1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "gc/shenandoah/brooksPointer.hpp"
  29 #include "gc/shenandoah/shenandoahBarrierSet.hpp"
  30 #include "gc/shenandoah/shenandoahHeap.hpp"
  31 #include "gc/shenandoah/shenandoahHeapRegion.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "nativeInst_x86.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 // Declaration and definition of StubGenerator (no .hpp file).
  50 // For a more detailed description of the stub routine structure
  51 // see the comment in stubRoutines.hpp
  52 
  53 #define __ _masm->
  54 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  55 #define a__ ((Assembler*)_masm)->
  56 
  57 #ifdef PRODUCT
  58 #define BLOCK_COMMENT(str) /* nothing */
  59 #else
  60 #define BLOCK_COMMENT(str) __ block_comment(str)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  65 
  66 // Stub Code definitions
  67 
  68 class StubGenerator: public StubCodeGenerator {
  69  private:
  70 
  71 #ifdef PRODUCT
  72 #define inc_counter_np(counter) ((void)0)
  73 #else
  74   void inc_counter_np_(int& counter) {
  75     // This can destroy rscratch1 if counter is far from the code cache
  76     __ incrementl(ExternalAddress((address)&counter));
  77   }
  78 #define inc_counter_np(counter) \
  79   BLOCK_COMMENT("inc_counter " #counter); \
  80   inc_counter_np_(counter);
  81 #endif
  82 
  83   // Call stubs are used to call Java from C
  84   //
  85   // Linux Arguments:
  86   //    c_rarg0:   call wrapper address                   address
  87   //    c_rarg1:   result                                 address
  88   //    c_rarg2:   result type                            BasicType
  89   //    c_rarg3:   method                                 Method*
  90   //    c_rarg4:   (interpreter) entry point              address
  91   //    c_rarg5:   parameters                             intptr_t*
  92   //    16(rbp): parameter size (in words)              int
  93   //    24(rbp): thread                                 Thread*
  94   //
  95   //     [ return_from_Java     ] <--- rsp
  96   //     [ argument word n      ]
  97   //      ...
  98   // -12 [ argument word 1      ]
  99   // -11 [ saved r15            ] <--- rsp_after_call
 100   // -10 [ saved r14            ]
 101   //  -9 [ saved r13            ]
 102   //  -8 [ saved r12            ]
 103   //  -7 [ saved rbx            ]
 104   //  -6 [ call wrapper         ]
 105   //  -5 [ result               ]
 106   //  -4 [ result type          ]
 107   //  -3 [ method               ]
 108   //  -2 [ entry point          ]
 109   //  -1 [ parameters           ]
 110   //   0 [ saved rbp            ] <--- rbp
 111   //   1 [ return address       ]
 112   //   2 [ parameter size       ]
 113   //   3 [ thread               ]
 114   //
 115   // Windows Arguments:
 116   //    c_rarg0:   call wrapper address                   address
 117   //    c_rarg1:   result                                 address
 118   //    c_rarg2:   result type                            BasicType
 119   //    c_rarg3:   method                                 Method*
 120   //    48(rbp): (interpreter) entry point              address
 121   //    56(rbp): parameters                             intptr_t*
 122   //    64(rbp): parameter size (in words)              int
 123   //    72(rbp): thread                                 Thread*
 124   //
 125   //     [ return_from_Java     ] <--- rsp
 126   //     [ argument word n      ]
 127   //      ...
 128   // -60 [ argument word 1      ]
 129   // -59 [ saved xmm31          ] <--- rsp after_call
 130   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 131   // -27 [ saved xmm15          ]
 132   //     [ saved xmm7-xmm14     ]
 133   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 134   //  -7 [ saved r15            ]
 135   //  -6 [ saved r14            ]
 136   //  -5 [ saved r13            ]
 137   //  -4 [ saved r12            ]
 138   //  -3 [ saved rdi            ]
 139   //  -2 [ saved rsi            ]
 140   //  -1 [ saved rbx            ]
 141   //   0 [ saved rbp            ] <--- rbp
 142   //   1 [ return address       ]
 143   //   2 [ call wrapper         ]
 144   //   3 [ result               ]
 145   //   4 [ result type          ]
 146   //   5 [ method               ]
 147   //   6 [ entry point          ]
 148   //   7 [ parameters           ]
 149   //   8 [ parameter size       ]
 150   //   9 [ thread               ]
 151   //
 152   //    Windows reserves the callers stack space for arguments 1-4.
 153   //    We spill c_rarg0-c_rarg3 to this space.
 154 
 155   // Call stub stack layout word offsets from rbp
 156   enum call_stub_layout {
 157 #ifdef _WIN64
 158     xmm_save_first     = 6,  // save from xmm6
 159     xmm_save_last      = 31, // to xmm31
 160     xmm_save_base      = -9,
 161     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 162     r15_off            = -7,
 163     r14_off            = -6,
 164     r13_off            = -5,
 165     r12_off            = -4,
 166     rdi_off            = -3,
 167     rsi_off            = -2,
 168     rbx_off            = -1,
 169     rbp_off            =  0,
 170     retaddr_off        =  1,
 171     call_wrapper_off   =  2,
 172     result_off         =  3,
 173     result_type_off    =  4,
 174     method_off         =  5,
 175     entry_point_off    =  6,
 176     parameters_off     =  7,
 177     parameter_size_off =  8,
 178     thread_off         =  9
 179 #else
 180     rsp_after_call_off = -12,
 181     mxcsr_off          = rsp_after_call_off,
 182     r15_off            = -11,
 183     r14_off            = -10,
 184     r13_off            = -9,
 185     r12_off            = -8,
 186     rbx_off            = -7,
 187     call_wrapper_off   = -6,
 188     result_off         = -5,
 189     result_type_off    = -4,
 190     method_off         = -3,
 191     entry_point_off    = -2,
 192     parameters_off     = -1,
 193     rbp_off            =  0,
 194     retaddr_off        =  1,
 195     parameter_size_off =  2,
 196     thread_off         =  3
 197 #endif
 198   };
 199 
 200 #ifdef _WIN64
 201   Address xmm_save(int reg) {
 202     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 203     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 204   }
 205 #endif
 206 
 207   address generate_call_stub(address& return_address) {
 208     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 209            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 210            "adjust this code");
 211     StubCodeMark mark(this, "StubRoutines", "call_stub");
 212     address start = __ pc();
 213 
 214     // same as in generate_catch_exception()!
 215     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 216 
 217     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 218     const Address result        (rbp, result_off         * wordSize);
 219     const Address result_type   (rbp, result_type_off    * wordSize);
 220     const Address method        (rbp, method_off         * wordSize);
 221     const Address entry_point   (rbp, entry_point_off    * wordSize);
 222     const Address parameters    (rbp, parameters_off     * wordSize);
 223     const Address parameter_size(rbp, parameter_size_off * wordSize);
 224 
 225     // same as in generate_catch_exception()!
 226     const Address thread        (rbp, thread_off         * wordSize);
 227 
 228     const Address r15_save(rbp, r15_off * wordSize);
 229     const Address r14_save(rbp, r14_off * wordSize);
 230     const Address r13_save(rbp, r13_off * wordSize);
 231     const Address r12_save(rbp, r12_off * wordSize);
 232     const Address rbx_save(rbp, rbx_off * wordSize);
 233 
 234     // stub code
 235     __ enter();
 236     __ subptr(rsp, -rsp_after_call_off * wordSize);
 237 
 238     // save register parameters
 239 #ifndef _WIN64
 240     __ movptr(parameters,   c_rarg5); // parameters
 241     __ movptr(entry_point,  c_rarg4); // entry_point
 242 #endif
 243 
 244     __ movptr(method,       c_rarg3); // method
 245     __ movl(result_type,  c_rarg2);   // result type
 246     __ movptr(result,       c_rarg1); // result
 247     __ movptr(call_wrapper, c_rarg0); // call wrapper
 248 
 249     // save regs belonging to calling function
 250     __ movptr(rbx_save, rbx);
 251     __ movptr(r12_save, r12);
 252     __ movptr(r13_save, r13);
 253     __ movptr(r14_save, r14);
 254     __ movptr(r15_save, r15);
 255     if (UseAVX > 2) {
 256       __ movl(rbx, 0xffff);
 257       __ kmovwl(k1, rbx);
 258     }
 259 #ifdef _WIN64
 260     int last_reg = 15;
 261     if (UseAVX > 2) {
 262       last_reg = 31;
 263     }
 264     if (VM_Version::supports_evex()) {
 265       for (int i = xmm_save_first; i <= last_reg; i++) {
 266         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 267       }
 268     } else {
 269       for (int i = xmm_save_first; i <= last_reg; i++) {
 270         __ movdqu(xmm_save(i), as_XMMRegister(i));
 271       }
 272     }
 273 
 274     const Address rdi_save(rbp, rdi_off * wordSize);
 275     const Address rsi_save(rbp, rsi_off * wordSize);
 276 
 277     __ movptr(rsi_save, rsi);
 278     __ movptr(rdi_save, rdi);
 279 #else
 280     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 281     {
 282       Label skip_ldmx;
 283       __ stmxcsr(mxcsr_save);
 284       __ movl(rax, mxcsr_save);
 285       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 286       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 287       __ cmp32(rax, mxcsr_std);
 288       __ jcc(Assembler::equal, skip_ldmx);
 289       __ ldmxcsr(mxcsr_std);
 290       __ bind(skip_ldmx);
 291     }
 292 #endif
 293 
 294     // Load up thread register
 295     __ movptr(r15_thread, thread);
 296     __ reinit_heapbase();
 297 
 298 #ifdef ASSERT
 299     // make sure we have no pending exceptions
 300     {
 301       Label L;
 302       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 303       __ jcc(Assembler::equal, L);
 304       __ stop("StubRoutines::call_stub: entered with pending exception");
 305       __ bind(L);
 306     }
 307 #endif
 308 
 309     // pass parameters if any
 310     BLOCK_COMMENT("pass parameters if any");
 311     Label parameters_done;
 312     __ movl(c_rarg3, parameter_size);
 313     __ testl(c_rarg3, c_rarg3);
 314     __ jcc(Assembler::zero, parameters_done);
 315 
 316     Label loop;
 317     __ movptr(c_rarg2, parameters);       // parameter pointer
 318     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 319     __ BIND(loop);
 320     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 321     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 322     __ decrementl(c_rarg1);             // decrement counter
 323     __ push(rax);                       // pass parameter
 324     __ jcc(Assembler::notZero, loop);
 325 
 326     // call Java function
 327     __ BIND(parameters_done);
 328     __ movptr(rbx, method);             // get Method*
 329     __ movptr(c_rarg1, entry_point);    // get entry_point
 330     __ mov(r13, rsp);                   // set sender sp
 331     BLOCK_COMMENT("call Java function");
 332     __ call(c_rarg1);
 333 
 334     BLOCK_COMMENT("call_stub_return_address:");
 335     return_address = __ pc();
 336 
 337     // store result depending on type (everything that is not
 338     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 339     __ movptr(c_rarg0, result);
 340     Label is_long, is_float, is_double, exit;
 341     __ movl(c_rarg1, result_type);
 342     __ cmpl(c_rarg1, T_OBJECT);
 343     __ jcc(Assembler::equal, is_long);
 344     __ cmpl(c_rarg1, T_LONG);
 345     __ jcc(Assembler::equal, is_long);
 346     __ cmpl(c_rarg1, T_FLOAT);
 347     __ jcc(Assembler::equal, is_float);
 348     __ cmpl(c_rarg1, T_DOUBLE);
 349     __ jcc(Assembler::equal, is_double);
 350 
 351     // handle T_INT case
 352     __ movl(Address(c_rarg0, 0), rax);
 353 
 354     __ BIND(exit);
 355 
 356     // pop parameters
 357     __ lea(rsp, rsp_after_call);
 358 
 359 #ifdef ASSERT
 360     // verify that threads correspond
 361     {
 362      Label L1, L2, L3;
 363       __ cmpptr(r15_thread, thread);
 364       __ jcc(Assembler::equal, L1);
 365       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 366       __ bind(L1);
 367       __ get_thread(rbx);
 368       __ cmpptr(r15_thread, thread);
 369       __ jcc(Assembler::equal, L2);
 370       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 371       __ bind(L2);
 372       __ cmpptr(r15_thread, rbx);
 373       __ jcc(Assembler::equal, L3);
 374       __ stop("StubRoutines::call_stub: threads must correspond");
 375       __ bind(L3);
 376     }
 377 #endif
 378 
 379     // restore regs belonging to calling function
 380 #ifdef _WIN64
 381     // emit the restores for xmm regs
 382     if (VM_Version::supports_evex()) {
 383       for (int i = xmm_save_first; i <= last_reg; i++) {
 384         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 385       }
 386     } else {
 387       for (int i = xmm_save_first; i <= last_reg; i++) {
 388         __ movdqu(as_XMMRegister(i), xmm_save(i));
 389       }
 390     }
 391 #endif
 392     __ movptr(r15, r15_save);
 393     __ movptr(r14, r14_save);
 394     __ movptr(r13, r13_save);
 395     __ movptr(r12, r12_save);
 396     __ movptr(rbx, rbx_save);
 397 
 398 #ifdef _WIN64
 399     __ movptr(rdi, rdi_save);
 400     __ movptr(rsi, rsi_save);
 401 #else
 402     __ ldmxcsr(mxcsr_save);
 403 #endif
 404 
 405     // restore rsp
 406     __ addptr(rsp, -rsp_after_call_off * wordSize);
 407 
 408     // return
 409     __ vzeroupper();
 410     __ pop(rbp);
 411     __ ret(0);
 412 
 413     // handle return types different from T_INT
 414     __ BIND(is_long);
 415     __ movq(Address(c_rarg0, 0), rax);
 416     __ jmp(exit);
 417 
 418     __ BIND(is_float);
 419     __ movflt(Address(c_rarg0, 0), xmm0);
 420     __ jmp(exit);
 421 
 422     __ BIND(is_double);
 423     __ movdbl(Address(c_rarg0, 0), xmm0);
 424     __ jmp(exit);
 425 
 426     return start;
 427   }
 428 
 429   // Return point for a Java call if there's an exception thrown in
 430   // Java code.  The exception is caught and transformed into a
 431   // pending exception stored in JavaThread that can be tested from
 432   // within the VM.
 433   //
 434   // Note: Usually the parameters are removed by the callee. In case
 435   // of an exception crossing an activation frame boundary, that is
 436   // not the case if the callee is compiled code => need to setup the
 437   // rsp.
 438   //
 439   // rax: exception oop
 440 
 441   address generate_catch_exception() {
 442     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 443     address start = __ pc();
 444 
 445     // same as in generate_call_stub():
 446     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 447     const Address thread        (rbp, thread_off         * wordSize);
 448 
 449 #ifdef ASSERT
 450     // verify that threads correspond
 451     {
 452       Label L1, L2, L3;
 453       __ cmpptr(r15_thread, thread);
 454       __ jcc(Assembler::equal, L1);
 455       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 456       __ bind(L1);
 457       __ get_thread(rbx);
 458       __ cmpptr(r15_thread, thread);
 459       __ jcc(Assembler::equal, L2);
 460       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 461       __ bind(L2);
 462       __ cmpptr(r15_thread, rbx);
 463       __ jcc(Assembler::equal, L3);
 464       __ stop("StubRoutines::catch_exception: threads must correspond");
 465       __ bind(L3);
 466     }
 467 #endif
 468 
 469     // set pending exception
 470     __ verify_oop(rax);
 471 
 472     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 473     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 474     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 475     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 476 
 477     // complete return to VM
 478     assert(StubRoutines::_call_stub_return_address != NULL,
 479            "_call_stub_return_address must have been generated before");
 480     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 481 
 482     return start;
 483   }
 484 
 485   // Continuation point for runtime calls returning with a pending
 486   // exception.  The pending exception check happened in the runtime
 487   // or native call stub.  The pending exception in Thread is
 488   // converted into a Java-level exception.
 489   //
 490   // Contract with Java-level exception handlers:
 491   // rax: exception
 492   // rdx: throwing pc
 493   //
 494   // NOTE: At entry of this stub, exception-pc must be on stack !!
 495 
 496   address generate_forward_exception() {
 497     StubCodeMark mark(this, "StubRoutines", "forward exception");
 498     address start = __ pc();
 499 
 500     // Upon entry, the sp points to the return address returning into
 501     // Java (interpreted or compiled) code; i.e., the return address
 502     // becomes the throwing pc.
 503     //
 504     // Arguments pushed before the runtime call are still on the stack
 505     // but the exception handler will reset the stack pointer ->
 506     // ignore them.  A potential result in registers can be ignored as
 507     // well.
 508 
 509 #ifdef ASSERT
 510     // make sure this code is only executed if there is a pending exception
 511     {
 512       Label L;
 513       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 514       __ jcc(Assembler::notEqual, L);
 515       __ stop("StubRoutines::forward exception: no pending exception (1)");
 516       __ bind(L);
 517     }
 518 #endif
 519 
 520     // compute exception handler into rbx
 521     __ movptr(c_rarg0, Address(rsp, 0));
 522     BLOCK_COMMENT("call exception_handler_for_return_address");
 523     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 524                          SharedRuntime::exception_handler_for_return_address),
 525                     r15_thread, c_rarg0);
 526     __ mov(rbx, rax);
 527 
 528     // setup rax & rdx, remove return address & clear pending exception
 529     __ pop(rdx);
 530     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 531     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 532 
 533 #ifdef ASSERT
 534     // make sure exception is set
 535     {
 536       Label L;
 537       __ testptr(rax, rax);
 538       __ jcc(Assembler::notEqual, L);
 539       __ stop("StubRoutines::forward exception: no pending exception (2)");
 540       __ bind(L);
 541     }
 542 #endif
 543 
 544     // continue at exception handler (return address removed)
 545     // rax: exception
 546     // rbx: exception handler
 547     // rdx: throwing pc
 548     __ verify_oop(rax);
 549     __ jmp(rbx);
 550 
 551     return start;
 552   }
 553 
 554   // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
 555   //
 556   // Arguments :
 557   //    c_rarg0: exchange_value
 558   //    c_rarg0: dest
 559   //
 560   // Result:
 561   //    *dest <- ex, return (orig *dest)
 562   address generate_atomic_xchg() {
 563     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 564     address start = __ pc();
 565 
 566     __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
 567     __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
 568     __ ret(0);
 569 
 570     return start;
 571   }
 572 
 573   // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest)
 574   //
 575   // Arguments :
 576   //    c_rarg0: exchange_value
 577   //    c_rarg1: dest
 578   //
 579   // Result:
 580   //    *dest <- ex, return (orig *dest)
 581   address generate_atomic_xchg_long() {
 582     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
 583     address start = __ pc();
 584 
 585     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 586     __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
 587     __ ret(0);
 588 
 589     return start;
 590   }
 591 
 592   // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
 593   //                                         jint compare_value)
 594   //
 595   // Arguments :
 596   //    c_rarg0: exchange_value
 597   //    c_rarg1: dest
 598   //    c_rarg2: compare_value
 599   //
 600   // Result:
 601   //    if ( compare_value == *dest ) {
 602   //       *dest = exchange_value
 603   //       return compare_value;
 604   //    else
 605   //       return *dest;
 606   address generate_atomic_cmpxchg() {
 607     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 608     address start = __ pc();
 609 
 610     __ movl(rax, c_rarg2);
 611    if ( os::is_MP() ) __ lock();
 612     __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
 613     __ ret(0);
 614 
 615     return start;
 616   }
 617 
 618   // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
 619   //                                           int8_t compare_value)
 620   //
 621   // Arguments :
 622   //    c_rarg0: exchange_value
 623   //    c_rarg1: dest
 624   //    c_rarg2: compare_value
 625   //
 626   // Result:
 627   //    if ( compare_value == *dest ) {
 628   //       *dest = exchange_value
 629   //       return compare_value;
 630   //    else
 631   //       return *dest;
 632   address generate_atomic_cmpxchg_byte() {
 633     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
 634     address start = __ pc();
 635 
 636     __ movsbq(rax, c_rarg2);
 637    if ( os::is_MP() ) __ lock();
 638     __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
 639     __ ret(0);
 640 
 641     return start;
 642   }
 643 
 644   // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
 645   //                                            volatile int64_t* dest,
 646   //                                            int64_t compare_value)
 647   // Arguments :
 648   //    c_rarg0: exchange_value
 649   //    c_rarg1: dest
 650   //    c_rarg2: compare_value
 651   //
 652   // Result:
 653   //    if ( compare_value == *dest ) {
 654   //       *dest = exchange_value
 655   //       return compare_value;
 656   //    else
 657   //       return *dest;
 658   address generate_atomic_cmpxchg_long() {
 659     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 660     address start = __ pc();
 661 
 662     __ movq(rax, c_rarg2);
 663    if ( os::is_MP() ) __ lock();
 664     __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
 665     __ ret(0);
 666 
 667     return start;
 668   }
 669 
 670   // Support for jint atomic::add(jint add_value, volatile jint* dest)
 671   //
 672   // Arguments :
 673   //    c_rarg0: add_value
 674   //    c_rarg1: dest
 675   //
 676   // Result:
 677   //    *dest += add_value
 678   //    return *dest;
 679   address generate_atomic_add() {
 680     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 681     address start = __ pc();
 682 
 683     __ movl(rax, c_rarg0);
 684    if ( os::is_MP() ) __ lock();
 685     __ xaddl(Address(c_rarg1, 0), c_rarg0);
 686     __ addl(rax, c_rarg0);
 687     __ ret(0);
 688 
 689     return start;
 690   }
 691 
 692   // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
 693   //
 694   // Arguments :
 695   //    c_rarg0: add_value
 696   //    c_rarg1: dest
 697   //
 698   // Result:
 699   //    *dest += add_value
 700   //    return *dest;
 701   address generate_atomic_add_long() {
 702     StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
 703     address start = __ pc();
 704 
 705     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 706    if ( os::is_MP() ) __ lock();
 707     __ xaddptr(Address(c_rarg1, 0), c_rarg0);
 708     __ addptr(rax, c_rarg0);
 709     __ ret(0);
 710 
 711     return start;
 712   }
 713 
 714   // Support for intptr_t OrderAccess::fence()
 715   //
 716   // Arguments :
 717   //
 718   // Result:
 719   address generate_orderaccess_fence() {
 720     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 721     address start = __ pc();
 722     __ membar(Assembler::StoreLoad);
 723     __ ret(0);
 724 
 725     return start;
 726   }
 727 
 728   // Support for intptr_t get_previous_fp()
 729   //
 730   // This routine is used to find the previous frame pointer for the
 731   // caller (current_frame_guess). This is used as part of debugging
 732   // ps() is seemingly lost trying to find frames.
 733   // This code assumes that caller current_frame_guess) has a frame.
 734   address generate_get_previous_fp() {
 735     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
 736     const Address old_fp(rbp, 0);
 737     const Address older_fp(rax, 0);
 738     address start = __ pc();
 739 
 740     __ enter();
 741     __ movptr(rax, old_fp); // callers fp
 742     __ movptr(rax, older_fp); // the frame for ps()
 743     __ pop(rbp);
 744     __ ret(0);
 745 
 746     return start;
 747   }
 748 
 749   // Support for intptr_t get_previous_sp()
 750   //
 751   // This routine is used to find the previous stack pointer for the
 752   // caller.
 753   address generate_get_previous_sp() {
 754     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 755     address start = __ pc();
 756 
 757     __ movptr(rax, rsp);
 758     __ addptr(rax, 8); // return address is at the top of the stack.
 759     __ ret(0);
 760 
 761     return start;
 762   }
 763 
 764   //----------------------------------------------------------------------------------------------------
 765   // Support for void verify_mxcsr()
 766   //
 767   // This routine is used with -Xcheck:jni to verify that native
 768   // JNI code does not return to Java code without restoring the
 769   // MXCSR register to our expected state.
 770 
 771   address generate_verify_mxcsr() {
 772     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 773     address start = __ pc();
 774 
 775     const Address mxcsr_save(rsp, 0);
 776 
 777     if (CheckJNICalls) {
 778       Label ok_ret;
 779       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 780       __ push(rax);
 781       __ subptr(rsp, wordSize);      // allocate a temp location
 782       __ stmxcsr(mxcsr_save);
 783       __ movl(rax, mxcsr_save);
 784       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 785       __ cmp32(rax, mxcsr_std);
 786       __ jcc(Assembler::equal, ok_ret);
 787 
 788       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 789 
 790       __ ldmxcsr(mxcsr_std);
 791 
 792       __ bind(ok_ret);
 793       __ addptr(rsp, wordSize);
 794       __ pop(rax);
 795     }
 796 
 797     __ ret(0);
 798 
 799     return start;
 800   }
 801 
 802   address generate_shenandoah_wb(bool c_abi, bool do_cset_test) {
 803     StubCodeMark mark(this, "StubRoutines", "shenandoah_wb");
 804     address start = __ pc();
 805 
 806     Label not_done, done, slow_case, not_an_instance, is_array;
 807 
 808     // We use RDI, which also serves as argument register for slow call.
 809     // RAX always holds the src object ptr, except after the slow call and
 810     // the cmpxchg, then it holds the result.
 811     // R8 and RCX are used as temporary registers.
 812     if (!c_abi) {
 813       __ push(rdi);
 814       __ push(r8);
 815     }
 816 
 817     // Check for object beeing in the collection set.
 818     // TODO: Can we use only 1 register here?
 819     // The source object arrives here in rax.
 820     // live: rax
 821     // live: rdi
 822     if (!c_abi) {
 823       __ mov(rdi, rax);
 824     } else {
 825       if (rax != c_rarg0) {
 826         __ mov(rax, c_rarg0);
 827       }
 828     }
 829     if (do_cset_test) {
 830       __ shrptr(rdi, ShenandoahHeapRegion::region_size_bytes_shift_jint());
 831       // live: r8
 832       __ movptr(r8, (intptr_t) ShenandoahHeap::in_cset_fast_test_addr());
 833       __ movbool(r8, Address(r8, rdi, Address::times_1));
 834       // unlive: rdi
 835       __ testbool(r8);
 836       // unlive: r8
 837       __ jccb(Assembler::notZero, not_done);
 838 
 839       if (!c_abi) {
 840         __ pop(r8);
 841         __ pop(rdi);
 842       }
 843       __ ret(0);
 844 
 845       __ bind(not_done);
 846     }
 847 
 848     if (!c_abi) {
 849       __ push(rcx);
 850     }
 851 
 852     if (UseTLAB && ShenandoahAsmWB) {
 853 
 854       Register new_obj = r8;
 855       __ movptr(new_obj, Address(r15_thread, JavaThread::gclab_top_offset()));
 856       __ testptr(new_obj, new_obj);
 857       __ jcc(Assembler::zero, slow_case); // No TLAB.
 858 
 859       __ load_klass(rcx, rax);
 860 
 861       // Figure out object size.
 862       __ movl(rcx, Address(rcx, Klass::layout_helper_offset()));
 863       __ testl(rcx, Klass::_lh_instance_slow_path_bit);
 864       // test to see if it has a finalizer or is malformed in some way
 865       __ jcc(Assembler::notZero, slow_case);
 866       __ cmpl(rcx, Klass::_lh_neutral_value); // Make sure it's an instance (LH > 0)
 867       __ jcc(Assembler::lessEqual, not_an_instance); // Thrashes rcx, returns size in rcx. Uses rax.
 868       __ bind(is_array);
 869 
 870       // Size in rdi, new_obj in r8, src obj in rax
 871 
 872       Register new_obj_end = rdi;
 873       int oop_extra_words = Universe::heap()->oop_extra_words();
 874       __ addq(rcx, oop_extra_words * HeapWordSize);
 875       __ lea(new_obj_end, Address(new_obj, rcx, Address::times_1));
 876       __ cmpptr(new_obj_end, Address(r15_thread, JavaThread::gclab_end_offset()));
 877       __ jcc(Assembler::above, slow_case);
 878       __ subq(rcx, oop_extra_words * HeapWordSize);
 879 
 880       // Store Brooks pointer and adjust start of newobj.
 881       Universe::heap()->compile_prepare_oop(_masm, new_obj);
 882 
 883       // Size in rcx, new_obj in r8, src obj in rax
 884 
 885       // Copy object.
 886       Label loop;
 887       if (!c_abi) {
 888         __ push(rdi); // Save new_obj_end
 889         __ push(rsi);
 890       } else {
 891         __ mov(r9, rdi); // Save new_obj_end
 892       }
 893       __ shrl(rcx, 3);   // Make it num-64-bit-words
 894       __ mov(rdi, r8); // Mov dst into rdi
 895       __ mov(rsi, rax); // Src into rsi.
 896       __ rep_mov();
 897       if (!c_abi) {
 898         __ pop(rsi); // Restore rsi.
 899         __ pop(rdi); // Restore new_obj_end
 900       } else {
 901         __ mov(rdi, r9); // Restore new_obj_end
 902       }
 903 
 904       // Src obj still in rax.
 905       if (os::is_MP()) {
 906         __ lock();
 907       }
 908       __ cmpxchgptr(new_obj, Address(rax, BrooksPointer::byte_offset(), Address::times_1));
 909       __ jccb(Assembler::notEqual, done); // Failed. Updated object in rax.
 910       // Otherwise, we succeeded.
 911       __ mov(rax, new_obj);
 912       __ movptr(Address(r15_thread, JavaThread::gclab_top_offset()), new_obj_end);
 913       __ bind(done);
 914 
 915       if (!c_abi) {
 916         __ pop(rcx);
 917         __ pop(r8);
 918         __ pop(rdi);
 919       }
 920 
 921       __ ret(0);
 922 
 923       __ bind(not_an_instance);
 924       if (!c_abi) {
 925         __ push(rdx);
 926       }
 927       // Layout_helper bits are in rcx
 928       __ movl(rdx, rcx); // Move layout_helper bits to rdx
 929       __ movl(rdi, Address(rax, arrayOopDesc::length_offset_in_bytes()));
 930       __ shrl(rcx, Klass::_lh_log2_element_size_shift);
 931       __ andl(rcx, Klass::_lh_log2_element_size_mask);
 932       __ shll(rdi); // Shifts left by number of bits in rcx (CL)
 933       __ shrl(rdx, Klass::_lh_header_size_shift);
 934       __ andl(rdx, Klass::_lh_header_size_mask);
 935       __ addl(rdi, rdx);
 936       // Round up.
 937       __ addl(rdi, HeapWordSize-1);
 938       __ andl(rdi, -HeapWordSize);
 939       if (!c_abi) {
 940         __ pop(rdx);
 941       }
 942       // Move size (rdi) into rcx
 943       __ movl(rcx, rdi);
 944       __ jmp(is_array);
 945 
 946       __ bind(slow_case);
 947     }
 948 
 949     if (!c_abi) {
 950       __ push(rdx);
 951       __ push(rdi);
 952       __ push(rsi);
 953       __ push(r8);
 954       __ push(r9);
 955       __ push(r10);
 956       __ push(r11);
 957       __ push(r12);
 958       __ push(r13);
 959       __ push(r14);
 960       __ push(r15);
 961     }
 962     __ save_vector_registers();
 963     __ movptr(rdi, rax);
 964     __ call_VM_leaf(CAST_FROM_FN_PTR(address, ShenandoahBarrierSet::write_barrier_JRT), rdi);
 965     __ restore_vector_registers();
 966     if (!c_abi) {
 967       __ pop(r15);
 968       __ pop(r14);
 969       __ pop(r13);
 970       __ pop(r12);
 971       __ pop(r11);
 972       __ pop(r10);
 973       __ pop(r9);
 974       __ pop(r8);
 975       __ pop(rsi);
 976       __ pop(rdi);
 977       __ pop(rdx);
 978 
 979       __ pop(rcx);
 980       __ pop(r8);
 981       __ pop(rdi);
 982     }
 983     __ ret(0);
 984 
 985     return start;
 986   }
 987 
 988   address generate_f2i_fixup() {
 989     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 990     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 991 
 992     address start = __ pc();
 993 
 994     Label L;
 995 
 996     __ push(rax);
 997     __ push(c_rarg3);
 998     __ push(c_rarg2);
 999     __ push(c_rarg1);
1000 
1001     __ movl(rax, 0x7f800000);
1002     __ xorl(c_rarg3, c_rarg3);
1003     __ movl(c_rarg2, inout);
1004     __ movl(c_rarg1, c_rarg2);
1005     __ andl(c_rarg1, 0x7fffffff);
1006     __ cmpl(rax, c_rarg1); // NaN? -> 0
1007     __ jcc(Assembler::negative, L);
1008     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
1009     __ movl(c_rarg3, 0x80000000);
1010     __ movl(rax, 0x7fffffff);
1011     __ cmovl(Assembler::positive, c_rarg3, rax);
1012 
1013     __ bind(L);
1014     __ movptr(inout, c_rarg3);
1015 
1016     __ pop(c_rarg1);
1017     __ pop(c_rarg2);
1018     __ pop(c_rarg3);
1019     __ pop(rax);
1020 
1021     __ ret(0);
1022 
1023     return start;
1024   }
1025 
1026   address generate_f2l_fixup() {
1027     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
1028     Address inout(rsp, 5 * wordSize); // return address + 4 saves
1029     address start = __ pc();
1030 
1031     Label L;
1032 
1033     __ push(rax);
1034     __ push(c_rarg3);
1035     __ push(c_rarg2);
1036     __ push(c_rarg1);
1037 
1038     __ movl(rax, 0x7f800000);
1039     __ xorl(c_rarg3, c_rarg3);
1040     __ movl(c_rarg2, inout);
1041     __ movl(c_rarg1, c_rarg2);
1042     __ andl(c_rarg1, 0x7fffffff);
1043     __ cmpl(rax, c_rarg1); // NaN? -> 0
1044     __ jcc(Assembler::negative, L);
1045     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
1046     __ mov64(c_rarg3, 0x8000000000000000);
1047     __ mov64(rax, 0x7fffffffffffffff);
1048     __ cmov(Assembler::positive, c_rarg3, rax);
1049 
1050     __ bind(L);
1051     __ movptr(inout, c_rarg3);
1052 
1053     __ pop(c_rarg1);
1054     __ pop(c_rarg2);
1055     __ pop(c_rarg3);
1056     __ pop(rax);
1057 
1058     __ ret(0);
1059 
1060     return start;
1061   }
1062 
1063   address generate_d2i_fixup() {
1064     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
1065     Address inout(rsp, 6 * wordSize); // return address + 5 saves
1066 
1067     address start = __ pc();
1068 
1069     Label L;
1070 
1071     __ push(rax);
1072     __ push(c_rarg3);
1073     __ push(c_rarg2);
1074     __ push(c_rarg1);
1075     __ push(c_rarg0);
1076 
1077     __ movl(rax, 0x7ff00000);
1078     __ movq(c_rarg2, inout);
1079     __ movl(c_rarg3, c_rarg2);
1080     __ mov(c_rarg1, c_rarg2);
1081     __ mov(c_rarg0, c_rarg2);
1082     __ negl(c_rarg3);
1083     __ shrptr(c_rarg1, 0x20);
1084     __ orl(c_rarg3, c_rarg2);
1085     __ andl(c_rarg1, 0x7fffffff);
1086     __ xorl(c_rarg2, c_rarg2);
1087     __ shrl(c_rarg3, 0x1f);
1088     __ orl(c_rarg1, c_rarg3);
1089     __ cmpl(rax, c_rarg1);
1090     __ jcc(Assembler::negative, L); // NaN -> 0
1091     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
1092     __ movl(c_rarg2, 0x80000000);
1093     __ movl(rax, 0x7fffffff);
1094     __ cmov(Assembler::positive, c_rarg2, rax);
1095 
1096     __ bind(L);
1097     __ movptr(inout, c_rarg2);
1098 
1099     __ pop(c_rarg0);
1100     __ pop(c_rarg1);
1101     __ pop(c_rarg2);
1102     __ pop(c_rarg3);
1103     __ pop(rax);
1104 
1105     __ ret(0);
1106 
1107     return start;
1108   }
1109 
1110   address generate_d2l_fixup() {
1111     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
1112     Address inout(rsp, 6 * wordSize); // return address + 5 saves
1113 
1114     address start = __ pc();
1115 
1116     Label L;
1117 
1118     __ push(rax);
1119     __ push(c_rarg3);
1120     __ push(c_rarg2);
1121     __ push(c_rarg1);
1122     __ push(c_rarg0);
1123 
1124     __ movl(rax, 0x7ff00000);
1125     __ movq(c_rarg2, inout);
1126     __ movl(c_rarg3, c_rarg2);
1127     __ mov(c_rarg1, c_rarg2);
1128     __ mov(c_rarg0, c_rarg2);
1129     __ negl(c_rarg3);
1130     __ shrptr(c_rarg1, 0x20);
1131     __ orl(c_rarg3, c_rarg2);
1132     __ andl(c_rarg1, 0x7fffffff);
1133     __ xorl(c_rarg2, c_rarg2);
1134     __ shrl(c_rarg3, 0x1f);
1135     __ orl(c_rarg1, c_rarg3);
1136     __ cmpl(rax, c_rarg1);
1137     __ jcc(Assembler::negative, L); // NaN -> 0
1138     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
1139     __ mov64(c_rarg2, 0x8000000000000000);
1140     __ mov64(rax, 0x7fffffffffffffff);
1141     __ cmovq(Assembler::positive, c_rarg2, rax);
1142 
1143     __ bind(L);
1144     __ movq(inout, c_rarg2);
1145 
1146     __ pop(c_rarg0);
1147     __ pop(c_rarg1);
1148     __ pop(c_rarg2);
1149     __ pop(c_rarg3);
1150     __ pop(rax);
1151 
1152     __ ret(0);
1153 
1154     return start;
1155   }
1156 
1157   address generate_fp_mask(const char *stub_name, int64_t mask) {
1158     __ align(CodeEntryAlignment);
1159     StubCodeMark mark(this, "StubRoutines", stub_name);
1160     address start = __ pc();
1161 
1162     __ emit_data64( mask, relocInfo::none );
1163     __ emit_data64( mask, relocInfo::none );
1164 
1165     return start;
1166   }
1167 
1168   // Non-destructive plausibility checks for oops
1169   //
1170   // Arguments:
1171   //    all args on stack!
1172   //
1173   // Stack after saving c_rarg3:
1174   //    [tos + 0]: saved c_rarg3
1175   //    [tos + 1]: saved c_rarg2
1176   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
1177   //    [tos + 3]: saved flags
1178   //    [tos + 4]: return address
1179   //  * [tos + 5]: error message (char*)
1180   //  * [tos + 6]: object to verify (oop)
1181   //  * [tos + 7]: saved rax - saved by caller and bashed
1182   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
1183   //  * = popped on exit
1184   address generate_verify_oop() {
1185     StubCodeMark mark(this, "StubRoutines", "verify_oop");
1186     address start = __ pc();
1187 
1188     Label exit, error;
1189 
1190     __ pushf();
1191     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1192 
1193     __ push(r12);
1194 
1195     // save c_rarg2 and c_rarg3
1196     __ push(c_rarg2);
1197     __ push(c_rarg3);
1198 
1199     enum {
1200            // After previous pushes.
1201            oop_to_verify = 6 * wordSize,
1202            saved_rax     = 7 * wordSize,
1203            saved_r10     = 8 * wordSize,
1204 
1205            // Before the call to MacroAssembler::debug(), see below.
1206            return_addr   = 16 * wordSize,
1207            error_msg     = 17 * wordSize
1208     };
1209 
1210     // get object
1211     __ movptr(rax, Address(rsp, oop_to_verify));
1212 
1213     // make sure object is 'reasonable'
1214     __ testptr(rax, rax);
1215     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1216     // Check if the oop is in the right area of memory
1217     __ movptr(c_rarg2, rax);
1218     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1219     __ andptr(c_rarg2, c_rarg3);
1220     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1221     __ cmpptr(c_rarg2, c_rarg3);
1222     __ jcc(Assembler::notZero, error);
1223 
1224     // set r12 to heapbase for load_klass()
1225     __ reinit_heapbase();
1226 
1227     // make sure klass is 'reasonable', which is not zero.
1228     __ load_klass(rax, rax);  // get klass
1229     __ testptr(rax, rax);
1230     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1231 
1232     // return if everything seems ok
1233     __ bind(exit);
1234     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1235     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1236     __ pop(c_rarg3);                             // restore c_rarg3
1237     __ pop(c_rarg2);                             // restore c_rarg2
1238     __ pop(r12);                                 // restore r12
1239     __ popf();                                   // restore flags
1240     __ ret(4 * wordSize);                        // pop caller saved stuff
1241 
1242     // handle errors
1243     __ bind(error);
1244     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1245     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1246     __ pop(c_rarg3);                             // get saved c_rarg3 back
1247     __ pop(c_rarg2);                             // get saved c_rarg2 back
1248     __ pop(r12);                                 // get saved r12 back
1249     __ popf();                                   // get saved flags off stack --
1250                                                  // will be ignored
1251 
1252     __ pusha();                                  // push registers
1253                                                  // (rip is already
1254                                                  // already pushed)
1255     // debug(char* msg, int64_t pc, int64_t regs[])
1256     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1257     // pushed all the registers, so now the stack looks like:
1258     //     [tos +  0] 16 saved registers
1259     //     [tos + 16] return address
1260     //   * [tos + 17] error message (char*)
1261     //   * [tos + 18] object to verify (oop)
1262     //   * [tos + 19] saved rax - saved by caller and bashed
1263     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1264     //   * = popped on exit
1265 
1266     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1267     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1268     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1269     __ mov(r12, rsp);                               // remember rsp
1270     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1271     __ andptr(rsp, -16);                            // align stack as required by ABI
1272     BLOCK_COMMENT("call MacroAssembler::debug");
1273     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1274     __ mov(rsp, r12);                               // restore rsp
1275     __ popa();                                      // pop registers (includes r12)
1276     __ ret(4 * wordSize);                           // pop caller saved stuff
1277 
1278     return start;
1279   }
1280 
1281   //
1282   // Verify that a register contains clean 32-bits positive value
1283   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1284   //
1285   //  Input:
1286   //    Rint  -  32-bits value
1287   //    Rtmp  -  scratch
1288   //
1289   void assert_clean_int(Register Rint, Register Rtmp) {
1290 #ifdef ASSERT
1291     Label L;
1292     assert_different_registers(Rtmp, Rint);
1293     __ movslq(Rtmp, Rint);
1294     __ cmpq(Rtmp, Rint);
1295     __ jcc(Assembler::equal, L);
1296     __ stop("high 32-bits of int value are not 0");
1297     __ bind(L);
1298 #endif
1299   }
1300 
1301   //  Generate overlap test for array copy stubs
1302   //
1303   //  Input:
1304   //     c_rarg0 - from
1305   //     c_rarg1 - to
1306   //     c_rarg2 - element count
1307   //
1308   //  Output:
1309   //     rax   - &from[element count - 1]
1310   //
1311   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1312     assert(no_overlap_target != NULL, "must be generated");
1313     array_overlap_test(no_overlap_target, NULL, sf);
1314   }
1315   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1316     array_overlap_test(NULL, &L_no_overlap, sf);
1317   }
1318   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1319     const Register from     = c_rarg0;
1320     const Register to       = c_rarg1;
1321     const Register count    = c_rarg2;
1322     const Register end_from = rax;
1323 
1324     __ cmpptr(to, from);
1325     __ lea(end_from, Address(from, count, sf, 0));
1326     if (NOLp == NULL) {
1327       ExternalAddress no_overlap(no_overlap_target);
1328       __ jump_cc(Assembler::belowEqual, no_overlap);
1329       __ cmpptr(to, end_from);
1330       __ jump_cc(Assembler::aboveEqual, no_overlap);
1331     } else {
1332       __ jcc(Assembler::belowEqual, (*NOLp));
1333       __ cmpptr(to, end_from);
1334       __ jcc(Assembler::aboveEqual, (*NOLp));
1335     }
1336   }
1337 
1338   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1339   //
1340   // Outputs:
1341   //    rdi - rcx
1342   //    rsi - rdx
1343   //    rdx - r8
1344   //    rcx - r9
1345   //
1346   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1347   // are non-volatile.  r9 and r10 should not be used by the caller.
1348   //
1349   void setup_arg_regs(int nargs = 3) {
1350     const Register saved_rdi = r9;
1351     const Register saved_rsi = r10;
1352     assert(nargs == 3 || nargs == 4, "else fix");
1353 #ifdef _WIN64
1354     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1355            "unexpected argument registers");
1356     if (nargs >= 4)
1357       __ mov(rax, r9);  // r9 is also saved_rdi
1358     __ movptr(saved_rdi, rdi);
1359     __ movptr(saved_rsi, rsi);
1360     __ mov(rdi, rcx); // c_rarg0
1361     __ mov(rsi, rdx); // c_rarg1
1362     __ mov(rdx, r8);  // c_rarg2
1363     if (nargs >= 4)
1364       __ mov(rcx, rax); // c_rarg3 (via rax)
1365 #else
1366     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1367            "unexpected argument registers");
1368 #endif
1369   }
1370 
1371   void restore_arg_regs() {
1372     const Register saved_rdi = r9;
1373     const Register saved_rsi = r10;
1374 #ifdef _WIN64
1375     __ movptr(rdi, saved_rdi);
1376     __ movptr(rsi, saved_rsi);
1377 #endif
1378   }
1379 
1380   // Generate code for an array write pre barrier
1381   //
1382   //     addr    -  starting address
1383   //     count   -  element count
1384   //     tmp     - scratch register
1385   //
1386   //     Destroy no registers!
1387   //
1388   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1389     BarrierSet* bs = Universe::heap()->barrier_set();
1390     switch (bs->kind()) {
1391       case BarrierSet::G1SATBCTLogging:
1392       case BarrierSet::Shenandoah:
1393         // With G1, don't generate the call if we statically know that the target in uninitialized
1394         if (!dest_uninitialized) {
1395           Label filtered;
1396           Address in_progress(r15_thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1397                                                    SATBMarkQueue::byte_offset_of_active()));
1398           // Is marking active?
1399           if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
1400             __ cmpl(in_progress, 0);
1401           } else {
1402             assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
1403             __ cmpb(in_progress, 0);
1404           }
1405           __ jcc(Assembler::equal, filtered);
1406 
1407            __ pusha();                      // push registers
1408            if (count == c_rarg0) {
1409              if (addr == c_rarg1) {
1410                // exactly backwards!!
1411                __ xchgptr(c_rarg1, c_rarg0);
1412              } else {
1413                __ movptr(c_rarg1, count);
1414                __ movptr(c_rarg0, addr);
1415              }
1416            } else {
1417              __ movptr(c_rarg0, addr);
1418              __ movptr(c_rarg1, count);
1419            }
1420            __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
1421            __ popa();
1422 
1423            __ bind(filtered);
1424         }
1425          break;
1426       case BarrierSet::CardTableForRS:
1427       case BarrierSet::CardTableExtension:
1428       case BarrierSet::ModRef:
1429         break;
1430       default:
1431         ShouldNotReachHere();
1432 
1433     }
1434   }
1435 
1436   //
1437   // Generate code for an array write post barrier
1438   //
1439   //  Input:
1440   //     start    - register containing starting address of destination array
1441   //     count    - elements count
1442   //     scratch  - scratch register
1443   //
1444   //  The input registers are overwritten.
1445   //
1446   void  gen_write_ref_array_post_barrier(Register start, Register count, Register scratch) {
1447     assert_different_registers(start, count, scratch);
1448     BarrierSet* bs = Universe::heap()->barrier_set();
1449     switch (bs->kind()) {
1450       case BarrierSet::G1SATBCTLogging:
1451       case BarrierSet::Shenandoah:
1452         {
1453           __ pusha();             // push registers (overkill)
1454           if (c_rarg0 == count) { // On win64 c_rarg0 == rcx
1455             assert_different_registers(c_rarg1, start);
1456             __ mov(c_rarg1, count);
1457             __ mov(c_rarg0, start);
1458           } else {
1459             assert_different_registers(c_rarg0, count);
1460             __ mov(c_rarg0, start);
1461             __ mov(c_rarg1, count);
1462           }
1463           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
1464           __ popa();
1465         }
1466         break;
1467       case BarrierSet::CardTableForRS:
1468       case BarrierSet::CardTableExtension:
1469         {
1470           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
1471           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1472 
1473           Label L_loop, L_done;
1474           const Register end = count;
1475 
1476           __ testl(count, count);
1477           __ jcc(Assembler::zero, L_done); // zero count - nothing to do
1478 
1479           __ leaq(end, Address(start, count, TIMES_OOP, 0));  // end == start+count*oop_size
1480           __ subptr(end, BytesPerHeapOop); // end - 1 to make inclusive
1481           __ shrptr(start, CardTableModRefBS::card_shift);
1482           __ shrptr(end,   CardTableModRefBS::card_shift);
1483           __ subptr(end, start); // end --> cards count
1484 
1485           int64_t disp = (int64_t) ct->byte_map_base;
1486           __ mov64(scratch, disp);
1487           __ addptr(start, scratch);
1488         __ BIND(L_loop);
1489           __ movb(Address(start, count, Address::times_1), 0);
1490           __ decrement(count);
1491           __ jcc(Assembler::greaterEqual, L_loop);
1492         __ BIND(L_done);
1493         }
1494         break;
1495       default:
1496         ShouldNotReachHere();
1497 
1498     }
1499   }
1500 
1501 
1502   // Copy big chunks forward
1503   //
1504   // Inputs:
1505   //   end_from     - source arrays end address
1506   //   end_to       - destination array end address
1507   //   qword_count  - 64-bits element count, negative
1508   //   to           - scratch
1509   //   L_copy_bytes - entry label
1510   //   L_copy_8_bytes  - exit  label
1511   //
1512   void copy_bytes_forward(Register end_from, Register end_to,
1513                              Register qword_count, Register to,
1514                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1515     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1516     Label L_loop;
1517     __ align(OptoLoopAlignment);
1518     if (UseUnalignedLoadStores) {
1519       Label L_end;
1520       if (UseAVX > 2) {
1521         __ movl(to, 0xffff);
1522         __ kmovwl(k1, to);
1523       }
1524       // Copy 64-bytes per iteration
1525       __ BIND(L_loop);
1526       if (UseAVX > 2) {
1527         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1528         __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1529       } else if (UseAVX == 2) {
1530         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1531         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1532         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1533         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1534       } else {
1535         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1536         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1537         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1538         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1539         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1540         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1541         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1542         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1543       }
1544       __ BIND(L_copy_bytes);
1545       __ addptr(qword_count, 8);
1546       __ jcc(Assembler::lessEqual, L_loop);
1547       __ subptr(qword_count, 4);  // sub(8) and add(4)
1548       __ jccb(Assembler::greater, L_end);
1549       // Copy trailing 32 bytes
1550       if (UseAVX >= 2) {
1551         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1552         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1553       } else {
1554         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1555         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1556         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1557         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1558       }
1559       __ addptr(qword_count, 4);
1560       __ BIND(L_end);
1561       if (UseAVX >= 2) {
1562         // clean upper bits of YMM registers
1563         __ vpxor(xmm0, xmm0);
1564         __ vpxor(xmm1, xmm1);
1565       }
1566     } else {
1567       // Copy 32-bytes per iteration
1568       __ BIND(L_loop);
1569       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1570       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1571       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1572       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1573       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1574       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1575       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1576       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1577 
1578       __ BIND(L_copy_bytes);
1579       __ addptr(qword_count, 4);
1580       __ jcc(Assembler::lessEqual, L_loop);
1581     }
1582     __ subptr(qword_count, 4);
1583     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1584   }
1585 
1586   // Copy big chunks backward
1587   //
1588   // Inputs:
1589   //   from         - source arrays address
1590   //   dest         - destination array address
1591   //   qword_count  - 64-bits element count
1592   //   to           - scratch
1593   //   L_copy_bytes - entry label
1594   //   L_copy_8_bytes  - exit  label
1595   //
1596   void copy_bytes_backward(Register from, Register dest,
1597                               Register qword_count, Register to,
1598                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1599     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1600     Label L_loop;
1601     __ align(OptoLoopAlignment);
1602     if (UseUnalignedLoadStores) {
1603       Label L_end;
1604       if (UseAVX > 2) {
1605         __ movl(to, 0xffff);
1606         __ kmovwl(k1, to);
1607       }
1608       // Copy 64-bytes per iteration
1609       __ BIND(L_loop);
1610       if (UseAVX > 2) {
1611         __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1612         __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1613       } else if (UseAVX == 2) {
1614         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1615         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1616         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1617         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1618       } else {
1619         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1620         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1621         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1622         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1623         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1624         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1625         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1626         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1627       }
1628       __ BIND(L_copy_bytes);
1629       __ subptr(qword_count, 8);
1630       __ jcc(Assembler::greaterEqual, L_loop);
1631 
1632       __ addptr(qword_count, 4);  // add(8) and sub(4)
1633       __ jccb(Assembler::less, L_end);
1634       // Copy trailing 32 bytes
1635       if (UseAVX >= 2) {
1636         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1637         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1638       } else {
1639         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1640         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1641         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1642         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1643       }
1644       __ subptr(qword_count, 4);
1645       __ BIND(L_end);
1646       if (UseAVX >= 2) {
1647         // clean upper bits of YMM registers
1648         __ vpxor(xmm0, xmm0);
1649         __ vpxor(xmm1, xmm1);
1650       }
1651     } else {
1652       // Copy 32-bytes per iteration
1653       __ BIND(L_loop);
1654       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1655       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1656       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1657       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1658       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1659       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1660       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1661       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1662 
1663       __ BIND(L_copy_bytes);
1664       __ subptr(qword_count, 4);
1665       __ jcc(Assembler::greaterEqual, L_loop);
1666     }
1667     __ addptr(qword_count, 4);
1668     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1669   }
1670 
1671 
1672   // Arguments:
1673   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1674   //             ignored
1675   //   name    - stub name string
1676   //
1677   // Inputs:
1678   //   c_rarg0   - source array address
1679   //   c_rarg1   - destination array address
1680   //   c_rarg2   - element count, treated as ssize_t, can be zero
1681   //
1682   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1683   // we let the hardware handle it.  The one to eight bytes within words,
1684   // dwords or qwords that span cache line boundaries will still be loaded
1685   // and stored atomically.
1686   //
1687   // Side Effects:
1688   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1689   //   used by generate_conjoint_byte_copy().
1690   //
1691   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1692     __ align(CodeEntryAlignment);
1693     StubCodeMark mark(this, "StubRoutines", name);
1694     address start = __ pc();
1695 
1696     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1697     Label L_copy_byte, L_exit;
1698     const Register from        = rdi;  // source array address
1699     const Register to          = rsi;  // destination array address
1700     const Register count       = rdx;  // elements count
1701     const Register byte_count  = rcx;
1702     const Register qword_count = count;
1703     const Register end_from    = from; // source array end address
1704     const Register end_to      = to;   // destination array end address
1705     // End pointers are inclusive, and if count is not zero they point
1706     // to the last unit copied:  end_to[0] := end_from[0]
1707 
1708     __ enter(); // required for proper stackwalking of RuntimeStub frame
1709     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1710 
1711     if (entry != NULL) {
1712       *entry = __ pc();
1713        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1714       BLOCK_COMMENT("Entry:");
1715     }
1716 
1717     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1718                       // r9 and r10 may be used to save non-volatile registers
1719 
1720     // 'from', 'to' and 'count' are now valid
1721     __ movptr(byte_count, count);
1722     __ shrptr(count, 3); // count => qword_count
1723 
1724     // Copy from low to high addresses.  Use 'to' as scratch.
1725     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1726     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1727     __ negptr(qword_count); // make the count negative
1728     __ jmp(L_copy_bytes);
1729 
1730     // Copy trailing qwords
1731   __ BIND(L_copy_8_bytes);
1732     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1733     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1734     __ increment(qword_count);
1735     __ jcc(Assembler::notZero, L_copy_8_bytes);
1736 
1737     // Check for and copy trailing dword
1738   __ BIND(L_copy_4_bytes);
1739     __ testl(byte_count, 4);
1740     __ jccb(Assembler::zero, L_copy_2_bytes);
1741     __ movl(rax, Address(end_from, 8));
1742     __ movl(Address(end_to, 8), rax);
1743 
1744     __ addptr(end_from, 4);
1745     __ addptr(end_to, 4);
1746 
1747     // Check for and copy trailing word
1748   __ BIND(L_copy_2_bytes);
1749     __ testl(byte_count, 2);
1750     __ jccb(Assembler::zero, L_copy_byte);
1751     __ movw(rax, Address(end_from, 8));
1752     __ movw(Address(end_to, 8), rax);
1753 
1754     __ addptr(end_from, 2);
1755     __ addptr(end_to, 2);
1756 
1757     // Check for and copy trailing byte
1758   __ BIND(L_copy_byte);
1759     __ testl(byte_count, 1);
1760     __ jccb(Assembler::zero, L_exit);
1761     __ movb(rax, Address(end_from, 8));
1762     __ movb(Address(end_to, 8), rax);
1763 
1764   __ BIND(L_exit);
1765     restore_arg_regs();
1766     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1767     __ xorptr(rax, rax); // return 0
1768     __ vzeroupper();
1769     __ leave(); // required for proper stackwalking of RuntimeStub frame
1770     __ ret(0);
1771 
1772     // Copy in multi-bytes chunks
1773     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1774     __ jmp(L_copy_4_bytes);
1775 
1776     return start;
1777   }
1778 
1779   // Arguments:
1780   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1781   //             ignored
1782   //   name    - stub name string
1783   //
1784   // Inputs:
1785   //   c_rarg0   - source array address
1786   //   c_rarg1   - destination array address
1787   //   c_rarg2   - element count, treated as ssize_t, can be zero
1788   //
1789   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1790   // we let the hardware handle it.  The one to eight bytes within words,
1791   // dwords or qwords that span cache line boundaries will still be loaded
1792   // and stored atomically.
1793   //
1794   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1795                                       address* entry, const char *name) {
1796     __ align(CodeEntryAlignment);
1797     StubCodeMark mark(this, "StubRoutines", name);
1798     address start = __ pc();
1799 
1800     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1801     const Register from        = rdi;  // source array address
1802     const Register to          = rsi;  // destination array address
1803     const Register count       = rdx;  // elements count
1804     const Register byte_count  = rcx;
1805     const Register qword_count = count;
1806 
1807     __ enter(); // required for proper stackwalking of RuntimeStub frame
1808     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1809 
1810     if (entry != NULL) {
1811       *entry = __ pc();
1812       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1813       BLOCK_COMMENT("Entry:");
1814     }
1815 
1816     array_overlap_test(nooverlap_target, Address::times_1);
1817     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1818                       // r9 and r10 may be used to save non-volatile registers
1819 
1820     // 'from', 'to' and 'count' are now valid
1821     __ movptr(byte_count, count);
1822     __ shrptr(count, 3);   // count => qword_count
1823 
1824     // Copy from high to low addresses.
1825 
1826     // Check for and copy trailing byte
1827     __ testl(byte_count, 1);
1828     __ jcc(Assembler::zero, L_copy_2_bytes);
1829     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1830     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1831     __ decrement(byte_count); // Adjust for possible trailing word
1832 
1833     // Check for and copy trailing word
1834   __ BIND(L_copy_2_bytes);
1835     __ testl(byte_count, 2);
1836     __ jcc(Assembler::zero, L_copy_4_bytes);
1837     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1838     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1839 
1840     // Check for and copy trailing dword
1841   __ BIND(L_copy_4_bytes);
1842     __ testl(byte_count, 4);
1843     __ jcc(Assembler::zero, L_copy_bytes);
1844     __ movl(rax, Address(from, qword_count, Address::times_8));
1845     __ movl(Address(to, qword_count, Address::times_8), rax);
1846     __ jmp(L_copy_bytes);
1847 
1848     // Copy trailing qwords
1849   __ BIND(L_copy_8_bytes);
1850     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1851     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1852     __ decrement(qword_count);
1853     __ jcc(Assembler::notZero, L_copy_8_bytes);
1854 
1855     restore_arg_regs();
1856     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1857     __ xorptr(rax, rax); // return 0
1858     __ vzeroupper();
1859     __ leave(); // required for proper stackwalking of RuntimeStub frame
1860     __ ret(0);
1861 
1862     // Copy in multi-bytes chunks
1863     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1864 
1865     restore_arg_regs();
1866     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1867     __ xorptr(rax, rax); // return 0
1868     __ vzeroupper();
1869     __ leave(); // required for proper stackwalking of RuntimeStub frame
1870     __ ret(0);
1871 
1872     return start;
1873   }
1874 
1875   // Arguments:
1876   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1877   //             ignored
1878   //   name    - stub name string
1879   //
1880   // Inputs:
1881   //   c_rarg0   - source array address
1882   //   c_rarg1   - destination array address
1883   //   c_rarg2   - element count, treated as ssize_t, can be zero
1884   //
1885   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1886   // let the hardware handle it.  The two or four words within dwords
1887   // or qwords that span cache line boundaries will still be loaded
1888   // and stored atomically.
1889   //
1890   // Side Effects:
1891   //   disjoint_short_copy_entry is set to the no-overlap entry point
1892   //   used by generate_conjoint_short_copy().
1893   //
1894   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1895     __ align(CodeEntryAlignment);
1896     StubCodeMark mark(this, "StubRoutines", name);
1897     address start = __ pc();
1898 
1899     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1900     const Register from        = rdi;  // source array address
1901     const Register to          = rsi;  // destination array address
1902     const Register count       = rdx;  // elements count
1903     const Register word_count  = rcx;
1904     const Register qword_count = count;
1905     const Register end_from    = from; // source array end address
1906     const Register end_to      = to;   // destination array end address
1907     // End pointers are inclusive, and if count is not zero they point
1908     // to the last unit copied:  end_to[0] := end_from[0]
1909 
1910     __ enter(); // required for proper stackwalking of RuntimeStub frame
1911     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1912 
1913     if (entry != NULL) {
1914       *entry = __ pc();
1915       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1916       BLOCK_COMMENT("Entry:");
1917     }
1918 
1919     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1920                       // r9 and r10 may be used to save non-volatile registers
1921 
1922     // 'from', 'to' and 'count' are now valid
1923     __ movptr(word_count, count);
1924     __ shrptr(count, 2); // count => qword_count
1925 
1926     // Copy from low to high addresses.  Use 'to' as scratch.
1927     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1928     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1929     __ negptr(qword_count);
1930     __ jmp(L_copy_bytes);
1931 
1932     // Copy trailing qwords
1933   __ BIND(L_copy_8_bytes);
1934     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1935     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1936     __ increment(qword_count);
1937     __ jcc(Assembler::notZero, L_copy_8_bytes);
1938 
1939     // Original 'dest' is trashed, so we can't use it as a
1940     // base register for a possible trailing word copy
1941 
1942     // Check for and copy trailing dword
1943   __ BIND(L_copy_4_bytes);
1944     __ testl(word_count, 2);
1945     __ jccb(Assembler::zero, L_copy_2_bytes);
1946     __ movl(rax, Address(end_from, 8));
1947     __ movl(Address(end_to, 8), rax);
1948 
1949     __ addptr(end_from, 4);
1950     __ addptr(end_to, 4);
1951 
1952     // Check for and copy trailing word
1953   __ BIND(L_copy_2_bytes);
1954     __ testl(word_count, 1);
1955     __ jccb(Assembler::zero, L_exit);
1956     __ movw(rax, Address(end_from, 8));
1957     __ movw(Address(end_to, 8), rax);
1958 
1959   __ BIND(L_exit);
1960     restore_arg_regs();
1961     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1962     __ xorptr(rax, rax); // return 0
1963     __ vzeroupper();
1964     __ leave(); // required for proper stackwalking of RuntimeStub frame
1965     __ ret(0);
1966 
1967     // Copy in multi-bytes chunks
1968     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1969     __ jmp(L_copy_4_bytes);
1970 
1971     return start;
1972   }
1973 
1974   address generate_fill(BasicType t, bool aligned, const char *name) {
1975     __ align(CodeEntryAlignment);
1976     StubCodeMark mark(this, "StubRoutines", name);
1977     address start = __ pc();
1978 
1979     BLOCK_COMMENT("Entry:");
1980 
1981     const Register to       = c_rarg0;  // source array address
1982     const Register value    = c_rarg1;  // value
1983     const Register count    = c_rarg2;  // elements count
1984 
1985     __ enter(); // required for proper stackwalking of RuntimeStub frame
1986 
1987     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1988 
1989     __ vzeroupper();
1990     __ leave(); // required for proper stackwalking of RuntimeStub frame
1991     __ ret(0);
1992     return start;
1993   }
1994 
1995   // Arguments:
1996   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1997   //             ignored
1998   //   name    - stub name string
1999   //
2000   // Inputs:
2001   //   c_rarg0   - source array address
2002   //   c_rarg1   - destination array address
2003   //   c_rarg2   - element count, treated as ssize_t, can be zero
2004   //
2005   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2006   // let the hardware handle it.  The two or four words within dwords
2007   // or qwords that span cache line boundaries will still be loaded
2008   // and stored atomically.
2009   //
2010   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2011                                        address *entry, const char *name) {
2012     __ align(CodeEntryAlignment);
2013     StubCodeMark mark(this, "StubRoutines", name);
2014     address start = __ pc();
2015 
2016     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2017     const Register from        = rdi;  // source array address
2018     const Register to          = rsi;  // destination array address
2019     const Register count       = rdx;  // elements count
2020     const Register word_count  = rcx;
2021     const Register qword_count = count;
2022 
2023     __ enter(); // required for proper stackwalking of RuntimeStub frame
2024     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2025 
2026     if (entry != NULL) {
2027       *entry = __ pc();
2028       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2029       BLOCK_COMMENT("Entry:");
2030     }
2031 
2032     array_overlap_test(nooverlap_target, Address::times_2);
2033     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2034                       // r9 and r10 may be used to save non-volatile registers
2035 
2036     // 'from', 'to' and 'count' are now valid
2037     __ movptr(word_count, count);
2038     __ shrptr(count, 2); // count => qword_count
2039 
2040     // Copy from high to low addresses.  Use 'to' as scratch.
2041 
2042     // Check for and copy trailing word
2043     __ testl(word_count, 1);
2044     __ jccb(Assembler::zero, L_copy_4_bytes);
2045     __ movw(rax, Address(from, word_count, Address::times_2, -2));
2046     __ movw(Address(to, word_count, Address::times_2, -2), rax);
2047 
2048     // Check for and copy trailing dword
2049   __ BIND(L_copy_4_bytes);
2050     __ testl(word_count, 2);
2051     __ jcc(Assembler::zero, L_copy_bytes);
2052     __ movl(rax, Address(from, qword_count, Address::times_8));
2053     __ movl(Address(to, qword_count, Address::times_8), rax);
2054     __ jmp(L_copy_bytes);
2055 
2056     // Copy trailing qwords
2057   __ BIND(L_copy_8_bytes);
2058     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2059     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2060     __ decrement(qword_count);
2061     __ jcc(Assembler::notZero, L_copy_8_bytes);
2062 
2063     restore_arg_regs();
2064     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2065     __ xorptr(rax, rax); // return 0
2066     __ vzeroupper();
2067     __ leave(); // required for proper stackwalking of RuntimeStub frame
2068     __ ret(0);
2069 
2070     // Copy in multi-bytes chunks
2071     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2072 
2073     restore_arg_regs();
2074     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2075     __ xorptr(rax, rax); // return 0
2076     __ vzeroupper();
2077     __ leave(); // required for proper stackwalking of RuntimeStub frame
2078     __ ret(0);
2079 
2080     return start;
2081   }
2082 
2083   // Arguments:
2084   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2085   //             ignored
2086   //   is_oop  - true => oop array, so generate store check code
2087   //   name    - stub name string
2088   //
2089   // Inputs:
2090   //   c_rarg0   - source array address
2091   //   c_rarg1   - destination array address
2092   //   c_rarg2   - element count, treated as ssize_t, can be zero
2093   //
2094   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2095   // the hardware handle it.  The two dwords within qwords that span
2096   // cache line boundaries will still be loaded and stored atomicly.
2097   //
2098   // Side Effects:
2099   //   disjoint_int_copy_entry is set to the no-overlap entry point
2100   //   used by generate_conjoint_int_oop_copy().
2101   //
2102   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2103                                          const char *name, bool dest_uninitialized = false) {
2104     __ align(CodeEntryAlignment);
2105     StubCodeMark mark(this, "StubRoutines", name);
2106     address start = __ pc();
2107 
2108     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2109     const Register from        = rdi;  // source array address
2110     const Register to          = rsi;  // destination array address
2111     const Register count       = rdx;  // elements count
2112     const Register dword_count = rcx;
2113     const Register qword_count = count;
2114     const Register end_from    = from; // source array end address
2115     const Register end_to      = to;   // destination array end address
2116     const Register saved_to    = r11;  // saved destination array address
2117     // End pointers are inclusive, and if count is not zero they point
2118     // to the last unit copied:  end_to[0] := end_from[0]
2119 
2120     __ enter(); // required for proper stackwalking of RuntimeStub frame
2121     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2122 
2123     if (entry != NULL) {
2124       *entry = __ pc();
2125       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2126       BLOCK_COMMENT("Entry:");
2127     }
2128 
2129     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2130                       // r9 and r10 may be used to save non-volatile registers
2131     if (is_oop) {
2132       __ movq(saved_to, to);
2133       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2134     }
2135 
2136     // 'from', 'to' and 'count' are now valid
2137     __ movptr(dword_count, count);
2138     __ shrptr(count, 1); // count => qword_count
2139 
2140     // Copy from low to high addresses.  Use 'to' as scratch.
2141     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2142     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2143     __ negptr(qword_count);
2144     __ jmp(L_copy_bytes);
2145 
2146     // Copy trailing qwords
2147   __ BIND(L_copy_8_bytes);
2148     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2149     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2150     __ increment(qword_count);
2151     __ jcc(Assembler::notZero, L_copy_8_bytes);
2152 
2153     // Check for and copy trailing dword
2154   __ BIND(L_copy_4_bytes);
2155     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2156     __ jccb(Assembler::zero, L_exit);
2157     __ movl(rax, Address(end_from, 8));
2158     __ movl(Address(end_to, 8), rax);
2159 
2160   __ BIND(L_exit);
2161     if (is_oop) {
2162       gen_write_ref_array_post_barrier(saved_to, dword_count, rax);
2163     }
2164     restore_arg_regs();
2165     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2166     __ vzeroupper();
2167     __ xorptr(rax, rax); // return 0
2168     __ leave(); // required for proper stackwalking of RuntimeStub frame
2169     __ ret(0);
2170 
2171     // Copy in multi-bytes chunks
2172     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2173     __ jmp(L_copy_4_bytes);
2174 
2175     return start;
2176   }
2177 
2178   // Arguments:
2179   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2180   //             ignored
2181   //   is_oop  - true => oop array, so generate store check code
2182   //   name    - stub name string
2183   //
2184   // Inputs:
2185   //   c_rarg0   - source array address
2186   //   c_rarg1   - destination array address
2187   //   c_rarg2   - element count, treated as ssize_t, can be zero
2188   //
2189   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2190   // the hardware handle it.  The two dwords within qwords that span
2191   // cache line boundaries will still be loaded and stored atomicly.
2192   //
2193   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2194                                          address *entry, const char *name,
2195                                          bool dest_uninitialized = false) {
2196     __ align(CodeEntryAlignment);
2197     StubCodeMark mark(this, "StubRoutines", name);
2198     address start = __ pc();
2199 
2200     Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
2201     const Register from        = rdi;  // source array address
2202     const Register to          = rsi;  // destination array address
2203     const Register count       = rdx;  // elements count
2204     const Register dword_count = rcx;
2205     const Register qword_count = count;
2206 
2207     __ enter(); // required for proper stackwalking of RuntimeStub frame
2208     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2209 
2210     if (entry != NULL) {
2211       *entry = __ pc();
2212        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2213       BLOCK_COMMENT("Entry:");
2214     }
2215 
2216     array_overlap_test(nooverlap_target, Address::times_4);
2217     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2218                       // r9 and r10 may be used to save non-volatile registers
2219 
2220     if (is_oop) {
2221       // no registers are destroyed by this call
2222       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2223     }
2224 
2225     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2226     // 'from', 'to' and 'count' are now valid
2227     __ movptr(dword_count, count);
2228     __ shrptr(count, 1); // count => qword_count
2229 
2230     // Copy from high to low addresses.  Use 'to' as scratch.
2231 
2232     // Check for and copy trailing dword
2233     __ testl(dword_count, 1);
2234     __ jcc(Assembler::zero, L_copy_bytes);
2235     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2236     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2237     __ jmp(L_copy_bytes);
2238 
2239     // Copy trailing qwords
2240   __ BIND(L_copy_8_bytes);
2241     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2242     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2243     __ decrement(qword_count);
2244     __ jcc(Assembler::notZero, L_copy_8_bytes);
2245 
2246     if (is_oop) {
2247       __ jmp(L_exit);
2248     }
2249     restore_arg_regs();
2250     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2251     __ xorptr(rax, rax); // return 0
2252     __ vzeroupper();
2253     __ leave(); // required for proper stackwalking of RuntimeStub frame
2254     __ ret(0);
2255 
2256     // Copy in multi-bytes chunks
2257     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2258 
2259   __ BIND(L_exit);
2260     if (is_oop) {
2261       gen_write_ref_array_post_barrier(to, dword_count, rax);
2262     }
2263     restore_arg_regs();
2264     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2265     __ xorptr(rax, rax); // return 0
2266     __ vzeroupper();
2267     __ leave(); // required for proper stackwalking of RuntimeStub frame
2268     __ ret(0);
2269 
2270     return start;
2271   }
2272 
2273   // Arguments:
2274   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2275   //             ignored
2276   //   is_oop  - true => oop array, so generate store check code
2277   //   name    - stub name string
2278   //
2279   // Inputs:
2280   //   c_rarg0   - source array address
2281   //   c_rarg1   - destination array address
2282   //   c_rarg2   - element count, treated as ssize_t, can be zero
2283   //
2284  // Side Effects:
2285   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2286   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2287   //
2288   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2289                                           const char *name, bool dest_uninitialized = false) {
2290     __ align(CodeEntryAlignment);
2291     StubCodeMark mark(this, "StubRoutines", name);
2292     address start = __ pc();
2293 
2294     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2295     const Register from        = rdi;  // source array address
2296     const Register to          = rsi;  // destination array address
2297     const Register qword_count = rdx;  // elements count
2298     const Register end_from    = from; // source array end address
2299     const Register end_to      = rcx;  // destination array end address
2300     const Register saved_to    = to;
2301     const Register saved_count = r11;
2302     // End pointers are inclusive, and if count is not zero they point
2303     // to the last unit copied:  end_to[0] := end_from[0]
2304 
2305     __ enter(); // required for proper stackwalking of RuntimeStub frame
2306     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2307     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2308 
2309     if (entry != NULL) {
2310       *entry = __ pc();
2311       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2312       BLOCK_COMMENT("Entry:");
2313     }
2314 
2315     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2316                       // r9 and r10 may be used to save non-volatile registers
2317     // 'from', 'to' and 'qword_count' are now valid
2318     if (is_oop) {
2319       // Save to and count for store barrier
2320       __ movptr(saved_count, qword_count);
2321       // no registers are destroyed by this call
2322       gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized);
2323     }
2324 
2325     // Copy from low to high addresses.  Use 'to' as scratch.
2326     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2327     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2328     __ negptr(qword_count);
2329     __ jmp(L_copy_bytes);
2330 
2331     // Copy trailing qwords
2332   __ BIND(L_copy_8_bytes);
2333     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2334     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2335     __ increment(qword_count);
2336     __ jcc(Assembler::notZero, L_copy_8_bytes);
2337 
2338     if (is_oop) {
2339       __ jmp(L_exit);
2340     } else {
2341       restore_arg_regs();
2342       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2343       __ xorptr(rax, rax); // return 0
2344       __ vzeroupper();
2345       __ leave(); // required for proper stackwalking of RuntimeStub frame
2346       __ ret(0);
2347     }
2348 
2349     // Copy in multi-bytes chunks
2350     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2351 
2352     if (is_oop) {
2353     __ BIND(L_exit);
2354       gen_write_ref_array_post_barrier(saved_to, saved_count, rax);
2355     }
2356     restore_arg_regs();
2357     if (is_oop) {
2358       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2359     } else {
2360       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2361     }
2362     __ vzeroupper();
2363     __ xorptr(rax, rax); // return 0
2364     __ leave(); // required for proper stackwalking of RuntimeStub frame
2365     __ ret(0);
2366 
2367     return start;
2368   }
2369 
2370   // Arguments:
2371   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2372   //             ignored
2373   //   is_oop  - true => oop array, so generate store check code
2374   //   name    - stub name string
2375   //
2376   // Inputs:
2377   //   c_rarg0   - source array address
2378   //   c_rarg1   - destination array address
2379   //   c_rarg2   - element count, treated as ssize_t, can be zero
2380   //
2381   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2382                                           address nooverlap_target, address *entry,
2383                                           const char *name, bool dest_uninitialized = false) {
2384     __ align(CodeEntryAlignment);
2385     StubCodeMark mark(this, "StubRoutines", name);
2386     address start = __ pc();
2387 
2388     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2389     const Register from        = rdi;  // source array address
2390     const Register to          = rsi;  // destination array address
2391     const Register qword_count = rdx;  // elements count
2392     const Register saved_count = rcx;
2393 
2394     __ enter(); // required for proper stackwalking of RuntimeStub frame
2395     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2396 
2397     if (entry != NULL) {
2398       *entry = __ pc();
2399       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2400       BLOCK_COMMENT("Entry:");
2401     }
2402 
2403     array_overlap_test(nooverlap_target, Address::times_8);
2404     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2405                       // r9 and r10 may be used to save non-volatile registers
2406     // 'from', 'to' and 'qword_count' are now valid
2407     if (is_oop) {
2408       // Save to and count for store barrier
2409       __ movptr(saved_count, qword_count);
2410       // No registers are destroyed by this call
2411       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2412     }
2413 
2414     __ jmp(L_copy_bytes);
2415 
2416     // Copy trailing qwords
2417   __ BIND(L_copy_8_bytes);
2418     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2419     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2420     __ decrement(qword_count);
2421     __ jcc(Assembler::notZero, L_copy_8_bytes);
2422 
2423     if (is_oop) {
2424       __ jmp(L_exit);
2425     } else {
2426       restore_arg_regs();
2427       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2428       __ xorptr(rax, rax); // return 0
2429       __ vzeroupper();
2430       __ leave(); // required for proper stackwalking of RuntimeStub frame
2431       __ ret(0);
2432     }
2433 
2434     // Copy in multi-bytes chunks
2435     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2436 
2437     if (is_oop) {
2438     __ BIND(L_exit);
2439       gen_write_ref_array_post_barrier(to, saved_count, rax);
2440     }
2441     restore_arg_regs();
2442     if (is_oop) {
2443       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2444     } else {
2445       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2446     }
2447     __ vzeroupper();
2448     __ xorptr(rax, rax); // return 0
2449     __ leave(); // required for proper stackwalking of RuntimeStub frame
2450     __ ret(0);
2451 
2452     return start;
2453   }
2454 
2455 
2456   // Helper for generating a dynamic type check.
2457   // Smashes no registers.
2458   void generate_type_check(Register sub_klass,
2459                            Register super_check_offset,
2460                            Register super_klass,
2461                            Label& L_success) {
2462     assert_different_registers(sub_klass, super_check_offset, super_klass);
2463 
2464     BLOCK_COMMENT("type_check:");
2465 
2466     Label L_miss;
2467 
2468     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2469                                      super_check_offset);
2470     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2471 
2472     // Fall through on failure!
2473     __ BIND(L_miss);
2474   }
2475 
2476   //
2477   //  Generate checkcasting array copy stub
2478   //
2479   //  Input:
2480   //    c_rarg0   - source array address
2481   //    c_rarg1   - destination array address
2482   //    c_rarg2   - element count, treated as ssize_t, can be zero
2483   //    c_rarg3   - size_t ckoff (super_check_offset)
2484   // not Win64
2485   //    c_rarg4   - oop ckval (super_klass)
2486   // Win64
2487   //    rsp+40    - oop ckval (super_klass)
2488   //
2489   //  Output:
2490   //    rax ==  0  -  success
2491   //    rax == -1^K - failure, where K is partial transfer count
2492   //
2493   address generate_checkcast_copy(const char *name, address *entry,
2494                                   bool dest_uninitialized = false) {
2495 
2496     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2497 
2498     // Input registers (after setup_arg_regs)
2499     const Register from        = rdi;   // source array address
2500     const Register to          = rsi;   // destination array address
2501     const Register length      = rdx;   // elements count
2502     const Register ckoff       = rcx;   // super_check_offset
2503     const Register ckval       = r8;    // super_klass
2504 
2505     // Registers used as temps (r13, r14 are save-on-entry)
2506     const Register end_from    = from;  // source array end address
2507     const Register end_to      = r13;   // destination array end address
2508     const Register count       = rdx;   // -(count_remaining)
2509     const Register r14_length  = r14;   // saved copy of length
2510     // End pointers are inclusive, and if length is not zero they point
2511     // to the last unit copied:  end_to[0] := end_from[0]
2512 
2513     const Register rax_oop    = rax;    // actual oop copied
2514     const Register r11_klass  = r11;    // oop._klass
2515 
2516     //---------------------------------------------------------------
2517     // Assembler stub will be used for this call to arraycopy
2518     // if the two arrays are subtypes of Object[] but the
2519     // destination array type is not equal to or a supertype
2520     // of the source type.  Each element must be separately
2521     // checked.
2522 
2523     __ align(CodeEntryAlignment);
2524     StubCodeMark mark(this, "StubRoutines", name);
2525     address start = __ pc();
2526 
2527     __ enter(); // required for proper stackwalking of RuntimeStub frame
2528 
2529 #ifdef ASSERT
2530     // caller guarantees that the arrays really are different
2531     // otherwise, we would have to make conjoint checks
2532     { Label L;
2533       array_overlap_test(L, TIMES_OOP);
2534       __ stop("checkcast_copy within a single array");
2535       __ bind(L);
2536     }
2537 #endif //ASSERT
2538 
2539     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2540                        // ckoff => rcx, ckval => r8
2541                        // r9 and r10 may be used to save non-volatile registers
2542 #ifdef _WIN64
2543     // last argument (#4) is on stack on Win64
2544     __ movptr(ckval, Address(rsp, 6 * wordSize));
2545 #endif
2546 
2547     // Caller of this entry point must set up the argument registers.
2548     if (entry != NULL) {
2549       *entry = __ pc();
2550       BLOCK_COMMENT("Entry:");
2551     }
2552 
2553     // allocate spill slots for r13, r14
2554     enum {
2555       saved_r13_offset,
2556       saved_r14_offset,
2557       saved_rbp_offset
2558     };
2559     __ subptr(rsp, saved_rbp_offset * wordSize);
2560     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2561     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2562 
2563     // check that int operands are properly extended to size_t
2564     assert_clean_int(length, rax);
2565     assert_clean_int(ckoff, rax);
2566 
2567 #ifdef ASSERT
2568     BLOCK_COMMENT("assert consistent ckoff/ckval");
2569     // The ckoff and ckval must be mutually consistent,
2570     // even though caller generates both.
2571     { Label L;
2572       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2573       __ cmpl(ckoff, Address(ckval, sco_offset));
2574       __ jcc(Assembler::equal, L);
2575       __ stop("super_check_offset inconsistent");
2576       __ bind(L);
2577     }
2578 #endif //ASSERT
2579 
2580     // Loop-invariant addresses.  They are exclusive end pointers.
2581     Address end_from_addr(from, length, TIMES_OOP, 0);
2582     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2583     // Loop-variant addresses.  They assume post-incremented count < 0.
2584     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2585     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2586 
2587     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2588 
2589     // Copy from low to high addresses, indexed from the end of each array.
2590     __ lea(end_from, end_from_addr);
2591     __ lea(end_to,   end_to_addr);
2592     __ movptr(r14_length, length);        // save a copy of the length
2593     assert(length == count, "");          // else fix next line:
2594     __ negptr(count);                     // negate and test the length
2595     __ jcc(Assembler::notZero, L_load_element);
2596 
2597     // Empty array:  Nothing to do.
2598     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2599     __ jmp(L_done);
2600 
2601     // ======== begin loop ========
2602     // (Loop is rotated; its entry is L_load_element.)
2603     // Loop control:
2604     //   for (count = -count; count != 0; count++)
2605     // Base pointers src, dst are biased by 8*(count-1),to last element.
2606     __ align(OptoLoopAlignment);
2607 
2608     __ BIND(L_store_element);
2609     __ store_heap_oop(to_element_addr, rax_oop);  // store the oop
2610     __ increment(count);               // increment the count toward zero
2611     __ jcc(Assembler::zero, L_do_card_marks);
2612 
2613     // ======== loop entry is here ========
2614     __ BIND(L_load_element);
2615     __ load_heap_oop(rax_oop, from_element_addr); // load the oop
2616     __ testptr(rax_oop, rax_oop);
2617     __ jcc(Assembler::zero, L_store_element);
2618 
2619     __ load_klass(r11_klass, rax_oop);// query the object klass
2620     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2621     // ======== end loop ========
2622 
2623     // It was a real error; we must depend on the caller to finish the job.
2624     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2625     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2626     // and report their number to the caller.
2627     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2628     Label L_post_barrier;
2629     __ addptr(r14_length, count);     // K = (original - remaining) oops
2630     __ movptr(rax, r14_length);       // save the value
2631     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2632     __ jccb(Assembler::notZero, L_post_barrier);
2633     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2634 
2635     // Come here on success only.
2636     __ BIND(L_do_card_marks);
2637     __ xorptr(rax, rax);              // return 0 on success
2638 
2639     __ BIND(L_post_barrier);
2640     gen_write_ref_array_post_barrier(to, r14_length, rscratch1);
2641 
2642     // Common exit point (success or failure).
2643     __ BIND(L_done);
2644     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2645     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2646     restore_arg_regs();
2647     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2648     __ leave(); // required for proper stackwalking of RuntimeStub frame
2649     __ ret(0);
2650 
2651     return start;
2652   }
2653 
2654   //
2655   //  Generate 'unsafe' array copy stub
2656   //  Though just as safe as the other stubs, it takes an unscaled
2657   //  size_t argument instead of an element count.
2658   //
2659   //  Input:
2660   //    c_rarg0   - source array address
2661   //    c_rarg1   - destination array address
2662   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2663   //
2664   // Examines the alignment of the operands and dispatches
2665   // to a long, int, short, or byte copy loop.
2666   //
2667   address generate_unsafe_copy(const char *name,
2668                                address byte_copy_entry, address short_copy_entry,
2669                                address int_copy_entry, address long_copy_entry) {
2670 
2671     Label L_long_aligned, L_int_aligned, L_short_aligned;
2672 
2673     // Input registers (before setup_arg_regs)
2674     const Register from        = c_rarg0;  // source array address
2675     const Register to          = c_rarg1;  // destination array address
2676     const Register size        = c_rarg2;  // byte count (size_t)
2677 
2678     // Register used as a temp
2679     const Register bits        = rax;      // test copy of low bits
2680 
2681     __ align(CodeEntryAlignment);
2682     StubCodeMark mark(this, "StubRoutines", name);
2683     address start = __ pc();
2684 
2685     __ enter(); // required for proper stackwalking of RuntimeStub frame
2686 
2687     // bump this on entry, not on exit:
2688     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2689 
2690     __ mov(bits, from);
2691     __ orptr(bits, to);
2692     __ orptr(bits, size);
2693 
2694     __ testb(bits, BytesPerLong-1);
2695     __ jccb(Assembler::zero, L_long_aligned);
2696 
2697     __ testb(bits, BytesPerInt-1);
2698     __ jccb(Assembler::zero, L_int_aligned);
2699 
2700     __ testb(bits, BytesPerShort-1);
2701     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2702 
2703     __ BIND(L_short_aligned);
2704     __ shrptr(size, LogBytesPerShort); // size => short_count
2705     __ jump(RuntimeAddress(short_copy_entry));
2706 
2707     __ BIND(L_int_aligned);
2708     __ shrptr(size, LogBytesPerInt); // size => int_count
2709     __ jump(RuntimeAddress(int_copy_entry));
2710 
2711     __ BIND(L_long_aligned);
2712     __ shrptr(size, LogBytesPerLong); // size => qword_count
2713     __ jump(RuntimeAddress(long_copy_entry));
2714 
2715     return start;
2716   }
2717 
2718   // Perform range checks on the proposed arraycopy.
2719   // Kills temp, but nothing else.
2720   // Also, clean the sign bits of src_pos and dst_pos.
2721   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2722                               Register src_pos, // source position (c_rarg1)
2723                               Register dst,     // destination array oo (c_rarg2)
2724                               Register dst_pos, // destination position (c_rarg3)
2725                               Register length,
2726                               Register temp,
2727                               Label& L_failed) {
2728     BLOCK_COMMENT("arraycopy_range_checks:");
2729 
2730     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2731     __ movl(temp, length);
2732     __ addl(temp, src_pos);             // src_pos + length
2733     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2734     __ jcc(Assembler::above, L_failed);
2735 
2736     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2737     __ movl(temp, length);
2738     __ addl(temp, dst_pos);             // dst_pos + length
2739     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2740     __ jcc(Assembler::above, L_failed);
2741 
2742     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2743     // Move with sign extension can be used since they are positive.
2744     __ movslq(src_pos, src_pos);
2745     __ movslq(dst_pos, dst_pos);
2746 
2747     BLOCK_COMMENT("arraycopy_range_checks done");
2748   }
2749 
2750   //
2751   //  Generate generic array copy stubs
2752   //
2753   //  Input:
2754   //    c_rarg0    -  src oop
2755   //    c_rarg1    -  src_pos (32-bits)
2756   //    c_rarg2    -  dst oop
2757   //    c_rarg3    -  dst_pos (32-bits)
2758   // not Win64
2759   //    c_rarg4    -  element count (32-bits)
2760   // Win64
2761   //    rsp+40     -  element count (32-bits)
2762   //
2763   //  Output:
2764   //    rax ==  0  -  success
2765   //    rax == -1^K - failure, where K is partial transfer count
2766   //
2767   address generate_generic_copy(const char *name,
2768                                 address byte_copy_entry, address short_copy_entry,
2769                                 address int_copy_entry, address oop_copy_entry,
2770                                 address long_copy_entry, address checkcast_copy_entry) {
2771 
2772     Label L_failed, L_failed_0, L_objArray;
2773     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2774 
2775     // Input registers
2776     const Register src        = c_rarg0;  // source array oop
2777     const Register src_pos    = c_rarg1;  // source position
2778     const Register dst        = c_rarg2;  // destination array oop
2779     const Register dst_pos    = c_rarg3;  // destination position
2780 #ifndef _WIN64
2781     const Register length     = c_rarg4;
2782 #else
2783     const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2784 #endif
2785 
2786     { int modulus = CodeEntryAlignment;
2787       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2788       int advance = target - (__ offset() % modulus);
2789       if (advance < 0)  advance += modulus;
2790       if (advance > 0)  __ nop(advance);
2791     }
2792     StubCodeMark mark(this, "StubRoutines", name);
2793 
2794     // Short-hop target to L_failed.  Makes for denser prologue code.
2795     __ BIND(L_failed_0);
2796     __ jmp(L_failed);
2797     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2798 
2799     __ align(CodeEntryAlignment);
2800     address start = __ pc();
2801 
2802     __ enter(); // required for proper stackwalking of RuntimeStub frame
2803 
2804     // bump this on entry, not on exit:
2805     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2806 
2807     //-----------------------------------------------------------------------
2808     // Assembler stub will be used for this call to arraycopy
2809     // if the following conditions are met:
2810     //
2811     // (1) src and dst must not be null.
2812     // (2) src_pos must not be negative.
2813     // (3) dst_pos must not be negative.
2814     // (4) length  must not be negative.
2815     // (5) src klass and dst klass should be the same and not NULL.
2816     // (6) src and dst should be arrays.
2817     // (7) src_pos + length must not exceed length of src.
2818     // (8) dst_pos + length must not exceed length of dst.
2819     //
2820 
2821     //  if (src == NULL) return -1;
2822     __ testptr(src, src);         // src oop
2823     size_t j1off = __ offset();
2824     __ jccb(Assembler::zero, L_failed_0);
2825 
2826     //  if (src_pos < 0) return -1;
2827     __ testl(src_pos, src_pos); // src_pos (32-bits)
2828     __ jccb(Assembler::negative, L_failed_0);
2829 
2830     //  if (dst == NULL) return -1;
2831     __ testptr(dst, dst);         // dst oop
2832     __ jccb(Assembler::zero, L_failed_0);
2833 
2834     //  if (dst_pos < 0) return -1;
2835     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2836     size_t j4off = __ offset();
2837     __ jccb(Assembler::negative, L_failed_0);
2838 
2839     // The first four tests are very dense code,
2840     // but not quite dense enough to put four
2841     // jumps in a 16-byte instruction fetch buffer.
2842     // That's good, because some branch predicters
2843     // do not like jumps so close together.
2844     // Make sure of this.
2845     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2846 
2847     // registers used as temp
2848     const Register r11_length    = r11; // elements count to copy
2849     const Register r10_src_klass = r10; // array klass
2850 
2851     //  if (length < 0) return -1;
2852     __ movl(r11_length, length);        // length (elements count, 32-bits value)
2853     __ testl(r11_length, r11_length);
2854     __ jccb(Assembler::negative, L_failed_0);
2855 
2856     __ load_klass(r10_src_klass, src);
2857 #ifdef ASSERT
2858     //  assert(src->klass() != NULL);
2859     {
2860       BLOCK_COMMENT("assert klasses not null {");
2861       Label L1, L2;
2862       __ testptr(r10_src_klass, r10_src_klass);
2863       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2864       __ bind(L1);
2865       __ stop("broken null klass");
2866       __ bind(L2);
2867       __ load_klass(rax, dst);
2868       __ cmpq(rax, 0);
2869       __ jcc(Assembler::equal, L1);     // this would be broken also
2870       BLOCK_COMMENT("} assert klasses not null done");
2871     }
2872 #endif
2873 
2874     // Load layout helper (32-bits)
2875     //
2876     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2877     // 32        30    24            16              8     2                 0
2878     //
2879     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2880     //
2881 
2882     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2883 
2884     // Handle objArrays completely differently...
2885     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2886     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2887     __ jcc(Assembler::equal, L_objArray);
2888 
2889     //  if (src->klass() != dst->klass()) return -1;
2890     __ load_klass(rax, dst);
2891     __ cmpq(r10_src_klass, rax);
2892     __ jcc(Assembler::notEqual, L_failed);
2893 
2894     const Register rax_lh = rax;  // layout helper
2895     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2896 
2897     //  if (!src->is_Array()) return -1;
2898     __ cmpl(rax_lh, Klass::_lh_neutral_value);
2899     __ jcc(Assembler::greaterEqual, L_failed);
2900 
2901     // At this point, it is known to be a typeArray (array_tag 0x3).
2902 #ifdef ASSERT
2903     {
2904       BLOCK_COMMENT("assert primitive array {");
2905       Label L;
2906       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2907       __ jcc(Assembler::greaterEqual, L);
2908       __ stop("must be a primitive array");
2909       __ bind(L);
2910       BLOCK_COMMENT("} assert primitive array done");
2911     }
2912 #endif
2913 
2914     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2915                            r10, L_failed);
2916 
2917     // TypeArrayKlass
2918     //
2919     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2920     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2921     //
2922 
2923     const Register r10_offset = r10;    // array offset
2924     const Register rax_elsize = rax_lh; // element size
2925 
2926     __ movl(r10_offset, rax_lh);
2927     __ shrl(r10_offset, Klass::_lh_header_size_shift);
2928     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2929     __ addptr(src, r10_offset);           // src array offset
2930     __ addptr(dst, r10_offset);           // dst array offset
2931     BLOCK_COMMENT("choose copy loop based on element size");
2932     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2933 
2934     // next registers should be set before the jump to corresponding stub
2935     const Register from     = c_rarg0;  // source array address
2936     const Register to       = c_rarg1;  // destination array address
2937     const Register count    = c_rarg2;  // elements count
2938 
2939     // 'from', 'to', 'count' registers should be set in such order
2940     // since they are the same as 'src', 'src_pos', 'dst'.
2941 
2942   __ BIND(L_copy_bytes);
2943     __ cmpl(rax_elsize, 0);
2944     __ jccb(Assembler::notEqual, L_copy_shorts);
2945     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2946     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2947     __ movl2ptr(count, r11_length); // length
2948     __ jump(RuntimeAddress(byte_copy_entry));
2949 
2950   __ BIND(L_copy_shorts);
2951     __ cmpl(rax_elsize, LogBytesPerShort);
2952     __ jccb(Assembler::notEqual, L_copy_ints);
2953     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2954     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2955     __ movl2ptr(count, r11_length); // length
2956     __ jump(RuntimeAddress(short_copy_entry));
2957 
2958   __ BIND(L_copy_ints);
2959     __ cmpl(rax_elsize, LogBytesPerInt);
2960     __ jccb(Assembler::notEqual, L_copy_longs);
2961     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2962     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2963     __ movl2ptr(count, r11_length); // length
2964     __ jump(RuntimeAddress(int_copy_entry));
2965 
2966   __ BIND(L_copy_longs);
2967 #ifdef ASSERT
2968     {
2969       BLOCK_COMMENT("assert long copy {");
2970       Label L;
2971       __ cmpl(rax_elsize, LogBytesPerLong);
2972       __ jcc(Assembler::equal, L);
2973       __ stop("must be long copy, but elsize is wrong");
2974       __ bind(L);
2975       BLOCK_COMMENT("} assert long copy done");
2976     }
2977 #endif
2978     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2979     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2980     __ movl2ptr(count, r11_length); // length
2981     __ jump(RuntimeAddress(long_copy_entry));
2982 
2983     // ObjArrayKlass
2984   __ BIND(L_objArray);
2985     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2986 
2987     Label L_plain_copy, L_checkcast_copy;
2988     //  test array classes for subtyping
2989     __ load_klass(rax, dst);
2990     __ cmpq(r10_src_klass, rax); // usual case is exact equality
2991     __ jcc(Assembler::notEqual, L_checkcast_copy);
2992 
2993     // Identically typed arrays can be copied without element-wise checks.
2994     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2995                            r10, L_failed);
2996 
2997     __ lea(from, Address(src, src_pos, TIMES_OOP,
2998                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2999     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3000                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3001     __ movl2ptr(count, r11_length); // length
3002   __ BIND(L_plain_copy);
3003     __ jump(RuntimeAddress(oop_copy_entry));
3004 
3005   __ BIND(L_checkcast_copy);
3006     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3007     {
3008       // Before looking at dst.length, make sure dst is also an objArray.
3009       __ cmpl(Address(rax, lh_offset), objArray_lh);
3010       __ jcc(Assembler::notEqual, L_failed);
3011 
3012       // It is safe to examine both src.length and dst.length.
3013       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3014                              rax, L_failed);
3015 
3016       const Register r11_dst_klass = r11;
3017       __ load_klass(r11_dst_klass, dst); // reload
3018 
3019       // Marshal the base address arguments now, freeing registers.
3020       __ lea(from, Address(src, src_pos, TIMES_OOP,
3021                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3022       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3023                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3024       __ movl(count, length);           // length (reloaded)
3025       Register sco_temp = c_rarg3;      // this register is free now
3026       assert_different_registers(from, to, count, sco_temp,
3027                                  r11_dst_klass, r10_src_klass);
3028       assert_clean_int(count, sco_temp);
3029 
3030       // Generate the type check.
3031       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3032       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3033       assert_clean_int(sco_temp, rax);
3034       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3035 
3036       // Fetch destination element klass from the ObjArrayKlass header.
3037       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3038       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3039       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3040       assert_clean_int(sco_temp, rax);
3041 
3042       // the checkcast_copy loop needs two extra arguments:
3043       assert(c_rarg3 == sco_temp, "#3 already in place");
3044       // Set up arguments for checkcast_copy_entry.
3045       setup_arg_regs(4);
3046       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3047       __ jump(RuntimeAddress(checkcast_copy_entry));
3048     }
3049 
3050   __ BIND(L_failed);
3051     __ xorptr(rax, rax);
3052     __ notptr(rax); // return -1
3053     __ leave();   // required for proper stackwalking of RuntimeStub frame
3054     __ ret(0);
3055 
3056     return start;
3057   }
3058 
3059   void generate_arraycopy_stubs() {
3060     address entry;
3061     address entry_jbyte_arraycopy;
3062     address entry_jshort_arraycopy;
3063     address entry_jint_arraycopy;
3064     address entry_oop_arraycopy;
3065     address entry_jlong_arraycopy;
3066     address entry_checkcast_arraycopy;
3067 
3068     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3069                                                                            "jbyte_disjoint_arraycopy");
3070     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3071                                                                            "jbyte_arraycopy");
3072 
3073     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3074                                                                             "jshort_disjoint_arraycopy");
3075     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3076                                                                             "jshort_arraycopy");
3077 
3078     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3079                                                                               "jint_disjoint_arraycopy");
3080     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3081                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3082 
3083     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3084                                                                                "jlong_disjoint_arraycopy");
3085     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3086                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3087 
3088 
3089     if (UseCompressedOops) {
3090       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3091                                                                               "oop_disjoint_arraycopy");
3092       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3093                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3094       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3095                                                                                      "oop_disjoint_arraycopy_uninit",
3096                                                                                      /*dest_uninitialized*/true);
3097       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3098                                                                                      NULL, "oop_arraycopy_uninit",
3099                                                                                      /*dest_uninitialized*/true);
3100     } else {
3101       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3102                                                                                "oop_disjoint_arraycopy");
3103       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3104                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3105       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3106                                                                                       "oop_disjoint_arraycopy_uninit",
3107                                                                                       /*dest_uninitialized*/true);
3108       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3109                                                                                       NULL, "oop_arraycopy_uninit",
3110                                                                                       /*dest_uninitialized*/true);
3111     }
3112 
3113     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3114     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3115                                                                         /*dest_uninitialized*/true);
3116 
3117     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3118                                                               entry_jbyte_arraycopy,
3119                                                               entry_jshort_arraycopy,
3120                                                               entry_jint_arraycopy,
3121                                                               entry_jlong_arraycopy);
3122     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3123                                                                entry_jbyte_arraycopy,
3124                                                                entry_jshort_arraycopy,
3125                                                                entry_jint_arraycopy,
3126                                                                entry_oop_arraycopy,
3127                                                                entry_jlong_arraycopy,
3128                                                                entry_checkcast_arraycopy);
3129 
3130     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3131     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3132     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3133     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3134     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3135     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3136 
3137     // We don't generate specialized code for HeapWord-aligned source
3138     // arrays, so just use the code we've already generated
3139     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3140     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3141 
3142     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3143     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3144 
3145     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3146     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3147 
3148     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3149     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3150 
3151     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3152     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3153 
3154     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3155     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3156   }
3157 
3158   // AES intrinsic stubs
3159   enum {AESBlockSize = 16};
3160 
3161   address generate_key_shuffle_mask() {
3162     __ align(16);
3163     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3164     address start = __ pc();
3165     __ emit_data64( 0x0405060700010203, relocInfo::none );
3166     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3167     return start;
3168   }
3169 
3170   address generate_counter_shuffle_mask() {
3171     __ align(16);
3172     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3173     address start = __ pc();
3174     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3175     __ emit_data64(0x0001020304050607, relocInfo::none);
3176     return start;
3177   }
3178 
3179   // Utility routine for loading a 128-bit key word in little endian format
3180   // can optionally specify that the shuffle mask is already in an xmmregister
3181   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3182     __ movdqu(xmmdst, Address(key, offset));
3183     if (xmm_shuf_mask != NULL) {
3184       __ pshufb(xmmdst, xmm_shuf_mask);
3185     } else {
3186       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3187     }
3188   }
3189 
3190   // Utility routine for increase 128bit counter (iv in CTR mode)
3191   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3192     __ pextrq(reg, xmmdst, 0x0);
3193     __ addq(reg, inc_delta);
3194     __ pinsrq(xmmdst, reg, 0x0);
3195     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3196     __ pextrq(reg, xmmdst, 0x01); // Carry
3197     __ addq(reg, 0x01);
3198     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3199     __ BIND(next_block);          // next instruction
3200   }
3201 
3202   // Arguments:
3203   //
3204   // Inputs:
3205   //   c_rarg0   - source byte array address
3206   //   c_rarg1   - destination byte array address
3207   //   c_rarg2   - K (key) in little endian int array
3208   //
3209   address generate_aescrypt_encryptBlock() {
3210     assert(UseAES, "need AES instructions and misaligned SSE support");
3211     __ align(CodeEntryAlignment);
3212     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3213     Label L_doLast;
3214     address start = __ pc();
3215 
3216     const Register from        = c_rarg0;  // source array address
3217     const Register to          = c_rarg1;  // destination array address
3218     const Register key         = c_rarg2;  // key array address
3219     const Register keylen      = rax;
3220 
3221     const XMMRegister xmm_result = xmm0;
3222     const XMMRegister xmm_key_shuf_mask = xmm1;
3223     // On win64 xmm6-xmm15 must be preserved so don't use them.
3224     const XMMRegister xmm_temp1  = xmm2;
3225     const XMMRegister xmm_temp2  = xmm3;
3226     const XMMRegister xmm_temp3  = xmm4;
3227     const XMMRegister xmm_temp4  = xmm5;
3228 
3229     __ enter(); // required for proper stackwalking of RuntimeStub frame
3230 
3231     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3232     // context for the registers used, where all instructions below are using 128-bit mode
3233     // On EVEX without VL and BW, these instructions will all be AVX.
3234     if (VM_Version::supports_avx512vlbw()) {
3235       __ movl(rax, 0xffff);
3236       __ kmovql(k1, rax);
3237     }
3238 
3239     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3240     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3241 
3242     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3243     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3244 
3245     // For encryption, the java expanded key ordering is just what we need
3246     // we don't know if the key is aligned, hence not using load-execute form
3247 
3248     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3249     __ pxor(xmm_result, xmm_temp1);
3250 
3251     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3252     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3253     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3254     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3255 
3256     __ aesenc(xmm_result, xmm_temp1);
3257     __ aesenc(xmm_result, xmm_temp2);
3258     __ aesenc(xmm_result, xmm_temp3);
3259     __ aesenc(xmm_result, xmm_temp4);
3260 
3261     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3262     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3263     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3264     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3265 
3266     __ aesenc(xmm_result, xmm_temp1);
3267     __ aesenc(xmm_result, xmm_temp2);
3268     __ aesenc(xmm_result, xmm_temp3);
3269     __ aesenc(xmm_result, xmm_temp4);
3270 
3271     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3272     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3273 
3274     __ cmpl(keylen, 44);
3275     __ jccb(Assembler::equal, L_doLast);
3276 
3277     __ aesenc(xmm_result, xmm_temp1);
3278     __ aesenc(xmm_result, xmm_temp2);
3279 
3280     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3281     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3282 
3283     __ cmpl(keylen, 52);
3284     __ jccb(Assembler::equal, L_doLast);
3285 
3286     __ aesenc(xmm_result, xmm_temp1);
3287     __ aesenc(xmm_result, xmm_temp2);
3288 
3289     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3290     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3291 
3292     __ BIND(L_doLast);
3293     __ aesenc(xmm_result, xmm_temp1);
3294     __ aesenclast(xmm_result, xmm_temp2);
3295     __ movdqu(Address(to, 0), xmm_result);        // store the result
3296     __ xorptr(rax, rax); // return 0
3297     __ leave(); // required for proper stackwalking of RuntimeStub frame
3298     __ ret(0);
3299 
3300     return start;
3301   }
3302 
3303 
3304   // Arguments:
3305   //
3306   // Inputs:
3307   //   c_rarg0   - source byte array address
3308   //   c_rarg1   - destination byte array address
3309   //   c_rarg2   - K (key) in little endian int array
3310   //
3311   address generate_aescrypt_decryptBlock() {
3312     assert(UseAES, "need AES instructions and misaligned SSE support");
3313     __ align(CodeEntryAlignment);
3314     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3315     Label L_doLast;
3316     address start = __ pc();
3317 
3318     const Register from        = c_rarg0;  // source array address
3319     const Register to          = c_rarg1;  // destination array address
3320     const Register key         = c_rarg2;  // key array address
3321     const Register keylen      = rax;
3322 
3323     const XMMRegister xmm_result = xmm0;
3324     const XMMRegister xmm_key_shuf_mask = xmm1;
3325     // On win64 xmm6-xmm15 must be preserved so don't use them.
3326     const XMMRegister xmm_temp1  = xmm2;
3327     const XMMRegister xmm_temp2  = xmm3;
3328     const XMMRegister xmm_temp3  = xmm4;
3329     const XMMRegister xmm_temp4  = xmm5;
3330 
3331     __ enter(); // required for proper stackwalking of RuntimeStub frame
3332 
3333     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3334     // context for the registers used, where all instructions below are using 128-bit mode
3335     // On EVEX without VL and BW, these instructions will all be AVX.
3336     if (VM_Version::supports_avx512vlbw()) {
3337       __ movl(rax, 0xffff);
3338       __ kmovql(k1, rax);
3339     }
3340 
3341     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3342     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3343 
3344     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3345     __ movdqu(xmm_result, Address(from, 0));
3346 
3347     // for decryption java expanded key ordering is rotated one position from what we want
3348     // so we start from 0x10 here and hit 0x00 last
3349     // we don't know if the key is aligned, hence not using load-execute form
3350     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3351     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3352     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3353     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3354 
3355     __ pxor  (xmm_result, xmm_temp1);
3356     __ aesdec(xmm_result, xmm_temp2);
3357     __ aesdec(xmm_result, xmm_temp3);
3358     __ aesdec(xmm_result, xmm_temp4);
3359 
3360     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3361     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3362     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3363     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3364 
3365     __ aesdec(xmm_result, xmm_temp1);
3366     __ aesdec(xmm_result, xmm_temp2);
3367     __ aesdec(xmm_result, xmm_temp3);
3368     __ aesdec(xmm_result, xmm_temp4);
3369 
3370     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3371     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3372     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3373 
3374     __ cmpl(keylen, 44);
3375     __ jccb(Assembler::equal, L_doLast);
3376 
3377     __ aesdec(xmm_result, xmm_temp1);
3378     __ aesdec(xmm_result, xmm_temp2);
3379 
3380     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3381     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3382 
3383     __ cmpl(keylen, 52);
3384     __ jccb(Assembler::equal, L_doLast);
3385 
3386     __ aesdec(xmm_result, xmm_temp1);
3387     __ aesdec(xmm_result, xmm_temp2);
3388 
3389     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3390     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3391 
3392     __ BIND(L_doLast);
3393     __ aesdec(xmm_result, xmm_temp1);
3394     __ aesdec(xmm_result, xmm_temp2);
3395 
3396     // for decryption the aesdeclast operation is always on key+0x00
3397     __ aesdeclast(xmm_result, xmm_temp3);
3398     __ movdqu(Address(to, 0), xmm_result);  // store the result
3399     __ xorptr(rax, rax); // return 0
3400     __ leave(); // required for proper stackwalking of RuntimeStub frame
3401     __ ret(0);
3402 
3403     return start;
3404   }
3405 
3406 
3407   // Arguments:
3408   //
3409   // Inputs:
3410   //   c_rarg0   - source byte array address
3411   //   c_rarg1   - destination byte array address
3412   //   c_rarg2   - K (key) in little endian int array
3413   //   c_rarg3   - r vector byte array address
3414   //   c_rarg4   - input length
3415   //
3416   // Output:
3417   //   rax       - input length
3418   //
3419   address generate_cipherBlockChaining_encryptAESCrypt() {
3420     assert(UseAES, "need AES instructions and misaligned SSE support");
3421     __ align(CodeEntryAlignment);
3422     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3423     address start = __ pc();
3424 
3425     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3426     const Register from        = c_rarg0;  // source array address
3427     const Register to          = c_rarg1;  // destination array address
3428     const Register key         = c_rarg2;  // key array address
3429     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3430                                            // and left with the results of the last encryption block
3431 #ifndef _WIN64
3432     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3433 #else
3434     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3435     const Register len_reg     = r11;      // pick the volatile windows register
3436 #endif
3437     const Register pos         = rax;
3438 
3439     // xmm register assignments for the loops below
3440     const XMMRegister xmm_result = xmm0;
3441     const XMMRegister xmm_temp   = xmm1;
3442     // keys 0-10 preloaded into xmm2-xmm12
3443     const int XMM_REG_NUM_KEY_FIRST = 2;
3444     const int XMM_REG_NUM_KEY_LAST  = 15;
3445     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3446     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3447     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3448     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3449     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3450 
3451     __ enter(); // required for proper stackwalking of RuntimeStub frame
3452 
3453     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3454     // context for the registers used, where all instructions below are using 128-bit mode
3455     // On EVEX without VL and BW, these instructions will all be AVX.
3456     if (VM_Version::supports_avx512vlbw()) {
3457       __ movl(rax, 0xffff);
3458       __ kmovql(k1, rax);
3459     }
3460 
3461 #ifdef _WIN64
3462     // on win64, fill len_reg from stack position
3463     __ movl(len_reg, len_mem);
3464 #else
3465     __ push(len_reg); // Save
3466 #endif
3467 
3468     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3469     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3470     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3471     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3472       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3473       offset += 0x10;
3474     }
3475     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3476 
3477     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3478     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3479     __ cmpl(rax, 44);
3480     __ jcc(Assembler::notEqual, L_key_192_256);
3481 
3482     // 128 bit code follows here
3483     __ movptr(pos, 0);
3484     __ align(OptoLoopAlignment);
3485 
3486     __ BIND(L_loopTop_128);
3487     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3488     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3489     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3490     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3491       __ aesenc(xmm_result, as_XMMRegister(rnum));
3492     }
3493     __ aesenclast(xmm_result, xmm_key10);
3494     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3495     // no need to store r to memory until we exit
3496     __ addptr(pos, AESBlockSize);
3497     __ subptr(len_reg, AESBlockSize);
3498     __ jcc(Assembler::notEqual, L_loopTop_128);
3499 
3500     __ BIND(L_exit);
3501     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3502 
3503 #ifdef _WIN64
3504     __ movl(rax, len_mem);
3505 #else
3506     __ pop(rax); // return length
3507 #endif
3508     __ leave(); // required for proper stackwalking of RuntimeStub frame
3509     __ ret(0);
3510 
3511     __ BIND(L_key_192_256);
3512     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3513     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3514     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3515     __ cmpl(rax, 52);
3516     __ jcc(Assembler::notEqual, L_key_256);
3517 
3518     // 192-bit code follows here (could be changed to use more xmm registers)
3519     __ movptr(pos, 0);
3520     __ align(OptoLoopAlignment);
3521 
3522     __ BIND(L_loopTop_192);
3523     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3524     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3525     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3526     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3527       __ aesenc(xmm_result, as_XMMRegister(rnum));
3528     }
3529     __ aesenclast(xmm_result, xmm_key12);
3530     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3531     // no need to store r to memory until we exit
3532     __ addptr(pos, AESBlockSize);
3533     __ subptr(len_reg, AESBlockSize);
3534     __ jcc(Assembler::notEqual, L_loopTop_192);
3535     __ jmp(L_exit);
3536 
3537     __ BIND(L_key_256);
3538     // 256-bit code follows here (could be changed to use more xmm registers)
3539     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3540     __ movptr(pos, 0);
3541     __ align(OptoLoopAlignment);
3542 
3543     __ BIND(L_loopTop_256);
3544     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3545     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3546     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3547     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3548       __ aesenc(xmm_result, as_XMMRegister(rnum));
3549     }
3550     load_key(xmm_temp, key, 0xe0);
3551     __ aesenclast(xmm_result, xmm_temp);
3552     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3553     // no need to store r to memory until we exit
3554     __ addptr(pos, AESBlockSize);
3555     __ subptr(len_reg, AESBlockSize);
3556     __ jcc(Assembler::notEqual, L_loopTop_256);
3557     __ jmp(L_exit);
3558 
3559     return start;
3560   }
3561 
3562   // Safefetch stubs.
3563   void generate_safefetch(const char* name, int size, address* entry,
3564                           address* fault_pc, address* continuation_pc) {
3565     // safefetch signatures:
3566     //   int      SafeFetch32(int*      adr, int      errValue);
3567     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3568     //
3569     // arguments:
3570     //   c_rarg0 = adr
3571     //   c_rarg1 = errValue
3572     //
3573     // result:
3574     //   PPC_RET  = *adr or errValue
3575 
3576     StubCodeMark mark(this, "StubRoutines", name);
3577 
3578     // Entry point, pc or function descriptor.
3579     *entry = __ pc();
3580 
3581     // Load *adr into c_rarg1, may fault.
3582     *fault_pc = __ pc();
3583     switch (size) {
3584       case 4:
3585         // int32_t
3586         __ movl(c_rarg1, Address(c_rarg0, 0));
3587         break;
3588       case 8:
3589         // int64_t
3590         __ movq(c_rarg1, Address(c_rarg0, 0));
3591         break;
3592       default:
3593         ShouldNotReachHere();
3594     }
3595 
3596     // return errValue or *adr
3597     *continuation_pc = __ pc();
3598     __ movq(rax, c_rarg1);
3599     __ ret(0);
3600   }
3601 
3602   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3603   // to hide instruction latency
3604   //
3605   // Arguments:
3606   //
3607   // Inputs:
3608   //   c_rarg0   - source byte array address
3609   //   c_rarg1   - destination byte array address
3610   //   c_rarg2   - K (key) in little endian int array
3611   //   c_rarg3   - r vector byte array address
3612   //   c_rarg4   - input length
3613   //
3614   // Output:
3615   //   rax       - input length
3616   //
3617   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3618     assert(UseAES, "need AES instructions and misaligned SSE support");
3619     __ align(CodeEntryAlignment);
3620     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3621     address start = __ pc();
3622 
3623     const Register from        = c_rarg0;  // source array address
3624     const Register to          = c_rarg1;  // destination array address
3625     const Register key         = c_rarg2;  // key array address
3626     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3627                                            // and left with the results of the last encryption block
3628 #ifndef _WIN64
3629     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3630 #else
3631     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3632     const Register len_reg     = r11;      // pick the volatile windows register
3633 #endif
3634     const Register pos         = rax;
3635 
3636     const int PARALLEL_FACTOR = 4;
3637     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3638 
3639     Label L_exit;
3640     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3641     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3642     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3643     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3644     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3645 
3646     // keys 0-10 preloaded into xmm5-xmm15
3647     const int XMM_REG_NUM_KEY_FIRST = 5;
3648     const int XMM_REG_NUM_KEY_LAST  = 15;
3649     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3650     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3651 
3652     __ enter(); // required for proper stackwalking of RuntimeStub frame
3653 
3654     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3655     // context for the registers used, where all instructions below are using 128-bit mode
3656     // On EVEX without VL and BW, these instructions will all be AVX.
3657     if (VM_Version::supports_avx512vlbw()) {
3658       __ movl(rax, 0xffff);
3659       __ kmovql(k1, rax);
3660     }
3661 
3662 #ifdef _WIN64
3663     // on win64, fill len_reg from stack position
3664     __ movl(len_reg, len_mem);
3665 #else
3666     __ push(len_reg); // Save
3667 #endif
3668     __ push(rbx);
3669     // the java expanded key ordering is rotated one position from what we want
3670     // so we start from 0x10 here and hit 0x00 last
3671     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3672     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3673     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3674     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3675       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3676       offset += 0x10;
3677     }
3678     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3679 
3680     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3681 
3682     // registers holding the four results in the parallelized loop
3683     const XMMRegister xmm_result0 = xmm0;
3684     const XMMRegister xmm_result1 = xmm2;
3685     const XMMRegister xmm_result2 = xmm3;
3686     const XMMRegister xmm_result3 = xmm4;
3687 
3688     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3689 
3690     __ xorptr(pos, pos);
3691 
3692     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3693     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3694     __ cmpl(rbx, 52);
3695     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3696     __ cmpl(rbx, 60);
3697     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3698 
3699 #define DoFour(opc, src_reg)           \
3700   __ opc(xmm_result0, src_reg);         \
3701   __ opc(xmm_result1, src_reg);         \
3702   __ opc(xmm_result2, src_reg);         \
3703   __ opc(xmm_result3, src_reg);         \
3704 
3705     for (int k = 0; k < 3; ++k) {
3706       __ BIND(L_multiBlock_loopTopHead[k]);
3707       if (k != 0) {
3708         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3709         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3710       }
3711       if (k == 1) {
3712         __ subptr(rsp, 6 * wordSize);
3713         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3714         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3715         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3716         load_key(xmm1, key, 0xc0);  // 0xc0;
3717         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3718       } else if (k == 2) {
3719         __ subptr(rsp, 10 * wordSize);
3720         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3721         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3722         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3723         load_key(xmm1, key, 0xe0);  // 0xe0;
3724         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3725         load_key(xmm15, key, 0xb0); // 0xb0;
3726         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3727         load_key(xmm1, key, 0xc0);  // 0xc0;
3728         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3729       }
3730       __ align(OptoLoopAlignment);
3731       __ BIND(L_multiBlock_loopTop[k]);
3732       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3733       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3734 
3735       if  (k != 0) {
3736         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3737         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3738       }
3739 
3740       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3741       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3742       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3743       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3744 
3745       DoFour(pxor, xmm_key_first);
3746       if (k == 0) {
3747         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3748           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3749         }
3750         DoFour(aesdeclast, xmm_key_last);
3751       } else if (k == 1) {
3752         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3753           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3754         }
3755         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3756         DoFour(aesdec, xmm1);  // key : 0xc0
3757         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3758         DoFour(aesdeclast, xmm_key_last);
3759       } else if (k == 2) {
3760         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3761           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3762         }
3763         DoFour(aesdec, xmm1);  // key : 0xc0
3764         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3765         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3766         DoFour(aesdec, xmm15);  // key : 0xd0
3767         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3768         DoFour(aesdec, xmm1);  // key : 0xe0
3769         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3770         DoFour(aesdeclast, xmm_key_last);
3771       }
3772 
3773       // for each result, xor with the r vector of previous cipher block
3774       __ pxor(xmm_result0, xmm_prev_block_cipher);
3775       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3776       __ pxor(xmm_result1, xmm_prev_block_cipher);
3777       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3778       __ pxor(xmm_result2, xmm_prev_block_cipher);
3779       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3780       __ pxor(xmm_result3, xmm_prev_block_cipher);
3781       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
3782       if (k != 0) {
3783         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3784       }
3785 
3786       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3787       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3788       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3789       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3790 
3791       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3792       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3793       __ jmp(L_multiBlock_loopTop[k]);
3794 
3795       // registers used in the non-parallelized loops
3796       // xmm register assignments for the loops below
3797       const XMMRegister xmm_result = xmm0;
3798       const XMMRegister xmm_prev_block_cipher_save = xmm2;
3799       const XMMRegister xmm_key11 = xmm3;
3800       const XMMRegister xmm_key12 = xmm4;
3801       const XMMRegister key_tmp = xmm4;
3802 
3803       __ BIND(L_singleBlock_loopTopHead[k]);
3804       if (k == 1) {
3805         __ addptr(rsp, 6 * wordSize);
3806       } else if (k == 2) {
3807         __ addptr(rsp, 10 * wordSize);
3808       }
3809       __ cmpptr(len_reg, 0); // any blocks left??
3810       __ jcc(Assembler::equal, L_exit);
3811       __ BIND(L_singleBlock_loopTopHead2[k]);
3812       if (k == 1) {
3813         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3814         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3815       }
3816       if (k == 2) {
3817         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3818       }
3819       __ align(OptoLoopAlignment);
3820       __ BIND(L_singleBlock_loopTop[k]);
3821       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3822       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3823       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3824       for (int rnum = 1; rnum <= 9 ; rnum++) {
3825           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3826       }
3827       if (k == 1) {
3828         __ aesdec(xmm_result, xmm_key11);
3829         __ aesdec(xmm_result, xmm_key12);
3830       }
3831       if (k == 2) {
3832         __ aesdec(xmm_result, xmm_key11);
3833         load_key(key_tmp, key, 0xc0);
3834         __ aesdec(xmm_result, key_tmp);
3835         load_key(key_tmp, key, 0xd0);
3836         __ aesdec(xmm_result, key_tmp);
3837         load_key(key_tmp, key, 0xe0);
3838         __ aesdec(xmm_result, key_tmp);
3839       }
3840 
3841       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3842       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3843       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3844       // no need to store r to memory until we exit
3845       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3846       __ addptr(pos, AESBlockSize);
3847       __ subptr(len_reg, AESBlockSize);
3848       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3849       if (k != 2) {
3850         __ jmp(L_exit);
3851       }
3852     } //for 128/192/256
3853 
3854     __ BIND(L_exit);
3855     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3856     __ pop(rbx);
3857 #ifdef _WIN64
3858     __ movl(rax, len_mem);
3859 #else
3860     __ pop(rax); // return length
3861 #endif
3862     __ leave(); // required for proper stackwalking of RuntimeStub frame
3863     __ ret(0);
3864     return start;
3865 }
3866 
3867   address generate_upper_word_mask() {
3868     __ align(64);
3869     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3870     address start = __ pc();
3871     __ emit_data64(0x0000000000000000, relocInfo::none);
3872     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3873     return start;
3874   }
3875 
3876   address generate_shuffle_byte_flip_mask() {
3877     __ align(64);
3878     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3879     address start = __ pc();
3880     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3881     __ emit_data64(0x0001020304050607, relocInfo::none);
3882     return start;
3883   }
3884 
3885   // ofs and limit are use for multi-block byte array.
3886   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3887   address generate_sha1_implCompress(bool multi_block, const char *name) {
3888     __ align(CodeEntryAlignment);
3889     StubCodeMark mark(this, "StubRoutines", name);
3890     address start = __ pc();
3891 
3892     Register buf = c_rarg0;
3893     Register state = c_rarg1;
3894     Register ofs = c_rarg2;
3895     Register limit = c_rarg3;
3896 
3897     const XMMRegister abcd = xmm0;
3898     const XMMRegister e0 = xmm1;
3899     const XMMRegister e1 = xmm2;
3900     const XMMRegister msg0 = xmm3;
3901 
3902     const XMMRegister msg1 = xmm4;
3903     const XMMRegister msg2 = xmm5;
3904     const XMMRegister msg3 = xmm6;
3905     const XMMRegister shuf_mask = xmm7;
3906 
3907     __ enter();
3908 
3909     __ subptr(rsp, 4 * wordSize);
3910 
3911     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3912       buf, state, ofs, limit, rsp, multi_block);
3913 
3914     __ addptr(rsp, 4 * wordSize);
3915 
3916     __ leave();
3917     __ ret(0);
3918     return start;
3919   }
3920 
3921   address generate_pshuffle_byte_flip_mask() {
3922     __ align(64);
3923     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3924     address start = __ pc();
3925     __ emit_data64(0x0405060700010203, relocInfo::none);
3926     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3927 
3928     if (VM_Version::supports_avx2()) {
3929       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3930       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3931       // _SHUF_00BA
3932       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3933       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3934       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3935       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3936       // _SHUF_DC00
3937       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3938       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3939       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3940       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3941     }
3942 
3943     return start;
3944   }
3945 
3946   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
3947   address generate_pshuffle_byte_flip_mask_sha512() {
3948     __ align(32);
3949     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
3950     address start = __ pc();
3951     if (VM_Version::supports_avx2()) {
3952       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
3953       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3954       __ emit_data64(0x1011121314151617, relocInfo::none);
3955       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
3956       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
3957       __ emit_data64(0x0000000000000000, relocInfo::none);
3958       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3959       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3960     }
3961 
3962     return start;
3963   }
3964 
3965 // ofs and limit are use for multi-block byte array.
3966 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3967   address generate_sha256_implCompress(bool multi_block, const char *name) {
3968     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
3969     __ align(CodeEntryAlignment);
3970     StubCodeMark mark(this, "StubRoutines", name);
3971     address start = __ pc();
3972 
3973     Register buf = c_rarg0;
3974     Register state = c_rarg1;
3975     Register ofs = c_rarg2;
3976     Register limit = c_rarg3;
3977 
3978     const XMMRegister msg = xmm0;
3979     const XMMRegister state0 = xmm1;
3980     const XMMRegister state1 = xmm2;
3981     const XMMRegister msgtmp0 = xmm3;
3982 
3983     const XMMRegister msgtmp1 = xmm4;
3984     const XMMRegister msgtmp2 = xmm5;
3985     const XMMRegister msgtmp3 = xmm6;
3986     const XMMRegister msgtmp4 = xmm7;
3987 
3988     const XMMRegister shuf_mask = xmm8;
3989 
3990     __ enter();
3991 
3992     __ subptr(rsp, 4 * wordSize);
3993 
3994     if (VM_Version::supports_sha()) {
3995       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3996         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3997     } else if (VM_Version::supports_avx2()) {
3998       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3999         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4000     }
4001     __ addptr(rsp, 4 * wordSize);
4002     __ vzeroupper();
4003     __ leave();
4004     __ ret(0);
4005     return start;
4006   }
4007 
4008   address generate_sha512_implCompress(bool multi_block, const char *name) {
4009     assert(VM_Version::supports_avx2(), "");
4010     assert(VM_Version::supports_bmi2(), "");
4011     __ align(CodeEntryAlignment);
4012     StubCodeMark mark(this, "StubRoutines", name);
4013     address start = __ pc();
4014 
4015     Register buf = c_rarg0;
4016     Register state = c_rarg1;
4017     Register ofs = c_rarg2;
4018     Register limit = c_rarg3;
4019 
4020     const XMMRegister msg = xmm0;
4021     const XMMRegister state0 = xmm1;
4022     const XMMRegister state1 = xmm2;
4023     const XMMRegister msgtmp0 = xmm3;
4024     const XMMRegister msgtmp1 = xmm4;
4025     const XMMRegister msgtmp2 = xmm5;
4026     const XMMRegister msgtmp3 = xmm6;
4027     const XMMRegister msgtmp4 = xmm7;
4028 
4029     const XMMRegister shuf_mask = xmm8;
4030 
4031     __ enter();
4032 
4033     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4034     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4035 
4036     __ vzeroupper();
4037     __ leave();
4038     __ ret(0);
4039     return start;
4040   }
4041 
4042   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4043   // to hide instruction latency
4044   //
4045   // Arguments:
4046   //
4047   // Inputs:
4048   //   c_rarg0   - source byte array address
4049   //   c_rarg1   - destination byte array address
4050   //   c_rarg2   - K (key) in little endian int array
4051   //   c_rarg3   - counter vector byte array address
4052   //   Linux
4053   //     c_rarg4   -          input length
4054   //     c_rarg5   -          saved encryptedCounter start
4055   //     rbp + 6 * wordSize - saved used length
4056   //   Windows
4057   //     rbp + 6 * wordSize - input length
4058   //     rbp + 7 * wordSize - saved encryptedCounter start
4059   //     rbp + 8 * wordSize - saved used length
4060   //
4061   // Output:
4062   //   rax       - input length
4063   //
4064   address generate_counterMode_AESCrypt_Parallel() {
4065     assert(UseAES, "need AES instructions and misaligned SSE support");
4066     __ align(CodeEntryAlignment);
4067     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4068     address start = __ pc();
4069     const Register from = c_rarg0; // source array address
4070     const Register to = c_rarg1; // destination array address
4071     const Register key = c_rarg2; // key array address
4072     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4073                                       // and updated with the incremented counter in the end
4074 #ifndef _WIN64
4075     const Register len_reg = c_rarg4;
4076     const Register saved_encCounter_start = c_rarg5;
4077     const Register used_addr = r10;
4078     const Address  used_mem(rbp, 2 * wordSize);
4079     const Register used = r11;
4080 #else
4081     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4082     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4083     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4084     const Register len_reg = r10; // pick the first volatile windows register
4085     const Register saved_encCounter_start = r11;
4086     const Register used_addr = r13;
4087     const Register used = r14;
4088 #endif
4089     const Register pos = rax;
4090 
4091     const int PARALLEL_FACTOR = 6;
4092     const XMMRegister xmm_counter_shuf_mask = xmm0;
4093     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4094     const XMMRegister xmm_curr_counter = xmm2;
4095 
4096     const XMMRegister xmm_key_tmp0 = xmm3;
4097     const XMMRegister xmm_key_tmp1 = xmm4;
4098 
4099     // registers holding the four results in the parallelized loop
4100     const XMMRegister xmm_result0 = xmm5;
4101     const XMMRegister xmm_result1 = xmm6;
4102     const XMMRegister xmm_result2 = xmm7;
4103     const XMMRegister xmm_result3 = xmm8;
4104     const XMMRegister xmm_result4 = xmm9;
4105     const XMMRegister xmm_result5 = xmm10;
4106 
4107     const XMMRegister xmm_from0 = xmm11;
4108     const XMMRegister xmm_from1 = xmm12;
4109     const XMMRegister xmm_from2 = xmm13;
4110     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4111     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4112     const XMMRegister xmm_from5 = xmm4;
4113 
4114     //for key_128, key_192, key_256
4115     const int rounds[3] = {10, 12, 14};
4116     Label L_exit_preLoop, L_preLoop_start;
4117     Label L_multiBlock_loopTop[3];
4118     Label L_singleBlockLoopTop[3];
4119     Label L__incCounter[3][6]; //for 6 blocks
4120     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4121     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4122     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4123 
4124     Label L_exit;
4125 
4126     __ enter(); // required for proper stackwalking of RuntimeStub frame
4127 
4128     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4129     // context for the registers used, where all instructions below are using 128-bit mode
4130     // On EVEX without VL and BW, these instructions will all be AVX.
4131     if (VM_Version::supports_avx512vlbw()) {
4132         __ movl(rax, 0xffff);
4133         __ kmovql(k1, rax);
4134     }
4135 
4136 #ifdef _WIN64
4137     // allocate spill slots for r13, r14
4138     enum {
4139         saved_r13_offset,
4140         saved_r14_offset
4141     };
4142     __ subptr(rsp, 2 * wordSize);
4143     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4144     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4145 
4146     // on win64, fill len_reg from stack position
4147     __ movl(len_reg, len_mem);
4148     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4149     __ movptr(used_addr, used_mem);
4150     __ movl(used, Address(used_addr, 0));
4151 #else
4152     __ push(len_reg); // Save
4153     __ movptr(used_addr, used_mem);
4154     __ movl(used, Address(used_addr, 0));
4155 #endif
4156 
4157     __ push(rbx); // Save RBX
4158     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4159     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4160     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4161     __ movptr(pos, 0);
4162 
4163     // Use the partially used encrpyted counter from last invocation
4164     __ BIND(L_preLoop_start);
4165     __ cmpptr(used, 16);
4166     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4167       __ cmpptr(len_reg, 0);
4168       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4169       __ movb(rbx, Address(saved_encCounter_start, used));
4170       __ xorb(rbx, Address(from, pos));
4171       __ movb(Address(to, pos), rbx);
4172       __ addptr(pos, 1);
4173       __ addptr(used, 1);
4174       __ subptr(len_reg, 1);
4175 
4176     __ jmp(L_preLoop_start);
4177 
4178     __ BIND(L_exit_preLoop);
4179     __ movl(Address(used_addr, 0), used);
4180 
4181     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4182     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4183     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4184     __ cmpl(rbx, 52);
4185     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4186     __ cmpl(rbx, 60);
4187     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4188 
4189 #define CTR_DoSix(opc, src_reg)                \
4190     __ opc(xmm_result0, src_reg);              \
4191     __ opc(xmm_result1, src_reg);              \
4192     __ opc(xmm_result2, src_reg);              \
4193     __ opc(xmm_result3, src_reg);              \
4194     __ opc(xmm_result4, src_reg);              \
4195     __ opc(xmm_result5, src_reg);
4196 
4197     // k == 0 :  generate code for key_128
4198     // k == 1 :  generate code for key_192
4199     // k == 2 :  generate code for key_256
4200     for (int k = 0; k < 3; ++k) {
4201       //multi blocks starts here
4202       __ align(OptoLoopAlignment);
4203       __ BIND(L_multiBlock_loopTop[k]);
4204       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4205       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4206       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4207 
4208       //load, then increase counters
4209       CTR_DoSix(movdqa, xmm_curr_counter);
4210       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4211       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4212       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4213       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4214       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4215       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4216       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4217       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4218 
4219       //load two ROUND_KEYs at a time
4220       for (int i = 1; i < rounds[k]; ) {
4221         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4222         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4223         CTR_DoSix(aesenc, xmm_key_tmp1);
4224         i++;
4225         if (i != rounds[k]) {
4226           CTR_DoSix(aesenc, xmm_key_tmp0);
4227         } else {
4228           CTR_DoSix(aesenclast, xmm_key_tmp0);
4229         }
4230         i++;
4231       }
4232 
4233       // get next PARALLEL_FACTOR blocks into xmm_result registers
4234       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4235       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4236       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4237       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4238       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4239       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4240 
4241       __ pxor(xmm_result0, xmm_from0);
4242       __ pxor(xmm_result1, xmm_from1);
4243       __ pxor(xmm_result2, xmm_from2);
4244       __ pxor(xmm_result3, xmm_from3);
4245       __ pxor(xmm_result4, xmm_from4);
4246       __ pxor(xmm_result5, xmm_from5);
4247 
4248       // store 6 results into the next 64 bytes of output
4249       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4250       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4251       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4252       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4253       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4254       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4255 
4256       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4257       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4258       __ jmp(L_multiBlock_loopTop[k]);
4259 
4260       // singleBlock starts here
4261       __ align(OptoLoopAlignment);
4262       __ BIND(L_singleBlockLoopTop[k]);
4263       __ cmpptr(len_reg, 0);
4264       __ jcc(Assembler::lessEqual, L_exit);
4265       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4266       __ movdqa(xmm_result0, xmm_curr_counter);
4267       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4268       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4269       __ pxor(xmm_result0, xmm_key_tmp0);
4270       for (int i = 1; i < rounds[k]; i++) {
4271         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4272         __ aesenc(xmm_result0, xmm_key_tmp0);
4273       }
4274       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4275       __ aesenclast(xmm_result0, xmm_key_tmp0);
4276       __ cmpptr(len_reg, AESBlockSize);
4277       __ jcc(Assembler::less, L_processTail_insr[k]);
4278         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4279         __ pxor(xmm_result0, xmm_from0);
4280         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4281         __ addptr(pos, AESBlockSize);
4282         __ subptr(len_reg, AESBlockSize);
4283         __ jmp(L_singleBlockLoopTop[k]);
4284       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4285         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4286         __ testptr(len_reg, 8);
4287         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4288           __ subptr(pos,8);
4289           __ pinsrq(xmm_from0, Address(from, pos), 0);
4290         __ BIND(L_processTail_4_insr[k]);
4291         __ testptr(len_reg, 4);
4292         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4293           __ subptr(pos,4);
4294           __ pslldq(xmm_from0, 4);
4295           __ pinsrd(xmm_from0, Address(from, pos), 0);
4296         __ BIND(L_processTail_2_insr[k]);
4297         __ testptr(len_reg, 2);
4298         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4299           __ subptr(pos, 2);
4300           __ pslldq(xmm_from0, 2);
4301           __ pinsrw(xmm_from0, Address(from, pos), 0);
4302         __ BIND(L_processTail_1_insr[k]);
4303         __ testptr(len_reg, 1);
4304         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4305           __ subptr(pos, 1);
4306           __ pslldq(xmm_from0, 1);
4307           __ pinsrb(xmm_from0, Address(from, pos), 0);
4308         __ BIND(L_processTail_exit_insr[k]);
4309 
4310         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4311         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4312 
4313         __ testptr(len_reg, 8);
4314         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4315           __ pextrq(Address(to, pos), xmm_result0, 0);
4316           __ psrldq(xmm_result0, 8);
4317           __ addptr(pos, 8);
4318         __ BIND(L_processTail_4_extr[k]);
4319         __ testptr(len_reg, 4);
4320         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4321           __ pextrd(Address(to, pos), xmm_result0, 0);
4322           __ psrldq(xmm_result0, 4);
4323           __ addptr(pos, 4);
4324         __ BIND(L_processTail_2_extr[k]);
4325         __ testptr(len_reg, 2);
4326         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4327           __ pextrw(Address(to, pos), xmm_result0, 0);
4328           __ psrldq(xmm_result0, 2);
4329           __ addptr(pos, 2);
4330         __ BIND(L_processTail_1_extr[k]);
4331         __ testptr(len_reg, 1);
4332         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4333           __ pextrb(Address(to, pos), xmm_result0, 0);
4334 
4335         __ BIND(L_processTail_exit_extr[k]);
4336         __ movl(Address(used_addr, 0), len_reg);
4337         __ jmp(L_exit);
4338 
4339     }
4340 
4341     __ BIND(L_exit);
4342     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4343     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4344     __ pop(rbx); // pop the saved RBX.
4345 #ifdef _WIN64
4346     __ movl(rax, len_mem);
4347     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4348     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4349     __ addptr(rsp, 2 * wordSize);
4350 #else
4351     __ pop(rax); // return 'len'
4352 #endif
4353     __ leave(); // required for proper stackwalking of RuntimeStub frame
4354     __ ret(0);
4355     return start;
4356   }
4357 
4358   // byte swap x86 long
4359   address generate_ghash_long_swap_mask() {
4360     __ align(CodeEntryAlignment);
4361     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4362     address start = __ pc();
4363     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4364     __ emit_data64(0x0706050403020100, relocInfo::none );
4365   return start;
4366   }
4367 
4368   // byte swap x86 byte array
4369   address generate_ghash_byte_swap_mask() {
4370     __ align(CodeEntryAlignment);
4371     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4372     address start = __ pc();
4373     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4374     __ emit_data64(0x0001020304050607, relocInfo::none );
4375   return start;
4376   }
4377 
4378   /* Single and multi-block ghash operations */
4379   address generate_ghash_processBlocks() {
4380     __ align(CodeEntryAlignment);
4381     Label L_ghash_loop, L_exit;
4382     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4383     address start = __ pc();
4384 
4385     const Register state        = c_rarg0;
4386     const Register subkeyH      = c_rarg1;
4387     const Register data         = c_rarg2;
4388     const Register blocks       = c_rarg3;
4389 
4390     const XMMRegister xmm_temp0 = xmm0;
4391     const XMMRegister xmm_temp1 = xmm1;
4392     const XMMRegister xmm_temp2 = xmm2;
4393     const XMMRegister xmm_temp3 = xmm3;
4394     const XMMRegister xmm_temp4 = xmm4;
4395     const XMMRegister xmm_temp5 = xmm5;
4396     const XMMRegister xmm_temp6 = xmm6;
4397     const XMMRegister xmm_temp7 = xmm7;
4398     const XMMRegister xmm_temp8 = xmm8;
4399     const XMMRegister xmm_temp9 = xmm9;
4400     const XMMRegister xmm_temp10 = xmm10;
4401 
4402     __ enter();
4403 
4404     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4405     // context for the registers used, where all instructions below are using 128-bit mode
4406     // On EVEX without VL and BW, these instructions will all be AVX.
4407     if (VM_Version::supports_avx512vlbw()) {
4408       __ movl(rax, 0xffff);
4409       __ kmovql(k1, rax);
4410     }
4411 
4412     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4413 
4414     __ movdqu(xmm_temp0, Address(state, 0));
4415     __ pshufb(xmm_temp0, xmm_temp10);
4416 
4417 
4418     __ BIND(L_ghash_loop);
4419     __ movdqu(xmm_temp2, Address(data, 0));
4420     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4421 
4422     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4423     __ pshufb(xmm_temp1, xmm_temp10);
4424 
4425     __ pxor(xmm_temp0, xmm_temp2);
4426 
4427     //
4428     // Multiply with the hash key
4429     //
4430     __ movdqu(xmm_temp3, xmm_temp0);
4431     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
4432     __ movdqu(xmm_temp4, xmm_temp0);
4433     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
4434 
4435     __ movdqu(xmm_temp5, xmm_temp0);
4436     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
4437     __ movdqu(xmm_temp6, xmm_temp0);
4438     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
4439 
4440     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
4441 
4442     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
4443     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
4444     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
4445     __ pxor(xmm_temp3, xmm_temp5);
4446     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
4447                                         // of the carry-less multiplication of
4448                                         // xmm0 by xmm1.
4449 
4450     // We shift the result of the multiplication by one bit position
4451     // to the left to cope for the fact that the bits are reversed.
4452     __ movdqu(xmm_temp7, xmm_temp3);
4453     __ movdqu(xmm_temp8, xmm_temp6);
4454     __ pslld(xmm_temp3, 1);
4455     __ pslld(xmm_temp6, 1);
4456     __ psrld(xmm_temp7, 31);
4457     __ psrld(xmm_temp8, 31);
4458     __ movdqu(xmm_temp9, xmm_temp7);
4459     __ pslldq(xmm_temp8, 4);
4460     __ pslldq(xmm_temp7, 4);
4461     __ psrldq(xmm_temp9, 12);
4462     __ por(xmm_temp3, xmm_temp7);
4463     __ por(xmm_temp6, xmm_temp8);
4464     __ por(xmm_temp6, xmm_temp9);
4465 
4466     //
4467     // First phase of the reduction
4468     //
4469     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4470     // independently.
4471     __ movdqu(xmm_temp7, xmm_temp3);
4472     __ movdqu(xmm_temp8, xmm_temp3);
4473     __ movdqu(xmm_temp9, xmm_temp3);
4474     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
4475     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
4476     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
4477     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
4478     __ pxor(xmm_temp7, xmm_temp9);
4479     __ movdqu(xmm_temp8, xmm_temp7);
4480     __ pslldq(xmm_temp7, 12);
4481     __ psrldq(xmm_temp8, 4);
4482     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
4483 
4484     //
4485     // Second phase of the reduction
4486     //
4487     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4488     // shift operations.
4489     __ movdqu(xmm_temp2, xmm_temp3);
4490     __ movdqu(xmm_temp4, xmm_temp3);
4491     __ movdqu(xmm_temp5, xmm_temp3);
4492     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4493     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4494     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4495     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4496     __ pxor(xmm_temp2, xmm_temp5);
4497     __ pxor(xmm_temp2, xmm_temp8);
4498     __ pxor(xmm_temp3, xmm_temp2);
4499     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4500 
4501     __ decrement(blocks);
4502     __ jcc(Assembler::zero, L_exit);
4503     __ movdqu(xmm_temp0, xmm_temp6);
4504     __ addptr(data, 16);
4505     __ jmp(L_ghash_loop);
4506 
4507     __ BIND(L_exit);
4508     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4509     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4510     __ leave();
4511     __ ret(0);
4512     return start;
4513   }
4514 
4515   /**
4516    *  Arguments:
4517    *
4518    * Inputs:
4519    *   c_rarg0   - int crc
4520    *   c_rarg1   - byte* buf
4521    *   c_rarg2   - int length
4522    *
4523    * Ouput:
4524    *       rax   - int crc result
4525    */
4526   address generate_updateBytesCRC32() {
4527     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4528 
4529     __ align(CodeEntryAlignment);
4530     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4531 
4532     address start = __ pc();
4533     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4534     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4535     // rscratch1: r10
4536     const Register crc   = c_rarg0;  // crc
4537     const Register buf   = c_rarg1;  // source java byte array address
4538     const Register len   = c_rarg2;  // length
4539     const Register table = c_rarg3;  // crc_table address (reuse register)
4540     const Register tmp   = r11;
4541     assert_different_registers(crc, buf, len, table, tmp, rax);
4542 
4543     BLOCK_COMMENT("Entry:");
4544     __ enter(); // required for proper stackwalking of RuntimeStub frame
4545 
4546     __ kernel_crc32(crc, buf, len, table, tmp);
4547 
4548     __ movl(rax, crc);
4549     __ vzeroupper();
4550     __ leave(); // required for proper stackwalking of RuntimeStub frame
4551     __ ret(0);
4552 
4553     return start;
4554   }
4555 
4556   /**
4557   *  Arguments:
4558   *
4559   * Inputs:
4560   *   c_rarg0   - int crc
4561   *   c_rarg1   - byte* buf
4562   *   c_rarg2   - long length
4563   *   c_rarg3   - table_start - optional (present only when doing a library_call,
4564   *              not used by x86 algorithm)
4565   *
4566   * Ouput:
4567   *       rax   - int crc result
4568   */
4569   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
4570       assert(UseCRC32CIntrinsics, "need SSE4_2");
4571       __ align(CodeEntryAlignment);
4572       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4573       address start = __ pc();
4574       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
4575       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
4576       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
4577       const Register crc = c_rarg0;  // crc
4578       const Register buf = c_rarg1;  // source java byte array address
4579       const Register len = c_rarg2;  // length
4580       const Register a = rax;
4581       const Register j = r9;
4582       const Register k = r10;
4583       const Register l = r11;
4584 #ifdef _WIN64
4585       const Register y = rdi;
4586       const Register z = rsi;
4587 #else
4588       const Register y = rcx;
4589       const Register z = r8;
4590 #endif
4591       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
4592 
4593       BLOCK_COMMENT("Entry:");
4594       __ enter(); // required for proper stackwalking of RuntimeStub frame
4595 #ifdef _WIN64
4596       __ push(y);
4597       __ push(z);
4598 #endif
4599       __ crc32c_ipl_alg2_alt2(crc, buf, len,
4600                               a, j, k,
4601                               l, y, z,
4602                               c_farg0, c_farg1, c_farg2,
4603                               is_pclmulqdq_supported);
4604       __ movl(rax, crc);
4605 #ifdef _WIN64
4606       __ pop(z);
4607       __ pop(y);
4608 #endif
4609       __ vzeroupper();
4610       __ leave(); // required for proper stackwalking of RuntimeStub frame
4611       __ ret(0);
4612 
4613       return start;
4614   }
4615 
4616   /**
4617    *  Arguments:
4618    *
4619    *  Input:
4620    *    c_rarg0   - x address
4621    *    c_rarg1   - x length
4622    *    c_rarg2   - y address
4623    *    c_rarg3   - y lenth
4624    * not Win64
4625    *    c_rarg4   - z address
4626    *    c_rarg5   - z length
4627    * Win64
4628    *    rsp+40    - z address
4629    *    rsp+48    - z length
4630    */
4631   address generate_multiplyToLen() {
4632     __ align(CodeEntryAlignment);
4633     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4634 
4635     address start = __ pc();
4636     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4637     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4638     const Register x     = rdi;
4639     const Register xlen  = rax;
4640     const Register y     = rsi;
4641     const Register ylen  = rcx;
4642     const Register z     = r8;
4643     const Register zlen  = r11;
4644 
4645     // Next registers will be saved on stack in multiply_to_len().
4646     const Register tmp1  = r12;
4647     const Register tmp2  = r13;
4648     const Register tmp3  = r14;
4649     const Register tmp4  = r15;
4650     const Register tmp5  = rbx;
4651 
4652     BLOCK_COMMENT("Entry:");
4653     __ enter(); // required for proper stackwalking of RuntimeStub frame
4654 
4655 #ifndef _WIN64
4656     __ movptr(zlen, r9); // Save r9 in r11 - zlen
4657 #endif
4658     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
4659                        // ylen => rcx, z => r8, zlen => r11
4660                        // r9 and r10 may be used to save non-volatile registers
4661 #ifdef _WIN64
4662     // last 2 arguments (#4, #5) are on stack on Win64
4663     __ movptr(z, Address(rsp, 6 * wordSize));
4664     __ movptr(zlen, Address(rsp, 7 * wordSize));
4665 #endif
4666 
4667     __ movptr(xlen, rsi);
4668     __ movptr(y,    rdx);
4669     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
4670 
4671     restore_arg_regs();
4672 
4673     __ leave(); // required for proper stackwalking of RuntimeStub frame
4674     __ ret(0);
4675 
4676     return start;
4677   }
4678 
4679   /**
4680   *  Arguments:
4681   *
4682   *  Input:
4683   *    c_rarg0   - obja     address
4684   *    c_rarg1   - objb     address
4685   *    c_rarg3   - length   length
4686   *    c_rarg4   - scale    log2_array_indxscale
4687   *
4688   *  Output:
4689   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
4690   */
4691   address generate_vectorizedMismatch() {
4692     __ align(CodeEntryAlignment);
4693     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
4694     address start = __ pc();
4695 
4696     BLOCK_COMMENT("Entry:");
4697     __ enter();
4698 
4699 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4700     const Register scale = c_rarg0;  //rcx, will exchange with r9
4701     const Register objb = c_rarg1;   //rdx
4702     const Register length = c_rarg2; //r8
4703     const Register obja = c_rarg3;   //r9
4704     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
4705 
4706     const Register tmp1 = r10;
4707     const Register tmp2 = r11;
4708 #endif
4709 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4710     const Register obja = c_rarg0;   //U:rdi
4711     const Register objb = c_rarg1;   //U:rsi
4712     const Register length = c_rarg2; //U:rdx
4713     const Register scale = c_rarg3;  //U:rcx
4714     const Register tmp1 = r8;
4715     const Register tmp2 = r9;
4716 #endif
4717     const Register result = rax; //return value
4718     const XMMRegister vec0 = xmm0;
4719     const XMMRegister vec1 = xmm1;
4720     const XMMRegister vec2 = xmm2;
4721 
4722     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
4723 
4724     __ vzeroupper();
4725     __ leave();
4726     __ ret(0);
4727 
4728     return start;
4729   }
4730 
4731 /**
4732    *  Arguments:
4733    *
4734   //  Input:
4735   //    c_rarg0   - x address
4736   //    c_rarg1   - x length
4737   //    c_rarg2   - z address
4738   //    c_rarg3   - z lenth
4739    *
4740    */
4741   address generate_squareToLen() {
4742 
4743     __ align(CodeEntryAlignment);
4744     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4745 
4746     address start = __ pc();
4747     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4748     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
4749     const Register x      = rdi;
4750     const Register len    = rsi;
4751     const Register z      = r8;
4752     const Register zlen   = rcx;
4753 
4754    const Register tmp1      = r12;
4755    const Register tmp2      = r13;
4756    const Register tmp3      = r14;
4757    const Register tmp4      = r15;
4758    const Register tmp5      = rbx;
4759 
4760     BLOCK_COMMENT("Entry:");
4761     __ enter(); // required for proper stackwalking of RuntimeStub frame
4762 
4763        setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
4764                           // zlen => rcx
4765                           // r9 and r10 may be used to save non-volatile registers
4766     __ movptr(r8, rdx);
4767     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
4768 
4769     restore_arg_regs();
4770 
4771     __ leave(); // required for proper stackwalking of RuntimeStub frame
4772     __ ret(0);
4773 
4774     return start;
4775   }
4776 
4777    /**
4778    *  Arguments:
4779    *
4780    *  Input:
4781    *    c_rarg0   - out address
4782    *    c_rarg1   - in address
4783    *    c_rarg2   - offset
4784    *    c_rarg3   - len
4785    * not Win64
4786    *    c_rarg4   - k
4787    * Win64
4788    *    rsp+40    - k
4789    */
4790   address generate_mulAdd() {
4791     __ align(CodeEntryAlignment);
4792     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4793 
4794     address start = __ pc();
4795     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4796     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4797     const Register out     = rdi;
4798     const Register in      = rsi;
4799     const Register offset  = r11;
4800     const Register len     = rcx;
4801     const Register k       = r8;
4802 
4803     // Next registers will be saved on stack in mul_add().
4804     const Register tmp1  = r12;
4805     const Register tmp2  = r13;
4806     const Register tmp3  = r14;
4807     const Register tmp4  = r15;
4808     const Register tmp5  = rbx;
4809 
4810     BLOCK_COMMENT("Entry:");
4811     __ enter(); // required for proper stackwalking of RuntimeStub frame
4812 
4813     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
4814                        // len => rcx, k => r8
4815                        // r9 and r10 may be used to save non-volatile registers
4816 #ifdef _WIN64
4817     // last argument is on stack on Win64
4818     __ movl(k, Address(rsp, 6 * wordSize));
4819 #endif
4820     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
4821     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
4822 
4823     restore_arg_regs();
4824 
4825     __ leave(); // required for proper stackwalking of RuntimeStub frame
4826     __ ret(0);
4827 
4828     return start;
4829   }
4830 
4831   address generate_libmExp() {
4832     StubCodeMark mark(this, "StubRoutines", "libmExp");
4833 
4834     address start = __ pc();
4835 
4836     const XMMRegister x0  = xmm0;
4837     const XMMRegister x1  = xmm1;
4838     const XMMRegister x2  = xmm2;
4839     const XMMRegister x3  = xmm3;
4840 
4841     const XMMRegister x4  = xmm4;
4842     const XMMRegister x5  = xmm5;
4843     const XMMRegister x6  = xmm6;
4844     const XMMRegister x7  = xmm7;
4845 
4846     const Register tmp   = r11;
4847 
4848     BLOCK_COMMENT("Entry:");
4849     __ enter(); // required for proper stackwalking of RuntimeStub frame
4850 
4851     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4852 
4853     __ leave(); // required for proper stackwalking of RuntimeStub frame
4854     __ ret(0);
4855 
4856     return start;
4857 
4858   }
4859 
4860   address generate_libmLog() {
4861     StubCodeMark mark(this, "StubRoutines", "libmLog");
4862 
4863     address start = __ pc();
4864 
4865     const XMMRegister x0 = xmm0;
4866     const XMMRegister x1 = xmm1;
4867     const XMMRegister x2 = xmm2;
4868     const XMMRegister x3 = xmm3;
4869 
4870     const XMMRegister x4 = xmm4;
4871     const XMMRegister x5 = xmm5;
4872     const XMMRegister x6 = xmm6;
4873     const XMMRegister x7 = xmm7;
4874 
4875     const Register tmp1 = r11;
4876     const Register tmp2 = r8;
4877 
4878     BLOCK_COMMENT("Entry:");
4879     __ enter(); // required for proper stackwalking of RuntimeStub frame
4880 
4881     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
4882 
4883     __ leave(); // required for proper stackwalking of RuntimeStub frame
4884     __ ret(0);
4885 
4886     return start;
4887 
4888   }
4889 
4890   address generate_libmLog10() {
4891     StubCodeMark mark(this, "StubRoutines", "libmLog10");
4892 
4893     address start = __ pc();
4894 
4895     const XMMRegister x0 = xmm0;
4896     const XMMRegister x1 = xmm1;
4897     const XMMRegister x2 = xmm2;
4898     const XMMRegister x3 = xmm3;
4899 
4900     const XMMRegister x4 = xmm4;
4901     const XMMRegister x5 = xmm5;
4902     const XMMRegister x6 = xmm6;
4903     const XMMRegister x7 = xmm7;
4904 
4905     const Register tmp = r11;
4906 
4907     BLOCK_COMMENT("Entry:");
4908     __ enter(); // required for proper stackwalking of RuntimeStub frame
4909 
4910     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4911 
4912     __ leave(); // required for proper stackwalking of RuntimeStub frame
4913     __ ret(0);
4914 
4915     return start;
4916 
4917   }
4918 
4919   address generate_libmPow() {
4920     StubCodeMark mark(this, "StubRoutines", "libmPow");
4921 
4922     address start = __ pc();
4923 
4924     const XMMRegister x0 = xmm0;
4925     const XMMRegister x1 = xmm1;
4926     const XMMRegister x2 = xmm2;
4927     const XMMRegister x3 = xmm3;
4928 
4929     const XMMRegister x4 = xmm4;
4930     const XMMRegister x5 = xmm5;
4931     const XMMRegister x6 = xmm6;
4932     const XMMRegister x7 = xmm7;
4933 
4934     const Register tmp1 = r8;
4935     const Register tmp2 = r9;
4936     const Register tmp3 = r10;
4937     const Register tmp4 = r11;
4938 
4939     BLOCK_COMMENT("Entry:");
4940     __ enter(); // required for proper stackwalking of RuntimeStub frame
4941 
4942     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4943 
4944     __ leave(); // required for proper stackwalking of RuntimeStub frame
4945     __ ret(0);
4946 
4947     return start;
4948 
4949   }
4950 
4951   address generate_libmSin() {
4952     StubCodeMark mark(this, "StubRoutines", "libmSin");
4953 
4954     address start = __ pc();
4955 
4956     const XMMRegister x0 = xmm0;
4957     const XMMRegister x1 = xmm1;
4958     const XMMRegister x2 = xmm2;
4959     const XMMRegister x3 = xmm3;
4960 
4961     const XMMRegister x4 = xmm4;
4962     const XMMRegister x5 = xmm5;
4963     const XMMRegister x6 = xmm6;
4964     const XMMRegister x7 = xmm7;
4965 
4966     const Register tmp1 = r8;
4967     const Register tmp2 = r9;
4968     const Register tmp3 = r10;
4969     const Register tmp4 = r11;
4970 
4971     BLOCK_COMMENT("Entry:");
4972     __ enter(); // required for proper stackwalking of RuntimeStub frame
4973 
4974 #ifdef _WIN64
4975     __ push(rsi);
4976     __ push(rdi);
4977 #endif
4978     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4979 
4980 #ifdef _WIN64
4981     __ pop(rdi);
4982     __ pop(rsi);
4983 #endif
4984 
4985     __ leave(); // required for proper stackwalking of RuntimeStub frame
4986     __ ret(0);
4987 
4988     return start;
4989 
4990   }
4991 
4992   address generate_libmCos() {
4993     StubCodeMark mark(this, "StubRoutines", "libmCos");
4994 
4995     address start = __ pc();
4996 
4997     const XMMRegister x0 = xmm0;
4998     const XMMRegister x1 = xmm1;
4999     const XMMRegister x2 = xmm2;
5000     const XMMRegister x3 = xmm3;
5001 
5002     const XMMRegister x4 = xmm4;
5003     const XMMRegister x5 = xmm5;
5004     const XMMRegister x6 = xmm6;
5005     const XMMRegister x7 = xmm7;
5006 
5007     const Register tmp1 = r8;
5008     const Register tmp2 = r9;
5009     const Register tmp3 = r10;
5010     const Register tmp4 = r11;
5011 
5012     BLOCK_COMMENT("Entry:");
5013     __ enter(); // required for proper stackwalking of RuntimeStub frame
5014 
5015 #ifdef _WIN64
5016     __ push(rsi);
5017     __ push(rdi);
5018 #endif
5019     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5020 
5021 #ifdef _WIN64
5022     __ pop(rdi);
5023     __ pop(rsi);
5024 #endif
5025 
5026     __ leave(); // required for proper stackwalking of RuntimeStub frame
5027     __ ret(0);
5028 
5029     return start;
5030 
5031   }
5032 
5033   address generate_libmTan() {
5034     StubCodeMark mark(this, "StubRoutines", "libmTan");
5035 
5036     address start = __ pc();
5037 
5038     const XMMRegister x0 = xmm0;
5039     const XMMRegister x1 = xmm1;
5040     const XMMRegister x2 = xmm2;
5041     const XMMRegister x3 = xmm3;
5042 
5043     const XMMRegister x4 = xmm4;
5044     const XMMRegister x5 = xmm5;
5045     const XMMRegister x6 = xmm6;
5046     const XMMRegister x7 = xmm7;
5047 
5048     const Register tmp1 = r8;
5049     const Register tmp2 = r9;
5050     const Register tmp3 = r10;
5051     const Register tmp4 = r11;
5052 
5053     BLOCK_COMMENT("Entry:");
5054     __ enter(); // required for proper stackwalking of RuntimeStub frame
5055 
5056 #ifdef _WIN64
5057     __ push(rsi);
5058     __ push(rdi);
5059 #endif
5060     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5061 
5062 #ifdef _WIN64
5063     __ pop(rdi);
5064     __ pop(rsi);
5065 #endif
5066 
5067     __ leave(); // required for proper stackwalking of RuntimeStub frame
5068     __ ret(0);
5069 
5070     return start;
5071 
5072   }
5073 
5074 #undef __
5075 #define __ masm->
5076 
5077   // Continuation point for throwing of implicit exceptions that are
5078   // not handled in the current activation. Fabricates an exception
5079   // oop and initiates normal exception dispatching in this
5080   // frame. Since we need to preserve callee-saved values (currently
5081   // only for C2, but done for C1 as well) we need a callee-saved oop
5082   // map and therefore have to make these stubs into RuntimeStubs
5083   // rather than BufferBlobs.  If the compiler needs all registers to
5084   // be preserved between the fault point and the exception handler
5085   // then it must assume responsibility for that in
5086   // AbstractCompiler::continuation_for_implicit_null_exception or
5087   // continuation_for_implicit_division_by_zero_exception. All other
5088   // implicit exceptions (e.g., NullPointerException or
5089   // AbstractMethodError on entry) are either at call sites or
5090   // otherwise assume that stack unwinding will be initiated, so
5091   // caller saved registers were assumed volatile in the compiler.
5092   address generate_throw_exception(const char* name,
5093                                    address runtime_entry,
5094                                    Register arg1 = noreg,
5095                                    Register arg2 = noreg) {
5096     // Information about frame layout at time of blocking runtime call.
5097     // Note that we only have to preserve callee-saved registers since
5098     // the compilers are responsible for supplying a continuation point
5099     // if they expect all registers to be preserved.
5100     enum layout {
5101       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5102       rbp_off2,
5103       return_off,
5104       return_off2,
5105       framesize // inclusive of return address
5106     };
5107 
5108     int insts_size = 512;
5109     int locs_size  = 64;
5110 
5111     CodeBuffer code(name, insts_size, locs_size);
5112     OopMapSet* oop_maps  = new OopMapSet();
5113     MacroAssembler* masm = new MacroAssembler(&code);
5114 
5115     address start = __ pc();
5116 
5117     // This is an inlined and slightly modified version of call_VM
5118     // which has the ability to fetch the return PC out of
5119     // thread-local storage and also sets up last_Java_sp slightly
5120     // differently than the real call_VM
5121 
5122     __ enter(); // required for proper stackwalking of RuntimeStub frame
5123 
5124     assert(is_even(framesize/2), "sp not 16-byte aligned");
5125 
5126     // return address and rbp are already in place
5127     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
5128 
5129     int frame_complete = __ pc() - start;
5130 
5131     // Set up last_Java_sp and last_Java_fp
5132     address the_pc = __ pc();
5133     __ set_last_Java_frame(rsp, rbp, the_pc);
5134     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
5135 
5136     // Call runtime
5137     if (arg1 != noreg) {
5138       assert(arg2 != c_rarg1, "clobbered");
5139       __ movptr(c_rarg1, arg1);
5140     }
5141     if (arg2 != noreg) {
5142       __ movptr(c_rarg2, arg2);
5143     }
5144     __ movptr(c_rarg0, r15_thread);
5145     BLOCK_COMMENT("call runtime_entry");
5146     __ call(RuntimeAddress(runtime_entry));
5147 
5148     // Generate oop map
5149     OopMap* map = new OopMap(framesize, 0);
5150 
5151     oop_maps->add_gc_map(the_pc - start, map);
5152 
5153     __ reset_last_Java_frame(true);
5154 
5155     __ leave(); // required for proper stackwalking of RuntimeStub frame
5156 
5157     // check for pending exceptions
5158 #ifdef ASSERT
5159     Label L;
5160     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
5161             (int32_t) NULL_WORD);
5162     __ jcc(Assembler::notEqual, L);
5163     __ should_not_reach_here();
5164     __ bind(L);
5165 #endif // ASSERT
5166     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5167 
5168 
5169     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5170     RuntimeStub* stub =
5171       RuntimeStub::new_runtime_stub(name,
5172                                     &code,
5173                                     frame_complete,
5174                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5175                                     oop_maps, false);
5176     return stub->entry_point();
5177   }
5178 
5179   void create_control_words() {
5180     // Round to nearest, 53-bit mode, exceptions masked
5181     StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
5182     // Round to zero, 53-bit mode, exception mased
5183     StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
5184     // Round to nearest, 24-bit mode, exceptions masked
5185     StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
5186     // Round to nearest, 64-bit mode, exceptions masked
5187     StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
5188     // Round to nearest, 64-bit mode, exceptions masked
5189     StubRoutines::_mxcsr_std           = 0x1F80;
5190     // Note: the following two constants are 80-bit values
5191     //       layout is critical for correct loading by FPU.
5192     // Bias for strict fp multiply/divide
5193     StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
5194     StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
5195     StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
5196     // Un-Bias for strict fp multiply/divide
5197     StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
5198     StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
5199     StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
5200   }
5201 
5202   // Initialization
5203   void generate_initial() {
5204     // Generates all stubs and initializes the entry points
5205 
5206     // This platform-specific settings are needed by generate_call_stub()
5207     create_control_words();
5208 
5209     // entry points that exist in all platforms Note: This is code
5210     // that could be shared among different platforms - however the
5211     // benefit seems to be smaller than the disadvantage of having a
5212     // much more complicated generator structure. See also comment in
5213     // stubRoutines.hpp.
5214 
5215     StubRoutines::_forward_exception_entry = generate_forward_exception();
5216 
5217     StubRoutines::_call_stub_entry =
5218       generate_call_stub(StubRoutines::_call_stub_return_address);
5219 
5220     // is referenced by megamorphic call
5221     StubRoutines::_catch_exception_entry = generate_catch_exception();
5222 
5223     // atomic calls
5224     StubRoutines::_atomic_xchg_entry          = generate_atomic_xchg();
5225     StubRoutines::_atomic_xchg_long_entry     = generate_atomic_xchg_long();
5226     StubRoutines::_atomic_cmpxchg_entry       = generate_atomic_cmpxchg();
5227     StubRoutines::_atomic_cmpxchg_byte_entry  = generate_atomic_cmpxchg_byte();
5228     StubRoutines::_atomic_cmpxchg_long_entry  = generate_atomic_cmpxchg_long();
5229     StubRoutines::_atomic_add_entry           = generate_atomic_add();
5230     StubRoutines::_atomic_add_long_entry      = generate_atomic_add_long();
5231     StubRoutines::_fence_entry                = generate_orderaccess_fence();
5232 
5233     // platform dependent
5234     StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
5235     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
5236 
5237     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
5238 
5239     // Build this early so it's available for the interpreter.
5240     StubRoutines::_throw_StackOverflowError_entry =
5241       generate_throw_exception("StackOverflowError throw_exception",
5242                                CAST_FROM_FN_PTR(address,
5243                                                 SharedRuntime::
5244                                                 throw_StackOverflowError));
5245     StubRoutines::_throw_delayed_StackOverflowError_entry =
5246       generate_throw_exception("delayed StackOverflowError throw_exception",
5247                                CAST_FROM_FN_PTR(address,
5248                                                 SharedRuntime::
5249                                                 throw_delayed_StackOverflowError));
5250     if (UseCRC32Intrinsics) {
5251       // set table address before stub generation which use it
5252       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
5253       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5254     }
5255 
5256     if (UseCRC32CIntrinsics) {
5257       bool supports_clmul = VM_Version::supports_clmul();
5258       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
5259       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
5260       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
5261     }
5262     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
5263       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
5264           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
5265           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5266         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
5267         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
5268         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
5269         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
5270         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
5271         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
5272         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
5273         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
5274         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
5275         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
5276         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
5277         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
5278         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
5279         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
5280       }
5281       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
5282         StubRoutines::_dexp = generate_libmExp();
5283       }
5284       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5285         StubRoutines::_dlog = generate_libmLog();
5286       }
5287       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
5288         StubRoutines::_dlog10 = generate_libmLog10();
5289       }
5290       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
5291         StubRoutines::_dpow = generate_libmPow();
5292       }
5293       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5294         StubRoutines::_dsin = generate_libmSin();
5295       }
5296       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5297         StubRoutines::_dcos = generate_libmCos();
5298       }
5299       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5300         StubRoutines::_dtan = generate_libmTan();
5301       }
5302     }
5303   }
5304 
5305   void generate_all() {
5306     // Generates all stubs and initializes the entry points
5307 
5308     // These entry points require SharedInfo::stack0 to be set up in
5309     // non-core builds and need to be relocatable, so they each
5310     // fabricate a RuntimeStub internally.
5311     StubRoutines::_throw_AbstractMethodError_entry =
5312       generate_throw_exception("AbstractMethodError throw_exception",
5313                                CAST_FROM_FN_PTR(address,
5314                                                 SharedRuntime::
5315                                                 throw_AbstractMethodError));
5316 
5317     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5318       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5319                                CAST_FROM_FN_PTR(address,
5320                                                 SharedRuntime::
5321                                                 throw_IncompatibleClassChangeError));
5322 
5323     StubRoutines::_throw_NullPointerException_at_call_entry =
5324       generate_throw_exception("NullPointerException at call throw_exception",
5325                                CAST_FROM_FN_PTR(address,
5326                                                 SharedRuntime::
5327                                                 throw_NullPointerException_at_call));
5328 
5329     // entry points that are platform specific
5330     if (UseShenandoahGC && (ShenandoahWriteBarrier || ShenandoahStoreValWriteBarrier || ShenandoahStoreValEnqueueBarrier)) {
5331          StubRoutines::x86::_shenandoah_wb = generate_shenandoah_wb(false, true);
5332          StubRoutines::_shenandoah_wb_C = generate_shenandoah_wb(true, !ShenandoahWriteBarrierCsetTestInIR);
5333     }
5334     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
5335     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
5336     StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
5337     StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
5338 
5339     StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
5340     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
5341     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
5342     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
5343 
5344     // support for verify_oop (must happen after universe_init)
5345     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5346 
5347     // arraycopy stubs used by compilers
5348     generate_arraycopy_stubs();
5349 
5350     // don't bother generating these AES intrinsic stubs unless global flag is set
5351     if (UseAESIntrinsics) {
5352       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
5353       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5354       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5355       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5356       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5357     }
5358     if (UseAESCTRIntrinsics){
5359       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
5360       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
5361     }
5362 
5363     if (UseSHA1Intrinsics) {
5364       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
5365       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
5366       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5367       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5368     }
5369     if (UseSHA256Intrinsics) {
5370       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
5371       char* dst = (char*)StubRoutines::x86::_k256_W;
5372       char* src = (char*)StubRoutines::x86::_k256;
5373       for (int ii = 0; ii < 16; ++ii) {
5374         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
5375         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
5376       }
5377       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
5378       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
5379       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5380       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5381     }
5382     if (UseSHA512Intrinsics) {
5383       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
5384       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
5385       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
5386       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
5387     }
5388 
5389     // Generate GHASH intrinsics code
5390     if (UseGHASHIntrinsics) {
5391       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
5392       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
5393       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5394     }
5395 
5396     // Safefetch stubs.
5397     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5398                                                        &StubRoutines::_safefetch32_fault_pc,
5399                                                        &StubRoutines::_safefetch32_continuation_pc);
5400     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5401                                                        &StubRoutines::_safefetchN_fault_pc,
5402                                                        &StubRoutines::_safefetchN_continuation_pc);
5403 #ifdef COMPILER2
5404     if (UseMultiplyToLenIntrinsic) {
5405       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5406     }
5407     if (UseSquareToLenIntrinsic) {
5408       StubRoutines::_squareToLen = generate_squareToLen();
5409     }
5410     if (UseMulAddIntrinsic) {
5411       StubRoutines::_mulAdd = generate_mulAdd();
5412     }
5413 #ifndef _WINDOWS
5414     if (UseMontgomeryMultiplyIntrinsic) {
5415       StubRoutines::_montgomeryMultiply
5416         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
5417     }
5418     if (UseMontgomerySquareIntrinsic) {
5419       StubRoutines::_montgomerySquare
5420         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
5421     }
5422 #endif // WINDOWS
5423 #endif // COMPILER2
5424 
5425     if (UseVectorizedMismatchIntrinsic) {
5426       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
5427     }
5428   }
5429 
5430  public:
5431   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5432     if (all) {
5433       generate_all();
5434     } else {
5435       generate_initial();
5436     }
5437   }
5438 }; // end class declaration
5439 
5440 void StubGenerator_generate(CodeBuffer* code, bool all) {
5441   StubGenerator g(code, all);
5442 }