1 /*
   2  * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "nativeInst_x86.hpp"
  30 #include "oops/instanceOop.hpp"
  31 #include "oops/method.hpp"
  32 #include "oops/objArrayKlass.hpp"
  33 #include "oops/oop.inline.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/handles.inline.hpp"
  37 #include "runtime/sharedRuntime.hpp"
  38 #include "runtime/stubCodeGenerator.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #include "runtime/thread.inline.hpp"
  41 #ifdef COMPILER2
  42 #include "opto/runtime.hpp"
  43 #endif
  44 
  45 // Declaration and definition of StubGenerator (no .hpp file).
  46 // For a more detailed description of the stub routine structure
  47 // see the comment in stubRoutines.hpp
  48 
  49 #define __ _masm->
  50 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  51 #define a__ ((Assembler*)_masm)->
  52 
  53 #ifdef PRODUCT
  54 #define BLOCK_COMMENT(str) /* nothing */
  55 #else
  56 #define BLOCK_COMMENT(str) __ block_comment(str)
  57 #endif
  58 
  59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  60 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  61 
  62 // Stub Code definitions
  63 
  64 class StubGenerator: public StubCodeGenerator {
  65  private:
  66 
  67 #ifdef PRODUCT
  68 #define inc_counter_np(counter) ((void)0)
  69 #else
  70   void inc_counter_np_(int& counter) {
  71     // This can destroy rscratch1 if counter is far from the code cache
  72     __ incrementl(ExternalAddress((address)&counter));
  73   }
  74 #define inc_counter_np(counter) \
  75   BLOCK_COMMENT("inc_counter " #counter); \
  76   inc_counter_np_(counter);
  77 #endif
  78 
  79   // Call stubs are used to call Java from C
  80   //
  81   // Linux Arguments:
  82   //    c_rarg0:   call wrapper address                   address
  83   //    c_rarg1:   result                                 address
  84   //    c_rarg2:   result type                            BasicType
  85   //    c_rarg3:   method                                 Method*
  86   //    c_rarg4:   (interpreter) entry point              address
  87   //    c_rarg5:   parameters                             intptr_t*
  88   //    16(rbp): parameter size (in words)              int
  89   //    24(rbp): thread                                 Thread*
  90   //
  91   //     [ return_from_Java     ] <--- rsp
  92   //     [ argument word n      ]
  93   //      ...
  94   // -12 [ argument word 1      ]
  95   // -11 [ saved r15            ] <--- rsp_after_call
  96   // -10 [ saved r14            ]
  97   //  -9 [ saved r13            ]
  98   //  -8 [ saved r12            ]
  99   //  -7 [ saved rbx            ]
 100   //  -6 [ call wrapper         ]
 101   //  -5 [ result               ]
 102   //  -4 [ result type          ]
 103   //  -3 [ method               ]
 104   //  -2 [ entry point          ]
 105   //  -1 [ parameters           ]
 106   //   0 [ saved rbp            ] <--- rbp
 107   //   1 [ return address       ]
 108   //   2 [ parameter size       ]
 109   //   3 [ thread               ]
 110   //
 111   // Windows Arguments:
 112   //    c_rarg0:   call wrapper address                   address
 113   //    c_rarg1:   result                                 address
 114   //    c_rarg2:   result type                            BasicType
 115   //    c_rarg3:   method                                 Method*
 116   //    48(rbp): (interpreter) entry point              address
 117   //    56(rbp): parameters                             intptr_t*
 118   //    64(rbp): parameter size (in words)              int
 119   //    72(rbp): thread                                 Thread*
 120   //
 121   //     [ return_from_Java     ] <--- rsp
 122   //     [ argument word n      ]
 123   //      ...
 124   // -60 [ argument word 1      ]
 125   // -59 [ saved xmm31          ] <--- rsp after_call
 126   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 127   // -27 [ saved xmm15          ]
 128   //     [ saved xmm7-xmm14     ]
 129   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 130   //  -7 [ saved r15            ]
 131   //  -6 [ saved r14            ]
 132   //  -5 [ saved r13            ]
 133   //  -4 [ saved r12            ]
 134   //  -3 [ saved rdi            ]
 135   //  -2 [ saved rsi            ]
 136   //  -1 [ saved rbx            ]
 137   //   0 [ saved rbp            ] <--- rbp
 138   //   1 [ return address       ]
 139   //   2 [ call wrapper         ]
 140   //   3 [ result               ]
 141   //   4 [ result type          ]
 142   //   5 [ method               ]
 143   //   6 [ entry point          ]
 144   //   7 [ parameters           ]
 145   //   8 [ parameter size       ]
 146   //   9 [ thread               ]
 147   //
 148   //    Windows reserves the callers stack space for arguments 1-4.
 149   //    We spill c_rarg0-c_rarg3 to this space.
 150 
 151   // Call stub stack layout word offsets from rbp
 152   enum call_stub_layout {
 153 #ifdef _WIN64
 154     xmm_save_first     = 6,  // save from xmm6
 155     xmm_save_last      = 31, // to xmm31
 156     xmm_save_base      = -9,
 157     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 158     r15_off            = -7,
 159     r14_off            = -6,
 160     r13_off            = -5,
 161     r12_off            = -4,
 162     rdi_off            = -3,
 163     rsi_off            = -2,
 164     rbx_off            = -1,
 165     rbp_off            =  0,
 166     retaddr_off        =  1,
 167     call_wrapper_off   =  2,
 168     result_off         =  3,
 169     result_type_off    =  4,
 170     method_off         =  5,
 171     entry_point_off    =  6,
 172     parameters_off     =  7,
 173     parameter_size_off =  8,
 174     thread_off         =  9
 175 #else
 176     rsp_after_call_off = -12,
 177     mxcsr_off          = rsp_after_call_off,
 178     r15_off            = -11,
 179     r14_off            = -10,
 180     r13_off            = -9,
 181     r12_off            = -8,
 182     rbx_off            = -7,
 183     call_wrapper_off   = -6,
 184     result_off         = -5,
 185     result_type_off    = -4,
 186     method_off         = -3,
 187     entry_point_off    = -2,
 188     parameters_off     = -1,
 189     rbp_off            =  0,
 190     retaddr_off        =  1,
 191     parameter_size_off =  2,
 192     thread_off         =  3
 193 #endif
 194   };
 195 
 196 #ifdef _WIN64
 197   Address xmm_save(int reg) {
 198     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 199     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 200   }
 201 #endif
 202 
 203   address generate_call_stub(address& return_address) {
 204     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 205            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 206            "adjust this code");
 207     StubCodeMark mark(this, "StubRoutines", "call_stub");
 208     address start = __ pc();
 209 
 210     // same as in generate_catch_exception()!
 211     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 212 
 213     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 214     const Address result        (rbp, result_off         * wordSize);
 215     const Address result_type   (rbp, result_type_off    * wordSize);
 216     const Address method        (rbp, method_off         * wordSize);
 217     const Address entry_point   (rbp, entry_point_off    * wordSize);
 218     const Address parameters    (rbp, parameters_off     * wordSize);
 219     const Address parameter_size(rbp, parameter_size_off * wordSize);
 220 
 221     // same as in generate_catch_exception()!
 222     const Address thread        (rbp, thread_off         * wordSize);
 223 
 224     const Address r15_save(rbp, r15_off * wordSize);
 225     const Address r14_save(rbp, r14_off * wordSize);
 226     const Address r13_save(rbp, r13_off * wordSize);
 227     const Address r12_save(rbp, r12_off * wordSize);
 228     const Address rbx_save(rbp, rbx_off * wordSize);
 229 
 230     // stub code
 231     __ enter();
 232     __ subptr(rsp, -rsp_after_call_off * wordSize);
 233 
 234     // save register parameters
 235 #ifndef _WIN64
 236     __ movptr(parameters,   c_rarg5); // parameters
 237     __ movptr(entry_point,  c_rarg4); // entry_point
 238 #endif
 239 
 240     __ movptr(method,       c_rarg3); // method
 241     __ movl(result_type,  c_rarg2);   // result type
 242     __ movptr(result,       c_rarg1); // result
 243     __ movptr(call_wrapper, c_rarg0); // call wrapper
 244 
 245     // save regs belonging to calling function
 246     __ movptr(rbx_save, rbx);
 247     __ movptr(r12_save, r12);
 248     __ movptr(r13_save, r13);
 249     __ movptr(r14_save, r14);
 250     __ movptr(r15_save, r15);
 251     if (UseAVX > 2) {
 252       __ movl(rbx, 0xffff);
 253       __ kmovwl(k1, rbx);
 254     }
 255 #ifdef _WIN64
 256     int last_reg = 15;
 257     if (UseAVX > 2) {
 258       last_reg = 31;
 259     }
 260     if (VM_Version::supports_evex()) {
 261       for (int i = xmm_save_first; i <= last_reg; i++) {
 262         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 263       }
 264     } else {
 265       for (int i = xmm_save_first; i <= last_reg; i++) {
 266         __ movdqu(xmm_save(i), as_XMMRegister(i));
 267       }
 268     }
 269 
 270     const Address rdi_save(rbp, rdi_off * wordSize);
 271     const Address rsi_save(rbp, rsi_off * wordSize);
 272 
 273     __ movptr(rsi_save, rsi);
 274     __ movptr(rdi_save, rdi);
 275 #else
 276     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 277     {
 278       Label skip_ldmx;
 279       __ stmxcsr(mxcsr_save);
 280       __ movl(rax, mxcsr_save);
 281       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 282       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 283       __ cmp32(rax, mxcsr_std);
 284       __ jcc(Assembler::equal, skip_ldmx);
 285       __ ldmxcsr(mxcsr_std);
 286       __ bind(skip_ldmx);
 287     }
 288 #endif
 289 
 290     // Load up thread register
 291     __ movptr(r15_thread, thread);
 292     __ reinit_heapbase();
 293 
 294 #ifdef ASSERT
 295     // make sure we have no pending exceptions
 296     {
 297       Label L;
 298       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 299       __ jcc(Assembler::equal, L);
 300       __ stop("StubRoutines::call_stub: entered with pending exception");
 301       __ bind(L);
 302     }
 303 #endif
 304 
 305     // pass parameters if any
 306     BLOCK_COMMENT("pass parameters if any");
 307     Label parameters_done;
 308     __ movl(c_rarg3, parameter_size);
 309     __ testl(c_rarg3, c_rarg3);
 310     __ jcc(Assembler::zero, parameters_done);
 311 
 312     Label loop;
 313     __ movptr(c_rarg2, parameters);       // parameter pointer
 314     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 315     __ BIND(loop);
 316     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 317     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 318     __ decrementl(c_rarg1);             // decrement counter
 319     __ push(rax);                       // pass parameter
 320     __ jcc(Assembler::notZero, loop);
 321 
 322     // call Java function
 323     __ BIND(parameters_done);
 324     __ movptr(rbx, method);             // get Method*
 325     __ movptr(c_rarg1, entry_point);    // get entry_point
 326     __ mov(r13, rsp);                   // set sender sp
 327     BLOCK_COMMENT("call Java function");
 328     __ call(c_rarg1);
 329 
 330     BLOCK_COMMENT("call_stub_return_address:");
 331     return_address = __ pc();
 332 
 333     // store result depending on type (everything that is not
 334     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 335     __ movptr(c_rarg0, result);
 336     Label is_long, is_float, is_double, exit;
 337     __ movl(c_rarg1, result_type);
 338     __ cmpl(c_rarg1, T_OBJECT);
 339     __ jcc(Assembler::equal, is_long);
 340     __ cmpl(c_rarg1, T_LONG);
 341     __ jcc(Assembler::equal, is_long);
 342     __ cmpl(c_rarg1, T_FLOAT);
 343     __ jcc(Assembler::equal, is_float);
 344     __ cmpl(c_rarg1, T_DOUBLE);
 345     __ jcc(Assembler::equal, is_double);
 346 
 347     // handle T_INT case
 348     __ movl(Address(c_rarg0, 0), rax);
 349 
 350     __ BIND(exit);
 351 
 352     // pop parameters
 353     __ lea(rsp, rsp_after_call);
 354 
 355 #ifdef ASSERT
 356     // verify that threads correspond
 357     {
 358      Label L1, L2, L3;
 359       __ cmpptr(r15_thread, thread);
 360       __ jcc(Assembler::equal, L1);
 361       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 362       __ bind(L1);
 363       __ get_thread(rbx);
 364       __ cmpptr(r15_thread, thread);
 365       __ jcc(Assembler::equal, L2);
 366       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 367       __ bind(L2);
 368       __ cmpptr(r15_thread, rbx);
 369       __ jcc(Assembler::equal, L3);
 370       __ stop("StubRoutines::call_stub: threads must correspond");
 371       __ bind(L3);
 372     }
 373 #endif
 374 
 375     // restore regs belonging to calling function
 376 #ifdef _WIN64
 377     // emit the restores for xmm regs
 378     if (VM_Version::supports_evex()) {
 379       for (int i = xmm_save_first; i <= last_reg; i++) {
 380         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 381       }
 382     } else {
 383       for (int i = xmm_save_first; i <= last_reg; i++) {
 384         __ movdqu(as_XMMRegister(i), xmm_save(i));
 385       }
 386     }
 387 #endif
 388     __ movptr(r15, r15_save);
 389     __ movptr(r14, r14_save);
 390     __ movptr(r13, r13_save);
 391     __ movptr(r12, r12_save);
 392     __ movptr(rbx, rbx_save);
 393 
 394 #ifdef _WIN64
 395     __ movptr(rdi, rdi_save);
 396     __ movptr(rsi, rsi_save);
 397 #else
 398     __ ldmxcsr(mxcsr_save);
 399 #endif
 400 
 401     // restore rsp
 402     __ addptr(rsp, -rsp_after_call_off * wordSize);
 403 
 404     // return
 405     __ vzeroupper();
 406     __ pop(rbp);
 407     __ ret(0);
 408 
 409     // handle return types different from T_INT
 410     __ BIND(is_long);
 411     __ movq(Address(c_rarg0, 0), rax);
 412     __ jmp(exit);
 413 
 414     __ BIND(is_float);
 415     __ movflt(Address(c_rarg0, 0), xmm0);
 416     __ jmp(exit);
 417 
 418     __ BIND(is_double);
 419     __ movdbl(Address(c_rarg0, 0), xmm0);
 420     __ jmp(exit);
 421 
 422     return start;
 423   }
 424 
 425   // Return point for a Java call if there's an exception thrown in
 426   // Java code.  The exception is caught and transformed into a
 427   // pending exception stored in JavaThread that can be tested from
 428   // within the VM.
 429   //
 430   // Note: Usually the parameters are removed by the callee. In case
 431   // of an exception crossing an activation frame boundary, that is
 432   // not the case if the callee is compiled code => need to setup the
 433   // rsp.
 434   //
 435   // rax: exception oop
 436 
 437   address generate_catch_exception() {
 438     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 439     address start = __ pc();
 440 
 441     // same as in generate_call_stub():
 442     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 443     const Address thread        (rbp, thread_off         * wordSize);
 444 
 445 #ifdef ASSERT
 446     // verify that threads correspond
 447     {
 448       Label L1, L2, L3;
 449       __ cmpptr(r15_thread, thread);
 450       __ jcc(Assembler::equal, L1);
 451       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 452       __ bind(L1);
 453       __ get_thread(rbx);
 454       __ cmpptr(r15_thread, thread);
 455       __ jcc(Assembler::equal, L2);
 456       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 457       __ bind(L2);
 458       __ cmpptr(r15_thread, rbx);
 459       __ jcc(Assembler::equal, L3);
 460       __ stop("StubRoutines::catch_exception: threads must correspond");
 461       __ bind(L3);
 462     }
 463 #endif
 464 
 465     // set pending exception
 466     __ verify_oop(rax);
 467 
 468     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 469     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 470     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 471     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 472 
 473     // complete return to VM
 474     assert(StubRoutines::_call_stub_return_address != NULL,
 475            "_call_stub_return_address must have been generated before");
 476     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 477 
 478     return start;
 479   }
 480 
 481   // Continuation point for runtime calls returning with a pending
 482   // exception.  The pending exception check happened in the runtime
 483   // or native call stub.  The pending exception in Thread is
 484   // converted into a Java-level exception.
 485   //
 486   // Contract with Java-level exception handlers:
 487   // rax: exception
 488   // rdx: throwing pc
 489   //
 490   // NOTE: At entry of this stub, exception-pc must be on stack !!
 491 
 492   address generate_forward_exception() {
 493     StubCodeMark mark(this, "StubRoutines", "forward exception");
 494     address start = __ pc();
 495 
 496     // Upon entry, the sp points to the return address returning into
 497     // Java (interpreted or compiled) code; i.e., the return address
 498     // becomes the throwing pc.
 499     //
 500     // Arguments pushed before the runtime call are still on the stack
 501     // but the exception handler will reset the stack pointer ->
 502     // ignore them.  A potential result in registers can be ignored as
 503     // well.
 504 
 505 #ifdef ASSERT
 506     // make sure this code is only executed if there is a pending exception
 507     {
 508       Label L;
 509       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 510       __ jcc(Assembler::notEqual, L);
 511       __ stop("StubRoutines::forward exception: no pending exception (1)");
 512       __ bind(L);
 513     }
 514 #endif
 515 
 516     // compute exception handler into rbx
 517     __ movptr(c_rarg0, Address(rsp, 0));
 518     BLOCK_COMMENT("call exception_handler_for_return_address");
 519     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 520                          SharedRuntime::exception_handler_for_return_address),
 521                     r15_thread, c_rarg0);
 522     __ mov(rbx, rax);
 523 
 524     // setup rax & rdx, remove return address & clear pending exception
 525     __ pop(rdx);
 526     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 527     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ testptr(rax, rax);
 534       __ jcc(Assembler::notEqual, L);
 535       __ stop("StubRoutines::forward exception: no pending exception (2)");
 536       __ bind(L);
 537     }
 538 #endif
 539 
 540     // continue at exception handler (return address removed)
 541     // rax: exception
 542     // rbx: exception handler
 543     // rdx: throwing pc
 544     __ verify_oop(rax);
 545     __ jmp(rbx);
 546 
 547     return start;
 548   }
 549 
 550   // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
 551   //
 552   // Arguments :
 553   //    c_rarg0: exchange_value
 554   //    c_rarg0: dest
 555   //
 556   // Result:
 557   //    *dest <- ex, return (orig *dest)
 558   address generate_atomic_xchg() {
 559     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 560     address start = __ pc();
 561 
 562     __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
 563     __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
 564     __ ret(0);
 565 
 566     return start;
 567   }
 568 
 569   // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest)
 570   //
 571   // Arguments :
 572   //    c_rarg0: exchange_value
 573   //    c_rarg1: dest
 574   //
 575   // Result:
 576   //    *dest <- ex, return (orig *dest)
 577   address generate_atomic_xchg_long() {
 578     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
 579     address start = __ pc();
 580 
 581     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 582     __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
 583     __ ret(0);
 584 
 585     return start;
 586   }
 587 
 588   // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
 589   //                                         jint compare_value)
 590   //
 591   // Arguments :
 592   //    c_rarg0: exchange_value
 593   //    c_rarg1: dest
 594   //    c_rarg2: compare_value
 595   //
 596   // Result:
 597   //    if ( compare_value == *dest ) {
 598   //       *dest = exchange_value
 599   //       return compare_value;
 600   //    else
 601   //       return *dest;
 602   address generate_atomic_cmpxchg() {
 603     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 604     address start = __ pc();
 605 
 606     __ movl(rax, c_rarg2);
 607    if ( os::is_MP() ) __ lock();
 608     __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
 609     __ ret(0);
 610 
 611     return start;
 612   }
 613 
 614   // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
 615   //                                           int8_t compare_value)
 616   //
 617   // Arguments :
 618   //    c_rarg0: exchange_value
 619   //    c_rarg1: dest
 620   //    c_rarg2: compare_value
 621   //
 622   // Result:
 623   //    if ( compare_value == *dest ) {
 624   //       *dest = exchange_value
 625   //       return compare_value;
 626   //    else
 627   //       return *dest;
 628   address generate_atomic_cmpxchg_byte() {
 629     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
 630     address start = __ pc();
 631 
 632     __ movsbq(rax, c_rarg2);
 633    if ( os::is_MP() ) __ lock();
 634     __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
 635     __ ret(0);
 636 
 637     return start;
 638   }
 639 
 640   // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
 641   //                                            volatile int64_t* dest,
 642   //                                            int64_t compare_value)
 643   // Arguments :
 644   //    c_rarg0: exchange_value
 645   //    c_rarg1: dest
 646   //    c_rarg2: compare_value
 647   //
 648   // Result:
 649   //    if ( compare_value == *dest ) {
 650   //       *dest = exchange_value
 651   //       return compare_value;
 652   //    else
 653   //       return *dest;
 654   address generate_atomic_cmpxchg_long() {
 655     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 656     address start = __ pc();
 657 
 658     __ movq(rax, c_rarg2);
 659    if ( os::is_MP() ) __ lock();
 660     __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
 661     __ ret(0);
 662 
 663     return start;
 664   }
 665 
 666   // Support for jint atomic::add(jint add_value, volatile jint* dest)
 667   //
 668   // Arguments :
 669   //    c_rarg0: add_value
 670   //    c_rarg1: dest
 671   //
 672   // Result:
 673   //    *dest += add_value
 674   //    return *dest;
 675   address generate_atomic_add() {
 676     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 677     address start = __ pc();
 678 
 679     __ movl(rax, c_rarg0);
 680    if ( os::is_MP() ) __ lock();
 681     __ xaddl(Address(c_rarg1, 0), c_rarg0);
 682     __ addl(rax, c_rarg0);
 683     __ ret(0);
 684 
 685     return start;
 686   }
 687 
 688   // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
 689   //
 690   // Arguments :
 691   //    c_rarg0: add_value
 692   //    c_rarg1: dest
 693   //
 694   // Result:
 695   //    *dest += add_value
 696   //    return *dest;
 697   address generate_atomic_add_long() {
 698     StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
 699     address start = __ pc();
 700 
 701     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 702    if ( os::is_MP() ) __ lock();
 703     __ xaddptr(Address(c_rarg1, 0), c_rarg0);
 704     __ addptr(rax, c_rarg0);
 705     __ ret(0);
 706 
 707     return start;
 708   }
 709 
 710   // Support for intptr_t OrderAccess::fence()
 711   //
 712   // Arguments :
 713   //
 714   // Result:
 715   address generate_orderaccess_fence() {
 716     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 717     address start = __ pc();
 718     __ membar(Assembler::StoreLoad);
 719     __ ret(0);
 720 
 721     return start;
 722   }
 723 
 724   // Support for intptr_t get_previous_fp()
 725   //
 726   // This routine is used to find the previous frame pointer for the
 727   // caller (current_frame_guess). This is used as part of debugging
 728   // ps() is seemingly lost trying to find frames.
 729   // This code assumes that caller current_frame_guess) has a frame.
 730   address generate_get_previous_fp() {
 731     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
 732     const Address old_fp(rbp, 0);
 733     const Address older_fp(rax, 0);
 734     address start = __ pc();
 735 
 736     __ enter();
 737     __ movptr(rax, old_fp); // callers fp
 738     __ movptr(rax, older_fp); // the frame for ps()
 739     __ pop(rbp);
 740     __ ret(0);
 741 
 742     return start;
 743   }
 744 
 745   // Support for intptr_t get_previous_sp()
 746   //
 747   // This routine is used to find the previous stack pointer for the
 748   // caller.
 749   address generate_get_previous_sp() {
 750     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 751     address start = __ pc();
 752 
 753     __ movptr(rax, rsp);
 754     __ addptr(rax, 8); // return address is at the top of the stack.
 755     __ ret(0);
 756 
 757     return start;
 758   }
 759 
 760   //----------------------------------------------------------------------------------------------------
 761   // Support for void verify_mxcsr()
 762   //
 763   // This routine is used with -Xcheck:jni to verify that native
 764   // JNI code does not return to Java code without restoring the
 765   // MXCSR register to our expected state.
 766 
 767   address generate_verify_mxcsr() {
 768     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 769     address start = __ pc();
 770 
 771     const Address mxcsr_save(rsp, 0);
 772 
 773     if (CheckJNICalls) {
 774       Label ok_ret;
 775       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 776       __ push(rax);
 777       __ subptr(rsp, wordSize);      // allocate a temp location
 778       __ stmxcsr(mxcsr_save);
 779       __ movl(rax, mxcsr_save);
 780       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 781       __ cmp32(rax, mxcsr_std);
 782       __ jcc(Assembler::equal, ok_ret);
 783 
 784       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 785 
 786       __ ldmxcsr(mxcsr_std);
 787 
 788       __ bind(ok_ret);
 789       __ addptr(rsp, wordSize);
 790       __ pop(rax);
 791     }
 792 
 793     __ ret(0);
 794 
 795     return start;
 796   }
 797 
 798   address generate_f2i_fixup() {
 799     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 800     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 801 
 802     address start = __ pc();
 803 
 804     Label L;
 805 
 806     __ push(rax);
 807     __ push(c_rarg3);
 808     __ push(c_rarg2);
 809     __ push(c_rarg1);
 810 
 811     __ movl(rax, 0x7f800000);
 812     __ xorl(c_rarg3, c_rarg3);
 813     __ movl(c_rarg2, inout);
 814     __ movl(c_rarg1, c_rarg2);
 815     __ andl(c_rarg1, 0x7fffffff);
 816     __ cmpl(rax, c_rarg1); // NaN? -> 0
 817     __ jcc(Assembler::negative, L);
 818     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 819     __ movl(c_rarg3, 0x80000000);
 820     __ movl(rax, 0x7fffffff);
 821     __ cmovl(Assembler::positive, c_rarg3, rax);
 822 
 823     __ bind(L);
 824     __ movptr(inout, c_rarg3);
 825 
 826     __ pop(c_rarg1);
 827     __ pop(c_rarg2);
 828     __ pop(c_rarg3);
 829     __ pop(rax);
 830 
 831     __ ret(0);
 832 
 833     return start;
 834   }
 835 
 836   address generate_f2l_fixup() {
 837     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 838     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 839     address start = __ pc();
 840 
 841     Label L;
 842 
 843     __ push(rax);
 844     __ push(c_rarg3);
 845     __ push(c_rarg2);
 846     __ push(c_rarg1);
 847 
 848     __ movl(rax, 0x7f800000);
 849     __ xorl(c_rarg3, c_rarg3);
 850     __ movl(c_rarg2, inout);
 851     __ movl(c_rarg1, c_rarg2);
 852     __ andl(c_rarg1, 0x7fffffff);
 853     __ cmpl(rax, c_rarg1); // NaN? -> 0
 854     __ jcc(Assembler::negative, L);
 855     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 856     __ mov64(c_rarg3, 0x8000000000000000);
 857     __ mov64(rax, 0x7fffffffffffffff);
 858     __ cmov(Assembler::positive, c_rarg3, rax);
 859 
 860     __ bind(L);
 861     __ movptr(inout, c_rarg3);
 862 
 863     __ pop(c_rarg1);
 864     __ pop(c_rarg2);
 865     __ pop(c_rarg3);
 866     __ pop(rax);
 867 
 868     __ ret(0);
 869 
 870     return start;
 871   }
 872 
 873   address generate_d2i_fixup() {
 874     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 875     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 876 
 877     address start = __ pc();
 878 
 879     Label L;
 880 
 881     __ push(rax);
 882     __ push(c_rarg3);
 883     __ push(c_rarg2);
 884     __ push(c_rarg1);
 885     __ push(c_rarg0);
 886 
 887     __ movl(rax, 0x7ff00000);
 888     __ movq(c_rarg2, inout);
 889     __ movl(c_rarg3, c_rarg2);
 890     __ mov(c_rarg1, c_rarg2);
 891     __ mov(c_rarg0, c_rarg2);
 892     __ negl(c_rarg3);
 893     __ shrptr(c_rarg1, 0x20);
 894     __ orl(c_rarg3, c_rarg2);
 895     __ andl(c_rarg1, 0x7fffffff);
 896     __ xorl(c_rarg2, c_rarg2);
 897     __ shrl(c_rarg3, 0x1f);
 898     __ orl(c_rarg1, c_rarg3);
 899     __ cmpl(rax, c_rarg1);
 900     __ jcc(Assembler::negative, L); // NaN -> 0
 901     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 902     __ movl(c_rarg2, 0x80000000);
 903     __ movl(rax, 0x7fffffff);
 904     __ cmov(Assembler::positive, c_rarg2, rax);
 905 
 906     __ bind(L);
 907     __ movptr(inout, c_rarg2);
 908 
 909     __ pop(c_rarg0);
 910     __ pop(c_rarg1);
 911     __ pop(c_rarg2);
 912     __ pop(c_rarg3);
 913     __ pop(rax);
 914 
 915     __ ret(0);
 916 
 917     return start;
 918   }
 919 
 920   address generate_d2l_fixup() {
 921     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 922     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 923 
 924     address start = __ pc();
 925 
 926     Label L;
 927 
 928     __ push(rax);
 929     __ push(c_rarg3);
 930     __ push(c_rarg2);
 931     __ push(c_rarg1);
 932     __ push(c_rarg0);
 933 
 934     __ movl(rax, 0x7ff00000);
 935     __ movq(c_rarg2, inout);
 936     __ movl(c_rarg3, c_rarg2);
 937     __ mov(c_rarg1, c_rarg2);
 938     __ mov(c_rarg0, c_rarg2);
 939     __ negl(c_rarg3);
 940     __ shrptr(c_rarg1, 0x20);
 941     __ orl(c_rarg3, c_rarg2);
 942     __ andl(c_rarg1, 0x7fffffff);
 943     __ xorl(c_rarg2, c_rarg2);
 944     __ shrl(c_rarg3, 0x1f);
 945     __ orl(c_rarg1, c_rarg3);
 946     __ cmpl(rax, c_rarg1);
 947     __ jcc(Assembler::negative, L); // NaN -> 0
 948     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 949     __ mov64(c_rarg2, 0x8000000000000000);
 950     __ mov64(rax, 0x7fffffffffffffff);
 951     __ cmovq(Assembler::positive, c_rarg2, rax);
 952 
 953     __ bind(L);
 954     __ movq(inout, c_rarg2);
 955 
 956     __ pop(c_rarg0);
 957     __ pop(c_rarg1);
 958     __ pop(c_rarg2);
 959     __ pop(c_rarg3);
 960     __ pop(rax);
 961 
 962     __ ret(0);
 963 
 964     return start;
 965   }
 966 
 967   address generate_fp_mask(const char *stub_name, int64_t mask) {
 968     __ align(CodeEntryAlignment);
 969     StubCodeMark mark(this, "StubRoutines", stub_name);
 970     address start = __ pc();
 971 
 972     __ emit_data64( mask, relocInfo::none );
 973     __ emit_data64( mask, relocInfo::none );
 974 
 975     return start;
 976   }
 977 
 978   // Non-destructive plausibility checks for oops
 979   //
 980   // Arguments:
 981   //    all args on stack!
 982   //
 983   // Stack after saving c_rarg3:
 984   //    [tos + 0]: saved c_rarg3
 985   //    [tos + 1]: saved c_rarg2
 986   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
 987   //    [tos + 3]: saved flags
 988   //    [tos + 4]: return address
 989   //  * [tos + 5]: error message (char*)
 990   //  * [tos + 6]: object to verify (oop)
 991   //  * [tos + 7]: saved rax - saved by caller and bashed
 992   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
 993   //  * = popped on exit
 994   address generate_verify_oop() {
 995     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 996     address start = __ pc();
 997 
 998     Label exit, error;
 999 
1000     __ pushf();
1001     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1002 
1003     __ push(r12);
1004 
1005     // save c_rarg2 and c_rarg3
1006     __ push(c_rarg2);
1007     __ push(c_rarg3);
1008 
1009     enum {
1010            // After previous pushes.
1011            oop_to_verify = 6 * wordSize,
1012            saved_rax     = 7 * wordSize,
1013            saved_r10     = 8 * wordSize,
1014 
1015            // Before the call to MacroAssembler::debug(), see below.
1016            return_addr   = 16 * wordSize,
1017            error_msg     = 17 * wordSize
1018     };
1019 
1020     // get object
1021     __ movptr(rax, Address(rsp, oop_to_verify));
1022 
1023     // make sure object is 'reasonable'
1024     __ testptr(rax, rax);
1025     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1026     // Check if the oop is in the right area of memory
1027     __ movptr(c_rarg2, rax);
1028     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1029     __ andptr(c_rarg2, c_rarg3);
1030     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1031     __ cmpptr(c_rarg2, c_rarg3);
1032     __ jcc(Assembler::notZero, error);
1033 
1034     // set r12 to heapbase for load_klass()
1035     __ reinit_heapbase();
1036 
1037     // make sure klass is 'reasonable', which is not zero.
1038     __ load_klass(rax, rax);  // get klass
1039     __ testptr(rax, rax);
1040     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1041 
1042     // return if everything seems ok
1043     __ bind(exit);
1044     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1045     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1046     __ pop(c_rarg3);                             // restore c_rarg3
1047     __ pop(c_rarg2);                             // restore c_rarg2
1048     __ pop(r12);                                 // restore r12
1049     __ popf();                                   // restore flags
1050     __ ret(4 * wordSize);                        // pop caller saved stuff
1051 
1052     // handle errors
1053     __ bind(error);
1054     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1055     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1056     __ pop(c_rarg3);                             // get saved c_rarg3 back
1057     __ pop(c_rarg2);                             // get saved c_rarg2 back
1058     __ pop(r12);                                 // get saved r12 back
1059     __ popf();                                   // get saved flags off stack --
1060                                                  // will be ignored
1061 
1062     __ pusha();                                  // push registers
1063                                                  // (rip is already
1064                                                  // already pushed)
1065     // debug(char* msg, int64_t pc, int64_t regs[])
1066     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1067     // pushed all the registers, so now the stack looks like:
1068     //     [tos +  0] 16 saved registers
1069     //     [tos + 16] return address
1070     //   * [tos + 17] error message (char*)
1071     //   * [tos + 18] object to verify (oop)
1072     //   * [tos + 19] saved rax - saved by caller and bashed
1073     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1074     //   * = popped on exit
1075 
1076     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1077     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1078     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1079     __ mov(r12, rsp);                               // remember rsp
1080     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1081     __ andptr(rsp, -16);                            // align stack as required by ABI
1082     BLOCK_COMMENT("call MacroAssembler::debug");
1083     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1084     __ mov(rsp, r12);                               // restore rsp
1085     __ popa();                                      // pop registers (includes r12)
1086     __ ret(4 * wordSize);                           // pop caller saved stuff
1087 
1088     return start;
1089   }
1090 
1091   //
1092   // Verify that a register contains clean 32-bits positive value
1093   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1094   //
1095   //  Input:
1096   //    Rint  -  32-bits value
1097   //    Rtmp  -  scratch
1098   //
1099   void assert_clean_int(Register Rint, Register Rtmp) {
1100 #ifdef ASSERT
1101     Label L;
1102     assert_different_registers(Rtmp, Rint);
1103     __ movslq(Rtmp, Rint);
1104     __ cmpq(Rtmp, Rint);
1105     __ jcc(Assembler::equal, L);
1106     __ stop("high 32-bits of int value are not 0");
1107     __ bind(L);
1108 #endif
1109   }
1110 
1111   //  Generate overlap test for array copy stubs
1112   //
1113   //  Input:
1114   //     c_rarg0 - from
1115   //     c_rarg1 - to
1116   //     c_rarg2 - element count
1117   //
1118   //  Output:
1119   //     rax   - &from[element count - 1]
1120   //
1121   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1122     assert(no_overlap_target != NULL, "must be generated");
1123     array_overlap_test(no_overlap_target, NULL, sf);
1124   }
1125   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1126     array_overlap_test(NULL, &L_no_overlap, sf);
1127   }
1128   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1129     const Register from     = c_rarg0;
1130     const Register to       = c_rarg1;
1131     const Register count    = c_rarg2;
1132     const Register end_from = rax;
1133 
1134     __ cmpptr(to, from);
1135     __ lea(end_from, Address(from, count, sf, 0));
1136     if (NOLp == NULL) {
1137       ExternalAddress no_overlap(no_overlap_target);
1138       __ jump_cc(Assembler::belowEqual, no_overlap);
1139       __ cmpptr(to, end_from);
1140       __ jump_cc(Assembler::aboveEqual, no_overlap);
1141     } else {
1142       __ jcc(Assembler::belowEqual, (*NOLp));
1143       __ cmpptr(to, end_from);
1144       __ jcc(Assembler::aboveEqual, (*NOLp));
1145     }
1146   }
1147 
1148   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1149   //
1150   // Outputs:
1151   //    rdi - rcx
1152   //    rsi - rdx
1153   //    rdx - r8
1154   //    rcx - r9
1155   //
1156   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1157   // are non-volatile.  r9 and r10 should not be used by the caller.
1158   //
1159   void setup_arg_regs(int nargs = 3) {
1160     const Register saved_rdi = r9;
1161     const Register saved_rsi = r10;
1162     assert(nargs == 3 || nargs == 4, "else fix");
1163 #ifdef _WIN64
1164     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1165            "unexpected argument registers");
1166     if (nargs >= 4)
1167       __ mov(rax, r9);  // r9 is also saved_rdi
1168     __ movptr(saved_rdi, rdi);
1169     __ movptr(saved_rsi, rsi);
1170     __ mov(rdi, rcx); // c_rarg0
1171     __ mov(rsi, rdx); // c_rarg1
1172     __ mov(rdx, r8);  // c_rarg2
1173     if (nargs >= 4)
1174       __ mov(rcx, rax); // c_rarg3 (via rax)
1175 #else
1176     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1177            "unexpected argument registers");
1178 #endif
1179   }
1180 
1181   void restore_arg_regs() {
1182     const Register saved_rdi = r9;
1183     const Register saved_rsi = r10;
1184 #ifdef _WIN64
1185     __ movptr(rdi, saved_rdi);
1186     __ movptr(rsi, saved_rsi);
1187 #endif
1188   }
1189 
1190   // Generate code for an array write pre barrier
1191   //
1192   //     addr    -  starting address
1193   //     count   -  element count
1194   //     tmp     - scratch register
1195   //
1196   //     Destroy no registers!
1197   //
1198   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1199     BarrierSet* bs = Universe::heap()->barrier_set();
1200     switch (bs->kind()) {
1201       case BarrierSet::G1SATBCTLogging:
1202         // With G1, don't generate the call if we statically know that the target in uninitialized
1203         if (!dest_uninitialized) {
1204            __ pusha();                      // push registers
1205            if (count == c_rarg0) {
1206              if (addr == c_rarg1) {
1207                // exactly backwards!!
1208                __ xchgptr(c_rarg1, c_rarg0);
1209              } else {
1210                __ movptr(c_rarg1, count);
1211                __ movptr(c_rarg0, addr);
1212              }
1213            } else {
1214              __ movptr(c_rarg0, addr);
1215              __ movptr(c_rarg1, count);
1216            }
1217            __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
1218            __ popa();
1219         }
1220          break;
1221       case BarrierSet::CardTableForRS:
1222       case BarrierSet::CardTableExtension:
1223       case BarrierSet::ModRef:
1224         break;
1225       default:
1226         ShouldNotReachHere();
1227 
1228     }
1229   }
1230 
1231   //
1232   // Generate code for an array write post barrier
1233   //
1234   //  Input:
1235   //     start    - register containing starting address of destination array
1236   //     count    - elements count
1237   //     scratch  - scratch register
1238   //
1239   //  The input registers are overwritten.
1240   //
1241   void  gen_write_ref_array_post_barrier(Register start, Register count, Register scratch) {
1242     assert_different_registers(start, count, scratch);
1243     BarrierSet* bs = Universe::heap()->barrier_set();
1244     switch (bs->kind()) {
1245       case BarrierSet::G1SATBCTLogging:
1246         {
1247           __ pusha();             // push registers (overkill)
1248           if (c_rarg0 == count) { // On win64 c_rarg0 == rcx
1249             assert_different_registers(c_rarg1, start);
1250             __ mov(c_rarg1, count);
1251             __ mov(c_rarg0, start);
1252           } else {
1253             assert_different_registers(c_rarg0, count);
1254             __ mov(c_rarg0, start);
1255             __ mov(c_rarg1, count);
1256           }
1257           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
1258           __ popa();
1259         }
1260         break;
1261       case BarrierSet::CardTableForRS:
1262       case BarrierSet::CardTableExtension:
1263         {
1264           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
1265           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1266 
1267           Label L_loop, L_done;
1268           const Register end = count;
1269 
1270           __ testl(count, count);
1271           __ jcc(Assembler::zero, L_done); // zero count - nothing to do
1272 
1273           __ leaq(end, Address(start, count, TIMES_OOP, 0));  // end == start+count*oop_size
1274           __ subptr(end, BytesPerHeapOop); // end - 1 to make inclusive
1275           __ shrptr(start, CardTableModRefBS::card_shift);
1276           __ shrptr(end,   CardTableModRefBS::card_shift);
1277           __ subptr(end, start); // end --> cards count
1278 
1279           int64_t disp = (int64_t) ct->byte_map_base;
1280           __ mov64(scratch, disp);
1281           __ addptr(start, scratch);
1282         __ BIND(L_loop);
1283           __ movb(Address(start, count, Address::times_1), 0);
1284           __ decrement(count);
1285           __ jcc(Assembler::greaterEqual, L_loop);
1286         __ BIND(L_done);
1287         }
1288         break;
1289       default:
1290         ShouldNotReachHere();
1291 
1292     }
1293   }
1294 
1295 
1296   // Copy big chunks forward
1297   //
1298   // Inputs:
1299   //   end_from     - source arrays end address
1300   //   end_to       - destination array end address
1301   //   qword_count  - 64-bits element count, negative
1302   //   to           - scratch
1303   //   L_copy_bytes - entry label
1304   //   L_copy_8_bytes  - exit  label
1305   //
1306   void copy_bytes_forward(Register end_from, Register end_to,
1307                              Register qword_count, Register to,
1308                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1309     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1310     Label L_loop;
1311     __ align(OptoLoopAlignment);
1312     if (UseUnalignedLoadStores) {
1313       Label L_end;
1314       if (UseAVX > 2) {
1315         __ movl(to, 0xffff);
1316         __ kmovwl(k1, to);
1317       }
1318       // Copy 64-bytes per iteration
1319       __ BIND(L_loop);
1320       if (UseAVX > 2) {
1321         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1322         __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1323       } else if (UseAVX == 2) {
1324         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1325         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1326         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1327         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1328       } else {
1329         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1330         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1331         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1332         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1333         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1334         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1335         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1336         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1337       }
1338       __ BIND(L_copy_bytes);
1339       __ addptr(qword_count, 8);
1340       __ jcc(Assembler::lessEqual, L_loop);
1341       __ subptr(qword_count, 4);  // sub(8) and add(4)
1342       __ jccb(Assembler::greater, L_end);
1343       // Copy trailing 32 bytes
1344       if (UseAVX >= 2) {
1345         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1346         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1347       } else {
1348         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1349         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1350         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1351         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1352       }
1353       __ addptr(qword_count, 4);
1354       __ BIND(L_end);
1355       if (UseAVX >= 2) {
1356         // clean upper bits of YMM registers
1357         __ vpxor(xmm0, xmm0);
1358         __ vpxor(xmm1, xmm1);
1359       }
1360     } else {
1361       // Copy 32-bytes per iteration
1362       __ BIND(L_loop);
1363       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1364       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1365       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1366       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1367       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1368       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1369       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1370       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1371 
1372       __ BIND(L_copy_bytes);
1373       __ addptr(qword_count, 4);
1374       __ jcc(Assembler::lessEqual, L_loop);
1375     }
1376     __ subptr(qword_count, 4);
1377     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1378   }
1379 
1380   // Copy big chunks backward
1381   //
1382   // Inputs:
1383   //   from         - source arrays address
1384   //   dest         - destination array address
1385   //   qword_count  - 64-bits element count
1386   //   to           - scratch
1387   //   L_copy_bytes - entry label
1388   //   L_copy_8_bytes  - exit  label
1389   //
1390   void copy_bytes_backward(Register from, Register dest,
1391                               Register qword_count, Register to,
1392                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1393     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1394     Label L_loop;
1395     __ align(OptoLoopAlignment);
1396     if (UseUnalignedLoadStores) {
1397       Label L_end;
1398       if (UseAVX > 2) {
1399         __ movl(to, 0xffff);
1400         __ kmovwl(k1, to);
1401       }
1402       // Copy 64-bytes per iteration
1403       __ BIND(L_loop);
1404       if (UseAVX > 2) {
1405         __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1406         __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1407       } else if (UseAVX == 2) {
1408         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1409         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1410         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1411         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1412       } else {
1413         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1414         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1415         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1416         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1417         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1418         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1419         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1420         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1421       }
1422       __ BIND(L_copy_bytes);
1423       __ subptr(qword_count, 8);
1424       __ jcc(Assembler::greaterEqual, L_loop);
1425 
1426       __ addptr(qword_count, 4);  // add(8) and sub(4)
1427       __ jccb(Assembler::less, L_end);
1428       // Copy trailing 32 bytes
1429       if (UseAVX >= 2) {
1430         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1431         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1432       } else {
1433         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1434         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1435         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1436         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1437       }
1438       __ subptr(qword_count, 4);
1439       __ BIND(L_end);
1440       if (UseAVX >= 2) {
1441         // clean upper bits of YMM registers
1442         __ vpxor(xmm0, xmm0);
1443         __ vpxor(xmm1, xmm1);
1444       }
1445     } else {
1446       // Copy 32-bytes per iteration
1447       __ BIND(L_loop);
1448       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1449       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1450       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1451       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1452       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1453       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1454       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1455       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1456 
1457       __ BIND(L_copy_bytes);
1458       __ subptr(qword_count, 4);
1459       __ jcc(Assembler::greaterEqual, L_loop);
1460     }
1461     __ addptr(qword_count, 4);
1462     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1463   }
1464 
1465 
1466   // Arguments:
1467   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1468   //             ignored
1469   //   name    - stub name string
1470   //
1471   // Inputs:
1472   //   c_rarg0   - source array address
1473   //   c_rarg1   - destination array address
1474   //   c_rarg2   - element count, treated as ssize_t, can be zero
1475   //
1476   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1477   // we let the hardware handle it.  The one to eight bytes within words,
1478   // dwords or qwords that span cache line boundaries will still be loaded
1479   // and stored atomically.
1480   //
1481   // Side Effects:
1482   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1483   //   used by generate_conjoint_byte_copy().
1484   //
1485   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1486     __ align(CodeEntryAlignment);
1487     StubCodeMark mark(this, "StubRoutines", name);
1488     address start = __ pc();
1489 
1490     Label L_copy_qwords, L_copy_7bytes_or_less, L_copy_3bytes_or_less, L_copy_1byte_or_less, L_exit;
1491     const Register from        = rdi;  // source array address
1492     const Register to          = rsi;  // destination array address
1493     const Register count       = rdx;  // elements count
1494     const Register byte_count  = rcx;
1495     const Register qword_count = count;
1496     const Register end_from    = from; // source array end address
1497     const Register end_to      = to;   // destination array end address
1498     // End pointers are inclusive, and if count is not zero they point
1499     // to the last unit copied:  end_to[0] := end_from[0]
1500 
1501     __ enter(); // required for proper stackwalking of RuntimeStub frame
1502     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1503 
1504     if (entry != NULL) {
1505       *entry = __ pc();
1506        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1507       BLOCK_COMMENT("Entry:");
1508     }
1509 
1510     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1511                       // r9 and r10 may be used to save non-volatile registers
1512 
1513     // 'from', 'to' and 'count' are now valid
1514 
1515     guarantee(UseAVX >= 2, "Experimental code");
1516 
1517     Label L_prepare_bulk_align;
1518     Label L_tail_32, L_tail_16, L_tail_8, L_tail_4, L_tail_end;
1519     Label L_tail_nozero_32, L_tail_nozero_16, L_tail_nozero_8, L_tail_nozero_4, L_tail_nozero_end;
1520 
1521     __ movptr(byte_count, count);
1522 
1523     // If less than a qword, then don't bother doing any magic, follow to
1524     // byte tail copy.
1525     __ cmpptr(byte_count, 8);
1526     __ jcc(Assembler::less, L_copy_7bytes_or_less);
1527 
1528     // If greater than 64 bytes, then it makes sense to prepare and go to
1529     // the aligned copy.
1530     __ cmpptr(byte_count, 64);
1531     __ jccb(Assembler::greater, L_prepare_bulk_align);
1532 
1533     // Less than 64 bytes (8 qwords) => jump to qword copy tail.
1534     // This requires preparing the qword_count and src/dst addresses:
1535     __ movptr(qword_count, byte_count);
1536     __ shrptr(qword_count, 3);
1537     __ lea(end_from, Address(from, qword_count, Address::times_8));
1538     __ lea(end_to,   Address(to,   qword_count, Address::times_8));
1539     __ negptr(qword_count);
1540     __ jmp(L_copy_qwords);
1541 
1542     // Pre-align slide: do enough individual copies to align destination at 32 bytes.
1543     // At this point we know there is enough elements to hit the proper alignment,
1544     // don't need to check byte_count.
1545     __ BIND(L_prepare_bulk_align)
1546 
1547     Label L_adjust_2byte, L_adjust_4byte, L_adjust_8byte, L_adjust_16byte, L_adjust_done;
1548 
1549     __ lea(rscratch2, Address(to, 0));
1550     __ andptr(rscratch2, 31);
1551     __ subptr(rscratch2, 32);
1552     __ negptr(rscratch2);
1553     __ andptr(rscratch2, 31);
1554 
1555     // rscratch2 holds the number of excess bytes are found; pre-slide will consume
1556     // them. Adjust byte count here. from/to would get adjusted during the pre-slide.
1557     __ subptr(byte_count, rscratch2);
1558 
1559     __ testptr(rscratch2, 1);
1560     __ jccb(Assembler::zero, L_adjust_2byte);
1561     __ movb(rax, Address(from, 0));
1562     __ movb(Address(to, 0), rax);
1563     __ addptr(from, 1);
1564     __ addptr(to, 1);
1565 
1566     __ BIND(L_adjust_2byte)
1567     __ testptr(rscratch2, 2);
1568     __ jccb(Assembler::zero, L_adjust_4byte);
1569     __ movw(rax, Address(from, 0));
1570     __ movw(Address(to, 0), rax);
1571     __ addptr(from, 2);
1572     __ addptr(to, 2);
1573 
1574     __ BIND(L_adjust_4byte)
1575     __ testptr(rscratch2, 4);
1576     __ jccb(Assembler::zero, L_adjust_8byte);
1577     __ movl(rax, Address(from, 0));
1578     __ movl(Address(to, 0), rax);
1579     __ addptr(from, 4);
1580     __ addptr(to, 4);
1581 
1582     __ BIND(L_adjust_8byte)
1583     __ testptr(rscratch2, 8);
1584     __ jccb(Assembler::zero, L_adjust_16byte);
1585     __ movq(rax, Address(from, 0));
1586     __ movq(Address(to, 0), rax);
1587     __ addptr(from, 8);
1588     __ addptr(to, 8);
1589 
1590     __ BIND(L_adjust_16byte)
1591     __ testptr(rscratch2, 16);
1592     __ jccb(Assembler::zero, L_adjust_done);
1593     __ movq(rax, Address(from, 0));
1594     __ movq(Address(to, 0), rax);
1595     __ movq(rax, Address(from, 8));
1596     __ movq(Address(to, 8), rax);
1597     __ addptr(from, 16);
1598     __ addptr(to, 16);
1599 
1600     __ BIND(L_adjust_done)
1601 
1602     // Pre-slide done! At this point, destination is guaranteed to be aligned
1603     // to 32. This allows us to do the bulk copies with aligned stores.
1604 
1605     // Prepare qword count and src/dst addresses
1606     __ movptr(qword_count, byte_count);
1607     __ shrptr(qword_count, 3);
1608     __ lea(end_from, Address(from, qword_count, Address::times_8));
1609     __ lea(end_to,   Address(to,   qword_count, Address::times_8));
1610     __ negptr(qword_count);
1611 
1612     // Medium-sized arrays benefit skipping the larger bulk stores.
1613     // Try to enter at appropriate bulk tail, this will avoid rushing
1614     // through a size checking maze, and avoids unnecessary zeroing of the
1615     // xmm/ymm registers.
1616     __ addptr(qword_count, 4);
1617     __ jcc(Assembler::greater, L_tail_nozero_end);
1618 
1619     __ addptr(qword_count, 4); // sub(4), add(8)
1620     __ jcc(Assembler::greater, L_tail_nozero_4);
1621 
1622     __ addptr(qword_count, 8); // sub(8), add(16)
1623     __ jcc(Assembler::greater, L_tail_nozero_8);
1624 
1625     __ addptr(qword_count, 16); // sub(16), add(32)
1626     __ jcc(Assembler::greater, L_tail_nozero_16);
1627 
1628     __ addptr(qword_count, 32); // sub(32), add(64)
1629     __ jcc(Assembler::greater, L_tail_nozero_32);
1630 
1631     // Massively parallel copy: moves lots of data on each iteration (default)
1632     Label L_bulk_loop_default;
1633     __ align(OptoLoopAlignment);
1634     __ BIND(L_bulk_loop_default);
1635       __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
1636 
1637       // Remarkably, doing a single pair of 16-byte accesses helps performance:
1638       // RESOURCE_STALLS falls abruptly. Extending this trick to all other loads
1639       // degrades performance. :/
1640       __ movdqu(xmm15, Address(rscratch1, -512));
1641       __ vinserti128(xmm15, xmm15, Address(rscratch1, -496), 1);
1642 
1643                                                    __ vmovdqu(xmm14, Address(rscratch1, -480));
1644       __ vmovdqu(xmm13, Address(rscratch1, -448)); __ vmovdqu(xmm12, Address(rscratch1, -416));
1645       __ vmovdqu(xmm11, Address(rscratch1, -384)); __ vmovdqu(xmm10, Address(rscratch1, -352));
1646       __ vmovdqu(xmm9,  Address(rscratch1, -320)); __ vmovdqu(xmm8,  Address(rscratch1, -288));
1647       __ vmovdqu(xmm7,  Address(rscratch1, -256)); __ vmovdqu(xmm6,  Address(rscratch1, -224));
1648       __ vmovdqu(xmm5,  Address(rscratch1, -192)); __ vmovdqu(xmm4,  Address(rscratch1, -160));
1649       __ vmovdqu(xmm3,  Address(rscratch1, -128)); __ vmovdqu(xmm2,  Address(rscratch1,  -96));
1650       __ vmovdqu(xmm1,  Address(rscratch1,  -64)); __ vmovdqu(xmm0,  Address(rscratch1,  -32));
1651 
1652       __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
1653       __ vmovdqa(Address(rscratch2, -512), xmm15); __ vmovdqa(Address(rscratch2, -480), xmm14);
1654       __ vmovdqa(Address(rscratch2, -448), xmm13); __ vmovdqa(Address(rscratch2, -416), xmm12);
1655       __ vmovdqa(Address(rscratch2, -384), xmm11); __ vmovdqa(Address(rscratch2, -352), xmm10);
1656       __ vmovdqa(Address(rscratch2, -320), xmm9);  __ vmovdqa(Address(rscratch2, -288), xmm8);
1657       __ vmovdqa(Address(rscratch2, -256), xmm7);  __ vmovdqa(Address(rscratch2, -224), xmm6);
1658       __ vmovdqa(Address(rscratch2, -192), xmm5);  __ vmovdqa(Address(rscratch2, -160), xmm4);
1659       __ vmovdqa(Address(rscratch2, -128), xmm3);  __ vmovdqa(Address(rscratch2,  -96), xmm2);
1660       __ vmovdqa(Address(rscratch2,  -64), xmm1);  __ vmovdqa(Address(rscratch2,  -32), xmm0);
1661 
1662       __ addptr(qword_count, 64);
1663       __ jcc(Assembler::lessEqual, L_bulk_loop_default);
1664 
1665     __ BIND(L_tail_32);
1666     __ vpxor(xmm15, xmm15);
1667     __ vpxor(xmm14, xmm14);
1668     __ vpxor(xmm13, xmm13);
1669     __ vpxor(xmm12, xmm12);
1670     __ vpxor(xmm11, xmm11);
1671     __ vpxor(xmm10, xmm10);
1672     __ vpxor(xmm9, xmm9);
1673     __ vpxor(xmm8, xmm8);
1674     __ BIND(L_tail_nozero_32);
1675 
1676     // Copy trailing bulk qwords, until we can:
1677     __ subptr(qword_count, 32); // sub(64), add(32)
1678     __ jcc(Assembler::greater, L_tail_16);
1679     __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
1680     __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
1681     __ vmovdqu(xmm7, Address(rscratch1, -256)); __ vmovdqu(xmm6, Address(rscratch1, -224));
1682     __ vmovdqu(xmm5, Address(rscratch1, -192)); __ vmovdqu(xmm4, Address(rscratch1, -160));
1683     __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1,  -96));
1684     __ vmovdqu(xmm1, Address(rscratch1,  -64)); __ vmovdqu(xmm0, Address(rscratch1,  -32));
1685     __ vmovdqa(Address(rscratch2, -256), xmm7); __ vmovdqa(Address(rscratch2, -224), xmm6);
1686     __ vmovdqa(Address(rscratch2, -192), xmm5); __ vmovdqa(Address(rscratch2, -160), xmm4);
1687     __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2,  -96), xmm2);
1688     __ vmovdqa(Address(rscratch2,  -64), xmm1); __ vmovdqa(Address(rscratch2,  -32), xmm0);
1689     __ addptr(qword_count, 32);
1690 
1691     __ BIND(L_tail_16);
1692     __ vpxor(xmm7, xmm7);
1693     __ vpxor(xmm6, xmm6);
1694     __ vpxor(xmm5, xmm5);
1695     __ vpxor(xmm4, xmm4);
1696     __ BIND(L_tail_nozero_16);
1697 
1698     __ subptr(qword_count, 16); // sub(32), add(16)
1699     __ jcc(Assembler::greater, L_tail_8);
1700     __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
1701     __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
1702     __ vmovdqu(xmm3, Address(rscratch1, -128)); __ vmovdqu(xmm2, Address(rscratch1,  -96));
1703     __ vmovdqu(xmm1, Address(rscratch1,  -64)); __ vmovdqu(xmm0, Address(rscratch1,  -32));
1704     __ vmovdqa(Address(rscratch2, -128), xmm3); __ vmovdqa(Address(rscratch2,  -96), xmm2);
1705     __ vmovdqa(Address(rscratch2,  -64), xmm1); __ vmovdqa(Address(rscratch2,  -32), xmm0);
1706     __ addptr(qword_count, 16);
1707 
1708     __ BIND(L_tail_8);
1709     __ vpxor(xmm3, xmm3);
1710     __ vpxor(xmm2, xmm2);
1711     __ BIND(L_tail_nozero_8);
1712 
1713     __ subptr(qword_count, 8); // sub(16), add(8)
1714     __ jcc(Assembler::greater, L_tail_4);
1715     __ lea(rscratch1, Address(end_from, qword_count, Address::times_8));
1716     __ lea(rscratch2, Address(end_to,   qword_count, Address::times_8));
1717     __ vmovdqu(xmm1, Address(rscratch1, -64)); __ vmovdqu(xmm0, Address(rscratch1, -32));
1718     __ vmovdqa(Address(rscratch2, -64), xmm1); __ vmovdqa(Address(rscratch2, -32), xmm0);
1719     __ addptr(qword_count, 8);
1720 
1721     __ BIND(L_tail_4);
1722     __ vpxor(xmm1, xmm1);
1723     __ BIND(L_tail_nozero_4);
1724 
1725     __ subptr(qword_count, 4); // sub(8), add(4)
1726     __ jcc(Assembler::greater, L_tail_end);
1727     __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -32));
1728     __ vmovdqa(Address(end_to, qword_count, Address::times_8, -32), xmm0);
1729     __ addptr(qword_count, 4);
1730 
1731     __ BIND(L_tail_end);
1732     __ vpxor(xmm0, xmm0);
1733     __ BIND(L_tail_nozero_end);
1734 
1735     __ subptr(qword_count, 4);
1736     __ jcc(Assembler::zero, L_copy_7bytes_or_less);
1737 
1738     // Copy trailing qwords
1739   __ BIND(L_copy_qwords);
1740     __ movq(rax, Address(end_from, qword_count, Address::times_8));
1741     __ movq(Address(end_to, qword_count, Address::times_8), rax);
1742     __ increment(qword_count);
1743     __ jccb(Assembler::notZero, L_copy_qwords);
1744 
1745     // Check for and copy trailing dword
1746   __ BIND(L_copy_7bytes_or_less);
1747     __ testptr(byte_count, 4);
1748     __ jccb(Assembler::zero, L_copy_3bytes_or_less);
1749     __ movl(rax, Address(end_from, 0));
1750     __ movl(Address(end_to, 0), rax);
1751     __ addptr(end_from, 4);
1752     __ addptr(end_to, 4);
1753 
1754     // Check for and copy trailing word
1755   __ BIND(L_copy_3bytes_or_less);
1756     __ testptr(byte_count, 2);
1757     __ jccb(Assembler::zero, L_copy_1byte_or_less);
1758     __ movw(rax, Address(end_from, 0));
1759     __ movw(Address(end_to, 0), rax);
1760     __ addptr(end_from, 2);
1761     __ addptr(end_to, 2);
1762 
1763     // Check for and copy trailing byte
1764   __ BIND(L_copy_1byte_or_less);
1765     __ testptr(byte_count, 1);
1766     __ jccb(Assembler::zero, L_exit);
1767     __ movb(rax, Address(end_from, 0));
1768     __ movb(Address(end_to, 0), rax);
1769 
1770   __ BIND(L_exit);
1771     restore_arg_regs();
1772     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1773     __ xorptr(rax, rax); // return 0
1774     __ vzeroupper();
1775     __ leave(); // required for proper stackwalking of RuntimeStub frame
1776     __ ret(0);
1777 
1778     return start;
1779   }
1780 
1781   // Arguments:
1782   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1783   //             ignored
1784   //   name    - stub name string
1785   //
1786   // Inputs:
1787   //   c_rarg0   - source array address
1788   //   c_rarg1   - destination array address
1789   //   c_rarg2   - element count, treated as ssize_t, can be zero
1790   //
1791   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1792   // we let the hardware handle it.  The one to eight bytes within words,
1793   // dwords or qwords that span cache line boundaries will still be loaded
1794   // and stored atomically.
1795   //
1796   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1797                                       address* entry, const char *name) {
1798     __ align(CodeEntryAlignment);
1799     StubCodeMark mark(this, "StubRoutines", name);
1800     address start = __ pc();
1801 
1802     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1803     const Register from        = rdi;  // source array address
1804     const Register to          = rsi;  // destination array address
1805     const Register count       = rdx;  // elements count
1806     const Register byte_count  = rcx;
1807     const Register qword_count = count;
1808 
1809     __ enter(); // required for proper stackwalking of RuntimeStub frame
1810     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1811 
1812     if (entry != NULL) {
1813       *entry = __ pc();
1814       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1815       BLOCK_COMMENT("Entry:");
1816     }
1817 
1818     array_overlap_test(nooverlap_target, Address::times_1);
1819     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1820                       // r9 and r10 may be used to save non-volatile registers
1821 
1822     // 'from', 'to' and 'count' are now valid
1823     __ movptr(byte_count, count);
1824     __ shrptr(count, 3);   // count => qword_count
1825 
1826     // Copy from high to low addresses.
1827 
1828     // Check for and copy trailing byte
1829     __ testl(byte_count, 1);
1830     __ jcc(Assembler::zero, L_copy_2_bytes);
1831     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1832     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1833     __ decrement(byte_count); // Adjust for possible trailing word
1834 
1835     // Check for and copy trailing word
1836   __ BIND(L_copy_2_bytes);
1837     __ testl(byte_count, 2);
1838     __ jcc(Assembler::zero, L_copy_4_bytes);
1839     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1840     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1841 
1842     // Check for and copy trailing dword
1843   __ BIND(L_copy_4_bytes);
1844     __ testl(byte_count, 4);
1845     __ jcc(Assembler::zero, L_copy_bytes);
1846     __ movl(rax, Address(from, qword_count, Address::times_8));
1847     __ movl(Address(to, qword_count, Address::times_8), rax);
1848     __ jmp(L_copy_bytes);
1849 
1850     // Copy trailing qwords
1851   __ BIND(L_copy_8_bytes);
1852     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1853     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1854     __ decrement(qword_count);
1855     __ jcc(Assembler::notZero, L_copy_8_bytes);
1856 
1857     restore_arg_regs();
1858     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1859     __ xorptr(rax, rax); // return 0
1860     __ vzeroupper();
1861     __ leave(); // required for proper stackwalking of RuntimeStub frame
1862     __ ret(0);
1863 
1864     // Copy in multi-bytes chunks
1865     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1866 
1867     restore_arg_regs();
1868     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1869     __ xorptr(rax, rax); // return 0
1870     __ vzeroupper();
1871     __ leave(); // required for proper stackwalking of RuntimeStub frame
1872     __ ret(0);
1873 
1874     return start;
1875   }
1876 
1877   // Arguments:
1878   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1879   //             ignored
1880   //   name    - stub name string
1881   //
1882   // Inputs:
1883   //   c_rarg0   - source array address
1884   //   c_rarg1   - destination array address
1885   //   c_rarg2   - element count, treated as ssize_t, can be zero
1886   //
1887   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1888   // let the hardware handle it.  The two or four words within dwords
1889   // or qwords that span cache line boundaries will still be loaded
1890   // and stored atomically.
1891   //
1892   // Side Effects:
1893   //   disjoint_short_copy_entry is set to the no-overlap entry point
1894   //   used by generate_conjoint_short_copy().
1895   //
1896   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1897     __ align(CodeEntryAlignment);
1898     StubCodeMark mark(this, "StubRoutines", name);
1899     address start = __ pc();
1900 
1901     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1902     const Register from        = rdi;  // source array address
1903     const Register to          = rsi;  // destination array address
1904     const Register count       = rdx;  // elements count
1905     const Register word_count  = rcx;
1906     const Register qword_count = count;
1907     const Register end_from    = from; // source array end address
1908     const Register end_to      = to;   // destination array end address
1909     // End pointers are inclusive, and if count is not zero they point
1910     // to the last unit copied:  end_to[0] := end_from[0]
1911 
1912     __ enter(); // required for proper stackwalking of RuntimeStub frame
1913     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1914 
1915     if (entry != NULL) {
1916       *entry = __ pc();
1917       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1918       BLOCK_COMMENT("Entry:");
1919     }
1920 
1921     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1922                       // r9 and r10 may be used to save non-volatile registers
1923 
1924     // 'from', 'to' and 'count' are now valid
1925     __ movptr(word_count, count);
1926     __ shrptr(count, 2); // count => qword_count
1927 
1928     // Copy from low to high addresses.  Use 'to' as scratch.
1929     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1930     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1931     __ negptr(qword_count);
1932     __ jmp(L_copy_bytes);
1933 
1934     // Copy trailing qwords
1935   __ BIND(L_copy_8_bytes);
1936     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1937     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1938     __ increment(qword_count);
1939     __ jcc(Assembler::notZero, L_copy_8_bytes);
1940 
1941     // Original 'dest' is trashed, so we can't use it as a
1942     // base register for a possible trailing word copy
1943 
1944     // Check for and copy trailing dword
1945   __ BIND(L_copy_4_bytes);
1946     __ testl(word_count, 2);
1947     __ jccb(Assembler::zero, L_copy_2_bytes);
1948     __ movl(rax, Address(end_from, 8));
1949     __ movl(Address(end_to, 8), rax);
1950 
1951     __ addptr(end_from, 4);
1952     __ addptr(end_to, 4);
1953 
1954     // Check for and copy trailing word
1955   __ BIND(L_copy_2_bytes);
1956     __ testl(word_count, 1);
1957     __ jccb(Assembler::zero, L_exit);
1958     __ movw(rax, Address(end_from, 8));
1959     __ movw(Address(end_to, 8), rax);
1960 
1961   __ BIND(L_exit);
1962     restore_arg_regs();
1963     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1964     __ xorptr(rax, rax); // return 0
1965     __ vzeroupper();
1966     __ leave(); // required for proper stackwalking of RuntimeStub frame
1967     __ ret(0);
1968 
1969     // Copy in multi-bytes chunks
1970     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1971     __ jmp(L_copy_4_bytes);
1972 
1973     return start;
1974   }
1975 
1976   address generate_fill(BasicType t, bool aligned, const char *name) {
1977     __ align(CodeEntryAlignment);
1978     StubCodeMark mark(this, "StubRoutines", name);
1979     address start = __ pc();
1980 
1981     BLOCK_COMMENT("Entry:");
1982 
1983     const Register to       = c_rarg0;  // source array address
1984     const Register value    = c_rarg1;  // value
1985     const Register count    = c_rarg2;  // elements count
1986 
1987     __ enter(); // required for proper stackwalking of RuntimeStub frame
1988 
1989     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1990 
1991     __ vzeroupper();
1992     __ leave(); // required for proper stackwalking of RuntimeStub frame
1993     __ ret(0);
1994     return start;
1995   }
1996 
1997   // Arguments:
1998   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1999   //             ignored
2000   //   name    - stub name string
2001   //
2002   // Inputs:
2003   //   c_rarg0   - source array address
2004   //   c_rarg1   - destination array address
2005   //   c_rarg2   - element count, treated as ssize_t, can be zero
2006   //
2007   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2008   // let the hardware handle it.  The two or four words within dwords
2009   // or qwords that span cache line boundaries will still be loaded
2010   // and stored atomically.
2011   //
2012   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2013                                        address *entry, const char *name) {
2014     __ align(CodeEntryAlignment);
2015     StubCodeMark mark(this, "StubRoutines", name);
2016     address start = __ pc();
2017 
2018     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2019     const Register from        = rdi;  // source array address
2020     const Register to          = rsi;  // destination array address
2021     const Register count       = rdx;  // elements count
2022     const Register word_count  = rcx;
2023     const Register qword_count = count;
2024 
2025     __ enter(); // required for proper stackwalking of RuntimeStub frame
2026     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2027 
2028     if (entry != NULL) {
2029       *entry = __ pc();
2030       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2031       BLOCK_COMMENT("Entry:");
2032     }
2033 
2034     array_overlap_test(nooverlap_target, Address::times_2);
2035     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2036                       // r9 and r10 may be used to save non-volatile registers
2037 
2038     // 'from', 'to' and 'count' are now valid
2039     __ movptr(word_count, count);
2040     __ shrptr(count, 2); // count => qword_count
2041 
2042     // Copy from high to low addresses.  Use 'to' as scratch.
2043 
2044     // Check for and copy trailing word
2045     __ testl(word_count, 1);
2046     __ jccb(Assembler::zero, L_copy_4_bytes);
2047     __ movw(rax, Address(from, word_count, Address::times_2, -2));
2048     __ movw(Address(to, word_count, Address::times_2, -2), rax);
2049 
2050     // Check for and copy trailing dword
2051   __ BIND(L_copy_4_bytes);
2052     __ testl(word_count, 2);
2053     __ jcc(Assembler::zero, L_copy_bytes);
2054     __ movl(rax, Address(from, qword_count, Address::times_8));
2055     __ movl(Address(to, qword_count, Address::times_8), rax);
2056     __ jmp(L_copy_bytes);
2057 
2058     // Copy trailing qwords
2059   __ BIND(L_copy_8_bytes);
2060     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2061     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2062     __ decrement(qword_count);
2063     __ jcc(Assembler::notZero, L_copy_8_bytes);
2064 
2065     restore_arg_regs();
2066     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2067     __ xorptr(rax, rax); // return 0
2068     __ vzeroupper();
2069     __ leave(); // required for proper stackwalking of RuntimeStub frame
2070     __ ret(0);
2071 
2072     // Copy in multi-bytes chunks
2073     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2074 
2075     restore_arg_regs();
2076     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2077     __ xorptr(rax, rax); // return 0
2078     __ vzeroupper();
2079     __ leave(); // required for proper stackwalking of RuntimeStub frame
2080     __ ret(0);
2081 
2082     return start;
2083   }
2084 
2085   // Arguments:
2086   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2087   //             ignored
2088   //   is_oop  - true => oop array, so generate store check code
2089   //   name    - stub name string
2090   //
2091   // Inputs:
2092   //   c_rarg0   - source array address
2093   //   c_rarg1   - destination array address
2094   //   c_rarg2   - element count, treated as ssize_t, can be zero
2095   //
2096   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2097   // the hardware handle it.  The two dwords within qwords that span
2098   // cache line boundaries will still be loaded and stored atomicly.
2099   //
2100   // Side Effects:
2101   //   disjoint_int_copy_entry is set to the no-overlap entry point
2102   //   used by generate_conjoint_int_oop_copy().
2103   //
2104   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2105                                          const char *name, bool dest_uninitialized = false) {
2106     __ align(CodeEntryAlignment);
2107     StubCodeMark mark(this, "StubRoutines", name);
2108     address start = __ pc();
2109 
2110     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2111     const Register from        = rdi;  // source array address
2112     const Register to          = rsi;  // destination array address
2113     const Register count       = rdx;  // elements count
2114     const Register dword_count = rcx;
2115     const Register qword_count = count;
2116     const Register end_from    = from; // source array end address
2117     const Register end_to      = to;   // destination array end address
2118     const Register saved_to    = r11;  // saved destination array address
2119     // End pointers are inclusive, and if count is not zero they point
2120     // to the last unit copied:  end_to[0] := end_from[0]
2121 
2122     __ enter(); // required for proper stackwalking of RuntimeStub frame
2123     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2124 
2125     if (entry != NULL) {
2126       *entry = __ pc();
2127       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2128       BLOCK_COMMENT("Entry:");
2129     }
2130 
2131     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2132                       // r9 and r10 may be used to save non-volatile registers
2133     if (is_oop) {
2134       __ movq(saved_to, to);
2135       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2136     }
2137 
2138     // 'from', 'to' and 'count' are now valid
2139     __ movptr(dword_count, count);
2140     __ shrptr(count, 1); // count => qword_count
2141 
2142     // Copy from low to high addresses.  Use 'to' as scratch.
2143     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2144     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2145     __ negptr(qword_count);
2146     __ jmp(L_copy_bytes);
2147 
2148     // Copy trailing qwords
2149   __ BIND(L_copy_8_bytes);
2150     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2151     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2152     __ increment(qword_count);
2153     __ jcc(Assembler::notZero, L_copy_8_bytes);
2154 
2155     // Check for and copy trailing dword
2156   __ BIND(L_copy_4_bytes);
2157     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2158     __ jccb(Assembler::zero, L_exit);
2159     __ movl(rax, Address(end_from, 8));
2160     __ movl(Address(end_to, 8), rax);
2161 
2162   __ BIND(L_exit);
2163     if (is_oop) {
2164       gen_write_ref_array_post_barrier(saved_to, dword_count, rax);
2165     }
2166     restore_arg_regs();
2167     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2168     __ vzeroupper();
2169     __ xorptr(rax, rax); // return 0
2170     __ leave(); // required for proper stackwalking of RuntimeStub frame
2171     __ ret(0);
2172 
2173     // Copy in multi-bytes chunks
2174     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2175     __ jmp(L_copy_4_bytes);
2176 
2177     return start;
2178   }
2179 
2180   // Arguments:
2181   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2182   //             ignored
2183   //   is_oop  - true => oop array, so generate store check code
2184   //   name    - stub name string
2185   //
2186   // Inputs:
2187   //   c_rarg0   - source array address
2188   //   c_rarg1   - destination array address
2189   //   c_rarg2   - element count, treated as ssize_t, can be zero
2190   //
2191   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2192   // the hardware handle it.  The two dwords within qwords that span
2193   // cache line boundaries will still be loaded and stored atomicly.
2194   //
2195   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2196                                          address *entry, const char *name,
2197                                          bool dest_uninitialized = false) {
2198     __ align(CodeEntryAlignment);
2199     StubCodeMark mark(this, "StubRoutines", name);
2200     address start = __ pc();
2201 
2202     Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
2203     const Register from        = rdi;  // source array address
2204     const Register to          = rsi;  // destination array address
2205     const Register count       = rdx;  // elements count
2206     const Register dword_count = rcx;
2207     const Register qword_count = count;
2208 
2209     __ enter(); // required for proper stackwalking of RuntimeStub frame
2210     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2211 
2212     if (entry != NULL) {
2213       *entry = __ pc();
2214        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2215       BLOCK_COMMENT("Entry:");
2216     }
2217 
2218     array_overlap_test(nooverlap_target, Address::times_4);
2219     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2220                       // r9 and r10 may be used to save non-volatile registers
2221 
2222     if (is_oop) {
2223       // no registers are destroyed by this call
2224       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2225     }
2226 
2227     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2228     // 'from', 'to' and 'count' are now valid
2229     __ movptr(dword_count, count);
2230     __ shrptr(count, 1); // count => qword_count
2231 
2232     // Copy from high to low addresses.  Use 'to' as scratch.
2233 
2234     // Check for and copy trailing dword
2235     __ testl(dword_count, 1);
2236     __ jcc(Assembler::zero, L_copy_bytes);
2237     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2238     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2239     __ jmp(L_copy_bytes);
2240 
2241     // Copy trailing qwords
2242   __ BIND(L_copy_8_bytes);
2243     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2244     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2245     __ decrement(qword_count);
2246     __ jcc(Assembler::notZero, L_copy_8_bytes);
2247 
2248     if (is_oop) {
2249       __ jmp(L_exit);
2250     }
2251     restore_arg_regs();
2252     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2253     __ xorptr(rax, rax); // return 0
2254     __ vzeroupper();
2255     __ leave(); // required for proper stackwalking of RuntimeStub frame
2256     __ ret(0);
2257 
2258     // Copy in multi-bytes chunks
2259     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2260 
2261   __ BIND(L_exit);
2262     if (is_oop) {
2263       gen_write_ref_array_post_barrier(to, dword_count, rax);
2264     }
2265     restore_arg_regs();
2266     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2267     __ xorptr(rax, rax); // return 0
2268     __ vzeroupper();
2269     __ leave(); // required for proper stackwalking of RuntimeStub frame
2270     __ ret(0);
2271 
2272     return start;
2273   }
2274 
2275   // Arguments:
2276   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2277   //             ignored
2278   //   is_oop  - true => oop array, so generate store check code
2279   //   name    - stub name string
2280   //
2281   // Inputs:
2282   //   c_rarg0   - source array address
2283   //   c_rarg1   - destination array address
2284   //   c_rarg2   - element count, treated as ssize_t, can be zero
2285   //
2286  // Side Effects:
2287   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2288   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2289   //
2290   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2291                                           const char *name, bool dest_uninitialized = false) {
2292     __ align(CodeEntryAlignment);
2293     StubCodeMark mark(this, "StubRoutines", name);
2294     address start = __ pc();
2295 
2296     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2297     const Register from        = rdi;  // source array address
2298     const Register to          = rsi;  // destination array address
2299     const Register qword_count = rdx;  // elements count
2300     const Register end_from    = from; // source array end address
2301     const Register end_to      = rcx;  // destination array end address
2302     const Register saved_to    = to;
2303     const Register saved_count = r11;
2304     // End pointers are inclusive, and if count is not zero they point
2305     // to the last unit copied:  end_to[0] := end_from[0]
2306 
2307     __ enter(); // required for proper stackwalking of RuntimeStub frame
2308     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2309     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2310 
2311     if (entry != NULL) {
2312       *entry = __ pc();
2313       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2314       BLOCK_COMMENT("Entry:");
2315     }
2316 
2317     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2318                       // r9 and r10 may be used to save non-volatile registers
2319     // 'from', 'to' and 'qword_count' are now valid
2320     if (is_oop) {
2321       // Save to and count for store barrier
2322       __ movptr(saved_count, qword_count);
2323       // no registers are destroyed by this call
2324       gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized);
2325     }
2326 
2327     // Copy from low to high addresses.  Use 'to' as scratch.
2328     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2329     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2330     __ negptr(qword_count);
2331     __ jmp(L_copy_bytes);
2332 
2333     // Copy trailing qwords
2334   __ BIND(L_copy_8_bytes);
2335     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2336     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2337     __ increment(qword_count);
2338     __ jcc(Assembler::notZero, L_copy_8_bytes);
2339 
2340     if (is_oop) {
2341       __ jmp(L_exit);
2342     } else {
2343       restore_arg_regs();
2344       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2345       __ xorptr(rax, rax); // return 0
2346       __ vzeroupper();
2347       __ leave(); // required for proper stackwalking of RuntimeStub frame
2348       __ ret(0);
2349     }
2350 
2351     // Copy in multi-bytes chunks
2352     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2353 
2354     if (is_oop) {
2355     __ BIND(L_exit);
2356       gen_write_ref_array_post_barrier(saved_to, saved_count, rax);
2357     }
2358     restore_arg_regs();
2359     if (is_oop) {
2360       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2361     } else {
2362       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2363     }
2364     __ vzeroupper();
2365     __ xorptr(rax, rax); // return 0
2366     __ leave(); // required for proper stackwalking of RuntimeStub frame
2367     __ ret(0);
2368 
2369     return start;
2370   }
2371 
2372   // Arguments:
2373   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2374   //             ignored
2375   //   is_oop  - true => oop array, so generate store check code
2376   //   name    - stub name string
2377   //
2378   // Inputs:
2379   //   c_rarg0   - source array address
2380   //   c_rarg1   - destination array address
2381   //   c_rarg2   - element count, treated as ssize_t, can be zero
2382   //
2383   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2384                                           address nooverlap_target, address *entry,
2385                                           const char *name, bool dest_uninitialized = false) {
2386     __ align(CodeEntryAlignment);
2387     StubCodeMark mark(this, "StubRoutines", name);
2388     address start = __ pc();
2389 
2390     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2391     const Register from        = rdi;  // source array address
2392     const Register to          = rsi;  // destination array address
2393     const Register qword_count = rdx;  // elements count
2394     const Register saved_count = rcx;
2395 
2396     __ enter(); // required for proper stackwalking of RuntimeStub frame
2397     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2398 
2399     if (entry != NULL) {
2400       *entry = __ pc();
2401       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2402       BLOCK_COMMENT("Entry:");
2403     }
2404 
2405     array_overlap_test(nooverlap_target, Address::times_8);
2406     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2407                       // r9 and r10 may be used to save non-volatile registers
2408     // 'from', 'to' and 'qword_count' are now valid
2409     if (is_oop) {
2410       // Save to and count for store barrier
2411       __ movptr(saved_count, qword_count);
2412       // No registers are destroyed by this call
2413       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2414     }
2415 
2416     __ jmp(L_copy_bytes);
2417 
2418     // Copy trailing qwords
2419   __ BIND(L_copy_8_bytes);
2420     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2421     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2422     __ decrement(qword_count);
2423     __ jcc(Assembler::notZero, L_copy_8_bytes);
2424 
2425     if (is_oop) {
2426       __ jmp(L_exit);
2427     } else {
2428       restore_arg_regs();
2429       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2430       __ xorptr(rax, rax); // return 0
2431       __ vzeroupper();
2432       __ leave(); // required for proper stackwalking of RuntimeStub frame
2433       __ ret(0);
2434     }
2435 
2436     // Copy in multi-bytes chunks
2437     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2438 
2439     if (is_oop) {
2440     __ BIND(L_exit);
2441       gen_write_ref_array_post_barrier(to, saved_count, rax);
2442     }
2443     restore_arg_regs();
2444     if (is_oop) {
2445       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2446     } else {
2447       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2448     }
2449     __ vzeroupper();
2450     __ xorptr(rax, rax); // return 0
2451     __ leave(); // required for proper stackwalking of RuntimeStub frame
2452     __ ret(0);
2453 
2454     return start;
2455   }
2456 
2457 
2458   // Helper for generating a dynamic type check.
2459   // Smashes no registers.
2460   void generate_type_check(Register sub_klass,
2461                            Register super_check_offset,
2462                            Register super_klass,
2463                            Label& L_success) {
2464     assert_different_registers(sub_klass, super_check_offset, super_klass);
2465 
2466     BLOCK_COMMENT("type_check:");
2467 
2468     Label L_miss;
2469 
2470     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2471                                      super_check_offset);
2472     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2473 
2474     // Fall through on failure!
2475     __ BIND(L_miss);
2476   }
2477 
2478   //
2479   //  Generate checkcasting array copy stub
2480   //
2481   //  Input:
2482   //    c_rarg0   - source array address
2483   //    c_rarg1   - destination array address
2484   //    c_rarg2   - element count, treated as ssize_t, can be zero
2485   //    c_rarg3   - size_t ckoff (super_check_offset)
2486   // not Win64
2487   //    c_rarg4   - oop ckval (super_klass)
2488   // Win64
2489   //    rsp+40    - oop ckval (super_klass)
2490   //
2491   //  Output:
2492   //    rax ==  0  -  success
2493   //    rax == -1^K - failure, where K is partial transfer count
2494   //
2495   address generate_checkcast_copy(const char *name, address *entry,
2496                                   bool dest_uninitialized = false) {
2497 
2498     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2499 
2500     // Input registers (after setup_arg_regs)
2501     const Register from        = rdi;   // source array address
2502     const Register to          = rsi;   // destination array address
2503     const Register length      = rdx;   // elements count
2504     const Register ckoff       = rcx;   // super_check_offset
2505     const Register ckval       = r8;    // super_klass
2506 
2507     // Registers used as temps (r13, r14 are save-on-entry)
2508     const Register end_from    = from;  // source array end address
2509     const Register end_to      = r13;   // destination array end address
2510     const Register count       = rdx;   // -(count_remaining)
2511     const Register r14_length  = r14;   // saved copy of length
2512     // End pointers are inclusive, and if length is not zero they point
2513     // to the last unit copied:  end_to[0] := end_from[0]
2514 
2515     const Register rax_oop    = rax;    // actual oop copied
2516     const Register r11_klass  = r11;    // oop._klass
2517 
2518     //---------------------------------------------------------------
2519     // Assembler stub will be used for this call to arraycopy
2520     // if the two arrays are subtypes of Object[] but the
2521     // destination array type is not equal to or a supertype
2522     // of the source type.  Each element must be separately
2523     // checked.
2524 
2525     __ align(CodeEntryAlignment);
2526     StubCodeMark mark(this, "StubRoutines", name);
2527     address start = __ pc();
2528 
2529     __ enter(); // required for proper stackwalking of RuntimeStub frame
2530 
2531 #ifdef ASSERT
2532     // caller guarantees that the arrays really are different
2533     // otherwise, we would have to make conjoint checks
2534     { Label L;
2535       array_overlap_test(L, TIMES_OOP);
2536       __ stop("checkcast_copy within a single array");
2537       __ bind(L);
2538     }
2539 #endif //ASSERT
2540 
2541     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2542                        // ckoff => rcx, ckval => r8
2543                        // r9 and r10 may be used to save non-volatile registers
2544 #ifdef _WIN64
2545     // last argument (#4) is on stack on Win64
2546     __ movptr(ckval, Address(rsp, 6 * wordSize));
2547 #endif
2548 
2549     // Caller of this entry point must set up the argument registers.
2550     if (entry != NULL) {
2551       *entry = __ pc();
2552       BLOCK_COMMENT("Entry:");
2553     }
2554 
2555     // allocate spill slots for r13, r14
2556     enum {
2557       saved_r13_offset,
2558       saved_r14_offset,
2559       saved_rbp_offset
2560     };
2561     __ subptr(rsp, saved_rbp_offset * wordSize);
2562     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2563     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2564 
2565     // check that int operands are properly extended to size_t
2566     assert_clean_int(length, rax);
2567     assert_clean_int(ckoff, rax);
2568 
2569 #ifdef ASSERT
2570     BLOCK_COMMENT("assert consistent ckoff/ckval");
2571     // The ckoff and ckval must be mutually consistent,
2572     // even though caller generates both.
2573     { Label L;
2574       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2575       __ cmpl(ckoff, Address(ckval, sco_offset));
2576       __ jcc(Assembler::equal, L);
2577       __ stop("super_check_offset inconsistent");
2578       __ bind(L);
2579     }
2580 #endif //ASSERT
2581 
2582     // Loop-invariant addresses.  They are exclusive end pointers.
2583     Address end_from_addr(from, length, TIMES_OOP, 0);
2584     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2585     // Loop-variant addresses.  They assume post-incremented count < 0.
2586     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2587     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2588 
2589     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2590 
2591     // Copy from low to high addresses, indexed from the end of each array.
2592     __ lea(end_from, end_from_addr);
2593     __ lea(end_to,   end_to_addr);
2594     __ movptr(r14_length, length);        // save a copy of the length
2595     assert(length == count, "");          // else fix next line:
2596     __ negptr(count);                     // negate and test the length
2597     __ jcc(Assembler::notZero, L_load_element);
2598 
2599     // Empty array:  Nothing to do.
2600     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2601     __ jmp(L_done);
2602 
2603     // ======== begin loop ========
2604     // (Loop is rotated; its entry is L_load_element.)
2605     // Loop control:
2606     //   for (count = -count; count != 0; count++)
2607     // Base pointers src, dst are biased by 8*(count-1),to last element.
2608     __ align(OptoLoopAlignment);
2609 
2610     __ BIND(L_store_element);
2611     __ store_heap_oop(to_element_addr, rax_oop);  // store the oop
2612     __ increment(count);               // increment the count toward zero
2613     __ jcc(Assembler::zero, L_do_card_marks);
2614 
2615     // ======== loop entry is here ========
2616     __ BIND(L_load_element);
2617     __ load_heap_oop(rax_oop, from_element_addr); // load the oop
2618     __ testptr(rax_oop, rax_oop);
2619     __ jcc(Assembler::zero, L_store_element);
2620 
2621     __ load_klass(r11_klass, rax_oop);// query the object klass
2622     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2623     // ======== end loop ========
2624 
2625     // It was a real error; we must depend on the caller to finish the job.
2626     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2627     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2628     // and report their number to the caller.
2629     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2630     Label L_post_barrier;
2631     __ addptr(r14_length, count);     // K = (original - remaining) oops
2632     __ movptr(rax, r14_length);       // save the value
2633     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2634     __ jccb(Assembler::notZero, L_post_barrier);
2635     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2636 
2637     // Come here on success only.
2638     __ BIND(L_do_card_marks);
2639     __ xorptr(rax, rax);              // return 0 on success
2640 
2641     __ BIND(L_post_barrier);
2642     gen_write_ref_array_post_barrier(to, r14_length, rscratch1);
2643 
2644     // Common exit point (success or failure).
2645     __ BIND(L_done);
2646     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2647     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2648     restore_arg_regs();
2649     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2650     __ leave(); // required for proper stackwalking of RuntimeStub frame
2651     __ ret(0);
2652 
2653     return start;
2654   }
2655 
2656   //
2657   //  Generate 'unsafe' array copy stub
2658   //  Though just as safe as the other stubs, it takes an unscaled
2659   //  size_t argument instead of an element count.
2660   //
2661   //  Input:
2662   //    c_rarg0   - source array address
2663   //    c_rarg1   - destination array address
2664   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2665   //
2666   // Examines the alignment of the operands and dispatches
2667   // to a long, int, short, or byte copy loop.
2668   //
2669   address generate_unsafe_copy(const char *name,
2670                                address byte_copy_entry, address short_copy_entry,
2671                                address int_copy_entry, address long_copy_entry) {
2672 
2673     Label L_long_aligned, L_int_aligned, L_short_aligned;
2674 
2675     // Input registers (before setup_arg_regs)
2676     const Register from        = c_rarg0;  // source array address
2677     const Register to          = c_rarg1;  // destination array address
2678     const Register size        = c_rarg2;  // byte count (size_t)
2679 
2680     // Register used as a temp
2681     const Register bits        = rax;      // test copy of low bits
2682 
2683     __ align(CodeEntryAlignment);
2684     StubCodeMark mark(this, "StubRoutines", name);
2685     address start = __ pc();
2686 
2687     __ enter(); // required for proper stackwalking of RuntimeStub frame
2688 
2689     // bump this on entry, not on exit:
2690     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2691 
2692     __ mov(bits, from);
2693     __ orptr(bits, to);
2694     __ orptr(bits, size);
2695 
2696     __ testb(bits, BytesPerLong-1);
2697     __ jccb(Assembler::zero, L_long_aligned);
2698 
2699     __ testb(bits, BytesPerInt-1);
2700     __ jccb(Assembler::zero, L_int_aligned);
2701 
2702     __ testb(bits, BytesPerShort-1);
2703     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2704 
2705     __ BIND(L_short_aligned);
2706     __ shrptr(size, LogBytesPerShort); // size => short_count
2707     __ jump(RuntimeAddress(short_copy_entry));
2708 
2709     __ BIND(L_int_aligned);
2710     __ shrptr(size, LogBytesPerInt); // size => int_count
2711     __ jump(RuntimeAddress(int_copy_entry));
2712 
2713     __ BIND(L_long_aligned);
2714     __ shrptr(size, LogBytesPerLong); // size => qword_count
2715     __ jump(RuntimeAddress(long_copy_entry));
2716 
2717     return start;
2718   }
2719 
2720   // Perform range checks on the proposed arraycopy.
2721   // Kills temp, but nothing else.
2722   // Also, clean the sign bits of src_pos and dst_pos.
2723   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2724                               Register src_pos, // source position (c_rarg1)
2725                               Register dst,     // destination array oo (c_rarg2)
2726                               Register dst_pos, // destination position (c_rarg3)
2727                               Register length,
2728                               Register temp,
2729                               Label& L_failed) {
2730     BLOCK_COMMENT("arraycopy_range_checks:");
2731 
2732     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2733     __ movl(temp, length);
2734     __ addl(temp, src_pos);             // src_pos + length
2735     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2736     __ jcc(Assembler::above, L_failed);
2737 
2738     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2739     __ movl(temp, length);
2740     __ addl(temp, dst_pos);             // dst_pos + length
2741     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2742     __ jcc(Assembler::above, L_failed);
2743 
2744     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2745     // Move with sign extension can be used since they are positive.
2746     __ movslq(src_pos, src_pos);
2747     __ movslq(dst_pos, dst_pos);
2748 
2749     BLOCK_COMMENT("arraycopy_range_checks done");
2750   }
2751 
2752   //
2753   //  Generate generic array copy stubs
2754   //
2755   //  Input:
2756   //    c_rarg0    -  src oop
2757   //    c_rarg1    -  src_pos (32-bits)
2758   //    c_rarg2    -  dst oop
2759   //    c_rarg3    -  dst_pos (32-bits)
2760   // not Win64
2761   //    c_rarg4    -  element count (32-bits)
2762   // Win64
2763   //    rsp+40     -  element count (32-bits)
2764   //
2765   //  Output:
2766   //    rax ==  0  -  success
2767   //    rax == -1^K - failure, where K is partial transfer count
2768   //
2769   address generate_generic_copy(const char *name,
2770                                 address byte_copy_entry, address short_copy_entry,
2771                                 address int_copy_entry, address oop_copy_entry,
2772                                 address long_copy_entry, address checkcast_copy_entry) {
2773 
2774     Label L_failed, L_failed_0, L_objArray;
2775     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2776 
2777     // Input registers
2778     const Register src        = c_rarg0;  // source array oop
2779     const Register src_pos    = c_rarg1;  // source position
2780     const Register dst        = c_rarg2;  // destination array oop
2781     const Register dst_pos    = c_rarg3;  // destination position
2782 #ifndef _WIN64
2783     const Register length     = c_rarg4;
2784 #else
2785     const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2786 #endif
2787 
2788     { int modulus = CodeEntryAlignment;
2789       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2790       int advance = target - (__ offset() % modulus);
2791       if (advance < 0)  advance += modulus;
2792       if (advance > 0)  __ nop(advance);
2793     }
2794     StubCodeMark mark(this, "StubRoutines", name);
2795 
2796     // Short-hop target to L_failed.  Makes for denser prologue code.
2797     __ BIND(L_failed_0);
2798     __ jmp(L_failed);
2799     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2800 
2801     __ align(CodeEntryAlignment);
2802     address start = __ pc();
2803 
2804     __ enter(); // required for proper stackwalking of RuntimeStub frame
2805 
2806     // bump this on entry, not on exit:
2807     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2808 
2809     //-----------------------------------------------------------------------
2810     // Assembler stub will be used for this call to arraycopy
2811     // if the following conditions are met:
2812     //
2813     // (1) src and dst must not be null.
2814     // (2) src_pos must not be negative.
2815     // (3) dst_pos must not be negative.
2816     // (4) length  must not be negative.
2817     // (5) src klass and dst klass should be the same and not NULL.
2818     // (6) src and dst should be arrays.
2819     // (7) src_pos + length must not exceed length of src.
2820     // (8) dst_pos + length must not exceed length of dst.
2821     //
2822 
2823     //  if (src == NULL) return -1;
2824     __ testptr(src, src);         // src oop
2825     size_t j1off = __ offset();
2826     __ jccb(Assembler::zero, L_failed_0);
2827 
2828     //  if (src_pos < 0) return -1;
2829     __ testl(src_pos, src_pos); // src_pos (32-bits)
2830     __ jccb(Assembler::negative, L_failed_0);
2831 
2832     //  if (dst == NULL) return -1;
2833     __ testptr(dst, dst);         // dst oop
2834     __ jccb(Assembler::zero, L_failed_0);
2835 
2836     //  if (dst_pos < 0) return -1;
2837     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2838     size_t j4off = __ offset();
2839     __ jccb(Assembler::negative, L_failed_0);
2840 
2841     // The first four tests are very dense code,
2842     // but not quite dense enough to put four
2843     // jumps in a 16-byte instruction fetch buffer.
2844     // That's good, because some branch predicters
2845     // do not like jumps so close together.
2846     // Make sure of this.
2847     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2848 
2849     // registers used as temp
2850     const Register r11_length    = r11; // elements count to copy
2851     const Register r10_src_klass = r10; // array klass
2852 
2853     //  if (length < 0) return -1;
2854     __ movl(r11_length, length);        // length (elements count, 32-bits value)
2855     __ testl(r11_length, r11_length);
2856     __ jccb(Assembler::negative, L_failed_0);
2857 
2858     __ load_klass(r10_src_klass, src);
2859 #ifdef ASSERT
2860     //  assert(src->klass() != NULL);
2861     {
2862       BLOCK_COMMENT("assert klasses not null {");
2863       Label L1, L2;
2864       __ testptr(r10_src_klass, r10_src_klass);
2865       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2866       __ bind(L1);
2867       __ stop("broken null klass");
2868       __ bind(L2);
2869       __ load_klass(rax, dst);
2870       __ cmpq(rax, 0);
2871       __ jcc(Assembler::equal, L1);     // this would be broken also
2872       BLOCK_COMMENT("} assert klasses not null done");
2873     }
2874 #endif
2875 
2876     // Load layout helper (32-bits)
2877     //
2878     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2879     // 32        30    24            16              8     2                 0
2880     //
2881     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2882     //
2883 
2884     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2885 
2886     // Handle objArrays completely differently...
2887     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2888     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2889     __ jcc(Assembler::equal, L_objArray);
2890 
2891     //  if (src->klass() != dst->klass()) return -1;
2892     __ load_klass(rax, dst);
2893     __ cmpq(r10_src_klass, rax);
2894     __ jcc(Assembler::notEqual, L_failed);
2895 
2896     const Register rax_lh = rax;  // layout helper
2897     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2898 
2899     //  if (!src->is_Array()) return -1;
2900     __ cmpl(rax_lh, Klass::_lh_neutral_value);
2901     __ jcc(Assembler::greaterEqual, L_failed);
2902 
2903     // At this point, it is known to be a typeArray (array_tag 0x3).
2904 #ifdef ASSERT
2905     {
2906       BLOCK_COMMENT("assert primitive array {");
2907       Label L;
2908       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2909       __ jcc(Assembler::greaterEqual, L);
2910       __ stop("must be a primitive array");
2911       __ bind(L);
2912       BLOCK_COMMENT("} assert primitive array done");
2913     }
2914 #endif
2915 
2916     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2917                            r10, L_failed);
2918 
2919     // TypeArrayKlass
2920     //
2921     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2922     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2923     //
2924 
2925     const Register r10_offset = r10;    // array offset
2926     const Register rax_elsize = rax_lh; // element size
2927 
2928     __ movl(r10_offset, rax_lh);
2929     __ shrl(r10_offset, Klass::_lh_header_size_shift);
2930     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2931     __ addptr(src, r10_offset);           // src array offset
2932     __ addptr(dst, r10_offset);           // dst array offset
2933     BLOCK_COMMENT("choose copy loop based on element size");
2934     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2935 
2936     // next registers should be set before the jump to corresponding stub
2937     const Register from     = c_rarg0;  // source array address
2938     const Register to       = c_rarg1;  // destination array address
2939     const Register count    = c_rarg2;  // elements count
2940 
2941     // 'from', 'to', 'count' registers should be set in such order
2942     // since they are the same as 'src', 'src_pos', 'dst'.
2943 
2944   __ BIND(L_copy_bytes);
2945     __ cmpl(rax_elsize, 0);
2946     __ jccb(Assembler::notEqual, L_copy_shorts);
2947     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2948     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2949     __ movl2ptr(count, r11_length); // length
2950     __ jump(RuntimeAddress(byte_copy_entry));
2951 
2952   __ BIND(L_copy_shorts);
2953     __ cmpl(rax_elsize, LogBytesPerShort);
2954     __ jccb(Assembler::notEqual, L_copy_ints);
2955     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2956     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2957     __ movl2ptr(count, r11_length); // length
2958     __ jump(RuntimeAddress(short_copy_entry));
2959 
2960   __ BIND(L_copy_ints);
2961     __ cmpl(rax_elsize, LogBytesPerInt);
2962     __ jccb(Assembler::notEqual, L_copy_longs);
2963     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2964     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2965     __ movl2ptr(count, r11_length); // length
2966     __ jump(RuntimeAddress(int_copy_entry));
2967 
2968   __ BIND(L_copy_longs);
2969 #ifdef ASSERT
2970     {
2971       BLOCK_COMMENT("assert long copy {");
2972       Label L;
2973       __ cmpl(rax_elsize, LogBytesPerLong);
2974       __ jcc(Assembler::equal, L);
2975       __ stop("must be long copy, but elsize is wrong");
2976       __ bind(L);
2977       BLOCK_COMMENT("} assert long copy done");
2978     }
2979 #endif
2980     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2981     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2982     __ movl2ptr(count, r11_length); // length
2983     __ jump(RuntimeAddress(long_copy_entry));
2984 
2985     // ObjArrayKlass
2986   __ BIND(L_objArray);
2987     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2988 
2989     Label L_plain_copy, L_checkcast_copy;
2990     //  test array classes for subtyping
2991     __ load_klass(rax, dst);
2992     __ cmpq(r10_src_klass, rax); // usual case is exact equality
2993     __ jcc(Assembler::notEqual, L_checkcast_copy);
2994 
2995     // Identically typed arrays can be copied without element-wise checks.
2996     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2997                            r10, L_failed);
2998 
2999     __ lea(from, Address(src, src_pos, TIMES_OOP,
3000                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3001     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3002                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3003     __ movl2ptr(count, r11_length); // length
3004   __ BIND(L_plain_copy);
3005     __ jump(RuntimeAddress(oop_copy_entry));
3006 
3007   __ BIND(L_checkcast_copy);
3008     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3009     {
3010       // Before looking at dst.length, make sure dst is also an objArray.
3011       __ cmpl(Address(rax, lh_offset), objArray_lh);
3012       __ jcc(Assembler::notEqual, L_failed);
3013 
3014       // It is safe to examine both src.length and dst.length.
3015       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3016                              rax, L_failed);
3017 
3018       const Register r11_dst_klass = r11;
3019       __ load_klass(r11_dst_klass, dst); // reload
3020 
3021       // Marshal the base address arguments now, freeing registers.
3022       __ lea(from, Address(src, src_pos, TIMES_OOP,
3023                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3024       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3025                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3026       __ movl(count, length);           // length (reloaded)
3027       Register sco_temp = c_rarg3;      // this register is free now
3028       assert_different_registers(from, to, count, sco_temp,
3029                                  r11_dst_klass, r10_src_klass);
3030       assert_clean_int(count, sco_temp);
3031 
3032       // Generate the type check.
3033       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3034       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3035       assert_clean_int(sco_temp, rax);
3036       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3037 
3038       // Fetch destination element klass from the ObjArrayKlass header.
3039       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3040       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3041       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3042       assert_clean_int(sco_temp, rax);
3043 
3044       // the checkcast_copy loop needs two extra arguments:
3045       assert(c_rarg3 == sco_temp, "#3 already in place");
3046       // Set up arguments for checkcast_copy_entry.
3047       setup_arg_regs(4);
3048       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3049       __ jump(RuntimeAddress(checkcast_copy_entry));
3050     }
3051 
3052   __ BIND(L_failed);
3053     __ xorptr(rax, rax);
3054     __ notptr(rax); // return -1
3055     __ leave();   // required for proper stackwalking of RuntimeStub frame
3056     __ ret(0);
3057 
3058     return start;
3059   }
3060 
3061   void generate_arraycopy_stubs() {
3062     address entry;
3063     address entry_jbyte_arraycopy;
3064     address entry_jshort_arraycopy;
3065     address entry_jint_arraycopy;
3066     address entry_oop_arraycopy;
3067     address entry_jlong_arraycopy;
3068     address entry_checkcast_arraycopy;
3069 
3070     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3071                                                                            "jbyte_disjoint_arraycopy");
3072     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3073                                                                            "jbyte_arraycopy");
3074 
3075     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3076                                                                             "jshort_disjoint_arraycopy");
3077     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3078                                                                             "jshort_arraycopy");
3079 
3080     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3081                                                                               "jint_disjoint_arraycopy");
3082     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3083                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3084 
3085     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3086                                                                                "jlong_disjoint_arraycopy");
3087     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3088                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3089 
3090 
3091     if (UseCompressedOops) {
3092       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3093                                                                               "oop_disjoint_arraycopy");
3094       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3095                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3096       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3097                                                                                      "oop_disjoint_arraycopy_uninit",
3098                                                                                      /*dest_uninitialized*/true);
3099       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3100                                                                                      NULL, "oop_arraycopy_uninit",
3101                                                                                      /*dest_uninitialized*/true);
3102     } else {
3103       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3104                                                                                "oop_disjoint_arraycopy");
3105       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3106                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3107       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3108                                                                                       "oop_disjoint_arraycopy_uninit",
3109                                                                                       /*dest_uninitialized*/true);
3110       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3111                                                                                       NULL, "oop_arraycopy_uninit",
3112                                                                                       /*dest_uninitialized*/true);
3113     }
3114 
3115     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3116     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3117                                                                         /*dest_uninitialized*/true);
3118 
3119     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3120                                                               entry_jbyte_arraycopy,
3121                                                               entry_jshort_arraycopy,
3122                                                               entry_jint_arraycopy,
3123                                                               entry_jlong_arraycopy);
3124     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3125                                                                entry_jbyte_arraycopy,
3126                                                                entry_jshort_arraycopy,
3127                                                                entry_jint_arraycopy,
3128                                                                entry_oop_arraycopy,
3129                                                                entry_jlong_arraycopy,
3130                                                                entry_checkcast_arraycopy);
3131 
3132     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3133     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3134     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3135     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3136     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3137     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3138 
3139     // We don't generate specialized code for HeapWord-aligned source
3140     // arrays, so just use the code we've already generated
3141     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3142     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3143 
3144     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3145     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3146 
3147     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3148     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3149 
3150     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3151     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3152 
3153     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3154     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3155 
3156     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3157     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3158   }
3159 
3160   // AES intrinsic stubs
3161   enum {AESBlockSize = 16};
3162 
3163   address generate_key_shuffle_mask() {
3164     __ align(16);
3165     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3166     address start = __ pc();
3167     __ emit_data64( 0x0405060700010203, relocInfo::none );
3168     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3169     return start;
3170   }
3171 
3172   address generate_counter_shuffle_mask() {
3173     __ align(16);
3174     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3175     address start = __ pc();
3176     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3177     __ emit_data64(0x0001020304050607, relocInfo::none);
3178     return start;
3179   }
3180 
3181   // Utility routine for loading a 128-bit key word in little endian format
3182   // can optionally specify that the shuffle mask is already in an xmmregister
3183   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3184     __ movdqu(xmmdst, Address(key, offset));
3185     if (xmm_shuf_mask != NULL) {
3186       __ pshufb(xmmdst, xmm_shuf_mask);
3187     } else {
3188       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3189     }
3190   }
3191 
3192   // Utility routine for increase 128bit counter (iv in CTR mode)
3193   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3194     __ pextrq(reg, xmmdst, 0x0);
3195     __ addq(reg, inc_delta);
3196     __ pinsrq(xmmdst, reg, 0x0);
3197     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3198     __ pextrq(reg, xmmdst, 0x01); // Carry
3199     __ addq(reg, 0x01);
3200     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3201     __ BIND(next_block);          // next instruction
3202   }
3203 
3204   // Arguments:
3205   //
3206   // Inputs:
3207   //   c_rarg0   - source byte array address
3208   //   c_rarg1   - destination byte array address
3209   //   c_rarg2   - K (key) in little endian int array
3210   //
3211   address generate_aescrypt_encryptBlock() {
3212     assert(UseAES, "need AES instructions and misaligned SSE support");
3213     __ align(CodeEntryAlignment);
3214     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3215     Label L_doLast;
3216     address start = __ pc();
3217 
3218     const Register from        = c_rarg0;  // source array address
3219     const Register to          = c_rarg1;  // destination array address
3220     const Register key         = c_rarg2;  // key array address
3221     const Register keylen      = rax;
3222 
3223     const XMMRegister xmm_result = xmm0;
3224     const XMMRegister xmm_key_shuf_mask = xmm1;
3225     // On win64 xmm6-xmm15 must be preserved so don't use them.
3226     const XMMRegister xmm_temp1  = xmm2;
3227     const XMMRegister xmm_temp2  = xmm3;
3228     const XMMRegister xmm_temp3  = xmm4;
3229     const XMMRegister xmm_temp4  = xmm5;
3230 
3231     __ enter(); // required for proper stackwalking of RuntimeStub frame
3232 
3233     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3234     // context for the registers used, where all instructions below are using 128-bit mode
3235     // On EVEX without VL and BW, these instructions will all be AVX.
3236     if (VM_Version::supports_avx512vlbw()) {
3237       __ movl(rax, 0xffff);
3238       __ kmovql(k1, rax);
3239     }
3240 
3241     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3242     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3243 
3244     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3245     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3246 
3247     // For encryption, the java expanded key ordering is just what we need
3248     // we don't know if the key is aligned, hence not using load-execute form
3249 
3250     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3251     __ pxor(xmm_result, xmm_temp1);
3252 
3253     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3254     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3255     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3256     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3257 
3258     __ aesenc(xmm_result, xmm_temp1);
3259     __ aesenc(xmm_result, xmm_temp2);
3260     __ aesenc(xmm_result, xmm_temp3);
3261     __ aesenc(xmm_result, xmm_temp4);
3262 
3263     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3264     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3265     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3266     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3267 
3268     __ aesenc(xmm_result, xmm_temp1);
3269     __ aesenc(xmm_result, xmm_temp2);
3270     __ aesenc(xmm_result, xmm_temp3);
3271     __ aesenc(xmm_result, xmm_temp4);
3272 
3273     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3274     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3275 
3276     __ cmpl(keylen, 44);
3277     __ jccb(Assembler::equal, L_doLast);
3278 
3279     __ aesenc(xmm_result, xmm_temp1);
3280     __ aesenc(xmm_result, xmm_temp2);
3281 
3282     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3283     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3284 
3285     __ cmpl(keylen, 52);
3286     __ jccb(Assembler::equal, L_doLast);
3287 
3288     __ aesenc(xmm_result, xmm_temp1);
3289     __ aesenc(xmm_result, xmm_temp2);
3290 
3291     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3292     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3293 
3294     __ BIND(L_doLast);
3295     __ aesenc(xmm_result, xmm_temp1);
3296     __ aesenclast(xmm_result, xmm_temp2);
3297     __ movdqu(Address(to, 0), xmm_result);        // store the result
3298     __ xorptr(rax, rax); // return 0
3299     __ leave(); // required for proper stackwalking of RuntimeStub frame
3300     __ ret(0);
3301 
3302     return start;
3303   }
3304 
3305 
3306   // Arguments:
3307   //
3308   // Inputs:
3309   //   c_rarg0   - source byte array address
3310   //   c_rarg1   - destination byte array address
3311   //   c_rarg2   - K (key) in little endian int array
3312   //
3313   address generate_aescrypt_decryptBlock() {
3314     assert(UseAES, "need AES instructions and misaligned SSE support");
3315     __ align(CodeEntryAlignment);
3316     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3317     Label L_doLast;
3318     address start = __ pc();
3319 
3320     const Register from        = c_rarg0;  // source array address
3321     const Register to          = c_rarg1;  // destination array address
3322     const Register key         = c_rarg2;  // key array address
3323     const Register keylen      = rax;
3324 
3325     const XMMRegister xmm_result = xmm0;
3326     const XMMRegister xmm_key_shuf_mask = xmm1;
3327     // On win64 xmm6-xmm15 must be preserved so don't use them.
3328     const XMMRegister xmm_temp1  = xmm2;
3329     const XMMRegister xmm_temp2  = xmm3;
3330     const XMMRegister xmm_temp3  = xmm4;
3331     const XMMRegister xmm_temp4  = xmm5;
3332 
3333     __ enter(); // required for proper stackwalking of RuntimeStub frame
3334 
3335     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3336     // context for the registers used, where all instructions below are using 128-bit mode
3337     // On EVEX without VL and BW, these instructions will all be AVX.
3338     if (VM_Version::supports_avx512vlbw()) {
3339       __ movl(rax, 0xffff);
3340       __ kmovql(k1, rax);
3341     }
3342 
3343     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3344     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3345 
3346     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3347     __ movdqu(xmm_result, Address(from, 0));
3348 
3349     // for decryption java expanded key ordering is rotated one position from what we want
3350     // so we start from 0x10 here and hit 0x00 last
3351     // we don't know if the key is aligned, hence not using load-execute form
3352     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3353     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3354     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3355     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3356 
3357     __ pxor  (xmm_result, xmm_temp1);
3358     __ aesdec(xmm_result, xmm_temp2);
3359     __ aesdec(xmm_result, xmm_temp3);
3360     __ aesdec(xmm_result, xmm_temp4);
3361 
3362     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3363     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3364     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3365     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3366 
3367     __ aesdec(xmm_result, xmm_temp1);
3368     __ aesdec(xmm_result, xmm_temp2);
3369     __ aesdec(xmm_result, xmm_temp3);
3370     __ aesdec(xmm_result, xmm_temp4);
3371 
3372     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3373     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3374     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3375 
3376     __ cmpl(keylen, 44);
3377     __ jccb(Assembler::equal, L_doLast);
3378 
3379     __ aesdec(xmm_result, xmm_temp1);
3380     __ aesdec(xmm_result, xmm_temp2);
3381 
3382     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3383     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3384 
3385     __ cmpl(keylen, 52);
3386     __ jccb(Assembler::equal, L_doLast);
3387 
3388     __ aesdec(xmm_result, xmm_temp1);
3389     __ aesdec(xmm_result, xmm_temp2);
3390 
3391     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3392     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3393 
3394     __ BIND(L_doLast);
3395     __ aesdec(xmm_result, xmm_temp1);
3396     __ aesdec(xmm_result, xmm_temp2);
3397 
3398     // for decryption the aesdeclast operation is always on key+0x00
3399     __ aesdeclast(xmm_result, xmm_temp3);
3400     __ movdqu(Address(to, 0), xmm_result);  // store the result
3401     __ xorptr(rax, rax); // return 0
3402     __ leave(); // required for proper stackwalking of RuntimeStub frame
3403     __ ret(0);
3404 
3405     return start;
3406   }
3407 
3408 
3409   // Arguments:
3410   //
3411   // Inputs:
3412   //   c_rarg0   - source byte array address
3413   //   c_rarg1   - destination byte array address
3414   //   c_rarg2   - K (key) in little endian int array
3415   //   c_rarg3   - r vector byte array address
3416   //   c_rarg4   - input length
3417   //
3418   // Output:
3419   //   rax       - input length
3420   //
3421   address generate_cipherBlockChaining_encryptAESCrypt() {
3422     assert(UseAES, "need AES instructions and misaligned SSE support");
3423     __ align(CodeEntryAlignment);
3424     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3425     address start = __ pc();
3426 
3427     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3428     const Register from        = c_rarg0;  // source array address
3429     const Register to          = c_rarg1;  // destination array address
3430     const Register key         = c_rarg2;  // key array address
3431     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3432                                            // and left with the results of the last encryption block
3433 #ifndef _WIN64
3434     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3435 #else
3436     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3437     const Register len_reg     = r11;      // pick the volatile windows register
3438 #endif
3439     const Register pos         = rax;
3440 
3441     // xmm register assignments for the loops below
3442     const XMMRegister xmm_result = xmm0;
3443     const XMMRegister xmm_temp   = xmm1;
3444     // keys 0-10 preloaded into xmm2-xmm12
3445     const int XMM_REG_NUM_KEY_FIRST = 2;
3446     const int XMM_REG_NUM_KEY_LAST  = 15;
3447     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3448     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3449     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3450     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3451     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3452 
3453     __ enter(); // required for proper stackwalking of RuntimeStub frame
3454 
3455     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3456     // context for the registers used, where all instructions below are using 128-bit mode
3457     // On EVEX without VL and BW, these instructions will all be AVX.
3458     if (VM_Version::supports_avx512vlbw()) {
3459       __ movl(rax, 0xffff);
3460       __ kmovql(k1, rax);
3461     }
3462 
3463 #ifdef _WIN64
3464     // on win64, fill len_reg from stack position
3465     __ movl(len_reg, len_mem);
3466 #else
3467     __ push(len_reg); // Save
3468 #endif
3469 
3470     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3471     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3472     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3473     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3474       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3475       offset += 0x10;
3476     }
3477     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3478 
3479     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3480     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3481     __ cmpl(rax, 44);
3482     __ jcc(Assembler::notEqual, L_key_192_256);
3483 
3484     // 128 bit code follows here
3485     __ movptr(pos, 0);
3486     __ align(OptoLoopAlignment);
3487 
3488     __ BIND(L_loopTop_128);
3489     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3490     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3491     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3492     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3493       __ aesenc(xmm_result, as_XMMRegister(rnum));
3494     }
3495     __ aesenclast(xmm_result, xmm_key10);
3496     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3497     // no need to store r to memory until we exit
3498     __ addptr(pos, AESBlockSize);
3499     __ subptr(len_reg, AESBlockSize);
3500     __ jcc(Assembler::notEqual, L_loopTop_128);
3501 
3502     __ BIND(L_exit);
3503     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3504 
3505 #ifdef _WIN64
3506     __ movl(rax, len_mem);
3507 #else
3508     __ pop(rax); // return length
3509 #endif
3510     __ leave(); // required for proper stackwalking of RuntimeStub frame
3511     __ ret(0);
3512 
3513     __ BIND(L_key_192_256);
3514     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3515     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3516     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3517     __ cmpl(rax, 52);
3518     __ jcc(Assembler::notEqual, L_key_256);
3519 
3520     // 192-bit code follows here (could be changed to use more xmm registers)
3521     __ movptr(pos, 0);
3522     __ align(OptoLoopAlignment);
3523 
3524     __ BIND(L_loopTop_192);
3525     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3526     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3527     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3528     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3529       __ aesenc(xmm_result, as_XMMRegister(rnum));
3530     }
3531     __ aesenclast(xmm_result, xmm_key12);
3532     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3533     // no need to store r to memory until we exit
3534     __ addptr(pos, AESBlockSize);
3535     __ subptr(len_reg, AESBlockSize);
3536     __ jcc(Assembler::notEqual, L_loopTop_192);
3537     __ jmp(L_exit);
3538 
3539     __ BIND(L_key_256);
3540     // 256-bit code follows here (could be changed to use more xmm registers)
3541     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3542     __ movptr(pos, 0);
3543     __ align(OptoLoopAlignment);
3544 
3545     __ BIND(L_loopTop_256);
3546     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3547     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3548     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3549     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3550       __ aesenc(xmm_result, as_XMMRegister(rnum));
3551     }
3552     load_key(xmm_temp, key, 0xe0);
3553     __ aesenclast(xmm_result, xmm_temp);
3554     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3555     // no need to store r to memory until we exit
3556     __ addptr(pos, AESBlockSize);
3557     __ subptr(len_reg, AESBlockSize);
3558     __ jcc(Assembler::notEqual, L_loopTop_256);
3559     __ jmp(L_exit);
3560 
3561     return start;
3562   }
3563 
3564   // Safefetch stubs.
3565   void generate_safefetch(const char* name, int size, address* entry,
3566                           address* fault_pc, address* continuation_pc) {
3567     // safefetch signatures:
3568     //   int      SafeFetch32(int*      adr, int      errValue);
3569     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3570     //
3571     // arguments:
3572     //   c_rarg0 = adr
3573     //   c_rarg1 = errValue
3574     //
3575     // result:
3576     //   PPC_RET  = *adr or errValue
3577 
3578     StubCodeMark mark(this, "StubRoutines", name);
3579 
3580     // Entry point, pc or function descriptor.
3581     *entry = __ pc();
3582 
3583     // Load *adr into c_rarg1, may fault.
3584     *fault_pc = __ pc();
3585     switch (size) {
3586       case 4:
3587         // int32_t
3588         __ movl(c_rarg1, Address(c_rarg0, 0));
3589         break;
3590       case 8:
3591         // int64_t
3592         __ movq(c_rarg1, Address(c_rarg0, 0));
3593         break;
3594       default:
3595         ShouldNotReachHere();
3596     }
3597 
3598     // return errValue or *adr
3599     *continuation_pc = __ pc();
3600     __ movq(rax, c_rarg1);
3601     __ ret(0);
3602   }
3603 
3604   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3605   // to hide instruction latency
3606   //
3607   // Arguments:
3608   //
3609   // Inputs:
3610   //   c_rarg0   - source byte array address
3611   //   c_rarg1   - destination byte array address
3612   //   c_rarg2   - K (key) in little endian int array
3613   //   c_rarg3   - r vector byte array address
3614   //   c_rarg4   - input length
3615   //
3616   // Output:
3617   //   rax       - input length
3618   //
3619   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3620     assert(UseAES, "need AES instructions and misaligned SSE support");
3621     __ align(CodeEntryAlignment);
3622     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3623     address start = __ pc();
3624 
3625     const Register from        = c_rarg0;  // source array address
3626     const Register to          = c_rarg1;  // destination array address
3627     const Register key         = c_rarg2;  // key array address
3628     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3629                                            // and left with the results of the last encryption block
3630 #ifndef _WIN64
3631     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3632 #else
3633     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3634     const Register len_reg     = r11;      // pick the volatile windows register
3635 #endif
3636     const Register pos         = rax;
3637 
3638     const int PARALLEL_FACTOR = 4;
3639     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3640 
3641     Label L_exit;
3642     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3643     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3644     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3645     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3646     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3647 
3648     // keys 0-10 preloaded into xmm5-xmm15
3649     const int XMM_REG_NUM_KEY_FIRST = 5;
3650     const int XMM_REG_NUM_KEY_LAST  = 15;
3651     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3652     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3653 
3654     __ enter(); // required for proper stackwalking of RuntimeStub frame
3655 
3656     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3657     // context for the registers used, where all instructions below are using 128-bit mode
3658     // On EVEX without VL and BW, these instructions will all be AVX.
3659     if (VM_Version::supports_avx512vlbw()) {
3660       __ movl(rax, 0xffff);
3661       __ kmovql(k1, rax);
3662     }
3663 
3664 #ifdef _WIN64
3665     // on win64, fill len_reg from stack position
3666     __ movl(len_reg, len_mem);
3667 #else
3668     __ push(len_reg); // Save
3669 #endif
3670     __ push(rbx);
3671     // the java expanded key ordering is rotated one position from what we want
3672     // so we start from 0x10 here and hit 0x00 last
3673     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3674     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3675     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3676     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3677       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3678       offset += 0x10;
3679     }
3680     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3681 
3682     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3683 
3684     // registers holding the four results in the parallelized loop
3685     const XMMRegister xmm_result0 = xmm0;
3686     const XMMRegister xmm_result1 = xmm2;
3687     const XMMRegister xmm_result2 = xmm3;
3688     const XMMRegister xmm_result3 = xmm4;
3689 
3690     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3691 
3692     __ xorptr(pos, pos);
3693 
3694     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3695     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3696     __ cmpl(rbx, 52);
3697     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3698     __ cmpl(rbx, 60);
3699     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3700 
3701 #define DoFour(opc, src_reg)           \
3702   __ opc(xmm_result0, src_reg);         \
3703   __ opc(xmm_result1, src_reg);         \
3704   __ opc(xmm_result2, src_reg);         \
3705   __ opc(xmm_result3, src_reg);         \
3706 
3707     for (int k = 0; k < 3; ++k) {
3708       __ BIND(L_multiBlock_loopTopHead[k]);
3709       if (k != 0) {
3710         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3711         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3712       }
3713       if (k == 1) {
3714         __ subptr(rsp, 6 * wordSize);
3715         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3716         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3717         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3718         load_key(xmm1, key, 0xc0);  // 0xc0;
3719         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3720       } else if (k == 2) {
3721         __ subptr(rsp, 10 * wordSize);
3722         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3723         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3724         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3725         load_key(xmm1, key, 0xe0);  // 0xe0;
3726         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3727         load_key(xmm15, key, 0xb0); // 0xb0;
3728         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3729         load_key(xmm1, key, 0xc0);  // 0xc0;
3730         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3731       }
3732       __ align(OptoLoopAlignment);
3733       __ BIND(L_multiBlock_loopTop[k]);
3734       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3735       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3736 
3737       if  (k != 0) {
3738         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3739         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3740       }
3741 
3742       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3743       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3744       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3745       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3746 
3747       DoFour(pxor, xmm_key_first);
3748       if (k == 0) {
3749         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3750           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3751         }
3752         DoFour(aesdeclast, xmm_key_last);
3753       } else if (k == 1) {
3754         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3755           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3756         }
3757         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3758         DoFour(aesdec, xmm1);  // key : 0xc0
3759         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3760         DoFour(aesdeclast, xmm_key_last);
3761       } else if (k == 2) {
3762         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3763           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3764         }
3765         DoFour(aesdec, xmm1);  // key : 0xc0
3766         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3767         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3768         DoFour(aesdec, xmm15);  // key : 0xd0
3769         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3770         DoFour(aesdec, xmm1);  // key : 0xe0
3771         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3772         DoFour(aesdeclast, xmm_key_last);
3773       }
3774 
3775       // for each result, xor with the r vector of previous cipher block
3776       __ pxor(xmm_result0, xmm_prev_block_cipher);
3777       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3778       __ pxor(xmm_result1, xmm_prev_block_cipher);
3779       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3780       __ pxor(xmm_result2, xmm_prev_block_cipher);
3781       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3782       __ pxor(xmm_result3, xmm_prev_block_cipher);
3783       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
3784       if (k != 0) {
3785         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3786       }
3787 
3788       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3789       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3790       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3791       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3792 
3793       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3794       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3795       __ jmp(L_multiBlock_loopTop[k]);
3796 
3797       // registers used in the non-parallelized loops
3798       // xmm register assignments for the loops below
3799       const XMMRegister xmm_result = xmm0;
3800       const XMMRegister xmm_prev_block_cipher_save = xmm2;
3801       const XMMRegister xmm_key11 = xmm3;
3802       const XMMRegister xmm_key12 = xmm4;
3803       const XMMRegister key_tmp = xmm4;
3804 
3805       __ BIND(L_singleBlock_loopTopHead[k]);
3806       if (k == 1) {
3807         __ addptr(rsp, 6 * wordSize);
3808       } else if (k == 2) {
3809         __ addptr(rsp, 10 * wordSize);
3810       }
3811       __ cmpptr(len_reg, 0); // any blocks left??
3812       __ jcc(Assembler::equal, L_exit);
3813       __ BIND(L_singleBlock_loopTopHead2[k]);
3814       if (k == 1) {
3815         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3816         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3817       }
3818       if (k == 2) {
3819         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3820       }
3821       __ align(OptoLoopAlignment);
3822       __ BIND(L_singleBlock_loopTop[k]);
3823       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3824       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3825       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3826       for (int rnum = 1; rnum <= 9 ; rnum++) {
3827           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3828       }
3829       if (k == 1) {
3830         __ aesdec(xmm_result, xmm_key11);
3831         __ aesdec(xmm_result, xmm_key12);
3832       }
3833       if (k == 2) {
3834         __ aesdec(xmm_result, xmm_key11);
3835         load_key(key_tmp, key, 0xc0);
3836         __ aesdec(xmm_result, key_tmp);
3837         load_key(key_tmp, key, 0xd0);
3838         __ aesdec(xmm_result, key_tmp);
3839         load_key(key_tmp, key, 0xe0);
3840         __ aesdec(xmm_result, key_tmp);
3841       }
3842 
3843       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3844       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3845       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3846       // no need to store r to memory until we exit
3847       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3848       __ addptr(pos, AESBlockSize);
3849       __ subptr(len_reg, AESBlockSize);
3850       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3851       if (k != 2) {
3852         __ jmp(L_exit);
3853       }
3854     } //for 128/192/256
3855 
3856     __ BIND(L_exit);
3857     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3858     __ pop(rbx);
3859 #ifdef _WIN64
3860     __ movl(rax, len_mem);
3861 #else
3862     __ pop(rax); // return length
3863 #endif
3864     __ leave(); // required for proper stackwalking of RuntimeStub frame
3865     __ ret(0);
3866     return start;
3867 }
3868 
3869   address generate_upper_word_mask() {
3870     __ align(64);
3871     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3872     address start = __ pc();
3873     __ emit_data64(0x0000000000000000, relocInfo::none);
3874     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3875     return start;
3876   }
3877 
3878   address generate_shuffle_byte_flip_mask() {
3879     __ align(64);
3880     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3881     address start = __ pc();
3882     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3883     __ emit_data64(0x0001020304050607, relocInfo::none);
3884     return start;
3885   }
3886 
3887   // ofs and limit are use for multi-block byte array.
3888   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3889   address generate_sha1_implCompress(bool multi_block, const char *name) {
3890     __ align(CodeEntryAlignment);
3891     StubCodeMark mark(this, "StubRoutines", name);
3892     address start = __ pc();
3893 
3894     Register buf = c_rarg0;
3895     Register state = c_rarg1;
3896     Register ofs = c_rarg2;
3897     Register limit = c_rarg3;
3898 
3899     const XMMRegister abcd = xmm0;
3900     const XMMRegister e0 = xmm1;
3901     const XMMRegister e1 = xmm2;
3902     const XMMRegister msg0 = xmm3;
3903 
3904     const XMMRegister msg1 = xmm4;
3905     const XMMRegister msg2 = xmm5;
3906     const XMMRegister msg3 = xmm6;
3907     const XMMRegister shuf_mask = xmm7;
3908 
3909     __ enter();
3910 
3911     __ subptr(rsp, 4 * wordSize);
3912 
3913     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3914       buf, state, ofs, limit, rsp, multi_block);
3915 
3916     __ addptr(rsp, 4 * wordSize);
3917 
3918     __ leave();
3919     __ ret(0);
3920     return start;
3921   }
3922 
3923   address generate_pshuffle_byte_flip_mask() {
3924     __ align(64);
3925     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3926     address start = __ pc();
3927     __ emit_data64(0x0405060700010203, relocInfo::none);
3928     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3929 
3930     if (VM_Version::supports_avx2()) {
3931       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3932       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3933       // _SHUF_00BA
3934       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3935       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3936       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3937       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3938       // _SHUF_DC00
3939       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3940       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3941       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3942       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3943     }
3944 
3945     return start;
3946   }
3947 
3948   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
3949   address generate_pshuffle_byte_flip_mask_sha512() {
3950     __ align(32);
3951     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
3952     address start = __ pc();
3953     if (VM_Version::supports_avx2()) {
3954       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
3955       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3956       __ emit_data64(0x1011121314151617, relocInfo::none);
3957       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
3958       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
3959       __ emit_data64(0x0000000000000000, relocInfo::none);
3960       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3961       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3962     }
3963 
3964     return start;
3965   }
3966 
3967 // ofs and limit are use for multi-block byte array.
3968 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3969   address generate_sha256_implCompress(bool multi_block, const char *name) {
3970     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
3971     __ align(CodeEntryAlignment);
3972     StubCodeMark mark(this, "StubRoutines", name);
3973     address start = __ pc();
3974 
3975     Register buf = c_rarg0;
3976     Register state = c_rarg1;
3977     Register ofs = c_rarg2;
3978     Register limit = c_rarg3;
3979 
3980     const XMMRegister msg = xmm0;
3981     const XMMRegister state0 = xmm1;
3982     const XMMRegister state1 = xmm2;
3983     const XMMRegister msgtmp0 = xmm3;
3984 
3985     const XMMRegister msgtmp1 = xmm4;
3986     const XMMRegister msgtmp2 = xmm5;
3987     const XMMRegister msgtmp3 = xmm6;
3988     const XMMRegister msgtmp4 = xmm7;
3989 
3990     const XMMRegister shuf_mask = xmm8;
3991 
3992     __ enter();
3993 
3994     __ subptr(rsp, 4 * wordSize);
3995 
3996     if (VM_Version::supports_sha()) {
3997       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3998         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3999     } else if (VM_Version::supports_avx2()) {
4000       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4001         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4002     }
4003     __ addptr(rsp, 4 * wordSize);
4004     __ vzeroupper();
4005     __ leave();
4006     __ ret(0);
4007     return start;
4008   }
4009 
4010   address generate_sha512_implCompress(bool multi_block, const char *name) {
4011     assert(VM_Version::supports_avx2(), "");
4012     assert(VM_Version::supports_bmi2(), "");
4013     __ align(CodeEntryAlignment);
4014     StubCodeMark mark(this, "StubRoutines", name);
4015     address start = __ pc();
4016 
4017     Register buf = c_rarg0;
4018     Register state = c_rarg1;
4019     Register ofs = c_rarg2;
4020     Register limit = c_rarg3;
4021 
4022     const XMMRegister msg = xmm0;
4023     const XMMRegister state0 = xmm1;
4024     const XMMRegister state1 = xmm2;
4025     const XMMRegister msgtmp0 = xmm3;
4026     const XMMRegister msgtmp1 = xmm4;
4027     const XMMRegister msgtmp2 = xmm5;
4028     const XMMRegister msgtmp3 = xmm6;
4029     const XMMRegister msgtmp4 = xmm7;
4030 
4031     const XMMRegister shuf_mask = xmm8;
4032 
4033     __ enter();
4034 
4035     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4036     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4037 
4038     __ vzeroupper();
4039     __ leave();
4040     __ ret(0);
4041     return start;
4042   }
4043 
4044   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4045   // to hide instruction latency
4046   //
4047   // Arguments:
4048   //
4049   // Inputs:
4050   //   c_rarg0   - source byte array address
4051   //   c_rarg1   - destination byte array address
4052   //   c_rarg2   - K (key) in little endian int array
4053   //   c_rarg3   - counter vector byte array address
4054   //   Linux
4055   //     c_rarg4   -          input length
4056   //     c_rarg5   -          saved encryptedCounter start
4057   //     rbp + 6 * wordSize - saved used length
4058   //   Windows
4059   //     rbp + 6 * wordSize - input length
4060   //     rbp + 7 * wordSize - saved encryptedCounter start
4061   //     rbp + 8 * wordSize - saved used length
4062   //
4063   // Output:
4064   //   rax       - input length
4065   //
4066   address generate_counterMode_AESCrypt_Parallel() {
4067     assert(UseAES, "need AES instructions and misaligned SSE support");
4068     __ align(CodeEntryAlignment);
4069     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4070     address start = __ pc();
4071     const Register from = c_rarg0; // source array address
4072     const Register to = c_rarg1; // destination array address
4073     const Register key = c_rarg2; // key array address
4074     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4075                                       // and updated with the incremented counter in the end
4076 #ifndef _WIN64
4077     const Register len_reg = c_rarg4;
4078     const Register saved_encCounter_start = c_rarg5;
4079     const Register used_addr = r10;
4080     const Address  used_mem(rbp, 2 * wordSize);
4081     const Register used = r11;
4082 #else
4083     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4084     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4085     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4086     const Register len_reg = r10; // pick the first volatile windows register
4087     const Register saved_encCounter_start = r11;
4088     const Register used_addr = r13;
4089     const Register used = r14;
4090 #endif
4091     const Register pos = rax;
4092 
4093     const int PARALLEL_FACTOR = 6;
4094     const XMMRegister xmm_counter_shuf_mask = xmm0;
4095     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4096     const XMMRegister xmm_curr_counter = xmm2;
4097 
4098     const XMMRegister xmm_key_tmp0 = xmm3;
4099     const XMMRegister xmm_key_tmp1 = xmm4;
4100 
4101     // registers holding the four results in the parallelized loop
4102     const XMMRegister xmm_result0 = xmm5;
4103     const XMMRegister xmm_result1 = xmm6;
4104     const XMMRegister xmm_result2 = xmm7;
4105     const XMMRegister xmm_result3 = xmm8;
4106     const XMMRegister xmm_result4 = xmm9;
4107     const XMMRegister xmm_result5 = xmm10;
4108 
4109     const XMMRegister xmm_from0 = xmm11;
4110     const XMMRegister xmm_from1 = xmm12;
4111     const XMMRegister xmm_from2 = xmm13;
4112     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4113     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4114     const XMMRegister xmm_from5 = xmm4;
4115 
4116     //for key_128, key_192, key_256
4117     const int rounds[3] = {10, 12, 14};
4118     Label L_exit_preLoop, L_preLoop_start;
4119     Label L_multiBlock_loopTop[3];
4120     Label L_singleBlockLoopTop[3];
4121     Label L__incCounter[3][6]; //for 6 blocks
4122     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4123     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4124     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4125 
4126     Label L_exit;
4127 
4128     __ enter(); // required for proper stackwalking of RuntimeStub frame
4129 
4130     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4131     // context for the registers used, where all instructions below are using 128-bit mode
4132     // On EVEX without VL and BW, these instructions will all be AVX.
4133     if (VM_Version::supports_avx512vlbw()) {
4134         __ movl(rax, 0xffff);
4135         __ kmovql(k1, rax);
4136     }
4137 
4138 #ifdef _WIN64
4139     // allocate spill slots for r13, r14
4140     enum {
4141         saved_r13_offset,
4142         saved_r14_offset
4143     };
4144     __ subptr(rsp, 2 * wordSize);
4145     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4146     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4147 
4148     // on win64, fill len_reg from stack position
4149     __ movl(len_reg, len_mem);
4150     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4151     __ movptr(used_addr, used_mem);
4152     __ movl(used, Address(used_addr, 0));
4153 #else
4154     __ push(len_reg); // Save
4155     __ movptr(used_addr, used_mem);
4156     __ movl(used, Address(used_addr, 0));
4157 #endif
4158 
4159     __ push(rbx); // Save RBX
4160     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4161     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4162     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4163     __ movptr(pos, 0);
4164 
4165     // Use the partially used encrpyted counter from last invocation
4166     __ BIND(L_preLoop_start);
4167     __ cmpptr(used, 16);
4168     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4169       __ cmpptr(len_reg, 0);
4170       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4171       __ movb(rbx, Address(saved_encCounter_start, used));
4172       __ xorb(rbx, Address(from, pos));
4173       __ movb(Address(to, pos), rbx);
4174       __ addptr(pos, 1);
4175       __ addptr(used, 1);
4176       __ subptr(len_reg, 1);
4177 
4178     __ jmp(L_preLoop_start);
4179 
4180     __ BIND(L_exit_preLoop);
4181     __ movl(Address(used_addr, 0), used);
4182 
4183     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4184     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4185     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4186     __ cmpl(rbx, 52);
4187     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4188     __ cmpl(rbx, 60);
4189     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4190 
4191 #define CTR_DoSix(opc, src_reg)                \
4192     __ opc(xmm_result0, src_reg);              \
4193     __ opc(xmm_result1, src_reg);              \
4194     __ opc(xmm_result2, src_reg);              \
4195     __ opc(xmm_result3, src_reg);              \
4196     __ opc(xmm_result4, src_reg);              \
4197     __ opc(xmm_result5, src_reg);
4198 
4199     // k == 0 :  generate code for key_128
4200     // k == 1 :  generate code for key_192
4201     // k == 2 :  generate code for key_256
4202     for (int k = 0; k < 3; ++k) {
4203       //multi blocks starts here
4204       __ align(OptoLoopAlignment);
4205       __ BIND(L_multiBlock_loopTop[k]);
4206       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4207       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4208       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4209 
4210       //load, then increase counters
4211       CTR_DoSix(movdqa, xmm_curr_counter);
4212       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4213       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4214       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4215       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4216       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4217       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4218       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4219       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4220 
4221       //load two ROUND_KEYs at a time
4222       for (int i = 1; i < rounds[k]; ) {
4223         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4224         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4225         CTR_DoSix(aesenc, xmm_key_tmp1);
4226         i++;
4227         if (i != rounds[k]) {
4228           CTR_DoSix(aesenc, xmm_key_tmp0);
4229         } else {
4230           CTR_DoSix(aesenclast, xmm_key_tmp0);
4231         }
4232         i++;
4233       }
4234 
4235       // get next PARALLEL_FACTOR blocks into xmm_result registers
4236       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4237       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4238       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4239       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4240       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4241       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4242 
4243       __ pxor(xmm_result0, xmm_from0);
4244       __ pxor(xmm_result1, xmm_from1);
4245       __ pxor(xmm_result2, xmm_from2);
4246       __ pxor(xmm_result3, xmm_from3);
4247       __ pxor(xmm_result4, xmm_from4);
4248       __ pxor(xmm_result5, xmm_from5);
4249 
4250       // store 6 results into the next 64 bytes of output
4251       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4252       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4253       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4254       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4255       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4256       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4257 
4258       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4259       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4260       __ jmp(L_multiBlock_loopTop[k]);
4261 
4262       // singleBlock starts here
4263       __ align(OptoLoopAlignment);
4264       __ BIND(L_singleBlockLoopTop[k]);
4265       __ cmpptr(len_reg, 0);
4266       __ jcc(Assembler::lessEqual, L_exit);
4267       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4268       __ movdqa(xmm_result0, xmm_curr_counter);
4269       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4270       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4271       __ pxor(xmm_result0, xmm_key_tmp0);
4272       for (int i = 1; i < rounds[k]; i++) {
4273         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4274         __ aesenc(xmm_result0, xmm_key_tmp0);
4275       }
4276       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4277       __ aesenclast(xmm_result0, xmm_key_tmp0);
4278       __ cmpptr(len_reg, AESBlockSize);
4279       __ jcc(Assembler::less, L_processTail_insr[k]);
4280         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4281         __ pxor(xmm_result0, xmm_from0);
4282         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4283         __ addptr(pos, AESBlockSize);
4284         __ subptr(len_reg, AESBlockSize);
4285         __ jmp(L_singleBlockLoopTop[k]);
4286       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4287         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4288         __ testptr(len_reg, 8);
4289         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4290           __ subptr(pos,8);
4291           __ pinsrq(xmm_from0, Address(from, pos), 0);
4292         __ BIND(L_processTail_4_insr[k]);
4293         __ testptr(len_reg, 4);
4294         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4295           __ subptr(pos,4);
4296           __ pslldq(xmm_from0, 4);
4297           __ pinsrd(xmm_from0, Address(from, pos), 0);
4298         __ BIND(L_processTail_2_insr[k]);
4299         __ testptr(len_reg, 2);
4300         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4301           __ subptr(pos, 2);
4302           __ pslldq(xmm_from0, 2);
4303           __ pinsrw(xmm_from0, Address(from, pos), 0);
4304         __ BIND(L_processTail_1_insr[k]);
4305         __ testptr(len_reg, 1);
4306         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4307           __ subptr(pos, 1);
4308           __ pslldq(xmm_from0, 1);
4309           __ pinsrb(xmm_from0, Address(from, pos), 0);
4310         __ BIND(L_processTail_exit_insr[k]);
4311 
4312         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4313         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4314 
4315         __ testptr(len_reg, 8);
4316         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4317           __ pextrq(Address(to, pos), xmm_result0, 0);
4318           __ psrldq(xmm_result0, 8);
4319           __ addptr(pos, 8);
4320         __ BIND(L_processTail_4_extr[k]);
4321         __ testptr(len_reg, 4);
4322         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4323           __ pextrd(Address(to, pos), xmm_result0, 0);
4324           __ psrldq(xmm_result0, 4);
4325           __ addptr(pos, 4);
4326         __ BIND(L_processTail_2_extr[k]);
4327         __ testptr(len_reg, 2);
4328         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4329           __ pextrw(Address(to, pos), xmm_result0, 0);
4330           __ psrldq(xmm_result0, 2);
4331           __ addptr(pos, 2);
4332         __ BIND(L_processTail_1_extr[k]);
4333         __ testptr(len_reg, 1);
4334         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4335           __ pextrb(Address(to, pos), xmm_result0, 0);
4336 
4337         __ BIND(L_processTail_exit_extr[k]);
4338         __ movl(Address(used_addr, 0), len_reg);
4339         __ jmp(L_exit);
4340 
4341     }
4342 
4343     __ BIND(L_exit);
4344     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4345     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4346     __ pop(rbx); // pop the saved RBX.
4347 #ifdef _WIN64
4348     __ movl(rax, len_mem);
4349     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4350     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4351     __ addptr(rsp, 2 * wordSize);
4352 #else
4353     __ pop(rax); // return 'len'
4354 #endif
4355     __ leave(); // required for proper stackwalking of RuntimeStub frame
4356     __ ret(0);
4357     return start;
4358   }
4359 
4360   // byte swap x86 long
4361   address generate_ghash_long_swap_mask() {
4362     __ align(CodeEntryAlignment);
4363     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4364     address start = __ pc();
4365     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4366     __ emit_data64(0x0706050403020100, relocInfo::none );
4367   return start;
4368   }
4369 
4370   // byte swap x86 byte array
4371   address generate_ghash_byte_swap_mask() {
4372     __ align(CodeEntryAlignment);
4373     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4374     address start = __ pc();
4375     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4376     __ emit_data64(0x0001020304050607, relocInfo::none );
4377   return start;
4378   }
4379 
4380   /* Single and multi-block ghash operations */
4381   address generate_ghash_processBlocks() {
4382     __ align(CodeEntryAlignment);
4383     Label L_ghash_loop, L_exit;
4384     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4385     address start = __ pc();
4386 
4387     const Register state        = c_rarg0;
4388     const Register subkeyH      = c_rarg1;
4389     const Register data         = c_rarg2;
4390     const Register blocks       = c_rarg3;
4391 
4392     const XMMRegister xmm_temp0 = xmm0;
4393     const XMMRegister xmm_temp1 = xmm1;
4394     const XMMRegister xmm_temp2 = xmm2;
4395     const XMMRegister xmm_temp3 = xmm3;
4396     const XMMRegister xmm_temp4 = xmm4;
4397     const XMMRegister xmm_temp5 = xmm5;
4398     const XMMRegister xmm_temp6 = xmm6;
4399     const XMMRegister xmm_temp7 = xmm7;
4400     const XMMRegister xmm_temp8 = xmm8;
4401     const XMMRegister xmm_temp9 = xmm9;
4402     const XMMRegister xmm_temp10 = xmm10;
4403 
4404     __ enter();
4405 
4406     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4407     // context for the registers used, where all instructions below are using 128-bit mode
4408     // On EVEX without VL and BW, these instructions will all be AVX.
4409     if (VM_Version::supports_avx512vlbw()) {
4410       __ movl(rax, 0xffff);
4411       __ kmovql(k1, rax);
4412     }
4413 
4414     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4415 
4416     __ movdqu(xmm_temp0, Address(state, 0));
4417     __ pshufb(xmm_temp0, xmm_temp10);
4418 
4419 
4420     __ BIND(L_ghash_loop);
4421     __ movdqu(xmm_temp2, Address(data, 0));
4422     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4423 
4424     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4425     __ pshufb(xmm_temp1, xmm_temp10);
4426 
4427     __ pxor(xmm_temp0, xmm_temp2);
4428 
4429     //
4430     // Multiply with the hash key
4431     //
4432     __ movdqu(xmm_temp3, xmm_temp0);
4433     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
4434     __ movdqu(xmm_temp4, xmm_temp0);
4435     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
4436 
4437     __ movdqu(xmm_temp5, xmm_temp0);
4438     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
4439     __ movdqu(xmm_temp6, xmm_temp0);
4440     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
4441 
4442     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
4443 
4444     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
4445     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
4446     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
4447     __ pxor(xmm_temp3, xmm_temp5);
4448     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
4449                                         // of the carry-less multiplication of
4450                                         // xmm0 by xmm1.
4451 
4452     // We shift the result of the multiplication by one bit position
4453     // to the left to cope for the fact that the bits are reversed.
4454     __ movdqu(xmm_temp7, xmm_temp3);
4455     __ movdqu(xmm_temp8, xmm_temp6);
4456     __ pslld(xmm_temp3, 1);
4457     __ pslld(xmm_temp6, 1);
4458     __ psrld(xmm_temp7, 31);
4459     __ psrld(xmm_temp8, 31);
4460     __ movdqu(xmm_temp9, xmm_temp7);
4461     __ pslldq(xmm_temp8, 4);
4462     __ pslldq(xmm_temp7, 4);
4463     __ psrldq(xmm_temp9, 12);
4464     __ por(xmm_temp3, xmm_temp7);
4465     __ por(xmm_temp6, xmm_temp8);
4466     __ por(xmm_temp6, xmm_temp9);
4467 
4468     //
4469     // First phase of the reduction
4470     //
4471     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4472     // independently.
4473     __ movdqu(xmm_temp7, xmm_temp3);
4474     __ movdqu(xmm_temp8, xmm_temp3);
4475     __ movdqu(xmm_temp9, xmm_temp3);
4476     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
4477     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
4478     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
4479     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
4480     __ pxor(xmm_temp7, xmm_temp9);
4481     __ movdqu(xmm_temp8, xmm_temp7);
4482     __ pslldq(xmm_temp7, 12);
4483     __ psrldq(xmm_temp8, 4);
4484     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
4485 
4486     //
4487     // Second phase of the reduction
4488     //
4489     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4490     // shift operations.
4491     __ movdqu(xmm_temp2, xmm_temp3);
4492     __ movdqu(xmm_temp4, xmm_temp3);
4493     __ movdqu(xmm_temp5, xmm_temp3);
4494     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4495     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4496     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4497     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4498     __ pxor(xmm_temp2, xmm_temp5);
4499     __ pxor(xmm_temp2, xmm_temp8);
4500     __ pxor(xmm_temp3, xmm_temp2);
4501     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4502 
4503     __ decrement(blocks);
4504     __ jcc(Assembler::zero, L_exit);
4505     __ movdqu(xmm_temp0, xmm_temp6);
4506     __ addptr(data, 16);
4507     __ jmp(L_ghash_loop);
4508 
4509     __ BIND(L_exit);
4510     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4511     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4512     __ leave();
4513     __ ret(0);
4514     return start;
4515   }
4516 
4517   /**
4518    *  Arguments:
4519    *
4520    * Inputs:
4521    *   c_rarg0   - int crc
4522    *   c_rarg1   - byte* buf
4523    *   c_rarg2   - int length
4524    *
4525    * Ouput:
4526    *       rax   - int crc result
4527    */
4528   address generate_updateBytesCRC32() {
4529     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4530 
4531     __ align(CodeEntryAlignment);
4532     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4533 
4534     address start = __ pc();
4535     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4536     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4537     // rscratch1: r10
4538     const Register crc   = c_rarg0;  // crc
4539     const Register buf   = c_rarg1;  // source java byte array address
4540     const Register len   = c_rarg2;  // length
4541     const Register table = c_rarg3;  // crc_table address (reuse register)
4542     const Register tmp   = r11;
4543     assert_different_registers(crc, buf, len, table, tmp, rax);
4544 
4545     BLOCK_COMMENT("Entry:");
4546     __ enter(); // required for proper stackwalking of RuntimeStub frame
4547 
4548     __ kernel_crc32(crc, buf, len, table, tmp);
4549 
4550     __ movl(rax, crc);
4551     __ vzeroupper();
4552     __ leave(); // required for proper stackwalking of RuntimeStub frame
4553     __ ret(0);
4554 
4555     return start;
4556   }
4557 
4558   /**
4559   *  Arguments:
4560   *
4561   * Inputs:
4562   *   c_rarg0   - int crc
4563   *   c_rarg1   - byte* buf
4564   *   c_rarg2   - long length
4565   *   c_rarg3   - table_start - optional (present only when doing a library_call,
4566   *              not used by x86 algorithm)
4567   *
4568   * Ouput:
4569   *       rax   - int crc result
4570   */
4571   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
4572       assert(UseCRC32CIntrinsics, "need SSE4_2");
4573       __ align(CodeEntryAlignment);
4574       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4575       address start = __ pc();
4576       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
4577       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
4578       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
4579       const Register crc = c_rarg0;  // crc
4580       const Register buf = c_rarg1;  // source java byte array address
4581       const Register len = c_rarg2;  // length
4582       const Register a = rax;
4583       const Register j = r9;
4584       const Register k = r10;
4585       const Register l = r11;
4586 #ifdef _WIN64
4587       const Register y = rdi;
4588       const Register z = rsi;
4589 #else
4590       const Register y = rcx;
4591       const Register z = r8;
4592 #endif
4593       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
4594 
4595       BLOCK_COMMENT("Entry:");
4596       __ enter(); // required for proper stackwalking of RuntimeStub frame
4597 #ifdef _WIN64
4598       __ push(y);
4599       __ push(z);
4600 #endif
4601       __ crc32c_ipl_alg2_alt2(crc, buf, len,
4602                               a, j, k,
4603                               l, y, z,
4604                               c_farg0, c_farg1, c_farg2,
4605                               is_pclmulqdq_supported);
4606       __ movl(rax, crc);
4607 #ifdef _WIN64
4608       __ pop(z);
4609       __ pop(y);
4610 #endif
4611       __ vzeroupper();
4612       __ leave(); // required for proper stackwalking of RuntimeStub frame
4613       __ ret(0);
4614 
4615       return start;
4616   }
4617 
4618   /**
4619    *  Arguments:
4620    *
4621    *  Input:
4622    *    c_rarg0   - x address
4623    *    c_rarg1   - x length
4624    *    c_rarg2   - y address
4625    *    c_rarg3   - y lenth
4626    * not Win64
4627    *    c_rarg4   - z address
4628    *    c_rarg5   - z length
4629    * Win64
4630    *    rsp+40    - z address
4631    *    rsp+48    - z length
4632    */
4633   address generate_multiplyToLen() {
4634     __ align(CodeEntryAlignment);
4635     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4636 
4637     address start = __ pc();
4638     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4639     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4640     const Register x     = rdi;
4641     const Register xlen  = rax;
4642     const Register y     = rsi;
4643     const Register ylen  = rcx;
4644     const Register z     = r8;
4645     const Register zlen  = r11;
4646 
4647     // Next registers will be saved on stack in multiply_to_len().
4648     const Register tmp1  = r12;
4649     const Register tmp2  = r13;
4650     const Register tmp3  = r14;
4651     const Register tmp4  = r15;
4652     const Register tmp5  = rbx;
4653 
4654     BLOCK_COMMENT("Entry:");
4655     __ enter(); // required for proper stackwalking of RuntimeStub frame
4656 
4657 #ifndef _WIN64
4658     __ movptr(zlen, r9); // Save r9 in r11 - zlen
4659 #endif
4660     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
4661                        // ylen => rcx, z => r8, zlen => r11
4662                        // r9 and r10 may be used to save non-volatile registers
4663 #ifdef _WIN64
4664     // last 2 arguments (#4, #5) are on stack on Win64
4665     __ movptr(z, Address(rsp, 6 * wordSize));
4666     __ movptr(zlen, Address(rsp, 7 * wordSize));
4667 #endif
4668 
4669     __ movptr(xlen, rsi);
4670     __ movptr(y,    rdx);
4671     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
4672 
4673     restore_arg_regs();
4674 
4675     __ leave(); // required for proper stackwalking of RuntimeStub frame
4676     __ ret(0);
4677 
4678     return start;
4679   }
4680 
4681   /**
4682   *  Arguments:
4683   *
4684   *  Input:
4685   *    c_rarg0   - obja     address
4686   *    c_rarg1   - objb     address
4687   *    c_rarg3   - length   length
4688   *    c_rarg4   - scale    log2_array_indxscale
4689   *
4690   *  Output:
4691   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
4692   */
4693   address generate_vectorizedMismatch() {
4694     __ align(CodeEntryAlignment);
4695     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
4696     address start = __ pc();
4697 
4698     BLOCK_COMMENT("Entry:");
4699     __ enter();
4700 
4701 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4702     const Register scale = c_rarg0;  //rcx, will exchange with r9
4703     const Register objb = c_rarg1;   //rdx
4704     const Register length = c_rarg2; //r8
4705     const Register obja = c_rarg3;   //r9
4706     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
4707 
4708     const Register tmp1 = r10;
4709     const Register tmp2 = r11;
4710 #endif
4711 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4712     const Register obja = c_rarg0;   //U:rdi
4713     const Register objb = c_rarg1;   //U:rsi
4714     const Register length = c_rarg2; //U:rdx
4715     const Register scale = c_rarg3;  //U:rcx
4716     const Register tmp1 = r8;
4717     const Register tmp2 = r9;
4718 #endif
4719     const Register result = rax; //return value
4720     const XMMRegister vec0 = xmm0;
4721     const XMMRegister vec1 = xmm1;
4722     const XMMRegister vec2 = xmm2;
4723 
4724     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
4725 
4726     __ vzeroupper();
4727     __ leave();
4728     __ ret(0);
4729 
4730     return start;
4731   }
4732 
4733 /**
4734    *  Arguments:
4735    *
4736   //  Input:
4737   //    c_rarg0   - x address
4738   //    c_rarg1   - x length
4739   //    c_rarg2   - z address
4740   //    c_rarg3   - z lenth
4741    *
4742    */
4743   address generate_squareToLen() {
4744 
4745     __ align(CodeEntryAlignment);
4746     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4747 
4748     address start = __ pc();
4749     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4750     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
4751     const Register x      = rdi;
4752     const Register len    = rsi;
4753     const Register z      = r8;
4754     const Register zlen   = rcx;
4755 
4756    const Register tmp1      = r12;
4757    const Register tmp2      = r13;
4758    const Register tmp3      = r14;
4759    const Register tmp4      = r15;
4760    const Register tmp5      = rbx;
4761 
4762     BLOCK_COMMENT("Entry:");
4763     __ enter(); // required for proper stackwalking of RuntimeStub frame
4764 
4765        setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
4766                           // zlen => rcx
4767                           // r9 and r10 may be used to save non-volatile registers
4768     __ movptr(r8, rdx);
4769     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
4770 
4771     restore_arg_regs();
4772 
4773     __ leave(); // required for proper stackwalking of RuntimeStub frame
4774     __ ret(0);
4775 
4776     return start;
4777   }
4778 
4779    /**
4780    *  Arguments:
4781    *
4782    *  Input:
4783    *    c_rarg0   - out address
4784    *    c_rarg1   - in address
4785    *    c_rarg2   - offset
4786    *    c_rarg3   - len
4787    * not Win64
4788    *    c_rarg4   - k
4789    * Win64
4790    *    rsp+40    - k
4791    */
4792   address generate_mulAdd() {
4793     __ align(CodeEntryAlignment);
4794     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4795 
4796     address start = __ pc();
4797     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4798     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4799     const Register out     = rdi;
4800     const Register in      = rsi;
4801     const Register offset  = r11;
4802     const Register len     = rcx;
4803     const Register k       = r8;
4804 
4805     // Next registers will be saved on stack in mul_add().
4806     const Register tmp1  = r12;
4807     const Register tmp2  = r13;
4808     const Register tmp3  = r14;
4809     const Register tmp4  = r15;
4810     const Register tmp5  = rbx;
4811 
4812     BLOCK_COMMENT("Entry:");
4813     __ enter(); // required for proper stackwalking of RuntimeStub frame
4814 
4815     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
4816                        // len => rcx, k => r8
4817                        // r9 and r10 may be used to save non-volatile registers
4818 #ifdef _WIN64
4819     // last argument is on stack on Win64
4820     __ movl(k, Address(rsp, 6 * wordSize));
4821 #endif
4822     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
4823     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
4824 
4825     restore_arg_regs();
4826 
4827     __ leave(); // required for proper stackwalking of RuntimeStub frame
4828     __ ret(0);
4829 
4830     return start;
4831   }
4832 
4833   address generate_libmExp() {
4834     StubCodeMark mark(this, "StubRoutines", "libmExp");
4835 
4836     address start = __ pc();
4837 
4838     const XMMRegister x0  = xmm0;
4839     const XMMRegister x1  = xmm1;
4840     const XMMRegister x2  = xmm2;
4841     const XMMRegister x3  = xmm3;
4842 
4843     const XMMRegister x4  = xmm4;
4844     const XMMRegister x5  = xmm5;
4845     const XMMRegister x6  = xmm6;
4846     const XMMRegister x7  = xmm7;
4847 
4848     const Register tmp   = r11;
4849 
4850     BLOCK_COMMENT("Entry:");
4851     __ enter(); // required for proper stackwalking of RuntimeStub frame
4852 
4853     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4854 
4855     __ leave(); // required for proper stackwalking of RuntimeStub frame
4856     __ ret(0);
4857 
4858     return start;
4859 
4860   }
4861 
4862   address generate_libmLog() {
4863     StubCodeMark mark(this, "StubRoutines", "libmLog");
4864 
4865     address start = __ pc();
4866 
4867     const XMMRegister x0 = xmm0;
4868     const XMMRegister x1 = xmm1;
4869     const XMMRegister x2 = xmm2;
4870     const XMMRegister x3 = xmm3;
4871 
4872     const XMMRegister x4 = xmm4;
4873     const XMMRegister x5 = xmm5;
4874     const XMMRegister x6 = xmm6;
4875     const XMMRegister x7 = xmm7;
4876 
4877     const Register tmp1 = r11;
4878     const Register tmp2 = r8;
4879 
4880     BLOCK_COMMENT("Entry:");
4881     __ enter(); // required for proper stackwalking of RuntimeStub frame
4882 
4883     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
4884 
4885     __ leave(); // required for proper stackwalking of RuntimeStub frame
4886     __ ret(0);
4887 
4888     return start;
4889 
4890   }
4891 
4892   address generate_libmLog10() {
4893     StubCodeMark mark(this, "StubRoutines", "libmLog10");
4894 
4895     address start = __ pc();
4896 
4897     const XMMRegister x0 = xmm0;
4898     const XMMRegister x1 = xmm1;
4899     const XMMRegister x2 = xmm2;
4900     const XMMRegister x3 = xmm3;
4901 
4902     const XMMRegister x4 = xmm4;
4903     const XMMRegister x5 = xmm5;
4904     const XMMRegister x6 = xmm6;
4905     const XMMRegister x7 = xmm7;
4906 
4907     const Register tmp = r11;
4908 
4909     BLOCK_COMMENT("Entry:");
4910     __ enter(); // required for proper stackwalking of RuntimeStub frame
4911 
4912     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4913 
4914     __ leave(); // required for proper stackwalking of RuntimeStub frame
4915     __ ret(0);
4916 
4917     return start;
4918 
4919   }
4920 
4921   address generate_libmPow() {
4922     StubCodeMark mark(this, "StubRoutines", "libmPow");
4923 
4924     address start = __ pc();
4925 
4926     const XMMRegister x0 = xmm0;
4927     const XMMRegister x1 = xmm1;
4928     const XMMRegister x2 = xmm2;
4929     const XMMRegister x3 = xmm3;
4930 
4931     const XMMRegister x4 = xmm4;
4932     const XMMRegister x5 = xmm5;
4933     const XMMRegister x6 = xmm6;
4934     const XMMRegister x7 = xmm7;
4935 
4936     const Register tmp1 = r8;
4937     const Register tmp2 = r9;
4938     const Register tmp3 = r10;
4939     const Register tmp4 = r11;
4940 
4941     BLOCK_COMMENT("Entry:");
4942     __ enter(); // required for proper stackwalking of RuntimeStub frame
4943 
4944     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4945 
4946     __ leave(); // required for proper stackwalking of RuntimeStub frame
4947     __ ret(0);
4948 
4949     return start;
4950 
4951   }
4952 
4953   address generate_libmSin() {
4954     StubCodeMark mark(this, "StubRoutines", "libmSin");
4955 
4956     address start = __ pc();
4957 
4958     const XMMRegister x0 = xmm0;
4959     const XMMRegister x1 = xmm1;
4960     const XMMRegister x2 = xmm2;
4961     const XMMRegister x3 = xmm3;
4962 
4963     const XMMRegister x4 = xmm4;
4964     const XMMRegister x5 = xmm5;
4965     const XMMRegister x6 = xmm6;
4966     const XMMRegister x7 = xmm7;
4967 
4968     const Register tmp1 = r8;
4969     const Register tmp2 = r9;
4970     const Register tmp3 = r10;
4971     const Register tmp4 = r11;
4972 
4973     BLOCK_COMMENT("Entry:");
4974     __ enter(); // required for proper stackwalking of RuntimeStub frame
4975 
4976 #ifdef _WIN64
4977     __ push(rsi);
4978     __ push(rdi);
4979 #endif
4980     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4981 
4982 #ifdef _WIN64
4983     __ pop(rdi);
4984     __ pop(rsi);
4985 #endif
4986 
4987     __ leave(); // required for proper stackwalking of RuntimeStub frame
4988     __ ret(0);
4989 
4990     return start;
4991 
4992   }
4993 
4994   address generate_libmCos() {
4995     StubCodeMark mark(this, "StubRoutines", "libmCos");
4996 
4997     address start = __ pc();
4998 
4999     const XMMRegister x0 = xmm0;
5000     const XMMRegister x1 = xmm1;
5001     const XMMRegister x2 = xmm2;
5002     const XMMRegister x3 = xmm3;
5003 
5004     const XMMRegister x4 = xmm4;
5005     const XMMRegister x5 = xmm5;
5006     const XMMRegister x6 = xmm6;
5007     const XMMRegister x7 = xmm7;
5008 
5009     const Register tmp1 = r8;
5010     const Register tmp2 = r9;
5011     const Register tmp3 = r10;
5012     const Register tmp4 = r11;
5013 
5014     BLOCK_COMMENT("Entry:");
5015     __ enter(); // required for proper stackwalking of RuntimeStub frame
5016 
5017 #ifdef _WIN64
5018     __ push(rsi);
5019     __ push(rdi);
5020 #endif
5021     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5022 
5023 #ifdef _WIN64
5024     __ pop(rdi);
5025     __ pop(rsi);
5026 #endif
5027 
5028     __ leave(); // required for proper stackwalking of RuntimeStub frame
5029     __ ret(0);
5030 
5031     return start;
5032 
5033   }
5034 
5035   address generate_libmTan() {
5036     StubCodeMark mark(this, "StubRoutines", "libmTan");
5037 
5038     address start = __ pc();
5039 
5040     const XMMRegister x0 = xmm0;
5041     const XMMRegister x1 = xmm1;
5042     const XMMRegister x2 = xmm2;
5043     const XMMRegister x3 = xmm3;
5044 
5045     const XMMRegister x4 = xmm4;
5046     const XMMRegister x5 = xmm5;
5047     const XMMRegister x6 = xmm6;
5048     const XMMRegister x7 = xmm7;
5049 
5050     const Register tmp1 = r8;
5051     const Register tmp2 = r9;
5052     const Register tmp3 = r10;
5053     const Register tmp4 = r11;
5054 
5055     BLOCK_COMMENT("Entry:");
5056     __ enter(); // required for proper stackwalking of RuntimeStub frame
5057 
5058 #ifdef _WIN64
5059     __ push(rsi);
5060     __ push(rdi);
5061 #endif
5062     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5063 
5064 #ifdef _WIN64
5065     __ pop(rdi);
5066     __ pop(rsi);
5067 #endif
5068 
5069     __ leave(); // required for proper stackwalking of RuntimeStub frame
5070     __ ret(0);
5071 
5072     return start;
5073 
5074   }
5075 
5076 #undef __
5077 #define __ masm->
5078 
5079   // Continuation point for throwing of implicit exceptions that are
5080   // not handled in the current activation. Fabricates an exception
5081   // oop and initiates normal exception dispatching in this
5082   // frame. Since we need to preserve callee-saved values (currently
5083   // only for C2, but done for C1 as well) we need a callee-saved oop
5084   // map and therefore have to make these stubs into RuntimeStubs
5085   // rather than BufferBlobs.  If the compiler needs all registers to
5086   // be preserved between the fault point and the exception handler
5087   // then it must assume responsibility for that in
5088   // AbstractCompiler::continuation_for_implicit_null_exception or
5089   // continuation_for_implicit_division_by_zero_exception. All other
5090   // implicit exceptions (e.g., NullPointerException or
5091   // AbstractMethodError on entry) are either at call sites or
5092   // otherwise assume that stack unwinding will be initiated, so
5093   // caller saved registers were assumed volatile in the compiler.
5094   address generate_throw_exception(const char* name,
5095                                    address runtime_entry,
5096                                    Register arg1 = noreg,
5097                                    Register arg2 = noreg) {
5098     // Information about frame layout at time of blocking runtime call.
5099     // Note that we only have to preserve callee-saved registers since
5100     // the compilers are responsible for supplying a continuation point
5101     // if they expect all registers to be preserved.
5102     enum layout {
5103       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5104       rbp_off2,
5105       return_off,
5106       return_off2,
5107       framesize // inclusive of return address
5108     };
5109 
5110     int insts_size = 512;
5111     int locs_size  = 64;
5112 
5113     CodeBuffer code(name, insts_size, locs_size);
5114     OopMapSet* oop_maps  = new OopMapSet();
5115     MacroAssembler* masm = new MacroAssembler(&code);
5116 
5117     address start = __ pc();
5118 
5119     // This is an inlined and slightly modified version of call_VM
5120     // which has the ability to fetch the return PC out of
5121     // thread-local storage and also sets up last_Java_sp slightly
5122     // differently than the real call_VM
5123 
5124     __ enter(); // required for proper stackwalking of RuntimeStub frame
5125 
5126     assert(is_even(framesize/2), "sp not 16-byte aligned");
5127 
5128     // return address and rbp are already in place
5129     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
5130 
5131     int frame_complete = __ pc() - start;
5132 
5133     // Set up last_Java_sp and last_Java_fp
5134     address the_pc = __ pc();
5135     __ set_last_Java_frame(rsp, rbp, the_pc);
5136     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
5137 
5138     // Call runtime
5139     if (arg1 != noreg) {
5140       assert(arg2 != c_rarg1, "clobbered");
5141       __ movptr(c_rarg1, arg1);
5142     }
5143     if (arg2 != noreg) {
5144       __ movptr(c_rarg2, arg2);
5145     }
5146     __ movptr(c_rarg0, r15_thread);
5147     BLOCK_COMMENT("call runtime_entry");
5148     __ call(RuntimeAddress(runtime_entry));
5149 
5150     // Generate oop map
5151     OopMap* map = new OopMap(framesize, 0);
5152 
5153     oop_maps->add_gc_map(the_pc - start, map);
5154 
5155     __ reset_last_Java_frame(true);
5156 
5157     __ leave(); // required for proper stackwalking of RuntimeStub frame
5158 
5159     // check for pending exceptions
5160 #ifdef ASSERT
5161     Label L;
5162     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
5163             (int32_t) NULL_WORD);
5164     __ jcc(Assembler::notEqual, L);
5165     __ should_not_reach_here();
5166     __ bind(L);
5167 #endif // ASSERT
5168     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5169 
5170 
5171     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5172     RuntimeStub* stub =
5173       RuntimeStub::new_runtime_stub(name,
5174                                     &code,
5175                                     frame_complete,
5176                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5177                                     oop_maps, false);
5178     return stub->entry_point();
5179   }
5180 
5181   void create_control_words() {
5182     // Round to nearest, 53-bit mode, exceptions masked
5183     StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
5184     // Round to zero, 53-bit mode, exception mased
5185     StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
5186     // Round to nearest, 24-bit mode, exceptions masked
5187     StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
5188     // Round to nearest, 64-bit mode, exceptions masked
5189     StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
5190     // Round to nearest, 64-bit mode, exceptions masked
5191     StubRoutines::_mxcsr_std           = 0x1F80;
5192     // Note: the following two constants are 80-bit values
5193     //       layout is critical for correct loading by FPU.
5194     // Bias for strict fp multiply/divide
5195     StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
5196     StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
5197     StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
5198     // Un-Bias for strict fp multiply/divide
5199     StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
5200     StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
5201     StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
5202   }
5203 
5204   // Initialization
5205   void generate_initial() {
5206     // Generates all stubs and initializes the entry points
5207 
5208     // This platform-specific settings are needed by generate_call_stub()
5209     create_control_words();
5210 
5211     // entry points that exist in all platforms Note: This is code
5212     // that could be shared among different platforms - however the
5213     // benefit seems to be smaller than the disadvantage of having a
5214     // much more complicated generator structure. See also comment in
5215     // stubRoutines.hpp.
5216 
5217     StubRoutines::_forward_exception_entry = generate_forward_exception();
5218 
5219     StubRoutines::_call_stub_entry =
5220       generate_call_stub(StubRoutines::_call_stub_return_address);
5221 
5222     // is referenced by megamorphic call
5223     StubRoutines::_catch_exception_entry = generate_catch_exception();
5224 
5225     // atomic calls
5226     StubRoutines::_atomic_xchg_entry          = generate_atomic_xchg();
5227     StubRoutines::_atomic_xchg_long_entry     = generate_atomic_xchg_long();
5228     StubRoutines::_atomic_cmpxchg_entry       = generate_atomic_cmpxchg();
5229     StubRoutines::_atomic_cmpxchg_byte_entry  = generate_atomic_cmpxchg_byte();
5230     StubRoutines::_atomic_cmpxchg_long_entry  = generate_atomic_cmpxchg_long();
5231     StubRoutines::_atomic_add_entry           = generate_atomic_add();
5232     StubRoutines::_atomic_add_long_entry      = generate_atomic_add_long();
5233     StubRoutines::_fence_entry                = generate_orderaccess_fence();
5234 
5235     // platform dependent
5236     StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
5237     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
5238 
5239     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
5240 
5241     // Build this early so it's available for the interpreter.
5242     StubRoutines::_throw_StackOverflowError_entry =
5243       generate_throw_exception("StackOverflowError throw_exception",
5244                                CAST_FROM_FN_PTR(address,
5245                                                 SharedRuntime::
5246                                                 throw_StackOverflowError));
5247     StubRoutines::_throw_delayed_StackOverflowError_entry =
5248       generate_throw_exception("delayed StackOverflowError throw_exception",
5249                                CAST_FROM_FN_PTR(address,
5250                                                 SharedRuntime::
5251                                                 throw_delayed_StackOverflowError));
5252     if (UseCRC32Intrinsics) {
5253       // set table address before stub generation which use it
5254       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
5255       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5256     }
5257 
5258     if (UseCRC32CIntrinsics) {
5259       bool supports_clmul = VM_Version::supports_clmul();
5260       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
5261       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
5262       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
5263     }
5264     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
5265       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
5266           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
5267           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5268         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
5269         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
5270         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
5271         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
5272         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
5273         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
5274         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
5275         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
5276         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
5277         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
5278         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
5279         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
5280         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
5281         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
5282       }
5283       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
5284         StubRoutines::_dexp = generate_libmExp();
5285       }
5286       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5287         StubRoutines::_dlog = generate_libmLog();
5288       }
5289       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
5290         StubRoutines::_dlog10 = generate_libmLog10();
5291       }
5292       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
5293         StubRoutines::_dpow = generate_libmPow();
5294       }
5295       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5296         StubRoutines::_dsin = generate_libmSin();
5297       }
5298       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5299         StubRoutines::_dcos = generate_libmCos();
5300       }
5301       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5302         StubRoutines::_dtan = generate_libmTan();
5303       }
5304     }
5305   }
5306 
5307   void generate_all() {
5308     // Generates all stubs and initializes the entry points
5309 
5310     // These entry points require SharedInfo::stack0 to be set up in
5311     // non-core builds and need to be relocatable, so they each
5312     // fabricate a RuntimeStub internally.
5313     StubRoutines::_throw_AbstractMethodError_entry =
5314       generate_throw_exception("AbstractMethodError throw_exception",
5315                                CAST_FROM_FN_PTR(address,
5316                                                 SharedRuntime::
5317                                                 throw_AbstractMethodError));
5318 
5319     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5320       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5321                                CAST_FROM_FN_PTR(address,
5322                                                 SharedRuntime::
5323                                                 throw_IncompatibleClassChangeError));
5324 
5325     StubRoutines::_throw_NullPointerException_at_call_entry =
5326       generate_throw_exception("NullPointerException at call throw_exception",
5327                                CAST_FROM_FN_PTR(address,
5328                                                 SharedRuntime::
5329                                                 throw_NullPointerException_at_call));
5330 
5331     // entry points that are platform specific
5332     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
5333     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
5334     StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
5335     StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
5336 
5337     StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
5338     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
5339     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
5340     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
5341 
5342     // support for verify_oop (must happen after universe_init)
5343     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5344 
5345     // arraycopy stubs used by compilers
5346     generate_arraycopy_stubs();
5347 
5348     // don't bother generating these AES intrinsic stubs unless global flag is set
5349     if (UseAESIntrinsics) {
5350       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
5351       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5352       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5353       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5354       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5355     }
5356     if (UseAESCTRIntrinsics){
5357       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
5358       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
5359     }
5360 
5361     if (UseSHA1Intrinsics) {
5362       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
5363       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
5364       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5365       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5366     }
5367     if (UseSHA256Intrinsics) {
5368       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
5369       char* dst = (char*)StubRoutines::x86::_k256_W;
5370       char* src = (char*)StubRoutines::x86::_k256;
5371       for (int ii = 0; ii < 16; ++ii) {
5372         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
5373         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
5374       }
5375       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
5376       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
5377       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5378       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5379     }
5380     if (UseSHA512Intrinsics) {
5381       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
5382       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
5383       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
5384       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
5385     }
5386 
5387     // Generate GHASH intrinsics code
5388     if (UseGHASHIntrinsics) {
5389       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
5390       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
5391       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5392     }
5393 
5394     // Safefetch stubs.
5395     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5396                                                        &StubRoutines::_safefetch32_fault_pc,
5397                                                        &StubRoutines::_safefetch32_continuation_pc);
5398     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5399                                                        &StubRoutines::_safefetchN_fault_pc,
5400                                                        &StubRoutines::_safefetchN_continuation_pc);
5401 #ifdef COMPILER2
5402     if (UseMultiplyToLenIntrinsic) {
5403       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5404     }
5405     if (UseSquareToLenIntrinsic) {
5406       StubRoutines::_squareToLen = generate_squareToLen();
5407     }
5408     if (UseMulAddIntrinsic) {
5409       StubRoutines::_mulAdd = generate_mulAdd();
5410     }
5411 #ifndef _WINDOWS
5412     if (UseMontgomeryMultiplyIntrinsic) {
5413       StubRoutines::_montgomeryMultiply
5414         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
5415     }
5416     if (UseMontgomerySquareIntrinsic) {
5417       StubRoutines::_montgomerySquare
5418         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
5419     }
5420 #endif // WINDOWS
5421 #endif // COMPILER2
5422 
5423     if (UseVectorizedMismatchIntrinsic) {
5424       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
5425     }
5426   }
5427 
5428  public:
5429   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5430     if (all) {
5431       generate_all();
5432     } else {
5433       generate_initial();
5434     }
5435   }
5436 }; // end class declaration
5437 
5438 void StubGenerator_generate(CodeBuffer* code, bool all) {
5439   StubGenerator g(code, all);
5440 }