1 /*
   2  * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "gc/shared/barrierSetNMethod.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_x86.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 // Declaration and definition of StubGenerator (no .hpp file).
  50 // For a more detailed description of the stub routine structure
  51 // see the comment in stubRoutines.hpp
  52 
  53 #define __ _masm->
  54 #define a__ ((Assembler*)_masm)->
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #else
  59 #define BLOCK_COMMENT(str) __ block_comment(str)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 const int MXCSR_MASK  = 0xFFC0;  // Mask out any pending exceptions
  65 const int FPU_CNTRL_WRD_MASK = 0xFFFF;
  66 
  67 // -------------------------------------------------------------------------------------------------------------------------
  68 // Stub Code definitions
  69 
  70 class StubGenerator: public StubCodeGenerator {
  71  private:
  72 
  73 #ifdef PRODUCT
  74 #define inc_counter_np(counter) ((void)0)
  75 #else
  76   void inc_counter_np_(int& counter) {
  77     __ incrementl(ExternalAddress((address)&counter));
  78   }
  79 #define inc_counter_np(counter) \
  80   BLOCK_COMMENT("inc_counter " #counter); \
  81   inc_counter_np_(counter);
  82 #endif //PRODUCT
  83 
  84   void inc_copy_counter_np(BasicType t) {
  85 #ifndef PRODUCT
  86     switch (t) {
  87     case T_BYTE:    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
  88     case T_SHORT:   inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
  89     case T_INT:     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
  90     case T_LONG:    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
  91     case T_OBJECT:  inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
  92     default:        ShouldNotReachHere();
  93     }
  94 #endif //PRODUCT
  95   }
  96 
  97   //------------------------------------------------------------------------------------------------------------------------
  98   // Call stubs are used to call Java from C
  99   //
 100   //    [ return_from_Java     ] <--- rsp
 101   //    [ argument word n      ]
 102   //      ...
 103   // -N [ argument word 1      ]
 104   // -7 [ Possible padding for stack alignment ]
 105   // -6 [ Possible padding for stack alignment ]
 106   // -5 [ Possible padding for stack alignment ]
 107   // -4 [ mxcsr save           ] <--- rsp_after_call
 108   // -3 [ saved rbx,            ]
 109   // -2 [ saved rsi            ]
 110   // -1 [ saved rdi            ]
 111   //  0 [ saved rbp,            ] <--- rbp,
 112   //  1 [ return address       ]
 113   //  2 [ ptr. to call wrapper ]
 114   //  3 [ result               ]
 115   //  4 [ result_type          ]
 116   //  5 [ method               ]
 117   //  6 [ entry_point          ]
 118   //  7 [ parameters           ]
 119   //  8 [ parameter_size       ]
 120   //  9 [ thread               ]
 121 
 122 
 123   address generate_call_stub(address& return_address) {
 124     StubCodeMark mark(this, "StubRoutines", "call_stub");
 125     address start = __ pc();
 126 
 127     // stub code parameters / addresses
 128     assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
 129     bool  sse_save = false;
 130     const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
 131     const int     locals_count_in_bytes  (4*wordSize);
 132     const Address mxcsr_save    (rbp, -4 * wordSize);
 133     const Address saved_rbx     (rbp, -3 * wordSize);
 134     const Address saved_rsi     (rbp, -2 * wordSize);
 135     const Address saved_rdi     (rbp, -1 * wordSize);
 136     const Address result        (rbp,  3 * wordSize);
 137     const Address result_type   (rbp,  4 * wordSize);
 138     const Address method        (rbp,  5 * wordSize);
 139     const Address entry_point   (rbp,  6 * wordSize);
 140     const Address parameters    (rbp,  7 * wordSize);
 141     const Address parameter_size(rbp,  8 * wordSize);
 142     const Address thread        (rbp,  9 * wordSize); // same as in generate_catch_exception()!
 143     sse_save =  UseSSE > 0;
 144 
 145     // stub code
 146     __ enter();
 147     __ movptr(rcx, parameter_size);              // parameter counter
 148     __ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes
 149     __ addptr(rcx, locals_count_in_bytes);       // reserve space for register saves
 150     __ subptr(rsp, rcx);
 151     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
 152 
 153     // save rdi, rsi, & rbx, according to C calling conventions
 154     __ movptr(saved_rdi, rdi);
 155     __ movptr(saved_rsi, rsi);
 156     __ movptr(saved_rbx, rbx);
 157 
 158     // save and initialize %mxcsr
 159     if (sse_save) {
 160       Label skip_ldmx;
 161       __ stmxcsr(mxcsr_save);
 162       __ movl(rax, mxcsr_save);
 163       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 164       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 165       __ cmp32(rax, mxcsr_std);
 166       __ jcc(Assembler::equal, skip_ldmx);
 167       __ ldmxcsr(mxcsr_std);
 168       __ bind(skip_ldmx);
 169     }
 170 
 171     // make sure the control word is correct.
 172     __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
 173 
 174 #ifdef ASSERT
 175     // make sure we have no pending exceptions
 176     { Label L;
 177       __ movptr(rcx, thread);
 178       __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 179       __ jcc(Assembler::equal, L);
 180       __ stop("StubRoutines::call_stub: entered with pending exception");
 181       __ bind(L);
 182     }
 183 #endif
 184 
 185     // pass parameters if any
 186     BLOCK_COMMENT("pass parameters if any");
 187     Label parameters_done;
 188     __ movl(rcx, parameter_size);  // parameter counter
 189     __ testl(rcx, rcx);
 190     __ jcc(Assembler::zero, parameters_done);
 191 
 192     // parameter passing loop
 193 
 194     Label loop;
 195     // Copy Java parameters in reverse order (receiver last)
 196     // Note that the argument order is inverted in the process
 197     // source is rdx[rcx: N-1..0]
 198     // dest   is rsp[rbx: 0..N-1]
 199 
 200     __ movptr(rdx, parameters);          // parameter pointer
 201     __ xorptr(rbx, rbx);
 202 
 203     __ BIND(loop);
 204 
 205     // get parameter
 206     __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
 207     __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
 208                     Interpreter::expr_offset_in_bytes(0)), rax);          // store parameter
 209     __ increment(rbx);
 210     __ decrement(rcx);
 211     __ jcc(Assembler::notZero, loop);
 212 
 213     // call Java function
 214     __ BIND(parameters_done);
 215     __ movptr(rbx, method);           // get Method*
 216     __ movptr(rax, entry_point);      // get entry_point
 217     __ mov(rsi, rsp);                 // set sender sp
 218     BLOCK_COMMENT("call Java function");
 219     __ call(rax);
 220 
 221     BLOCK_COMMENT("call_stub_return_address:");
 222     return_address = __ pc();
 223 
 224 #ifdef COMPILER2
 225     {
 226       Label L_skip;
 227       if (UseSSE >= 2) {
 228         __ verify_FPU(0, "call_stub_return");
 229       } else {
 230         for (int i = 1; i < 8; i++) {
 231           __ ffree(i);
 232         }
 233 
 234         // UseSSE <= 1 so double result should be left on TOS
 235         __ movl(rsi, result_type);
 236         __ cmpl(rsi, T_DOUBLE);
 237         __ jcc(Assembler::equal, L_skip);
 238         if (UseSSE == 0) {
 239           // UseSSE == 0 so float result should be left on TOS
 240           __ cmpl(rsi, T_FLOAT);
 241           __ jcc(Assembler::equal, L_skip);
 242         }
 243         __ ffree(0);
 244       }
 245       __ BIND(L_skip);
 246     }
 247 #endif // COMPILER2
 248 
 249     // store result depending on type
 250     // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 251     __ movptr(rdi, result);
 252     Label is_long, is_float, is_double, exit;
 253     __ movl(rsi, result_type);
 254     __ cmpl(rsi, T_LONG);
 255     __ jcc(Assembler::equal, is_long);
 256     __ cmpl(rsi, T_FLOAT);
 257     __ jcc(Assembler::equal, is_float);
 258     __ cmpl(rsi, T_DOUBLE);
 259     __ jcc(Assembler::equal, is_double);
 260 
 261     // handle T_INT case
 262     __ movl(Address(rdi, 0), rax);
 263     __ BIND(exit);
 264 
 265     // check that FPU stack is empty
 266     __ verify_FPU(0, "generate_call_stub");
 267 
 268     // pop parameters
 269     __ lea(rsp, rsp_after_call);
 270 
 271     // restore %mxcsr
 272     if (sse_save) {
 273       __ ldmxcsr(mxcsr_save);
 274     }
 275 
 276     // restore rdi, rsi and rbx,
 277     __ movptr(rbx, saved_rbx);
 278     __ movptr(rsi, saved_rsi);
 279     __ movptr(rdi, saved_rdi);
 280     __ addptr(rsp, 4*wordSize);
 281 
 282     // return
 283     __ pop(rbp);
 284     __ ret(0);
 285 
 286     // handle return types different from T_INT
 287     __ BIND(is_long);
 288     __ movl(Address(rdi, 0 * wordSize), rax);
 289     __ movl(Address(rdi, 1 * wordSize), rdx);
 290     __ jmp(exit);
 291 
 292     __ BIND(is_float);
 293     // interpreter uses xmm0 for return values
 294     if (UseSSE >= 1) {
 295       __ movflt(Address(rdi, 0), xmm0);
 296     } else {
 297       __ fstp_s(Address(rdi, 0));
 298     }
 299     __ jmp(exit);
 300 
 301     __ BIND(is_double);
 302     // interpreter uses xmm0 for return values
 303     if (UseSSE >= 2) {
 304       __ movdbl(Address(rdi, 0), xmm0);
 305     } else {
 306       __ fstp_d(Address(rdi, 0));
 307     }
 308     __ jmp(exit);
 309 
 310     return start;
 311   }
 312 
 313 
 314   //------------------------------------------------------------------------------------------------------------------------
 315   // Return point for a Java call if there's an exception thrown in Java code.
 316   // The exception is caught and transformed into a pending exception stored in
 317   // JavaThread that can be tested from within the VM.
 318   //
 319   // Note: Usually the parameters are removed by the callee. In case of an exception
 320   //       crossing an activation frame boundary, that is not the case if the callee
 321   //       is compiled code => need to setup the rsp.
 322   //
 323   // rax,: exception oop
 324 
 325   address generate_catch_exception() {
 326     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 327     const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
 328     const Address thread        (rbp,  9 * wordSize); // same as in generate_call_stub()!
 329     address start = __ pc();
 330 
 331     // get thread directly
 332     __ movptr(rcx, thread);
 333 #ifdef ASSERT
 334     // verify that threads correspond
 335     { Label L;
 336       __ get_thread(rbx);
 337       __ cmpptr(rbx, rcx);
 338       __ jcc(Assembler::equal, L);
 339       __ stop("StubRoutines::catch_exception: threads must correspond");
 340       __ bind(L);
 341     }
 342 #endif
 343     // set pending exception
 344     __ verify_oop(rax);
 345     __ movptr(Address(rcx, Thread::pending_exception_offset()), rax          );
 346     __ lea(Address(rcx, Thread::exception_file_offset   ()),
 347            ExternalAddress((address)__FILE__));
 348     __ movl(Address(rcx, Thread::exception_line_offset   ()), __LINE__ );
 349     // complete return to VM
 350     assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
 351     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 352 
 353     return start;
 354   }
 355 
 356 
 357   //------------------------------------------------------------------------------------------------------------------------
 358   // Continuation point for runtime calls returning with a pending exception.
 359   // The pending exception check happened in the runtime or native call stub.
 360   // The pending exception in Thread is converted into a Java-level exception.
 361   //
 362   // Contract with Java-level exception handlers:
 363   // rax: exception
 364   // rdx: throwing pc
 365   //
 366   // NOTE: At entry of this stub, exception-pc must be on stack !!
 367 
 368   address generate_forward_exception() {
 369     StubCodeMark mark(this, "StubRoutines", "forward exception");
 370     address start = __ pc();
 371     const Register thread = rcx;
 372 
 373     // other registers used in this stub
 374     const Register exception_oop = rax;
 375     const Register handler_addr  = rbx;
 376     const Register exception_pc  = rdx;
 377 
 378     // Upon entry, the sp points to the return address returning into Java
 379     // (interpreted or compiled) code; i.e., the return address becomes the
 380     // throwing pc.
 381     //
 382     // Arguments pushed before the runtime call are still on the stack but
 383     // the exception handler will reset the stack pointer -> ignore them.
 384     // A potential result in registers can be ignored as well.
 385 
 386 #ifdef ASSERT
 387     // make sure this code is only executed if there is a pending exception
 388     { Label L;
 389       __ get_thread(thread);
 390       __ cmpptr(Address(thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 391       __ jcc(Assembler::notEqual, L);
 392       __ stop("StubRoutines::forward exception: no pending exception (1)");
 393       __ bind(L);
 394     }
 395 #endif
 396 
 397     // compute exception handler into rbx,
 398     __ get_thread(thread);
 399     __ movptr(exception_pc, Address(rsp, 0));
 400     BLOCK_COMMENT("call exception_handler_for_return_address");
 401     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc);
 402     __ mov(handler_addr, rax);
 403 
 404     // setup rax & rdx, remove return address & clear pending exception
 405     __ get_thread(thread);
 406     __ pop(exception_pc);
 407     __ movptr(exception_oop, Address(thread, Thread::pending_exception_offset()));
 408     __ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
 409 
 410 #ifdef ASSERT
 411     // make sure exception is set
 412     { Label L;
 413       __ testptr(exception_oop, exception_oop);
 414       __ jcc(Assembler::notEqual, L);
 415       __ stop("StubRoutines::forward exception: no pending exception (2)");
 416       __ bind(L);
 417     }
 418 #endif
 419 
 420     // Verify that there is really a valid exception in RAX.
 421     __ verify_oop(exception_oop);
 422 
 423     // continue at exception handler (return address removed)
 424     // rax: exception
 425     // rbx: exception handler
 426     // rdx: throwing pc
 427     __ jmp(handler_addr);
 428 
 429     return start;
 430   }
 431 
 432 
 433   //----------------------------------------------------------------------------------------------------
 434   // Implementation of int32_t atomic_xchg(int32_t exchange_value, volatile int32_t* dest)
 435   // used by Atomic::xchg(volatile int32_t* dest, int32_t exchange_value)
 436   //
 437   // xchg exists as far back as 8086, lock needed for MP only
 438   // Stack layout immediately after call:
 439   //
 440   // 0 [ret addr ] <--- rsp
 441   // 1 [  ex     ]
 442   // 2 [  dest   ]
 443   //
 444   // Result:   *dest <- ex, return (old *dest)
 445   //
 446   // Note: win32 does not currently use this code
 447 
 448   address generate_atomic_xchg() {
 449     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 450     address start = __ pc();
 451 
 452     __ push(rdx);
 453     Address exchange(rsp, 2 * wordSize);
 454     Address dest_addr(rsp, 3 * wordSize);
 455     __ movl(rax, exchange);
 456     __ movptr(rdx, dest_addr);
 457     __ xchgl(rax, Address(rdx, 0));
 458     __ pop(rdx);
 459     __ ret(0);
 460 
 461     return start;
 462   }
 463 
 464   //----------------------------------------------------------------------------------------------------
 465   // Support for void verify_mxcsr()
 466   //
 467   // This routine is used with -Xcheck:jni to verify that native
 468   // JNI code does not return to Java code without restoring the
 469   // MXCSR register to our expected state.
 470 
 471 
 472   address generate_verify_mxcsr() {
 473     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 474     address start = __ pc();
 475 
 476     const Address mxcsr_save(rsp, 0);
 477 
 478     if (CheckJNICalls && UseSSE > 0 ) {
 479       Label ok_ret;
 480       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 481       __ push(rax);
 482       __ subptr(rsp, wordSize);      // allocate a temp location
 483       __ stmxcsr(mxcsr_save);
 484       __ movl(rax, mxcsr_save);
 485       __ andl(rax, MXCSR_MASK);
 486       __ cmp32(rax, mxcsr_std);
 487       __ jcc(Assembler::equal, ok_ret);
 488 
 489       __ warn("MXCSR changed by native JNI code.");
 490 
 491       __ ldmxcsr(mxcsr_std);
 492 
 493       __ bind(ok_ret);
 494       __ addptr(rsp, wordSize);
 495       __ pop(rax);
 496     }
 497 
 498     __ ret(0);
 499 
 500     return start;
 501   }
 502 
 503 
 504   //---------------------------------------------------------------------------
 505   // Support for void verify_fpu_cntrl_wrd()
 506   //
 507   // This routine is used with -Xcheck:jni to verify that native
 508   // JNI code does not return to Java code without restoring the
 509   // FP control word to our expected state.
 510 
 511   address generate_verify_fpu_cntrl_wrd() {
 512     StubCodeMark mark(this, "StubRoutines", "verify_spcw");
 513     address start = __ pc();
 514 
 515     const Address fpu_cntrl_wrd_save(rsp, 0);
 516 
 517     if (CheckJNICalls) {
 518       Label ok_ret;
 519       __ push(rax);
 520       __ subptr(rsp, wordSize);      // allocate a temp location
 521       __ fnstcw(fpu_cntrl_wrd_save);
 522       __ movl(rax, fpu_cntrl_wrd_save);
 523       __ andl(rax, FPU_CNTRL_WRD_MASK);
 524       ExternalAddress fpu_std(StubRoutines::addr_fpu_cntrl_wrd_std());
 525       __ cmp32(rax, fpu_std);
 526       __ jcc(Assembler::equal, ok_ret);
 527 
 528       __ warn("Floating point control word changed by native JNI code.");
 529 
 530       __ fldcw(fpu_std);
 531 
 532       __ bind(ok_ret);
 533       __ addptr(rsp, wordSize);
 534       __ pop(rax);
 535     }
 536 
 537     __ ret(0);
 538 
 539     return start;
 540   }
 541 
 542   //---------------------------------------------------------------------------
 543   // Wrapper for slow-case handling of double-to-integer conversion
 544   // d2i or f2i fast case failed either because it is nan or because
 545   // of under/overflow.
 546   // Input:  FPU TOS: float value
 547   // Output: rax, (rdx): integer (long) result
 548 
 549   address generate_d2i_wrapper(BasicType t, address fcn) {
 550     StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
 551     address start = __ pc();
 552 
 553   // Capture info about frame layout
 554   enum layout { FPUState_off         = 0,
 555                 rbp_off              = FPUStateSizeInWords,
 556                 rdi_off,
 557                 rsi_off,
 558                 rcx_off,
 559                 rbx_off,
 560                 saved_argument_off,
 561                 saved_argument_off2, // 2nd half of double
 562                 framesize
 563   };
 564 
 565   assert(FPUStateSizeInWords == 27, "update stack layout");
 566 
 567     // Save outgoing argument to stack across push_FPU_state()
 568     __ subptr(rsp, wordSize * 2);
 569     __ fstp_d(Address(rsp, 0));
 570 
 571     // Save CPU & FPU state
 572     __ push(rbx);
 573     __ push(rcx);
 574     __ push(rsi);
 575     __ push(rdi);
 576     __ push(rbp);
 577     __ push_FPU_state();
 578 
 579     // push_FPU_state() resets the FP top of stack
 580     // Load original double into FP top of stack
 581     __ fld_d(Address(rsp, saved_argument_off * wordSize));
 582     // Store double into stack as outgoing argument
 583     __ subptr(rsp, wordSize*2);
 584     __ fst_d(Address(rsp, 0));
 585 
 586     // Prepare FPU for doing math in C-land
 587     __ empty_FPU_stack();
 588     // Call the C code to massage the double.  Result in EAX
 589     if (t == T_INT)
 590       { BLOCK_COMMENT("SharedRuntime::d2i"); }
 591     else if (t == T_LONG)
 592       { BLOCK_COMMENT("SharedRuntime::d2l"); }
 593     __ call_VM_leaf( fcn, 2 );
 594 
 595     // Restore CPU & FPU state
 596     __ pop_FPU_state();
 597     __ pop(rbp);
 598     __ pop(rdi);
 599     __ pop(rsi);
 600     __ pop(rcx);
 601     __ pop(rbx);
 602     __ addptr(rsp, wordSize * 2);
 603 
 604     __ ret(0);
 605 
 606     return start;
 607   }
 608   //---------------------------------------------------------------------------------------------------
 609 
 610   address generate_vector_mask(const char *stub_name, int32_t mask) {
 611     __ align(CodeEntryAlignment);
 612     StubCodeMark mark(this, "StubRoutines", stub_name);
 613     address start = __ pc();
 614 
 615     for (int i = 0; i < 16; i++) {
 616       __ emit_data(mask, relocInfo::none, 0);
 617     }
 618 
 619     return start;
 620   }
 621 
 622   address generate_iota_indices(const char *stub_name) {
 623     __ align(CodeEntryAlignment);
 624     StubCodeMark mark(this, "StubRoutines", stub_name);
 625     address start = __ pc();
 626     __ emit_data(0x03020100, relocInfo::none, 0);
 627     __ emit_data(0x07060504, relocInfo::none, 0);
 628     __ emit_data(0x0B0A0908, relocInfo::none, 0);
 629     __ emit_data(0x0F0E0D0C, relocInfo::none, 0);
 630     __ emit_data(0x13121110, relocInfo::none, 0);
 631     __ emit_data(0x17161514, relocInfo::none, 0);
 632     __ emit_data(0x1B1A1918, relocInfo::none, 0);
 633     __ emit_data(0x1F1E1D1C, relocInfo::none, 0);
 634     __ emit_data(0x23222120, relocInfo::none, 0);
 635     __ emit_data(0x27262524, relocInfo::none, 0);
 636     __ emit_data(0x2B2A2928, relocInfo::none, 0);
 637     __ emit_data(0x2F2E2D2C, relocInfo::none, 0);
 638     __ emit_data(0x33323130, relocInfo::none, 0);
 639     __ emit_data(0x37363534, relocInfo::none, 0);
 640     __ emit_data(0x3B3A3938, relocInfo::none, 0);
 641     __ emit_data(0x3F3E3D3C, relocInfo::none, 0);
 642     return start;
 643   }
 644 
 645   address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) {
 646     __ align(CodeEntryAlignment);
 647     StubCodeMark mark(this, "StubRoutines", stub_name);
 648     address start = __ pc();
 649 
 650     for (int i = 0; i < 8; i++) {
 651       __ emit_data(masklo, relocInfo::none, 0);
 652       __ emit_data(maskhi, relocInfo::none, 0);
 653     }
 654 
 655     return start;
 656   }
 657 
 658   //----------------------------------------------------------------------------------------------------
 659 
 660   address generate_vector_byte_perm_mask(const char *stub_name) {
 661     __ align(CodeEntryAlignment);
 662     StubCodeMark mark(this, "StubRoutines", stub_name);
 663     address start = __ pc();
 664 
 665     __ emit_data(0x00000001, relocInfo::none, 0);
 666     __ emit_data(0x00000000, relocInfo::none, 0);
 667     __ emit_data(0x00000003, relocInfo::none, 0);
 668     __ emit_data(0x00000000, relocInfo::none, 0);
 669     __ emit_data(0x00000005, relocInfo::none, 0);
 670     __ emit_data(0x00000000, relocInfo::none, 0);
 671     __ emit_data(0x00000007, relocInfo::none, 0);
 672     __ emit_data(0x00000000, relocInfo::none, 0);
 673     __ emit_data(0x00000000, relocInfo::none, 0);
 674     __ emit_data(0x00000000, relocInfo::none, 0);
 675     __ emit_data(0x00000002, relocInfo::none, 0);
 676     __ emit_data(0x00000000, relocInfo::none, 0);
 677     __ emit_data(0x00000004, relocInfo::none, 0);
 678     __ emit_data(0x00000000, relocInfo::none, 0);
 679     __ emit_data(0x00000006, relocInfo::none, 0);
 680     __ emit_data(0x00000000, relocInfo::none, 0);
 681 
 682     return start;
 683   }
 684 
 685   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
 686                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
 687                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
 688                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
 689                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
 690     __ align(CodeEntryAlignment);
 691     StubCodeMark mark(this, "StubRoutines", stub_name);
 692     address start = __ pc();
 693 
 694     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
 695     __ emit_data(val0, relocInfo::none, 0);
 696     __ emit_data(val1, relocInfo::none, 0);
 697     __ emit_data(val2, relocInfo::none, 0);
 698     __ emit_data(val3, relocInfo::none, 0);
 699     if (len >= Assembler::AVX_256bit) {
 700       __ emit_data(val4, relocInfo::none, 0);
 701       __ emit_data(val5, relocInfo::none, 0);
 702       __ emit_data(val6, relocInfo::none, 0);
 703       __ emit_data(val7, relocInfo::none, 0);
 704       if (len >= Assembler::AVX_512bit) {
 705         __ emit_data(val8, relocInfo::none, 0);
 706         __ emit_data(val9, relocInfo::none, 0);
 707         __ emit_data(val10, relocInfo::none, 0);
 708         __ emit_data(val11, relocInfo::none, 0);
 709         __ emit_data(val12, relocInfo::none, 0);
 710         __ emit_data(val13, relocInfo::none, 0);
 711         __ emit_data(val14, relocInfo::none, 0);
 712         __ emit_data(val15, relocInfo::none, 0);
 713       }
 714     }
 715 
 716     return start;
 717   }
 718 
 719   //----------------------------------------------------------------------------------------------------
 720   // Non-destructive plausibility checks for oops
 721 
 722   address generate_verify_oop() {
 723     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 724     address start = __ pc();
 725 
 726     // Incoming arguments on stack after saving rax,:
 727     //
 728     // [tos    ]: saved rdx
 729     // [tos + 1]: saved EFLAGS
 730     // [tos + 2]: return address
 731     // [tos + 3]: char* error message
 732     // [tos + 4]: oop   object to verify
 733     // [tos + 5]: saved rax, - saved by caller and bashed
 734 
 735     Label exit, error;
 736     __ pushf();
 737     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 738     __ push(rdx);                                // save rdx
 739     // make sure object is 'reasonable'
 740     __ movptr(rax, Address(rsp, 4 * wordSize));    // get object
 741     __ testptr(rax, rax);
 742     __ jcc(Assembler::zero, exit);               // if obj is NULL it is ok
 743 
 744     // Check if the oop is in the right area of memory
 745     const int oop_mask = Universe::verify_oop_mask();
 746     const int oop_bits = Universe::verify_oop_bits();
 747     __ mov(rdx, rax);
 748     __ andptr(rdx, oop_mask);
 749     __ cmpptr(rdx, oop_bits);
 750     __ jcc(Assembler::notZero, error);
 751 
 752     // make sure klass is 'reasonable', which is not zero.
 753     __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
 754     __ testptr(rax, rax);
 755     __ jcc(Assembler::zero, error);              // if klass is NULL it is broken
 756 
 757     // return if everything seems ok
 758     __ bind(exit);
 759     __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
 760     __ pop(rdx);                                 // restore rdx
 761     __ popf();                                   // restore EFLAGS
 762     __ ret(3 * wordSize);                        // pop arguments
 763 
 764     // handle errors
 765     __ bind(error);
 766     __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
 767     __ pop(rdx);                                 // get saved rdx back
 768     __ popf();                                   // get saved EFLAGS off stack -- will be ignored
 769     __ pusha();                                  // push registers (eip = return address & msg are already pushed)
 770     BLOCK_COMMENT("call MacroAssembler::debug");
 771     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 772     __ hlt();
 773     return start;
 774   }
 775 
 776 
 777   // Copy 64 bytes chunks
 778   //
 779   // Inputs:
 780   //   from        - source array address
 781   //   to_from     - destination array address - from
 782   //   qword_count - 8-bytes element count, negative
 783   //
 784   void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
 785     assert( UseSSE >= 2, "supported cpu only" );
 786     Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
 787 
 788     // Copy 64-byte chunks
 789     __ jmpb(L_copy_64_bytes);
 790     __ align(OptoLoopAlignment);
 791   __ BIND(L_copy_64_bytes_loop);
 792 
 793     if (UseUnalignedLoadStores) {
 794       if (UseAVX > 2) {
 795         __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
 796         __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
 797       } else if (UseAVX == 2) {
 798         __ vmovdqu(xmm0, Address(from,  0));
 799         __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
 800         __ vmovdqu(xmm1, Address(from, 32));
 801         __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
 802       } else {
 803         __ movdqu(xmm0, Address(from, 0));
 804         __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
 805         __ movdqu(xmm1, Address(from, 16));
 806         __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
 807         __ movdqu(xmm2, Address(from, 32));
 808         __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
 809         __ movdqu(xmm3, Address(from, 48));
 810         __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
 811       }
 812     } else {
 813       __ movq(xmm0, Address(from, 0));
 814       __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
 815       __ movq(xmm1, Address(from, 8));
 816       __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
 817       __ movq(xmm2, Address(from, 16));
 818       __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
 819       __ movq(xmm3, Address(from, 24));
 820       __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
 821       __ movq(xmm4, Address(from, 32));
 822       __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
 823       __ movq(xmm5, Address(from, 40));
 824       __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
 825       __ movq(xmm6, Address(from, 48));
 826       __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
 827       __ movq(xmm7, Address(from, 56));
 828       __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
 829     }
 830 
 831     __ addl(from, 64);
 832   __ BIND(L_copy_64_bytes);
 833     __ subl(qword_count, 8);
 834     __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
 835 
 836     if (UseUnalignedLoadStores && (UseAVX == 2)) {
 837       // clean upper bits of YMM registers
 838       __ vpxor(xmm0, xmm0);
 839       __ vpxor(xmm1, xmm1);
 840     }
 841     __ addl(qword_count, 8);
 842     __ jccb(Assembler::zero, L_exit);
 843     //
 844     // length is too short, just copy qwords
 845     //
 846   __ BIND(L_copy_8_bytes);
 847     __ movq(xmm0, Address(from, 0));
 848     __ movq(Address(from, to_from, Address::times_1), xmm0);
 849     __ addl(from, 8);
 850     __ decrement(qword_count);
 851     __ jcc(Assembler::greater, L_copy_8_bytes);
 852   __ BIND(L_exit);
 853   }
 854 
 855   address generate_disjoint_copy(BasicType t, bool aligned,
 856                                  Address::ScaleFactor sf,
 857                                  address* entry, const char *name,
 858                                  bool dest_uninitialized = false) {
 859     __ align(CodeEntryAlignment);
 860     StubCodeMark mark(this, "StubRoutines", name);
 861     address start = __ pc();
 862 
 863     Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
 864     Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
 865 
 866     int shift = Address::times_ptr - sf;
 867 
 868     const Register from     = rsi;  // source array address
 869     const Register to       = rdi;  // destination array address
 870     const Register count    = rcx;  // elements count
 871     const Register to_from  = to;   // (to - from)
 872     const Register saved_to = rdx;  // saved destination array address
 873 
 874     __ enter(); // required for proper stackwalking of RuntimeStub frame
 875     __ push(rsi);
 876     __ push(rdi);
 877     __ movptr(from , Address(rsp, 12+ 4));
 878     __ movptr(to   , Address(rsp, 12+ 8));
 879     __ movl(count, Address(rsp, 12+ 12));
 880 
 881     if (entry != NULL) {
 882       *entry = __ pc(); // Entry point from conjoint arraycopy stub.
 883       BLOCK_COMMENT("Entry:");
 884     }
 885 
 886     if (t == T_OBJECT) {
 887       __ testl(count, count);
 888       __ jcc(Assembler::zero, L_0_count);
 889     }
 890 
 891     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 892     if (dest_uninitialized) {
 893       decorators |= IS_DEST_UNINITIALIZED;
 894     }
 895     if (aligned) {
 896       decorators |= ARRAYCOPY_ALIGNED;
 897     }
 898 
 899     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 900     bs->arraycopy_prologue(_masm, decorators, t, from, to, count);
 901     {
 902       bool add_entry = (t != T_OBJECT && (!aligned || t == T_INT));
 903       // UnsafeCopyMemory page error: continue after ucm
 904       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
 905       __ subptr(to, from); // to --> to_from
 906       __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
 907       __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
 908       if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
 909         // align source address at 4 bytes address boundary
 910         if (t == T_BYTE) {
 911           // One byte misalignment happens only for byte arrays
 912           __ testl(from, 1);
 913           __ jccb(Assembler::zero, L_skip_align1);
 914           __ movb(rax, Address(from, 0));
 915           __ movb(Address(from, to_from, Address::times_1, 0), rax);
 916           __ increment(from);
 917           __ decrement(count);
 918         __ BIND(L_skip_align1);
 919         }
 920         // Two bytes misalignment happens only for byte and short (char) arrays
 921         __ testl(from, 2);
 922         __ jccb(Assembler::zero, L_skip_align2);
 923         __ movw(rax, Address(from, 0));
 924         __ movw(Address(from, to_from, Address::times_1, 0), rax);
 925         __ addptr(from, 2);
 926         __ subl(count, 1<<(shift-1));
 927       __ BIND(L_skip_align2);
 928       }
 929       if (!UseXMMForArrayCopy) {
 930         __ mov(rax, count);      // save 'count'
 931         __ shrl(count, shift); // bytes count
 932         __ addptr(to_from, from);// restore 'to'
 933         __ rep_mov();
 934         __ subptr(to_from, from);// restore 'to_from'
 935         __ mov(count, rax);      // restore 'count'
 936         __ jmpb(L_copy_2_bytes); // all dwords were copied
 937       } else {
 938         if (!UseUnalignedLoadStores) {
 939           // align to 8 bytes, we know we are 4 byte aligned to start
 940           __ testptr(from, 4);
 941           __ jccb(Assembler::zero, L_copy_64_bytes);
 942           __ movl(rax, Address(from, 0));
 943           __ movl(Address(from, to_from, Address::times_1, 0), rax);
 944           __ addptr(from, 4);
 945           __ subl(count, 1<<shift);
 946         }
 947       __ BIND(L_copy_64_bytes);
 948         __ mov(rax, count);
 949         __ shrl(rax, shift+1);  // 8 bytes chunk count
 950         //
 951         // Copy 8-byte chunks through XMM registers, 8 per iteration of the loop
 952         //
 953         xmm_copy_forward(from, to_from, rax);
 954       }
 955       // copy tailing dword
 956     __ BIND(L_copy_4_bytes);
 957       __ testl(count, 1<<shift);
 958       __ jccb(Assembler::zero, L_copy_2_bytes);
 959       __ movl(rax, Address(from, 0));
 960       __ movl(Address(from, to_from, Address::times_1, 0), rax);
 961       if (t == T_BYTE || t == T_SHORT) {
 962         __ addptr(from, 4);
 963       __ BIND(L_copy_2_bytes);
 964         // copy tailing word
 965         __ testl(count, 1<<(shift-1));
 966         __ jccb(Assembler::zero, L_copy_byte);
 967         __ movw(rax, Address(from, 0));
 968         __ movw(Address(from, to_from, Address::times_1, 0), rax);
 969         if (t == T_BYTE) {
 970           __ addptr(from, 2);
 971         __ BIND(L_copy_byte);
 972           // copy tailing byte
 973           __ testl(count, 1);
 974           __ jccb(Assembler::zero, L_exit);
 975           __ movb(rax, Address(from, 0));
 976           __ movb(Address(from, to_from, Address::times_1, 0), rax);
 977         __ BIND(L_exit);
 978         } else {
 979         __ BIND(L_copy_byte);
 980         }
 981       } else {
 982       __ BIND(L_copy_2_bytes);
 983       }
 984     }
 985 
 986     __ movl(count, Address(rsp, 12+12)); // reread 'count'
 987     bs->arraycopy_epilogue(_masm, decorators, t, from, to, count);
 988 
 989     if (t == T_OBJECT) {
 990     __ BIND(L_0_count);
 991     }
 992     inc_copy_counter_np(t);
 993     __ pop(rdi);
 994     __ pop(rsi);
 995     __ leave(); // required for proper stackwalking of RuntimeStub frame
 996     __ vzeroupper();
 997     __ xorptr(rax, rax); // return 0
 998     __ ret(0);
 999     return start;
1000   }
1001 
1002 
1003   address generate_fill(BasicType t, bool aligned, const char *name) {
1004     __ align(CodeEntryAlignment);
1005     StubCodeMark mark(this, "StubRoutines", name);
1006     address start = __ pc();
1007 
1008     BLOCK_COMMENT("Entry:");
1009 
1010     const Register to       = rdi;  // source array address
1011     const Register value    = rdx;  // value
1012     const Register count    = rsi;  // elements count
1013 
1014     __ enter(); // required for proper stackwalking of RuntimeStub frame
1015     __ push(rsi);
1016     __ push(rdi);
1017     __ movptr(to   , Address(rsp, 12+ 4));
1018     __ movl(value, Address(rsp, 12+ 8));
1019     __ movl(count, Address(rsp, 12+ 12));
1020 
1021     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1022 
1023     __ pop(rdi);
1024     __ pop(rsi);
1025     __ leave(); // required for proper stackwalking of RuntimeStub frame
1026     __ ret(0);
1027     return start;
1028   }
1029 
1030   address generate_conjoint_copy(BasicType t, bool aligned,
1031                                  Address::ScaleFactor sf,
1032                                  address nooverlap_target,
1033                                  address* entry, const char *name,
1034                                  bool dest_uninitialized = false) {
1035     __ align(CodeEntryAlignment);
1036     StubCodeMark mark(this, "StubRoutines", name);
1037     address start = __ pc();
1038 
1039     Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1040     Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
1041 
1042     int shift = Address::times_ptr - sf;
1043 
1044     const Register src   = rax;  // source array address
1045     const Register dst   = rdx;  // destination array address
1046     const Register from  = rsi;  // source array address
1047     const Register to    = rdi;  // destination array address
1048     const Register count = rcx;  // elements count
1049     const Register end   = rax;  // array end address
1050 
1051     __ enter(); // required for proper stackwalking of RuntimeStub frame
1052     __ push(rsi);
1053     __ push(rdi);
1054     __ movptr(src  , Address(rsp, 12+ 4));   // from
1055     __ movptr(dst  , Address(rsp, 12+ 8));   // to
1056     __ movl2ptr(count, Address(rsp, 12+12)); // count
1057 
1058     if (entry != NULL) {
1059       *entry = __ pc(); // Entry point from generic arraycopy stub.
1060       BLOCK_COMMENT("Entry:");
1061     }
1062 
1063     // nooverlap_target expects arguments in rsi and rdi.
1064     __ mov(from, src);
1065     __ mov(to  , dst);
1066 
1067     // arrays overlap test: dispatch to disjoint stub if necessary.
1068     RuntimeAddress nooverlap(nooverlap_target);
1069     __ cmpptr(dst, src);
1070     __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
1071     __ jump_cc(Assembler::belowEqual, nooverlap);
1072     __ cmpptr(dst, end);
1073     __ jump_cc(Assembler::aboveEqual, nooverlap);
1074 
1075     if (t == T_OBJECT) {
1076       __ testl(count, count);
1077       __ jcc(Assembler::zero, L_0_count);
1078     }
1079 
1080     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1081     if (dest_uninitialized) {
1082       decorators |= IS_DEST_UNINITIALIZED;
1083     }
1084     if (aligned) {
1085       decorators |= ARRAYCOPY_ALIGNED;
1086     }
1087 
1088     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1089     bs->arraycopy_prologue(_masm, decorators, t, from, to, count);
1090 
1091     {
1092       bool add_entry = (t != T_OBJECT && (!aligned || t == T_INT));
1093       // UnsafeCopyMemory page error: continue after ucm
1094       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1095       // copy from high to low
1096       __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1097       __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1098       if (t == T_BYTE || t == T_SHORT) {
1099         // Align the end of destination array at 4 bytes address boundary
1100         __ lea(end, Address(dst, count, sf, 0));
1101         if (t == T_BYTE) {
1102           // One byte misalignment happens only for byte arrays
1103           __ testl(end, 1);
1104           __ jccb(Assembler::zero, L_skip_align1);
1105           __ decrement(count);
1106           __ movb(rdx, Address(from, count, sf, 0));
1107           __ movb(Address(to, count, sf, 0), rdx);
1108         __ BIND(L_skip_align1);
1109         }
1110         // Two bytes misalignment happens only for byte and short (char) arrays
1111         __ testl(end, 2);
1112         __ jccb(Assembler::zero, L_skip_align2);
1113         __ subptr(count, 1<<(shift-1));
1114         __ movw(rdx, Address(from, count, sf, 0));
1115         __ movw(Address(to, count, sf, 0), rdx);
1116       __ BIND(L_skip_align2);
1117         __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1118         __ jcc(Assembler::below, L_copy_4_bytes);
1119       }
1120 
1121       if (!UseXMMForArrayCopy) {
1122         __ std();
1123         __ mov(rax, count); // Save 'count'
1124         __ mov(rdx, to);    // Save 'to'
1125         __ lea(rsi, Address(from, count, sf, -4));
1126         __ lea(rdi, Address(to  , count, sf, -4));
1127         __ shrptr(count, shift); // bytes count
1128         __ rep_mov();
1129         __ cld();
1130         __ mov(count, rax); // restore 'count'
1131         __ andl(count, (1<<shift)-1);      // mask the number of rest elements
1132         __ movptr(from, Address(rsp, 12+4)); // reread 'from'
1133         __ mov(to, rdx);   // restore 'to'
1134         __ jmpb(L_copy_2_bytes); // all dword were copied
1135       } else {
1136         // Align to 8 bytes the end of array. It is aligned to 4 bytes already.
1137         __ testptr(end, 4);
1138         __ jccb(Assembler::zero, L_copy_8_bytes);
1139         __ subl(count, 1<<shift);
1140         __ movl(rdx, Address(from, count, sf, 0));
1141         __ movl(Address(to, count, sf, 0), rdx);
1142         __ jmpb(L_copy_8_bytes);
1143 
1144         __ align(OptoLoopAlignment);
1145         // Move 8 bytes
1146       __ BIND(L_copy_8_bytes_loop);
1147         __ movq(xmm0, Address(from, count, sf, 0));
1148         __ movq(Address(to, count, sf, 0), xmm0);
1149       __ BIND(L_copy_8_bytes);
1150         __ subl(count, 2<<shift);
1151         __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1152         __ addl(count, 2<<shift);
1153       }
1154     __ BIND(L_copy_4_bytes);
1155       // copy prefix qword
1156       __ testl(count, 1<<shift);
1157       __ jccb(Assembler::zero, L_copy_2_bytes);
1158       __ movl(rdx, Address(from, count, sf, -4));
1159       __ movl(Address(to, count, sf, -4), rdx);
1160 
1161       if (t == T_BYTE || t == T_SHORT) {
1162           __ subl(count, (1<<shift));
1163         __ BIND(L_copy_2_bytes);
1164           // copy prefix dword
1165           __ testl(count, 1<<(shift-1));
1166           __ jccb(Assembler::zero, L_copy_byte);
1167           __ movw(rdx, Address(from, count, sf, -2));
1168           __ movw(Address(to, count, sf, -2), rdx);
1169           if (t == T_BYTE) {
1170             __ subl(count, 1<<(shift-1));
1171           __ BIND(L_copy_byte);
1172             // copy prefix byte
1173             __ testl(count, 1);
1174             __ jccb(Assembler::zero, L_exit);
1175             __ movb(rdx, Address(from, 0));
1176             __ movb(Address(to, 0), rdx);
1177           __ BIND(L_exit);
1178           } else {
1179           __ BIND(L_copy_byte);
1180           }
1181       } else {
1182       __ BIND(L_copy_2_bytes);
1183       }
1184     }
1185 
1186     __ movl2ptr(count, Address(rsp, 12+12)); // reread count
1187     bs->arraycopy_epilogue(_masm, decorators, t, from, to, count);
1188 
1189     if (t == T_OBJECT) {
1190     __ BIND(L_0_count);
1191     }
1192     inc_copy_counter_np(t);
1193     __ pop(rdi);
1194     __ pop(rsi);
1195     __ leave(); // required for proper stackwalking of RuntimeStub frame
1196     __ xorptr(rax, rax); // return 0
1197     __ ret(0);
1198     return start;
1199   }
1200 
1201 
1202   address generate_disjoint_long_copy(address* entry, const char *name) {
1203     __ align(CodeEntryAlignment);
1204     StubCodeMark mark(this, "StubRoutines", name);
1205     address start = __ pc();
1206 
1207     Label L_copy_8_bytes, L_copy_8_bytes_loop;
1208     const Register from       = rax;  // source array address
1209     const Register to         = rdx;  // destination array address
1210     const Register count      = rcx;  // elements count
1211     const Register to_from    = rdx;  // (to - from)
1212 
1213     __ enter(); // required for proper stackwalking of RuntimeStub frame
1214     __ movptr(from , Address(rsp, 8+0));       // from
1215     __ movptr(to   , Address(rsp, 8+4));       // to
1216     __ movl2ptr(count, Address(rsp, 8+8));     // count
1217 
1218     *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1219     BLOCK_COMMENT("Entry:");
1220 
1221     {
1222       // UnsafeCopyMemory page error: continue after ucm
1223       UnsafeCopyMemoryMark ucmm(this, true, true);
1224       __ subptr(to, from); // to --> to_from
1225       if (UseXMMForArrayCopy) {
1226         xmm_copy_forward(from, to_from, count);
1227       } else {
1228         __ jmpb(L_copy_8_bytes);
1229         __ align(OptoLoopAlignment);
1230       __ BIND(L_copy_8_bytes_loop);
1231         __ fild_d(Address(from, 0));
1232         __ fistp_d(Address(from, to_from, Address::times_1));
1233         __ addptr(from, 8);
1234       __ BIND(L_copy_8_bytes);
1235         __ decrement(count);
1236         __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1237       }
1238     }
1239     inc_copy_counter_np(T_LONG);
1240     __ leave(); // required for proper stackwalking of RuntimeStub frame
1241     __ vzeroupper();
1242     __ xorptr(rax, rax); // return 0
1243     __ ret(0);
1244     return start;
1245   }
1246 
1247   address generate_conjoint_long_copy(address nooverlap_target,
1248                                       address* entry, const char *name) {
1249     __ align(CodeEntryAlignment);
1250     StubCodeMark mark(this, "StubRoutines", name);
1251     address start = __ pc();
1252 
1253     Label L_copy_8_bytes, L_copy_8_bytes_loop;
1254     const Register from       = rax;  // source array address
1255     const Register to         = rdx;  // destination array address
1256     const Register count      = rcx;  // elements count
1257     const Register end_from   = rax;  // source array end address
1258 
1259     __ enter(); // required for proper stackwalking of RuntimeStub frame
1260     __ movptr(from , Address(rsp, 8+0));       // from
1261     __ movptr(to   , Address(rsp, 8+4));       // to
1262     __ movl2ptr(count, Address(rsp, 8+8));     // count
1263 
1264     *entry = __ pc(); // Entry point from generic arraycopy stub.
1265     BLOCK_COMMENT("Entry:");
1266 
1267     // arrays overlap test
1268     __ cmpptr(to, from);
1269     RuntimeAddress nooverlap(nooverlap_target);
1270     __ jump_cc(Assembler::belowEqual, nooverlap);
1271     __ lea(end_from, Address(from, count, Address::times_8, 0));
1272     __ cmpptr(to, end_from);
1273     __ movptr(from, Address(rsp, 8));  // from
1274     __ jump_cc(Assembler::aboveEqual, nooverlap);
1275 
1276     {
1277       // UnsafeCopyMemory page error: continue after ucm
1278       UnsafeCopyMemoryMark ucmm(this, true, true);
1279 
1280       __ jmpb(L_copy_8_bytes);
1281 
1282       __ align(OptoLoopAlignment);
1283     __ BIND(L_copy_8_bytes_loop);
1284       if (UseXMMForArrayCopy) {
1285         __ movq(xmm0, Address(from, count, Address::times_8));
1286         __ movq(Address(to, count, Address::times_8), xmm0);
1287       } else {
1288         __ fild_d(Address(from, count, Address::times_8));
1289         __ fistp_d(Address(to, count, Address::times_8));
1290       }
1291     __ BIND(L_copy_8_bytes);
1292       __ decrement(count);
1293       __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1294 
1295     }
1296     inc_copy_counter_np(T_LONG);
1297     __ leave(); // required for proper stackwalking of RuntimeStub frame
1298     __ xorptr(rax, rax); // return 0
1299     __ ret(0);
1300     return start;
1301   }
1302 
1303 
1304   // Helper for generating a dynamic type check.
1305   // The sub_klass must be one of {rbx, rdx, rsi}.
1306   // The temp is killed.
1307   void generate_type_check(Register sub_klass,
1308                            Address& super_check_offset_addr,
1309                            Address& super_klass_addr,
1310                            Register temp,
1311                            Label* L_success, Label* L_failure) {
1312     BLOCK_COMMENT("type_check:");
1313 
1314     Label L_fallthrough;
1315 #define LOCAL_JCC(assembler_con, label_ptr)                             \
1316     if (label_ptr != NULL)  __ jcc(assembler_con, *(label_ptr));        \
1317     else                    __ jcc(assembler_con, L_fallthrough) /*omit semi*/
1318 
1319     // The following is a strange variation of the fast path which requires
1320     // one less register, because needed values are on the argument stack.
1321     // __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp,
1322     //                                  L_success, L_failure, NULL);
1323     assert_different_registers(sub_klass, temp);
1324 
1325     int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1326 
1327     // if the pointers are equal, we are done (e.g., String[] elements)
1328     __ cmpptr(sub_klass, super_klass_addr);
1329     LOCAL_JCC(Assembler::equal, L_success);
1330 
1331     // check the supertype display:
1332     __ movl2ptr(temp, super_check_offset_addr);
1333     Address super_check_addr(sub_klass, temp, Address::times_1, 0);
1334     __ movptr(temp, super_check_addr); // load displayed supertype
1335     __ cmpptr(temp, super_klass_addr); // test the super type
1336     LOCAL_JCC(Assembler::equal, L_success);
1337 
1338     // if it was a primary super, we can just fail immediately
1339     __ cmpl(super_check_offset_addr, sc_offset);
1340     LOCAL_JCC(Assembler::notEqual, L_failure);
1341 
1342     // The repne_scan instruction uses fixed registers, which will get spilled.
1343     // We happen to know this works best when super_klass is in rax.
1344     Register super_klass = temp;
1345     __ movptr(super_klass, super_klass_addr);
1346     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
1347                                      L_success, L_failure);
1348 
1349     __ bind(L_fallthrough);
1350 
1351     if (L_success == NULL) { BLOCK_COMMENT("L_success:"); }
1352     if (L_failure == NULL) { BLOCK_COMMENT("L_failure:"); }
1353 
1354 #undef LOCAL_JCC
1355   }
1356 
1357   //
1358   //  Generate checkcasting array copy stub
1359   //
1360   //  Input:
1361   //    4(rsp)   - source array address
1362   //    8(rsp)   - destination array address
1363   //   12(rsp)   - element count, can be zero
1364   //   16(rsp)   - size_t ckoff (super_check_offset)
1365   //   20(rsp)   - oop ckval (super_klass)
1366   //
1367   //  Output:
1368   //    rax, ==  0  -  success
1369   //    rax, == -1^K - failure, where K is partial transfer count
1370   //
1371   address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) {
1372     __ align(CodeEntryAlignment);
1373     StubCodeMark mark(this, "StubRoutines", name);
1374     address start = __ pc();
1375 
1376     Label L_load_element, L_store_element, L_do_card_marks, L_done;
1377 
1378     // register use:
1379     //  rax, rdx, rcx -- loop control (end_from, end_to, count)
1380     //  rdi, rsi      -- element access (oop, klass)
1381     //  rbx,           -- temp
1382     const Register from       = rax;    // source array address
1383     const Register to         = rdx;    // destination array address
1384     const Register length     = rcx;    // elements count
1385     const Register elem       = rdi;    // each oop copied
1386     const Register elem_klass = rsi;    // each elem._klass (sub_klass)
1387     const Register temp       = rbx;    // lone remaining temp
1388 
1389     __ enter(); // required for proper stackwalking of RuntimeStub frame
1390 
1391     __ push(rsi);
1392     __ push(rdi);
1393     __ push(rbx);
1394 
1395     Address   from_arg(rsp, 16+ 4);     // from
1396     Address     to_arg(rsp, 16+ 8);     // to
1397     Address length_arg(rsp, 16+12);     // elements count
1398     Address  ckoff_arg(rsp, 16+16);     // super_check_offset
1399     Address  ckval_arg(rsp, 16+20);     // super_klass
1400 
1401     // Load up:
1402     __ movptr(from,     from_arg);
1403     __ movptr(to,         to_arg);
1404     __ movl2ptr(length, length_arg);
1405 
1406     if (entry != NULL) {
1407       *entry = __ pc(); // Entry point from generic arraycopy stub.
1408       BLOCK_COMMENT("Entry:");
1409     }
1410 
1411     //---------------------------------------------------------------
1412     // Assembler stub will be used for this call to arraycopy
1413     // if the two arrays are subtypes of Object[] but the
1414     // destination array type is not equal to or a supertype
1415     // of the source type.  Each element must be separately
1416     // checked.
1417 
1418     // Loop-invariant addresses.  They are exclusive end pointers.
1419     Address end_from_addr(from, length, Address::times_ptr, 0);
1420     Address   end_to_addr(to,   length, Address::times_ptr, 0);
1421 
1422     Register end_from = from;           // re-use
1423     Register end_to   = to;             // re-use
1424     Register count    = length;         // re-use
1425 
1426     // Loop-variant addresses.  They assume post-incremented count < 0.
1427     Address from_element_addr(end_from, count, Address::times_ptr, 0);
1428     Address   to_element_addr(end_to,   count, Address::times_ptr, 0);
1429     Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
1430 
1431     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1432     if (dest_uninitialized) {
1433       decorators |= IS_DEST_UNINITIALIZED;
1434     }
1435 
1436     BasicType type = T_OBJECT;
1437     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1438     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1439 
1440     // Copy from low to high addresses, indexed from the end of each array.
1441     __ lea(end_from, end_from_addr);
1442     __ lea(end_to,   end_to_addr);
1443     assert(length == count, "");        // else fix next line:
1444     __ negptr(count);                   // negate and test the length
1445     __ jccb(Assembler::notZero, L_load_element);
1446 
1447     // Empty array:  Nothing to do.
1448     __ xorptr(rax, rax);                  // return 0 on (trivial) success
1449     __ jmp(L_done);
1450 
1451     // ======== begin loop ========
1452     // (Loop is rotated; its entry is L_load_element.)
1453     // Loop control:
1454     //   for (count = -count; count != 0; count++)
1455     // Base pointers src, dst are biased by 8*count,to last element.
1456     __ align(OptoLoopAlignment);
1457 
1458     __ BIND(L_store_element);
1459     __ movptr(to_element_addr, elem);     // store the oop
1460     __ increment(count);                // increment the count toward zero
1461     __ jccb(Assembler::zero, L_do_card_marks);
1462 
1463     // ======== loop entry is here ========
1464     __ BIND(L_load_element);
1465     __ movptr(elem, from_element_addr);   // load the oop
1466     __ testptr(elem, elem);
1467     __ jccb(Assembler::zero, L_store_element);
1468 
1469     // (Could do a trick here:  Remember last successful non-null
1470     // element stored and make a quick oop equality check on it.)
1471 
1472     __ movptr(elem_klass, elem_klass_addr); // query the object klass
1473     generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
1474                         &L_store_element, NULL);
1475     // (On fall-through, we have failed the element type check.)
1476     // ======== end loop ========
1477 
1478     // It was a real error; we must depend on the caller to finish the job.
1479     // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops.
1480     // Emit GC store barriers for the oops we have copied (length_arg + count),
1481     // and report their number to the caller.
1482     assert_different_registers(to, count, rax);
1483     Label L_post_barrier;
1484     __ addl(count, length_arg);         // transfers = (length - remaining)
1485     __ movl2ptr(rax, count);            // save the value
1486     __ notptr(rax);                     // report (-1^K) to caller (does not affect flags)
1487     __ jccb(Assembler::notZero, L_post_barrier);
1488     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
1489 
1490     // Come here on success only.
1491     __ BIND(L_do_card_marks);
1492     __ xorptr(rax, rax);                // return 0 on success
1493     __ movl2ptr(count, length_arg);
1494 
1495     __ BIND(L_post_barrier);
1496     __ movptr(to, to_arg);              // reload
1497     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1498 
1499     // Common exit point (success or failure).
1500     __ BIND(L_done);
1501     __ pop(rbx);
1502     __ pop(rdi);
1503     __ pop(rsi);
1504     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1505     __ leave(); // required for proper stackwalking of RuntimeStub frame
1506     __ ret(0);
1507 
1508     return start;
1509   }
1510 
1511   //
1512   //  Generate 'unsafe' array copy stub
1513   //  Though just as safe as the other stubs, it takes an unscaled
1514   //  size_t argument instead of an element count.
1515   //
1516   //  Input:
1517   //    4(rsp)   - source array address
1518   //    8(rsp)   - destination array address
1519   //   12(rsp)   - byte count, can be zero
1520   //
1521   //  Output:
1522   //    rax, ==  0  -  success
1523   //    rax, == -1  -  need to call System.arraycopy
1524   //
1525   // Examines the alignment of the operands and dispatches
1526   // to a long, int, short, or byte copy loop.
1527   //
1528   address generate_unsafe_copy(const char *name,
1529                                address byte_copy_entry,
1530                                address short_copy_entry,
1531                                address int_copy_entry,
1532                                address long_copy_entry) {
1533 
1534     Label L_long_aligned, L_int_aligned, L_short_aligned;
1535 
1536     __ align(CodeEntryAlignment);
1537     StubCodeMark mark(this, "StubRoutines", name);
1538     address start = __ pc();
1539 
1540     const Register from       = rax;  // source array address
1541     const Register to         = rdx;  // destination array address
1542     const Register count      = rcx;  // elements count
1543 
1544     __ enter(); // required for proper stackwalking of RuntimeStub frame
1545     __ push(rsi);
1546     __ push(rdi);
1547     Address  from_arg(rsp, 12+ 4);      // from
1548     Address    to_arg(rsp, 12+ 8);      // to
1549     Address count_arg(rsp, 12+12);      // byte count
1550 
1551     // Load up:
1552     __ movptr(from ,  from_arg);
1553     __ movptr(to   ,    to_arg);
1554     __ movl2ptr(count, count_arg);
1555 
1556     // bump this on entry, not on exit:
1557     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1558 
1559     const Register bits = rsi;
1560     __ mov(bits, from);
1561     __ orptr(bits, to);
1562     __ orptr(bits, count);
1563 
1564     __ testl(bits, BytesPerLong-1);
1565     __ jccb(Assembler::zero, L_long_aligned);
1566 
1567     __ testl(bits, BytesPerInt-1);
1568     __ jccb(Assembler::zero, L_int_aligned);
1569 
1570     __ testl(bits, BytesPerShort-1);
1571     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1572 
1573     __ BIND(L_short_aligned);
1574     __ shrptr(count, LogBytesPerShort); // size => short_count
1575     __ movl(count_arg, count);          // update 'count'
1576     __ jump(RuntimeAddress(short_copy_entry));
1577 
1578     __ BIND(L_int_aligned);
1579     __ shrptr(count, LogBytesPerInt); // size => int_count
1580     __ movl(count_arg, count);          // update 'count'
1581     __ jump(RuntimeAddress(int_copy_entry));
1582 
1583     __ BIND(L_long_aligned);
1584     __ shrptr(count, LogBytesPerLong); // size => qword_count
1585     __ movl(count_arg, count);          // update 'count'
1586     __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1587     __ pop(rsi);
1588     __ jump(RuntimeAddress(long_copy_entry));
1589 
1590     return start;
1591   }
1592 
1593 
1594   // Perform range checks on the proposed arraycopy.
1595   // Smashes src_pos and dst_pos.  (Uses them up for temps.)
1596   void arraycopy_range_checks(Register src,
1597                               Register src_pos,
1598                               Register dst,
1599                               Register dst_pos,
1600                               Address& length,
1601                               Label& L_failed) {
1602     BLOCK_COMMENT("arraycopy_range_checks:");
1603     const Register src_end = src_pos;   // source array end position
1604     const Register dst_end = dst_pos;   // destination array end position
1605     __ addl(src_end, length); // src_pos + length
1606     __ addl(dst_end, length); // dst_pos + length
1607 
1608     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
1609     __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
1610     __ jcc(Assembler::above, L_failed);
1611 
1612     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
1613     __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1614     __ jcc(Assembler::above, L_failed);
1615 
1616     BLOCK_COMMENT("arraycopy_range_checks done");
1617   }
1618 
1619 
1620   //
1621   //  Generate generic array copy stubs
1622   //
1623   //  Input:
1624   //     4(rsp)    -  src oop
1625   //     8(rsp)    -  src_pos
1626   //    12(rsp)    -  dst oop
1627   //    16(rsp)    -  dst_pos
1628   //    20(rsp)    -  element count
1629   //
1630   //  Output:
1631   //    rax, ==  0  -  success
1632   //    rax, == -1^K - failure, where K is partial transfer count
1633   //
1634   address generate_generic_copy(const char *name,
1635                                 address entry_jbyte_arraycopy,
1636                                 address entry_jshort_arraycopy,
1637                                 address entry_jint_arraycopy,
1638                                 address entry_oop_arraycopy,
1639                                 address entry_jlong_arraycopy,
1640                                 address entry_checkcast_arraycopy) {
1641     Label L_failed, L_failed_0, L_objArray;
1642 
1643     { int modulus = CodeEntryAlignment;
1644       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
1645       int advance = target - (__ offset() % modulus);
1646       if (advance < 0)  advance += modulus;
1647       if (advance > 0)  __ nop(advance);
1648     }
1649     StubCodeMark mark(this, "StubRoutines", name);
1650 
1651     // Short-hop target to L_failed.  Makes for denser prologue code.
1652     __ BIND(L_failed_0);
1653     __ jmp(L_failed);
1654     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
1655 
1656     __ align(CodeEntryAlignment);
1657     address start = __ pc();
1658 
1659     __ enter(); // required for proper stackwalking of RuntimeStub frame
1660     __ push(rsi);
1661     __ push(rdi);
1662 
1663     // bump this on entry, not on exit:
1664     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1665 
1666     // Input values
1667     Address SRC     (rsp, 12+ 4);
1668     Address SRC_POS (rsp, 12+ 8);
1669     Address DST     (rsp, 12+12);
1670     Address DST_POS (rsp, 12+16);
1671     Address LENGTH  (rsp, 12+20);
1672 
1673     //-----------------------------------------------------------------------
1674     // Assembler stub will be used for this call to arraycopy
1675     // if the following conditions are met:
1676     //
1677     // (1) src and dst must not be null.
1678     // (2) src_pos must not be negative.
1679     // (3) dst_pos must not be negative.
1680     // (4) length  must not be negative.
1681     // (5) src klass and dst klass should be the same and not NULL.
1682     // (6) src and dst should be arrays.
1683     // (7) src_pos + length must not exceed length of src.
1684     // (8) dst_pos + length must not exceed length of dst.
1685     //
1686 
1687     const Register src     = rax;       // source array oop
1688     const Register src_pos = rsi;
1689     const Register dst     = rdx;       // destination array oop
1690     const Register dst_pos = rdi;
1691     const Register length  = rcx;       // transfer count
1692 
1693     //  if (src == NULL) return -1;
1694     __ movptr(src, SRC);      // src oop
1695     __ testptr(src, src);
1696     __ jccb(Assembler::zero, L_failed_0);
1697 
1698     //  if (src_pos < 0) return -1;
1699     __ movl2ptr(src_pos, SRC_POS);  // src_pos
1700     __ testl(src_pos, src_pos);
1701     __ jccb(Assembler::negative, L_failed_0);
1702 
1703     //  if (dst == NULL) return -1;
1704     __ movptr(dst, DST);      // dst oop
1705     __ testptr(dst, dst);
1706     __ jccb(Assembler::zero, L_failed_0);
1707 
1708     //  if (dst_pos < 0) return -1;
1709     __ movl2ptr(dst_pos, DST_POS);  // dst_pos
1710     __ testl(dst_pos, dst_pos);
1711     __ jccb(Assembler::negative, L_failed_0);
1712 
1713     //  if (length < 0) return -1;
1714     __ movl2ptr(length, LENGTH);   // length
1715     __ testl(length, length);
1716     __ jccb(Assembler::negative, L_failed_0);
1717 
1718     //  if (src->klass() == NULL) return -1;
1719     Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
1720     Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
1721     const Register rcx_src_klass = rcx;    // array klass
1722     __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
1723 
1724 #ifdef ASSERT
1725     //  assert(src->klass() != NULL);
1726     BLOCK_COMMENT("assert klasses not null");
1727     { Label L1, L2;
1728       __ testptr(rcx_src_klass, rcx_src_klass);
1729       __ jccb(Assembler::notZero, L2);   // it is broken if klass is NULL
1730       __ bind(L1);
1731       __ stop("broken null klass");
1732       __ bind(L2);
1733       __ cmpptr(dst_klass_addr, (int32_t)NULL_WORD);
1734       __ jccb(Assembler::equal, L1);      // this would be broken also
1735       BLOCK_COMMENT("assert done");
1736     }
1737 #endif //ASSERT
1738 
1739     // Load layout helper (32-bits)
1740     //
1741     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1742     // 32        30    24            16              8     2                 0
1743     //
1744     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1745     //
1746 
1747     int lh_offset = in_bytes(Klass::layout_helper_offset());
1748     Address src_klass_lh_addr(rcx_src_klass, lh_offset);
1749 
1750     // Handle objArrays completely differently...
1751     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1752     __ cmpl(src_klass_lh_addr, objArray_lh);
1753     __ jcc(Assembler::equal, L_objArray);
1754 
1755     //  if (src->klass() != dst->klass()) return -1;
1756     __ cmpptr(rcx_src_klass, dst_klass_addr);
1757     __ jccb(Assembler::notEqual, L_failed_0);
1758 
1759     const Register rcx_lh = rcx;  // layout helper
1760     assert(rcx_lh == rcx_src_klass, "known alias");
1761     __ movl(rcx_lh, src_klass_lh_addr);
1762 
1763     //  if (!src->is_Array()) return -1;
1764     __ cmpl(rcx_lh, Klass::_lh_neutral_value);
1765     __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
1766 
1767     // At this point, it is known to be a typeArray (array_tag 0x3).
1768 #ifdef ASSERT
1769     { Label L;
1770       __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1771       __ jcc(Assembler::greaterEqual, L); // signed cmp
1772       __ stop("must be a primitive array");
1773       __ bind(L);
1774     }
1775 #endif
1776 
1777     assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
1778     arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1779 
1780     // TypeArrayKlass
1781     //
1782     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1783     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1784     //
1785     const Register rsi_offset = rsi; // array offset
1786     const Register src_array  = src; // src array offset
1787     const Register dst_array  = dst; // dst array offset
1788     const Register rdi_elsize = rdi; // log2 element size
1789 
1790     __ mov(rsi_offset, rcx_lh);
1791     __ shrptr(rsi_offset, Klass::_lh_header_size_shift);
1792     __ andptr(rsi_offset, Klass::_lh_header_size_mask);   // array_offset
1793     __ addptr(src_array, rsi_offset);  // src array offset
1794     __ addptr(dst_array, rsi_offset);  // dst array offset
1795     __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
1796 
1797     // next registers should be set before the jump to corresponding stub
1798     const Register from       = src; // source array address
1799     const Register to         = dst; // destination array address
1800     const Register count      = rcx; // elements count
1801     // some of them should be duplicated on stack
1802 #define FROM   Address(rsp, 12+ 4)
1803 #define TO     Address(rsp, 12+ 8)   // Not used now
1804 #define COUNT  Address(rsp, 12+12)   // Only for oop arraycopy
1805 
1806     BLOCK_COMMENT("scale indexes to element size");
1807     __ movl2ptr(rsi, SRC_POS);  // src_pos
1808     __ shlptr(rsi);             // src_pos << rcx (log2 elsize)
1809     assert(src_array == from, "");
1810     __ addptr(from, rsi);       // from = src_array + SRC_POS << log2 elsize
1811     __ movl2ptr(rdi, DST_POS);  // dst_pos
1812     __ shlptr(rdi);             // dst_pos << rcx (log2 elsize)
1813     assert(dst_array == to, "");
1814     __ addptr(to,  rdi);        // to   = dst_array + DST_POS << log2 elsize
1815     __ movptr(FROM, from);      // src_addr
1816     __ mov(rdi_elsize, rcx_lh); // log2 elsize
1817     __ movl2ptr(count, LENGTH); // elements count
1818 
1819     BLOCK_COMMENT("choose copy loop based on element size");
1820     __ cmpl(rdi_elsize, 0);
1821 
1822     __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
1823     __ cmpl(rdi_elsize, LogBytesPerShort);
1824     __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
1825     __ cmpl(rdi_elsize, LogBytesPerInt);
1826     __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
1827 #ifdef ASSERT
1828     __ cmpl(rdi_elsize, LogBytesPerLong);
1829     __ jccb(Assembler::notEqual, L_failed);
1830 #endif
1831     __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1832     __ pop(rsi);
1833     __ jump(RuntimeAddress(entry_jlong_arraycopy));
1834 
1835   __ BIND(L_failed);
1836     __ xorptr(rax, rax);
1837     __ notptr(rax); // return -1
1838     __ pop(rdi);
1839     __ pop(rsi);
1840     __ leave(); // required for proper stackwalking of RuntimeStub frame
1841     __ ret(0);
1842 
1843     // ObjArrayKlass
1844   __ BIND(L_objArray);
1845     // live at this point:  rcx_src_klass, src[_pos], dst[_pos]
1846 
1847     Label L_plain_copy, L_checkcast_copy;
1848     //  test array classes for subtyping
1849     __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
1850     __ jccb(Assembler::notEqual, L_checkcast_copy);
1851 
1852     // Identically typed arrays can be copied without element-wise checks.
1853     assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
1854     arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1855 
1856   __ BIND(L_plain_copy);
1857     __ movl2ptr(count, LENGTH); // elements count
1858     __ movl2ptr(src_pos, SRC_POS);  // reload src_pos
1859     __ lea(from, Address(src, src_pos, Address::times_ptr,
1860                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
1861     __ movl2ptr(dst_pos, DST_POS);  // reload dst_pos
1862     __ lea(to,   Address(dst, dst_pos, Address::times_ptr,
1863                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
1864     __ movptr(FROM,  from);   // src_addr
1865     __ movptr(TO,    to);     // dst_addr
1866     __ movl(COUNT, count);  // count
1867     __ jump(RuntimeAddress(entry_oop_arraycopy));
1868 
1869   __ BIND(L_checkcast_copy);
1870     // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
1871     {
1872       // Handy offsets:
1873       int  ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1874       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1875 
1876       Register rsi_dst_klass = rsi;
1877       Register rdi_temp      = rdi;
1878       assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
1879       assert(rdi_temp      == dst_pos, "expected alias w/ dst_pos");
1880       Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
1881 
1882       // Before looking at dst.length, make sure dst is also an objArray.
1883       __ movptr(rsi_dst_klass, dst_klass_addr);
1884       __ cmpl(dst_klass_lh_addr, objArray_lh);
1885       __ jccb(Assembler::notEqual, L_failed);
1886 
1887       // It is safe to examine both src.length and dst.length.
1888       __ movl2ptr(src_pos, SRC_POS);        // reload rsi
1889       arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1890       // (Now src_pos and dst_pos are killed, but not src and dst.)
1891 
1892       // We'll need this temp (don't forget to pop it after the type check).
1893       __ push(rbx);
1894       Register rbx_src_klass = rbx;
1895 
1896       __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
1897       __ movptr(rsi_dst_klass, dst_klass_addr);
1898       Address super_check_offset_addr(rsi_dst_klass, sco_offset);
1899       Label L_fail_array_check;
1900       generate_type_check(rbx_src_klass,
1901                           super_check_offset_addr, dst_klass_addr,
1902                           rdi_temp, NULL, &L_fail_array_check);
1903       // (On fall-through, we have passed the array type check.)
1904       __ pop(rbx);
1905       __ jmp(L_plain_copy);
1906 
1907       __ BIND(L_fail_array_check);
1908       // Reshuffle arguments so we can call checkcast_arraycopy:
1909 
1910       // match initial saves for checkcast_arraycopy
1911       // push(rsi);    // already done; see above
1912       // push(rdi);    // already done; see above
1913       // push(rbx);    // already done; see above
1914 
1915       // Marshal outgoing arguments now, freeing registers.
1916       Address   from_arg(rsp, 16+ 4);   // from
1917       Address     to_arg(rsp, 16+ 8);   // to
1918       Address length_arg(rsp, 16+12);   // elements count
1919       Address  ckoff_arg(rsp, 16+16);   // super_check_offset
1920       Address  ckval_arg(rsp, 16+20);   // super_klass
1921 
1922       Address SRC_POS_arg(rsp, 16+ 8);
1923       Address DST_POS_arg(rsp, 16+16);
1924       Address  LENGTH_arg(rsp, 16+20);
1925       // push rbx, changed the incoming offsets (why not just use rbp,??)
1926       // assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, "");
1927 
1928       __ movptr(rbx, Address(rsi_dst_klass, ek_offset));
1929       __ movl2ptr(length, LENGTH_arg);    // reload elements count
1930       __ movl2ptr(src_pos, SRC_POS_arg);  // reload src_pos
1931       __ movl2ptr(dst_pos, DST_POS_arg);  // reload dst_pos
1932 
1933       __ movptr(ckval_arg, rbx);          // destination element type
1934       __ movl(rbx, Address(rbx, sco_offset));
1935       __ movl(ckoff_arg, rbx);          // corresponding class check offset
1936 
1937       __ movl(length_arg, length);      // outgoing length argument
1938 
1939       __ lea(from, Address(src, src_pos, Address::times_ptr,
1940                             arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1941       __ movptr(from_arg, from);
1942 
1943       __ lea(to, Address(dst, dst_pos, Address::times_ptr,
1944                           arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1945       __ movptr(to_arg, to);
1946       __ jump(RuntimeAddress(entry_checkcast_arraycopy));
1947     }
1948 
1949     return start;
1950   }
1951 
1952   void generate_arraycopy_stubs() {
1953     address entry;
1954     address entry_jbyte_arraycopy;
1955     address entry_jshort_arraycopy;
1956     address entry_jint_arraycopy;
1957     address entry_oop_arraycopy;
1958     address entry_jlong_arraycopy;
1959     address entry_checkcast_arraycopy;
1960 
1961     StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
1962         generate_disjoint_copy(T_BYTE,  true, Address::times_1, &entry,
1963                                "arrayof_jbyte_disjoint_arraycopy");
1964     StubRoutines::_arrayof_jbyte_arraycopy =
1965         generate_conjoint_copy(T_BYTE,  true, Address::times_1,  entry,
1966                                NULL, "arrayof_jbyte_arraycopy");
1967     StubRoutines::_jbyte_disjoint_arraycopy =
1968         generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
1969                                "jbyte_disjoint_arraycopy");
1970     StubRoutines::_jbyte_arraycopy =
1971         generate_conjoint_copy(T_BYTE, false, Address::times_1,  entry,
1972                                &entry_jbyte_arraycopy, "jbyte_arraycopy");
1973 
1974     StubRoutines::_arrayof_jshort_disjoint_arraycopy =
1975         generate_disjoint_copy(T_SHORT,  true, Address::times_2, &entry,
1976                                "arrayof_jshort_disjoint_arraycopy");
1977     StubRoutines::_arrayof_jshort_arraycopy =
1978         generate_conjoint_copy(T_SHORT,  true, Address::times_2,  entry,
1979                                NULL, "arrayof_jshort_arraycopy");
1980     StubRoutines::_jshort_disjoint_arraycopy =
1981         generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
1982                                "jshort_disjoint_arraycopy");
1983     StubRoutines::_jshort_arraycopy =
1984         generate_conjoint_copy(T_SHORT, false, Address::times_2,  entry,
1985                                &entry_jshort_arraycopy, "jshort_arraycopy");
1986 
1987     // Next arrays are always aligned on 4 bytes at least.
1988     StubRoutines::_jint_disjoint_arraycopy =
1989         generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
1990                                "jint_disjoint_arraycopy");
1991     StubRoutines::_jint_arraycopy =
1992         generate_conjoint_copy(T_INT, true, Address::times_4,  entry,
1993                                &entry_jint_arraycopy, "jint_arraycopy");
1994 
1995     StubRoutines::_oop_disjoint_arraycopy =
1996         generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
1997                                "oop_disjoint_arraycopy");
1998     StubRoutines::_oop_arraycopy =
1999         generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2000                                &entry_oop_arraycopy, "oop_arraycopy");
2001 
2002     StubRoutines::_oop_disjoint_arraycopy_uninit =
2003         generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2004                                "oop_disjoint_arraycopy_uninit",
2005                                /*dest_uninitialized*/true);
2006     StubRoutines::_oop_arraycopy_uninit =
2007         generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2008                                NULL, "oop_arraycopy_uninit",
2009                                /*dest_uninitialized*/true);
2010 
2011     StubRoutines::_jlong_disjoint_arraycopy =
2012         generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
2013     StubRoutines::_jlong_arraycopy =
2014         generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
2015                                     "jlong_arraycopy");
2016 
2017     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2018     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2019     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2020     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2021     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2022     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2023 
2024     StubRoutines::_arrayof_jint_disjoint_arraycopy       = StubRoutines::_jint_disjoint_arraycopy;
2025     StubRoutines::_arrayof_oop_disjoint_arraycopy        = StubRoutines::_oop_disjoint_arraycopy;
2026     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
2027     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = StubRoutines::_jlong_disjoint_arraycopy;
2028 
2029     StubRoutines::_arrayof_jint_arraycopy       = StubRoutines::_jint_arraycopy;
2030     StubRoutines::_arrayof_oop_arraycopy        = StubRoutines::_oop_arraycopy;
2031     StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
2032     StubRoutines::_arrayof_jlong_arraycopy      = StubRoutines::_jlong_arraycopy;
2033 
2034     StubRoutines::_checkcast_arraycopy =
2035         generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2036     StubRoutines::_checkcast_arraycopy_uninit =
2037         generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, /*dest_uninitialized*/true);
2038 
2039     StubRoutines::_unsafe_arraycopy =
2040         generate_unsafe_copy("unsafe_arraycopy",
2041                                entry_jbyte_arraycopy,
2042                                entry_jshort_arraycopy,
2043                                entry_jint_arraycopy,
2044                                entry_jlong_arraycopy);
2045 
2046     StubRoutines::_generic_arraycopy =
2047         generate_generic_copy("generic_arraycopy",
2048                                entry_jbyte_arraycopy,
2049                                entry_jshort_arraycopy,
2050                                entry_jint_arraycopy,
2051                                entry_oop_arraycopy,
2052                                entry_jlong_arraycopy,
2053                                entry_checkcast_arraycopy);
2054   }
2055 
2056   // AES intrinsic stubs
2057   enum {AESBlockSize = 16};
2058 
2059   address generate_key_shuffle_mask() {
2060     __ align(16);
2061     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2062     address start = __ pc();
2063     __ emit_data(0x00010203, relocInfo::none, 0 );
2064     __ emit_data(0x04050607, relocInfo::none, 0 );
2065     __ emit_data(0x08090a0b, relocInfo::none, 0 );
2066     __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
2067     return start;
2068   }
2069 
2070   address generate_counter_shuffle_mask() {
2071     __ align(16);
2072     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
2073     address start = __ pc();
2074     __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
2075     __ emit_data(0x08090a0b, relocInfo::none, 0);
2076     __ emit_data(0x04050607, relocInfo::none, 0);
2077     __ emit_data(0x00010203, relocInfo::none, 0);
2078     return start;
2079   }
2080 
2081   // Utility routine for loading a 128-bit key word in little endian format
2082   // can optionally specify that the shuffle mask is already in an xmmregister
2083   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2084     __ movdqu(xmmdst, Address(key, offset));
2085     if (xmm_shuf_mask != NULL) {
2086       __ pshufb(xmmdst, xmm_shuf_mask);
2087     } else {
2088       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2089     }
2090   }
2091 
2092   // aesenc using specified key+offset
2093   // can optionally specify that the shuffle mask is already in an xmmregister
2094   void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2095     load_key(xmmtmp, key, offset, xmm_shuf_mask);
2096     __ aesenc(xmmdst, xmmtmp);
2097   }
2098 
2099   // aesdec using specified key+offset
2100   // can optionally specify that the shuffle mask is already in an xmmregister
2101   void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2102     load_key(xmmtmp, key, offset, xmm_shuf_mask);
2103     __ aesdec(xmmdst, xmmtmp);
2104   }
2105 
2106   // Utility routine for increase 128bit counter (iv in CTR mode)
2107   //  XMM_128bit,  D3, D2, D1, D0
2108   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2109     __ pextrd(reg, xmmdst, 0x0);
2110     __ addl(reg, inc_delta);
2111     __ pinsrd(xmmdst, reg, 0x0);
2112     __ jcc(Assembler::carryClear, next_block); // jump if no carry
2113 
2114     __ pextrd(reg, xmmdst, 0x01); // Carry-> D1
2115     __ addl(reg, 0x01);
2116     __ pinsrd(xmmdst, reg, 0x01);
2117     __ jcc(Assembler::carryClear, next_block); // jump if no carry
2118 
2119     __ pextrd(reg, xmmdst, 0x02); // Carry-> D2
2120     __ addl(reg, 0x01);
2121     __ pinsrd(xmmdst, reg, 0x02);
2122     __ jcc(Assembler::carryClear, next_block); // jump if no carry
2123 
2124     __ pextrd(reg, xmmdst, 0x03); // Carry -> D3
2125     __ addl(reg, 0x01);
2126     __ pinsrd(xmmdst, reg, 0x03);
2127 
2128     __ BIND(next_block);          // next instruction
2129   }
2130 
2131 
2132   // Arguments:
2133   //
2134   // Inputs:
2135   //   c_rarg0   - source byte array address
2136   //   c_rarg1   - destination byte array address
2137   //   c_rarg2   - K (key) in little endian int array
2138   //
2139   address generate_aescrypt_encryptBlock() {
2140     assert(UseAES, "need AES instructions and misaligned SSE support");
2141     __ align(CodeEntryAlignment);
2142     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2143     Label L_doLast;
2144     address start = __ pc();
2145 
2146     const Register from        = rdx;      // source array address
2147     const Register to          = rdx;      // destination array address
2148     const Register key         = rcx;      // key array address
2149     const Register keylen      = rax;
2150     const Address  from_param(rbp, 8+0);
2151     const Address  to_param  (rbp, 8+4);
2152     const Address  key_param (rbp, 8+8);
2153 
2154     const XMMRegister xmm_result = xmm0;
2155     const XMMRegister xmm_key_shuf_mask = xmm1;
2156     const XMMRegister xmm_temp1  = xmm2;
2157     const XMMRegister xmm_temp2  = xmm3;
2158     const XMMRegister xmm_temp3  = xmm4;
2159     const XMMRegister xmm_temp4  = xmm5;
2160 
2161     __ enter();   // required for proper stackwalking of RuntimeStub frame
2162 
2163     __ movptr(from, from_param);
2164     __ movptr(key, key_param);
2165 
2166     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2167     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2168 
2169     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2170     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
2171     __ movptr(to, to_param);
2172 
2173     // For encryption, the java expanded key ordering is just what we need
2174 
2175     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2176     __ pxor(xmm_result, xmm_temp1);
2177 
2178     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2179     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2180     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2181     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2182 
2183     __ aesenc(xmm_result, xmm_temp1);
2184     __ aesenc(xmm_result, xmm_temp2);
2185     __ aesenc(xmm_result, xmm_temp3);
2186     __ aesenc(xmm_result, xmm_temp4);
2187 
2188     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2189     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2190     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2191     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2192 
2193     __ aesenc(xmm_result, xmm_temp1);
2194     __ aesenc(xmm_result, xmm_temp2);
2195     __ aesenc(xmm_result, xmm_temp3);
2196     __ aesenc(xmm_result, xmm_temp4);
2197 
2198     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2199     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2200 
2201     __ cmpl(keylen, 44);
2202     __ jccb(Assembler::equal, L_doLast);
2203 
2204     __ aesenc(xmm_result, xmm_temp1);
2205     __ aesenc(xmm_result, xmm_temp2);
2206 
2207     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2208     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2209 
2210     __ cmpl(keylen, 52);
2211     __ jccb(Assembler::equal, L_doLast);
2212 
2213     __ aesenc(xmm_result, xmm_temp1);
2214     __ aesenc(xmm_result, xmm_temp2);
2215 
2216     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2217     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2218 
2219     __ BIND(L_doLast);
2220     __ aesenc(xmm_result, xmm_temp1);
2221     __ aesenclast(xmm_result, xmm_temp2);
2222     __ movdqu(Address(to, 0), xmm_result);        // store the result
2223     __ xorptr(rax, rax); // return 0
2224     __ leave(); // required for proper stackwalking of RuntimeStub frame
2225     __ ret(0);
2226 
2227     return start;
2228   }
2229 
2230 
2231   // Arguments:
2232   //
2233   // Inputs:
2234   //   c_rarg0   - source byte array address
2235   //   c_rarg1   - destination byte array address
2236   //   c_rarg2   - K (key) in little endian int array
2237   //
2238   address generate_aescrypt_decryptBlock() {
2239     assert(UseAES, "need AES instructions and misaligned SSE support");
2240     __ align(CodeEntryAlignment);
2241     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2242     Label L_doLast;
2243     address start = __ pc();
2244 
2245     const Register from        = rdx;      // source array address
2246     const Register to          = rdx;      // destination array address
2247     const Register key         = rcx;      // key array address
2248     const Register keylen      = rax;
2249     const Address  from_param(rbp, 8+0);
2250     const Address  to_param  (rbp, 8+4);
2251     const Address  key_param (rbp, 8+8);
2252 
2253     const XMMRegister xmm_result = xmm0;
2254     const XMMRegister xmm_key_shuf_mask = xmm1;
2255     const XMMRegister xmm_temp1  = xmm2;
2256     const XMMRegister xmm_temp2  = xmm3;
2257     const XMMRegister xmm_temp3  = xmm4;
2258     const XMMRegister xmm_temp4  = xmm5;
2259 
2260     __ enter(); // required for proper stackwalking of RuntimeStub frame
2261 
2262     __ movptr(from, from_param);
2263     __ movptr(key, key_param);
2264 
2265     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2266     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2267 
2268     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2269     __ movdqu(xmm_result, Address(from, 0));
2270     __ movptr(to, to_param);
2271 
2272     // for decryption java expanded key ordering is rotated one position from what we want
2273     // so we start from 0x10 here and hit 0x00 last
2274     // we don't know if the key is aligned, hence not using load-execute form
2275     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2276     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2277     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2278     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2279 
2280     __ pxor  (xmm_result, xmm_temp1);
2281     __ aesdec(xmm_result, xmm_temp2);
2282     __ aesdec(xmm_result, xmm_temp3);
2283     __ aesdec(xmm_result, xmm_temp4);
2284 
2285     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2286     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2287     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2288     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2289 
2290     __ aesdec(xmm_result, xmm_temp1);
2291     __ aesdec(xmm_result, xmm_temp2);
2292     __ aesdec(xmm_result, xmm_temp3);
2293     __ aesdec(xmm_result, xmm_temp4);
2294 
2295     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2296     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2297     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
2298 
2299     __ cmpl(keylen, 44);
2300     __ jccb(Assembler::equal, L_doLast);
2301 
2302     __ aesdec(xmm_result, xmm_temp1);
2303     __ aesdec(xmm_result, xmm_temp2);
2304 
2305     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2306     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2307 
2308     __ cmpl(keylen, 52);
2309     __ jccb(Assembler::equal, L_doLast);
2310 
2311     __ aesdec(xmm_result, xmm_temp1);
2312     __ aesdec(xmm_result, xmm_temp2);
2313 
2314     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2315     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2316 
2317     __ BIND(L_doLast);
2318     __ aesdec(xmm_result, xmm_temp1);
2319     __ aesdec(xmm_result, xmm_temp2);
2320 
2321     // for decryption the aesdeclast operation is always on key+0x00
2322     __ aesdeclast(xmm_result, xmm_temp3);
2323     __ movdqu(Address(to, 0), xmm_result);  // store the result
2324     __ xorptr(rax, rax); // return 0
2325     __ leave(); // required for proper stackwalking of RuntimeStub frame
2326     __ ret(0);
2327 
2328     return start;
2329   }
2330 
2331   void handleSOERegisters(bool saving) {
2332     const int saveFrameSizeInBytes = 4 * wordSize;
2333     const Address saved_rbx     (rbp, -3 * wordSize);
2334     const Address saved_rsi     (rbp, -2 * wordSize);
2335     const Address saved_rdi     (rbp, -1 * wordSize);
2336 
2337     if (saving) {
2338       __ subptr(rsp, saveFrameSizeInBytes);
2339       __ movptr(saved_rsi, rsi);
2340       __ movptr(saved_rdi, rdi);
2341       __ movptr(saved_rbx, rbx);
2342     } else {
2343       // restoring
2344       __ movptr(rsi, saved_rsi);
2345       __ movptr(rdi, saved_rdi);
2346       __ movptr(rbx, saved_rbx);
2347     }
2348   }
2349 
2350   // Arguments:
2351   //
2352   // Inputs:
2353   //   c_rarg0   - source byte array address
2354   //   c_rarg1   - destination byte array address
2355   //   c_rarg2   - K (key) in little endian int array
2356   //   c_rarg3   - r vector byte array address
2357   //   c_rarg4   - input length
2358   //
2359   // Output:
2360   //   rax       - input length
2361   //
2362   address generate_cipherBlockChaining_encryptAESCrypt() {
2363     assert(UseAES, "need AES instructions and misaligned SSE support");
2364     __ align(CodeEntryAlignment);
2365     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2366     address start = __ pc();
2367 
2368     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
2369     const Register from        = rsi;      // source array address
2370     const Register to          = rdx;      // destination array address
2371     const Register key         = rcx;      // key array address
2372     const Register rvec        = rdi;      // r byte array initialized from initvector array address
2373                                            // and left with the results of the last encryption block
2374     const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2375     const Register pos         = rax;
2376 
2377     // xmm register assignments for the loops below
2378     const XMMRegister xmm_result = xmm0;
2379     const XMMRegister xmm_temp   = xmm1;
2380     // first 6 keys preloaded into xmm2-xmm7
2381     const int XMM_REG_NUM_KEY_FIRST = 2;
2382     const int XMM_REG_NUM_KEY_LAST  = 7;
2383     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2384 
2385     __ enter(); // required for proper stackwalking of RuntimeStub frame
2386     handleSOERegisters(true /*saving*/);
2387 
2388     // load registers from incoming parameters
2389     const Address  from_param(rbp, 8+0);
2390     const Address  to_param  (rbp, 8+4);
2391     const Address  key_param (rbp, 8+8);
2392     const Address  rvec_param (rbp, 8+12);
2393     const Address  len_param  (rbp, 8+16);
2394     __ movptr(from , from_param);
2395     __ movptr(to   , to_param);
2396     __ movptr(key  , key_param);
2397     __ movptr(rvec , rvec_param);
2398     __ movptr(len_reg , len_param);
2399 
2400     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
2401     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2402     // load up xmm regs 2 thru 7 with keys 0-5
2403     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2404       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2405       offset += 0x10;
2406     }
2407 
2408     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
2409 
2410     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2411     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2412     __ cmpl(rax, 44);
2413     __ jcc(Assembler::notEqual, L_key_192_256);
2414 
2415     // 128 bit code follows here
2416     __ movl(pos, 0);
2417     __ align(OptoLoopAlignment);
2418     __ BIND(L_loopTop_128);
2419     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2420     __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2421 
2422     __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2423     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2424       __ aesenc(xmm_result, as_XMMRegister(rnum));
2425     }
2426     for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
2427       aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2428     }
2429     load_key(xmm_temp, key, 0xa0);
2430     __ aesenclast(xmm_result, xmm_temp);
2431 
2432     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2433     // no need to store r to memory until we exit
2434     __ addptr(pos, AESBlockSize);
2435     __ subptr(len_reg, AESBlockSize);
2436     __ jcc(Assembler::notEqual, L_loopTop_128);
2437 
2438     __ BIND(L_exit);
2439     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
2440 
2441     handleSOERegisters(false /*restoring*/);
2442     __ movptr(rax, len_param); // return length
2443     __ leave();                                  // required for proper stackwalking of RuntimeStub frame
2444     __ ret(0);
2445 
2446     __ BIND(L_key_192_256);
2447     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2448     __ cmpl(rax, 52);
2449     __ jcc(Assembler::notEqual, L_key_256);
2450 
2451     // 192-bit code follows here (could be changed to use more xmm registers)
2452     __ movl(pos, 0);
2453     __ align(OptoLoopAlignment);
2454     __ BIND(L_loopTop_192);
2455     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2456     __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2457 
2458     __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2459     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2460       __ aesenc(xmm_result, as_XMMRegister(rnum));
2461     }
2462     for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
2463       aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2464     }
2465     load_key(xmm_temp, key, 0xc0);
2466     __ aesenclast(xmm_result, xmm_temp);
2467 
2468     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2469     // no need to store r to memory until we exit
2470     __ addptr(pos, AESBlockSize);
2471     __ subptr(len_reg, AESBlockSize);
2472     __ jcc(Assembler::notEqual, L_loopTop_192);
2473     __ jmp(L_exit);
2474 
2475     __ BIND(L_key_256);
2476     // 256-bit code follows here (could be changed to use more xmm registers)
2477     __ movl(pos, 0);
2478     __ align(OptoLoopAlignment);
2479     __ BIND(L_loopTop_256);
2480     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2481     __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2482 
2483     __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2484     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2485       __ aesenc(xmm_result, as_XMMRegister(rnum));
2486     }
2487     for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
2488       aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2489     }
2490     load_key(xmm_temp, key, 0xe0);
2491     __ aesenclast(xmm_result, xmm_temp);
2492 
2493     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2494     // no need to store r to memory until we exit
2495     __ addptr(pos, AESBlockSize);
2496     __ subptr(len_reg, AESBlockSize);
2497     __ jcc(Assembler::notEqual, L_loopTop_256);
2498     __ jmp(L_exit);
2499 
2500     return start;
2501   }
2502 
2503 
2504   // CBC AES Decryption.
2505   // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
2506   //
2507   // Arguments:
2508   //
2509   // Inputs:
2510   //   c_rarg0   - source byte array address
2511   //   c_rarg1   - destination byte array address
2512   //   c_rarg2   - K (key) in little endian int array
2513   //   c_rarg3   - r vector byte array address
2514   //   c_rarg4   - input length
2515   //
2516   // Output:
2517   //   rax       - input length
2518   //
2519 
2520   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
2521     assert(UseAES, "need AES instructions and misaligned SSE support");
2522     __ align(CodeEntryAlignment);
2523     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2524     address start = __ pc();
2525 
2526     const Register from        = rsi;      // source array address
2527     const Register to          = rdx;      // destination array address
2528     const Register key         = rcx;      // key array address
2529     const Register rvec        = rdi;      // r byte array initialized from initvector array address
2530                                            // and left with the results of the last encryption block
2531     const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2532     const Register pos         = rax;
2533 
2534     const int PARALLEL_FACTOR = 4;
2535     const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256
2536 
2537     Label L_exit;
2538     Label L_singleBlock_loopTop[3]; //128, 192, 256
2539     Label L_multiBlock_loopTop[3]; //128, 192, 256
2540 
2541     const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block
2542     const XMMRegister xmm_key_shuf_mask = xmm1;
2543 
2544     const XMMRegister xmm_key_tmp0 = xmm2;
2545     const XMMRegister xmm_key_tmp1 = xmm3;
2546 
2547     // registers holding the six results in the parallelized loop
2548     const XMMRegister xmm_result0 = xmm4;
2549     const XMMRegister xmm_result1 = xmm5;
2550     const XMMRegister xmm_result2 = xmm6;
2551     const XMMRegister xmm_result3 = xmm7;
2552 
2553     __ enter(); // required for proper stackwalking of RuntimeStub frame
2554     handleSOERegisters(true /*saving*/);
2555 
2556     // load registers from incoming parameters
2557     const Address  from_param(rbp, 8+0);
2558     const Address  to_param  (rbp, 8+4);
2559     const Address  key_param (rbp, 8+8);
2560     const Address  rvec_param (rbp, 8+12);
2561     const Address  len_param  (rbp, 8+16);
2562 
2563     __ movptr(from , from_param);
2564     __ movptr(to   , to_param);
2565     __ movptr(key  , key_param);
2566     __ movptr(rvec , rvec_param);
2567     __ movptr(len_reg , len_param);
2568 
2569     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2570     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
2571 
2572     __ xorptr(pos, pos);
2573 
2574     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2575     // rvec is reused
2576     __ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2577     __ cmpl(rvec, 52);
2578     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
2579     __ cmpl(rvec, 60);
2580     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
2581 
2582 #define DoFour(opc, src_reg)           \
2583   __ opc(xmm_result0, src_reg);         \
2584   __ opc(xmm_result1, src_reg);         \
2585   __ opc(xmm_result2, src_reg);         \
2586   __ opc(xmm_result3, src_reg);         \
2587 
2588     for (int k = 0; k < 3; ++k) {
2589       __ align(OptoLoopAlignment);
2590       __ BIND(L_multiBlock_loopTop[k]);
2591       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
2592       __ jcc(Assembler::less, L_singleBlock_loopTop[k]);
2593 
2594       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
2595       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2596       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2597       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2598 
2599       // the java expanded key ordering is rotated one position from what we want
2600       // so we start from 0x10 here and hit 0x00 last
2601       load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2602       DoFour(pxor, xmm_key_tmp0); //xor with first key
2603       // do the aes dec rounds
2604       for (int rnum = 1; rnum <= ROUNDS[k];) {
2605         //load two keys at a time
2606         //k1->0x20, ..., k9->0xa0, k10->0x00
2607         load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2608         load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last!
2609         DoFour(aesdec, xmm_key_tmp1);
2610         rnum++;
2611         if (rnum != ROUNDS[k]) {
2612           DoFour(aesdec, xmm_key_tmp0);
2613         }
2614         else {
2615           DoFour(aesdeclast, xmm_key_tmp0);
2616         }
2617         rnum++;
2618       }
2619 
2620       // for each result, xor with the r vector of previous cipher block
2621       __ pxor(xmm_result0, xmm_prev_block_cipher);
2622       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2623       __ pxor(xmm_result1, xmm_prev_block_cipher);
2624       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2625       __ pxor(xmm_result2, xmm_prev_block_cipher);
2626       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2627       __ pxor(xmm_result3, xmm_prev_block_cipher);
2628       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
2629 
2630             // store 4 results into the next 64 bytes of output
2631        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2632        __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2633        __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2634        __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2635 
2636        __ addptr(pos, 4 * AESBlockSize);
2637        __ subptr(len_reg, 4 * AESBlockSize);
2638        __ jmp(L_multiBlock_loopTop[k]);
2639 
2640        //singleBlock starts here
2641        __ align(OptoLoopAlignment);
2642        __ BIND(L_singleBlock_loopTop[k]);
2643        __ cmpptr(len_reg, 0); // any blocks left?
2644        __ jcc(Assembler::equal, L_exit);
2645        __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
2646        __ movdqa(xmm_result1, xmm_result0);
2647 
2648        load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2649        __ pxor(xmm_result0, xmm_key_tmp0);
2650        // do the aes dec rounds
2651        for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
2652          // the java expanded key ordering is rotated one position from what we want
2653          load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2654          __ aesdec(xmm_result0, xmm_key_tmp0);
2655        }
2656        load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
2657        __ aesdeclast(xmm_result0, xmm_key_tmp0);
2658        __ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector
2659        __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output
2660        // no need to store r to memory until we exit
2661        __ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block
2662 
2663        __ addptr(pos, AESBlockSize);
2664        __ subptr(len_reg, AESBlockSize);
2665        __ jmp(L_singleBlock_loopTop[k]);
2666     }//for 128/192/256
2667 
2668     __ BIND(L_exit);
2669     __ movptr(rvec, rvec_param);                        // restore this since reused earlier
2670     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
2671     handleSOERegisters(false /*restoring*/);
2672     __ movptr(rax, len_param);                          // return length
2673     __ leave();                                         // required for proper stackwalking of RuntimeStub frame
2674     __ ret(0);
2675 
2676     return start;
2677   }
2678 
2679   // CTR AES crypt.
2680   // In 32-bit stub, parallelize 4 blocks at a time
2681   // Arguments:
2682   //
2683   // Inputs:
2684   //   c_rarg0   - source byte array address
2685   //   c_rarg1   - destination byte array address
2686   //   c_rarg2   - K (key) in little endian int array
2687   //   c_rarg3   - counter vector byte array address
2688   //   c_rarg4   - input length
2689   //
2690   // Output:
2691   //   rax       - input length
2692   //
2693   address generate_counterMode_AESCrypt_Parallel() {
2694     assert(UseAES, "need AES instructions and misaligned SSE support");
2695     __ align(CodeEntryAlignment);
2696     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2697     address start = __ pc();
2698     const Register from        = rsi;      // source array address
2699     const Register to          = rdx;      // destination array address
2700     const Register key         = rcx;      // key array address
2701     const Register counter     = rdi;      // counter byte array initialized from initvector array address
2702                                            // and updated with the incremented counter in the end
2703     const Register len_reg     = rbx;
2704     const Register pos         = rax;
2705 
2706     __ enter(); // required for proper stackwalking of RuntimeStub frame
2707     handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
2708 
2709     // load registers from incoming parameters
2710     const Address  from_param(rbp, 8+0);
2711     const Address  to_param  (rbp, 8+4);
2712     const Address  key_param (rbp, 8+8);
2713     const Address  rvec_param (rbp, 8+12);
2714     const Address  len_param  (rbp, 8+16);
2715     const Address  saved_counter_param(rbp, 8 + 20);
2716     const Address  used_addr_param(rbp, 8 + 24);
2717 
2718     __ movptr(from , from_param);
2719     __ movptr(to   , to_param);
2720     __ movptr(len_reg , len_param);
2721 
2722     // Use the partially used encrpyted counter from last invocation
2723     Label L_exit_preLoop, L_preLoop_start;
2724 
2725     // Use the registers 'counter' and 'key' here in this preloop
2726     // to hold of last 2 params 'used' and 'saved_encCounter_start'
2727     Register used = counter;
2728     Register saved_encCounter_start = key;
2729     Register used_addr = saved_encCounter_start;
2730 
2731     __ movptr(used_addr, used_addr_param);
2732     __ movptr(used, Address(used_addr, 0));
2733     __ movptr(saved_encCounter_start, saved_counter_param);
2734 
2735     __ BIND(L_preLoop_start);
2736     __ cmpptr(used, 16);
2737     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
2738     __ cmpptr(len_reg, 0);
2739     __ jcc(Assembler::lessEqual, L_exit_preLoop);
2740     __ movb(rax, Address(saved_encCounter_start, used));
2741     __ xorb(rax, Address(from, 0));
2742     __ movb(Address(to, 0), rax);
2743     __ addptr(from, 1);
2744     __ addptr(to, 1);
2745     __ addptr(used, 1);
2746     __ subptr(len_reg, 1);
2747 
2748     __ jmp(L_preLoop_start);
2749 
2750     __ BIND(L_exit_preLoop);
2751     __ movptr(used_addr, used_addr_param);
2752     __ movptr(used_addr, used_addr_param);
2753     __ movl(Address(used_addr, 0), used);
2754 
2755     // load the parameters 'key' and 'counter'
2756     __ movptr(key, key_param);
2757     __ movptr(counter, rvec_param);
2758 
2759     // xmm register assignments for the loops below
2760     const XMMRegister xmm_curr_counter      = xmm0;
2761     const XMMRegister xmm_counter_shuf_mask = xmm1;  // need to be reloaded
2762     const XMMRegister xmm_key_shuf_mask     = xmm2;  // need to be reloaded
2763     const XMMRegister xmm_key               = xmm3;
2764     const XMMRegister xmm_result0           = xmm4;
2765     const XMMRegister xmm_result1           = xmm5;
2766     const XMMRegister xmm_result2           = xmm6;
2767     const XMMRegister xmm_result3           = xmm7;
2768     const XMMRegister xmm_from0             = xmm1;   //reuse XMM register
2769     const XMMRegister xmm_from1             = xmm2;
2770     const XMMRegister xmm_from2             = xmm3;
2771     const XMMRegister xmm_from3             = xmm4;
2772 
2773     //for key_128, key_192, key_256
2774     const int rounds[3] = {10, 12, 14};
2775     Label L_singleBlockLoopTop[3];
2776     Label L_multiBlock_loopTop[3];
2777     Label L_key192_top, L_key256_top;
2778     Label L_incCounter[3][4]; // 3: different key length,  4: 4 blocks at a time
2779     Label L_incCounter_single[3]; //for single block, key128, key192, key256
2780     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
2781     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
2782 
2783     Label L_exit;
2784     const int PARALLEL_FACTOR = 4;  //because of the limited register number
2785 
2786     // initialize counter with initial counter
2787     __ movdqu(xmm_curr_counter, Address(counter, 0x00));
2788     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2789     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
2790 
2791     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2792     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2793     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2794     __ cmpl(rax, 52);
2795     __ jcc(Assembler::equal, L_key192_top);
2796     __ cmpl(rax, 60);
2797     __ jcc(Assembler::equal, L_key256_top);
2798 
2799     //key128 begins here
2800     __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
2801 
2802 #define CTR_DoFour(opc, src_reg)               \
2803     __ opc(xmm_result0, src_reg);              \
2804     __ opc(xmm_result1, src_reg);              \
2805     __ opc(xmm_result2, src_reg);              \
2806     __ opc(xmm_result3, src_reg);
2807 
2808     // k == 0 :  generate code for key_128
2809     // k == 1 :  generate code for key_192
2810     // k == 2 :  generate code for key_256
2811     for (int k = 0; k < 3; ++k) {
2812       //multi blocks starts here
2813       __ align(OptoLoopAlignment);
2814       __ BIND(L_multiBlock_loopTop[k]);
2815       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
2816       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
2817 
2818       __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2819       __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2820 
2821       //load, then increase counters
2822       CTR_DoFour(movdqa, xmm_curr_counter);
2823       __ push(rbx);
2824       inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
2825       inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
2826       inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
2827       inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
2828       __ pop (rbx);
2829 
2830       load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
2831 
2832       CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
2833       CTR_DoFour(pxor, xmm_key);   //PXOR with Round 0 key
2834 
2835       for (int i = 1; i < rounds[k]; ++i) {
2836         load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
2837         CTR_DoFour(aesenc, xmm_key);
2838       }
2839       load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
2840       CTR_DoFour(aesenclast, xmm_key);
2841 
2842       // get next PARALLEL_FACTOR blocks into xmm_from registers
2843       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2844       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2845       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2846 
2847       // PXOR with input text
2848       __ pxor(xmm_result0, xmm_from0); //result0 is xmm4
2849       __ pxor(xmm_result1, xmm_from1);
2850       __ pxor(xmm_result2, xmm_from2);
2851 
2852       // store PARALLEL_FACTOR results into the next 64 bytes of output
2853       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2854       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2855       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2856 
2857       // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
2858       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2859       __ pxor(xmm_result3, xmm_from3);
2860       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2861 
2862       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
2863       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
2864       __ jmp(L_multiBlock_loopTop[k]);
2865 
2866       // singleBlock starts here
2867       __ align(OptoLoopAlignment);
2868       __ BIND(L_singleBlockLoopTop[k]);
2869       __ cmpptr(len_reg, 0);
2870       __ jcc(Assembler::equal, L_exit);
2871       __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2872       __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2873       __ movdqa(xmm_result0, xmm_curr_counter);
2874       load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
2875       __ push(rbx);//rbx is used for increasing counter
2876       inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
2877       __ pop (rbx);
2878       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
2879       __ pxor(xmm_result0, xmm_key);
2880       for (int i = 1; i < rounds[k]; i++) {
2881         load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
2882         __ aesenc(xmm_result0, xmm_key);
2883       }
2884       load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
2885       __ aesenclast(xmm_result0, xmm_key);
2886       __ cmpptr(len_reg, AESBlockSize);
2887       __ jcc(Assembler::less, L_processTail_insr[k]);
2888         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2889         __ pxor(xmm_result0, xmm_from0);
2890         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2891         __ addptr(pos, AESBlockSize);
2892         __ subptr(len_reg, AESBlockSize);
2893         __ jmp(L_singleBlockLoopTop[k]);
2894 
2895       __ BIND(L_processTail_insr[k]);                                               // Process the tail part of the input array
2896         __ addptr(pos, len_reg);                                                    // 1. Insert bytes from src array into xmm_from0 register
2897         __ testptr(len_reg, 8);
2898         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
2899           __ subptr(pos,8);
2900           __ pinsrd(xmm_from0, Address(from, pos), 0);
2901           __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1);
2902         __ BIND(L_processTail_4_insr[k]);
2903         __ testptr(len_reg, 4);
2904         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
2905           __ subptr(pos,4);
2906           __ pslldq(xmm_from0, 4);
2907           __ pinsrd(xmm_from0, Address(from, pos), 0);
2908         __ BIND(L_processTail_2_insr[k]);
2909         __ testptr(len_reg, 2);
2910         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
2911           __ subptr(pos, 2);
2912           __ pslldq(xmm_from0, 2);
2913           __ pinsrw(xmm_from0, Address(from, pos), 0);
2914         __ BIND(L_processTail_1_insr[k]);
2915         __ testptr(len_reg, 1);
2916         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
2917           __ subptr(pos, 1);
2918           __ pslldq(xmm_from0, 1);
2919           __ pinsrb(xmm_from0, Address(from, pos), 0);
2920         __ BIND(L_processTail_exit_insr[k]);
2921 
2922         __ movptr(saved_encCounter_start, saved_counter_param);
2923         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);               // 2. Perform pxor of the encrypted counter and plaintext Bytes.
2924         __ pxor(xmm_result0, xmm_from0);                                          //    Also the encrypted counter is saved for next invocation.
2925 
2926         __ testptr(len_reg, 8);
2927         __ jcc(Assembler::zero, L_processTail_4_extr[k]);                        // 3. Extract bytes from xmm_result0 into the dest. array
2928           __ pextrd(Address(to, pos), xmm_result0, 0);
2929           __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1);
2930           __ psrldq(xmm_result0, 8);
2931           __ addptr(pos, 8);
2932         __ BIND(L_processTail_4_extr[k]);
2933         __ testptr(len_reg, 4);
2934         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
2935           __ pextrd(Address(to, pos), xmm_result0, 0);
2936           __ psrldq(xmm_result0, 4);
2937           __ addptr(pos, 4);
2938         __ BIND(L_processTail_2_extr[k]);
2939         __ testptr(len_reg, 2);
2940         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
2941           __ pextrb(Address(to, pos), xmm_result0, 0);
2942           __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1);
2943           __ psrldq(xmm_result0, 2);
2944           __ addptr(pos, 2);
2945         __ BIND(L_processTail_1_extr[k]);
2946         __ testptr(len_reg, 1);
2947         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
2948           __ pextrb(Address(to, pos), xmm_result0, 0);
2949 
2950         __ BIND(L_processTail_exit_extr[k]);
2951         __ movptr(used_addr, used_addr_param);
2952         __ movl(Address(used_addr, 0), len_reg);
2953         __ jmp(L_exit);
2954     }
2955 
2956     __ BIND(L_exit);
2957     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2958     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
2959     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
2960     handleSOERegisters(false /*restoring*/);
2961     __ movptr(rax, len_param); // return length
2962     __ leave();                // required for proper stackwalking of RuntimeStub frame
2963     __ ret(0);
2964 
2965     __ BIND (L_key192_top);
2966     __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
2967     __ jmp(L_multiBlock_loopTop[1]); //key192
2968 
2969     __ BIND (L_key256_top);
2970     __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
2971     __ jmp(L_multiBlock_loopTop[2]); //key192
2972 
2973     return start;
2974   }
2975 
2976   address generate_upper_word_mask() {
2977     __ align(64);
2978     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
2979     address start = __ pc();
2980     __ emit_data(0x00000000, relocInfo::none, 0);
2981     __ emit_data(0x00000000, relocInfo::none, 0);
2982     __ emit_data(0x00000000, relocInfo::none, 0);
2983     __ emit_data(0xFFFFFFFF, relocInfo::none, 0);
2984     return start;
2985   }
2986 
2987   address generate_shuffle_byte_flip_mask() {
2988     __ align(64);
2989     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
2990     address start = __ pc();
2991     __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
2992     __ emit_data(0x08090a0b, relocInfo::none, 0);
2993     __ emit_data(0x04050607, relocInfo::none, 0);
2994     __ emit_data(0x00010203, relocInfo::none, 0);
2995     return start;
2996   }
2997 
2998   // ofs and limit are use for multi-block byte array.
2999   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3000   address generate_sha1_implCompress(bool multi_block, const char *name) {
3001     __ align(CodeEntryAlignment);
3002     StubCodeMark mark(this, "StubRoutines", name);
3003     address start = __ pc();
3004 
3005     Register buf   = rax;
3006     Register state = rdx;
3007     Register ofs   = rcx;
3008     Register limit = rdi;
3009 
3010     const Address  buf_param(rbp, 8 + 0);
3011     const Address  state_param(rbp, 8 + 4);
3012     const Address  ofs_param(rbp, 8 + 8);
3013     const Address  limit_param(rbp, 8 + 12);
3014 
3015     const XMMRegister abcd = xmm0;
3016     const XMMRegister e0 = xmm1;
3017     const XMMRegister e1 = xmm2;
3018     const XMMRegister msg0 = xmm3;
3019 
3020     const XMMRegister msg1 = xmm4;
3021     const XMMRegister msg2 = xmm5;
3022     const XMMRegister msg3 = xmm6;
3023     const XMMRegister shuf_mask = xmm7;
3024 
3025     __ enter();
3026     __ subptr(rsp, 8 * wordSize);
3027     handleSOERegisters(true /*saving*/);
3028 
3029     __ movptr(buf, buf_param);
3030     __ movptr(state, state_param);
3031     if (multi_block) {
3032       __ movptr(ofs, ofs_param);
3033       __ movptr(limit, limit_param);
3034     }
3035 
3036     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3037       buf, state, ofs, limit, rsp, multi_block);
3038 
3039     handleSOERegisters(false /*restoring*/);
3040     __ addptr(rsp, 8 * wordSize);
3041     __ leave();
3042     __ ret(0);
3043     return start;
3044   }
3045 
3046   address generate_pshuffle_byte_flip_mask() {
3047     __ align(64);
3048     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3049     address start = __ pc();
3050     __ emit_data(0x00010203, relocInfo::none, 0);
3051     __ emit_data(0x04050607, relocInfo::none, 0);
3052     __ emit_data(0x08090a0b, relocInfo::none, 0);
3053     __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3054     return start;
3055   }
3056 
3057   // ofs and limit are use for multi-block byte array.
3058   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3059  address generate_sha256_implCompress(bool multi_block, const char *name) {
3060     __ align(CodeEntryAlignment);
3061     StubCodeMark mark(this, "StubRoutines", name);
3062     address start = __ pc();
3063 
3064     Register buf = rbx;
3065     Register state = rsi;
3066     Register ofs = rdx;
3067     Register limit = rcx;
3068 
3069     const Address  buf_param(rbp, 8 + 0);
3070     const Address  state_param(rbp, 8 + 4);
3071     const Address  ofs_param(rbp, 8 + 8);
3072     const Address  limit_param(rbp, 8 + 12);
3073 
3074     const XMMRegister msg = xmm0;
3075     const XMMRegister state0 = xmm1;
3076     const XMMRegister state1 = xmm2;
3077     const XMMRegister msgtmp0 = xmm3;
3078 
3079     const XMMRegister msgtmp1 = xmm4;
3080     const XMMRegister msgtmp2 = xmm5;
3081     const XMMRegister msgtmp3 = xmm6;
3082     const XMMRegister msgtmp4 = xmm7;
3083 
3084     __ enter();
3085     __ subptr(rsp, 8 * wordSize);
3086     handleSOERegisters(true /*saving*/);
3087     __ movptr(buf, buf_param);
3088     __ movptr(state, state_param);
3089     if (multi_block) {
3090      __ movptr(ofs, ofs_param);
3091      __ movptr(limit, limit_param);
3092     }
3093 
3094     __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3095       buf, state, ofs, limit, rsp, multi_block);
3096 
3097     handleSOERegisters(false);
3098     __ addptr(rsp, 8 * wordSize);
3099     __ leave();
3100     __ ret(0);
3101     return start;
3102   }
3103 
3104   // byte swap x86 long
3105   address generate_ghash_long_swap_mask() {
3106     __ align(CodeEntryAlignment);
3107     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
3108     address start = __ pc();
3109     __ emit_data(0x0b0a0908, relocInfo::none, 0);
3110     __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
3111     __ emit_data(0x03020100, relocInfo::none, 0);
3112     __ emit_data(0x07060504, relocInfo::none, 0);
3113 
3114   return start;
3115   }
3116 
3117   // byte swap x86 byte array
3118   address generate_ghash_byte_swap_mask() {
3119     __ align(CodeEntryAlignment);
3120     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
3121     address start = __ pc();
3122     __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3123     __ emit_data(0x08090a0b, relocInfo::none, 0);
3124     __ emit_data(0x04050607, relocInfo::none, 0);
3125     __ emit_data(0x00010203, relocInfo::none, 0);
3126   return start;
3127   }
3128 
3129   /* Single and multi-block ghash operations */
3130   address generate_ghash_processBlocks() {
3131     assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
3132     __ align(CodeEntryAlignment);
3133     Label L_ghash_loop, L_exit;
3134     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3135     address start = __ pc();
3136 
3137     const Register state        = rdi;
3138     const Register subkeyH      = rsi;
3139     const Register data         = rdx;
3140     const Register blocks       = rcx;
3141 
3142     const Address  state_param(rbp, 8+0);
3143     const Address  subkeyH_param(rbp, 8+4);
3144     const Address  data_param(rbp, 8+8);
3145     const Address  blocks_param(rbp, 8+12);
3146 
3147     const XMMRegister xmm_temp0 = xmm0;
3148     const XMMRegister xmm_temp1 = xmm1;
3149     const XMMRegister xmm_temp2 = xmm2;
3150     const XMMRegister xmm_temp3 = xmm3;
3151     const XMMRegister xmm_temp4 = xmm4;
3152     const XMMRegister xmm_temp5 = xmm5;
3153     const XMMRegister xmm_temp6 = xmm6;
3154     const XMMRegister xmm_temp7 = xmm7;
3155 
3156     __ enter();
3157     handleSOERegisters(true);  // Save registers
3158 
3159     __ movptr(state, state_param);
3160     __ movptr(subkeyH, subkeyH_param);
3161     __ movptr(data, data_param);
3162     __ movptr(blocks, blocks_param);
3163 
3164     __ movdqu(xmm_temp0, Address(state, 0));
3165     __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3166 
3167     __ movdqu(xmm_temp1, Address(subkeyH, 0));
3168     __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3169 
3170     __ BIND(L_ghash_loop);
3171     __ movdqu(xmm_temp2, Address(data, 0));
3172     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
3173 
3174     __ pxor(xmm_temp0, xmm_temp2);
3175 
3176     //
3177     // Multiply with the hash key
3178     //
3179     __ movdqu(xmm_temp3, xmm_temp0);
3180     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
3181     __ movdqu(xmm_temp4, xmm_temp0);
3182     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
3183 
3184     __ movdqu(xmm_temp5, xmm_temp0);
3185     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
3186     __ movdqu(xmm_temp6, xmm_temp0);
3187     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
3188 
3189     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
3190 
3191     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
3192     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
3193     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
3194     __ pxor(xmm_temp3, xmm_temp5);
3195     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
3196                                         // of the carry-less multiplication of
3197                                         // xmm0 by xmm1.
3198 
3199     // We shift the result of the multiplication by one bit position
3200     // to the left to cope for the fact that the bits are reversed.
3201     __ movdqu(xmm_temp7, xmm_temp3);
3202     __ movdqu(xmm_temp4, xmm_temp6);
3203     __ pslld (xmm_temp3, 1);
3204     __ pslld(xmm_temp6, 1);
3205     __ psrld(xmm_temp7, 31);
3206     __ psrld(xmm_temp4, 31);
3207     __ movdqu(xmm_temp5, xmm_temp7);
3208     __ pslldq(xmm_temp4, 4);
3209     __ pslldq(xmm_temp7, 4);
3210     __ psrldq(xmm_temp5, 12);
3211     __ por(xmm_temp3, xmm_temp7);
3212     __ por(xmm_temp6, xmm_temp4);
3213     __ por(xmm_temp6, xmm_temp5);
3214 
3215     //
3216     // First phase of the reduction
3217     //
3218     // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
3219     // independently.
3220     __ movdqu(xmm_temp7, xmm_temp3);
3221     __ movdqu(xmm_temp4, xmm_temp3);
3222     __ movdqu(xmm_temp5, xmm_temp3);
3223     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
3224     __ pslld(xmm_temp4, 30);    // packed right shift shifting << 30
3225     __ pslld(xmm_temp5, 25);    // packed right shift shifting << 25
3226     __ pxor(xmm_temp7, xmm_temp4);      // xor the shifted versions
3227     __ pxor(xmm_temp7, xmm_temp5);
3228     __ movdqu(xmm_temp4, xmm_temp7);
3229     __ pslldq(xmm_temp7, 12);
3230     __ psrldq(xmm_temp4, 4);
3231     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
3232 
3233     //
3234     // Second phase of the reduction
3235     //
3236     // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
3237     // shift operations.
3238     __ movdqu(xmm_temp2, xmm_temp3);
3239     __ movdqu(xmm_temp7, xmm_temp3);
3240     __ movdqu(xmm_temp5, xmm_temp3);
3241     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
3242     __ psrld(xmm_temp7, 2);     // packed left shifting >> 2
3243     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
3244     __ pxor(xmm_temp2, xmm_temp7);      // xor the shifted versions
3245     __ pxor(xmm_temp2, xmm_temp5);
3246     __ pxor(xmm_temp2, xmm_temp4);
3247     __ pxor(xmm_temp3, xmm_temp2);
3248     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
3249 
3250     __ decrement(blocks);
3251     __ jcc(Assembler::zero, L_exit);
3252     __ movdqu(xmm_temp0, xmm_temp6);
3253     __ addptr(data, 16);
3254     __ jmp(L_ghash_loop);
3255 
3256     __ BIND(L_exit);
3257        // Byte swap 16-byte result
3258     __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3259     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
3260 
3261     handleSOERegisters(false);  // restore registers
3262     __ leave();
3263     __ ret(0);
3264     return start;
3265   }
3266 
3267   /**
3268    *  Arguments:
3269    *
3270    * Inputs:
3271    *   rsp(4)   - int crc
3272    *   rsp(8)   - byte* buf
3273    *   rsp(12)  - int length
3274    *
3275    * Ouput:
3276    *       rax   - int crc result
3277    */
3278   address generate_updateBytesCRC32() {
3279     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3280 
3281     __ align(CodeEntryAlignment);
3282     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3283 
3284     address start = __ pc();
3285 
3286     const Register crc   = rdx;  // crc
3287     const Register buf   = rsi;  // source java byte array address
3288     const Register len   = rcx;  // length
3289     const Register table = rdi;  // crc_table address (reuse register)
3290     const Register tmp   = rbx;
3291     assert_different_registers(crc, buf, len, table, tmp, rax);
3292 
3293     BLOCK_COMMENT("Entry:");
3294     __ enter(); // required for proper stackwalking of RuntimeStub frame
3295     __ push(rsi);
3296     __ push(rdi);
3297     __ push(rbx);
3298 
3299     Address crc_arg(rbp, 8 + 0);
3300     Address buf_arg(rbp, 8 + 4);
3301     Address len_arg(rbp, 8 + 8);
3302 
3303     // Load up:
3304     __ movl(crc,   crc_arg);
3305     __ movptr(buf, buf_arg);
3306     __ movl(len,   len_arg);
3307 
3308     __ kernel_crc32(crc, buf, len, table, tmp);
3309 
3310     __ movl(rax, crc);
3311     __ pop(rbx);
3312     __ pop(rdi);
3313     __ pop(rsi);
3314     __ vzeroupper();
3315     __ leave(); // required for proper stackwalking of RuntimeStub frame
3316     __ ret(0);
3317 
3318     return start;
3319   }
3320 
3321   /**
3322   *  Arguments:
3323   *
3324   * Inputs:
3325   *   rsp(4)   - int crc
3326   *   rsp(8)   - byte* buf
3327   *   rsp(12)  - int length
3328   *   rsp(16)  - table_start - optional (present only when doing a library_calll,
3329   *              not used by x86 algorithm)
3330   *
3331   * Ouput:
3332   *       rax  - int crc result
3333   */
3334   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
3335     assert(UseCRC32CIntrinsics, "need SSE4_2");
3336     __ align(CodeEntryAlignment);
3337     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3338     address start = __ pc();
3339     const Register crc = rax;  // crc
3340     const Register buf = rcx;  // source java byte array address
3341     const Register len = rdx;  // length
3342     const Register d = rbx;
3343     const Register g = rsi;
3344     const Register h = rdi;
3345     const Register empty = 0; // will never be used, in order not
3346                               // to change a signature for crc32c_IPL_Alg2_Alt2
3347                               // between 64/32 I'm just keeping it here
3348     assert_different_registers(crc, buf, len, d, g, h);
3349 
3350     BLOCK_COMMENT("Entry:");
3351     __ enter(); // required for proper stackwalking of RuntimeStub frame
3352     Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
3353                                      // we need to add additional 4 because __ enter
3354                                      // have just pushed ebp on a stack
3355     Address buf_arg(rsp, 4 + 4 + 4);
3356     Address len_arg(rsp, 4 + 4 + 8);
3357       // Load up:
3358       __ movl(crc, crc_arg);
3359       __ movl(buf, buf_arg);
3360       __ movl(len, len_arg);
3361       __ push(d);
3362       __ push(g);
3363       __ push(h);
3364       __ crc32c_ipl_alg2_alt2(crc, buf, len,
3365                               d, g, h,
3366                               empty, empty, empty,
3367                               xmm0, xmm1, xmm2,
3368                               is_pclmulqdq_supported);
3369       __ pop(h);
3370       __ pop(g);
3371       __ pop(d);
3372     __ vzeroupper();
3373     __ leave(); // required for proper stackwalking of RuntimeStub frame
3374     __ ret(0);
3375 
3376     return start;
3377   }
3378 
3379  address generate_libmExp() {
3380     StubCodeMark mark(this, "StubRoutines", "libmExp");
3381 
3382     address start = __ pc();
3383 
3384     const XMMRegister x0  = xmm0;
3385     const XMMRegister x1  = xmm1;
3386     const XMMRegister x2  = xmm2;
3387     const XMMRegister x3  = xmm3;
3388 
3389     const XMMRegister x4  = xmm4;
3390     const XMMRegister x5  = xmm5;
3391     const XMMRegister x6  = xmm6;
3392     const XMMRegister x7  = xmm7;
3393 
3394     const Register tmp   = rbx;
3395 
3396     BLOCK_COMMENT("Entry:");
3397     __ enter(); // required for proper stackwalking of RuntimeStub frame
3398     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3399     __ leave(); // required for proper stackwalking of RuntimeStub frame
3400     __ ret(0);
3401 
3402     return start;
3403 
3404   }
3405 
3406  address generate_libmLog() {
3407    StubCodeMark mark(this, "StubRoutines", "libmLog");
3408 
3409    address start = __ pc();
3410 
3411    const XMMRegister x0 = xmm0;
3412    const XMMRegister x1 = xmm1;
3413    const XMMRegister x2 = xmm2;
3414    const XMMRegister x3 = xmm3;
3415 
3416    const XMMRegister x4 = xmm4;
3417    const XMMRegister x5 = xmm5;
3418    const XMMRegister x6 = xmm6;
3419    const XMMRegister x7 = xmm7;
3420 
3421    const Register tmp = rbx;
3422 
3423    BLOCK_COMMENT("Entry:");
3424    __ enter(); // required for proper stackwalking of RuntimeStub frame
3425    __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3426    __ leave(); // required for proper stackwalking of RuntimeStub frame
3427    __ ret(0);
3428 
3429    return start;
3430 
3431  }
3432 
3433  address generate_libmLog10() {
3434    StubCodeMark mark(this, "StubRoutines", "libmLog10");
3435 
3436    address start = __ pc();
3437 
3438    const XMMRegister x0 = xmm0;
3439    const XMMRegister x1 = xmm1;
3440    const XMMRegister x2 = xmm2;
3441    const XMMRegister x3 = xmm3;
3442 
3443    const XMMRegister x4 = xmm4;
3444    const XMMRegister x5 = xmm5;
3445    const XMMRegister x6 = xmm6;
3446    const XMMRegister x7 = xmm7;
3447 
3448    const Register tmp = rbx;
3449 
3450    BLOCK_COMMENT("Entry:");
3451    __ enter(); // required for proper stackwalking of RuntimeStub frame
3452    __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3453    __ leave(); // required for proper stackwalking of RuntimeStub frame
3454    __ ret(0);
3455 
3456    return start;
3457 
3458  }
3459 
3460  address generate_libmPow() {
3461    StubCodeMark mark(this, "StubRoutines", "libmPow");
3462 
3463    address start = __ pc();
3464 
3465    const XMMRegister x0 = xmm0;
3466    const XMMRegister x1 = xmm1;
3467    const XMMRegister x2 = xmm2;
3468    const XMMRegister x3 = xmm3;
3469 
3470    const XMMRegister x4 = xmm4;
3471    const XMMRegister x5 = xmm5;
3472    const XMMRegister x6 = xmm6;
3473    const XMMRegister x7 = xmm7;
3474 
3475    const Register tmp = rbx;
3476 
3477    BLOCK_COMMENT("Entry:");
3478    __ enter(); // required for proper stackwalking of RuntimeStub frame
3479    __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3480    __ leave(); // required for proper stackwalking of RuntimeStub frame
3481    __ ret(0);
3482 
3483    return start;
3484 
3485  }
3486 
3487  address generate_libm_reduce_pi04l() {
3488    StubCodeMark mark(this, "StubRoutines", "libm_reduce_pi04l");
3489 
3490    address start = __ pc();
3491 
3492    BLOCK_COMMENT("Entry:");
3493    __ libm_reduce_pi04l(rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3494 
3495    return start;
3496 
3497  }
3498 
3499  address generate_libm_sin_cos_huge() {
3500    StubCodeMark mark(this, "StubRoutines", "libm_sin_cos_huge");
3501 
3502    address start = __ pc();
3503 
3504    const XMMRegister x0 = xmm0;
3505    const XMMRegister x1 = xmm1;
3506 
3507    BLOCK_COMMENT("Entry:");
3508    __ libm_sincos_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3509 
3510    return start;
3511 
3512  }
3513 
3514  address generate_libmSin() {
3515    StubCodeMark mark(this, "StubRoutines", "libmSin");
3516 
3517    address start = __ pc();
3518 
3519    const XMMRegister x0 = xmm0;
3520    const XMMRegister x1 = xmm1;
3521    const XMMRegister x2 = xmm2;
3522    const XMMRegister x3 = xmm3;
3523 
3524    const XMMRegister x4 = xmm4;
3525    const XMMRegister x5 = xmm5;
3526    const XMMRegister x6 = xmm6;
3527    const XMMRegister x7 = xmm7;
3528 
3529    BLOCK_COMMENT("Entry:");
3530    __ enter(); // required for proper stackwalking of RuntimeStub frame
3531    __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rdx);
3532    __ leave(); // required for proper stackwalking of RuntimeStub frame
3533    __ ret(0);
3534 
3535    return start;
3536 
3537  }
3538 
3539  address generate_libmCos() {
3540    StubCodeMark mark(this, "StubRoutines", "libmCos");
3541 
3542    address start = __ pc();
3543 
3544    const XMMRegister x0 = xmm0;
3545    const XMMRegister x1 = xmm1;
3546    const XMMRegister x2 = xmm2;
3547    const XMMRegister x3 = xmm3;
3548 
3549    const XMMRegister x4 = xmm4;
3550    const XMMRegister x5 = xmm5;
3551    const XMMRegister x6 = xmm6;
3552    const XMMRegister x7 = xmm7;
3553 
3554    const Register tmp = rbx;
3555 
3556    BLOCK_COMMENT("Entry:");
3557    __ enter(); // required for proper stackwalking of RuntimeStub frame
3558    __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3559    __ leave(); // required for proper stackwalking of RuntimeStub frame
3560    __ ret(0);
3561 
3562    return start;
3563 
3564  }
3565 
3566  address generate_libm_tan_cot_huge() {
3567    StubCodeMark mark(this, "StubRoutines", "libm_tan_cot_huge");
3568 
3569    address start = __ pc();
3570 
3571    const XMMRegister x0 = xmm0;
3572    const XMMRegister x1 = xmm1;
3573 
3574    BLOCK_COMMENT("Entry:");
3575    __ libm_tancot_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3576 
3577    return start;
3578 
3579  }
3580 
3581  address generate_libmTan() {
3582    StubCodeMark mark(this, "StubRoutines", "libmTan");
3583 
3584    address start = __ pc();
3585 
3586    const XMMRegister x0 = xmm0;
3587    const XMMRegister x1 = xmm1;
3588    const XMMRegister x2 = xmm2;
3589    const XMMRegister x3 = xmm3;
3590 
3591    const XMMRegister x4 = xmm4;
3592    const XMMRegister x5 = xmm5;
3593    const XMMRegister x6 = xmm6;
3594    const XMMRegister x7 = xmm7;
3595 
3596    const Register tmp = rbx;
3597 
3598    BLOCK_COMMENT("Entry:");
3599    __ enter(); // required for proper stackwalking of RuntimeStub frame
3600    __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3601    __ leave(); // required for proper stackwalking of RuntimeStub frame
3602    __ ret(0);
3603 
3604    return start;
3605 
3606  }
3607 
3608   // Safefetch stubs.
3609   void generate_safefetch(const char* name, int size, address* entry,
3610                           address* fault_pc, address* continuation_pc) {
3611     // safefetch signatures:
3612     //   int      SafeFetch32(int*      adr, int      errValue);
3613     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3614 
3615     StubCodeMark mark(this, "StubRoutines", name);
3616 
3617     // Entry point, pc or function descriptor.
3618     *entry = __ pc();
3619 
3620     __ movl(rax, Address(rsp, 0x8));
3621     __ movl(rcx, Address(rsp, 0x4));
3622     // Load *adr into eax, may fault.
3623     *fault_pc = __ pc();
3624     switch (size) {
3625       case 4:
3626         // int32_t
3627         __ movl(rax, Address(rcx, 0));
3628         break;
3629       case 8:
3630         // int64_t
3631         Unimplemented();
3632         break;
3633       default:
3634         ShouldNotReachHere();
3635     }
3636 
3637     // Return errValue or *adr.
3638     *continuation_pc = __ pc();
3639     __ ret(0);
3640   }
3641 
3642   address generate_method_entry_barrier() {
3643     __ align(CodeEntryAlignment);
3644     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
3645 
3646     Label deoptimize_label;
3647 
3648     address start = __ pc();
3649 
3650     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
3651 
3652     BLOCK_COMMENT("Entry:");
3653     __ enter(); // save rbp
3654 
3655     // save rbx, because we want to use that value.
3656     // We could do without it but then we depend on the number of slots used by pusha
3657     __ push(rbx);
3658 
3659     __ lea(rbx, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for rbx - this should be the return address
3660 
3661     __ pusha();
3662 
3663     // xmm0 and xmm1 may be used for passing float/double arguments
3664     const int xmm_size = wordSize * 2;
3665     const int xmm_spill_size = xmm_size * 2;
3666     __ subptr(rsp, xmm_spill_size);
3667     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
3668     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
3669 
3670     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), rbx);
3671 
3672     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
3673     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
3674     __ addptr(rsp, xmm_spill_size);
3675 
3676     __ cmpl(rax, 1); // 1 means deoptimize
3677     __ jcc(Assembler::equal, deoptimize_label);
3678 
3679     __ popa();
3680     __ pop(rbx);
3681 
3682     __ leave();
3683 
3684     __ addptr(rsp, 1 * wordSize); // cookie
3685     __ ret(0);
3686 
3687     __ BIND(deoptimize_label);
3688 
3689     __ popa();
3690     __ pop(rbx);
3691 
3692     __ leave();
3693 
3694     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
3695     // here while still having a correct stack is valuable
3696     __ testptr(rsp, Address(rsp, 0));
3697 
3698     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
3699     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
3700 
3701     return start;
3702   }
3703 
3704  public:
3705   // Information about frame layout at time of blocking runtime call.
3706   // Note that we only have to preserve callee-saved registers since
3707   // the compilers are responsible for supplying a continuation point
3708   // if they expect all registers to be preserved.
3709   enum layout {
3710     thread_off,    // last_java_sp
3711     arg1_off,
3712     arg2_off,
3713     rbp_off,       // callee saved register
3714     ret_pc,
3715     framesize
3716   };
3717 
3718  private:
3719 
3720 #undef  __
3721 #define __ masm->
3722 
3723   //------------------------------------------------------------------------------------------------------------------------
3724   // Continuation point for throwing of implicit exceptions that are not handled in
3725   // the current activation. Fabricates an exception oop and initiates normal
3726   // exception dispatching in this frame.
3727   //
3728   // Previously the compiler (c2) allowed for callee save registers on Java calls.
3729   // This is no longer true after adapter frames were removed but could possibly
3730   // be brought back in the future if the interpreter code was reworked and it
3731   // was deemed worthwhile. The comment below was left to describe what must
3732   // happen here if callee saves were resurrected. As it stands now this stub
3733   // could actually be a vanilla BufferBlob and have now oopMap at all.
3734   // Since it doesn't make much difference we've chosen to leave it the
3735   // way it was in the callee save days and keep the comment.
3736 
3737   // If we need to preserve callee-saved values we need a callee-saved oop map and
3738   // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
3739   // If the compiler needs all registers to be preserved between the fault
3740   // point and the exception handler then it must assume responsibility for that in
3741   // AbstractCompiler::continuation_for_implicit_null_exception or
3742   // continuation_for_implicit_division_by_zero_exception. All other implicit
3743   // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
3744   // either at call sites or otherwise assume that stack unwinding will be initiated,
3745   // so caller saved registers were assumed volatile in the compiler.
3746   address generate_throw_exception(const char* name, address runtime_entry,
3747                                    Register arg1 = noreg, Register arg2 = noreg) {
3748 
3749     int insts_size = 256;
3750     int locs_size  = 32;
3751 
3752     CodeBuffer code(name, insts_size, locs_size);
3753     OopMapSet* oop_maps  = new OopMapSet();
3754     MacroAssembler* masm = new MacroAssembler(&code);
3755 
3756     address start = __ pc();
3757 
3758     // This is an inlined and slightly modified version of call_VM
3759     // which has the ability to fetch the return PC out of
3760     // thread-local storage and also sets up last_Java_sp slightly
3761     // differently than the real call_VM
3762     Register java_thread = rbx;
3763     __ get_thread(java_thread);
3764 
3765     __ enter(); // required for proper stackwalking of RuntimeStub frame
3766 
3767     // pc and rbp, already pushed
3768     __ subptr(rsp, (framesize-2) * wordSize); // prolog
3769 
3770     // Frame is now completed as far as size and linkage.
3771 
3772     int frame_complete = __ pc() - start;
3773 
3774     // push java thread (becomes first argument of C function)
3775     __ movptr(Address(rsp, thread_off * wordSize), java_thread);
3776     if (arg1 != noreg) {
3777       __ movptr(Address(rsp, arg1_off * wordSize), arg1);
3778     }
3779     if (arg2 != noreg) {
3780       assert(arg1 != noreg, "missing reg arg");
3781       __ movptr(Address(rsp, arg2_off * wordSize), arg2);
3782     }
3783 
3784     // Set up last_Java_sp and last_Java_fp
3785     __ set_last_Java_frame(java_thread, rsp, rbp, NULL);
3786 
3787     // Call runtime
3788     BLOCK_COMMENT("call runtime_entry");
3789     __ call(RuntimeAddress(runtime_entry));
3790     // Generate oop map
3791     OopMap* map =  new OopMap(framesize, 0);
3792     oop_maps->add_gc_map(__ pc() - start, map);
3793 
3794     // restore the thread (cannot use the pushed argument since arguments
3795     // may be overwritten by C code generated by an optimizing compiler);
3796     // however can use the register value directly if it is callee saved.
3797     __ get_thread(java_thread);
3798 
3799     __ reset_last_Java_frame(java_thread, true);
3800 
3801     __ leave(); // required for proper stackwalking of RuntimeStub frame
3802 
3803     // check for pending exceptions
3804 #ifdef ASSERT
3805     Label L;
3806     __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3807     __ jcc(Assembler::notEqual, L);
3808     __ should_not_reach_here();
3809     __ bind(L);
3810 #endif /* ASSERT */
3811     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3812 
3813 
3814     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
3815     return stub->entry_point();
3816   }
3817 
3818 
3819   void create_control_words() {
3820     // Round to nearest, 53-bit mode, exceptions masked
3821     StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
3822     // Round to zero, 53-bit mode, exception mased
3823     StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
3824     // Round to nearest, 24-bit mode, exceptions masked
3825     StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
3826     // Round to nearest, 64-bit mode, exceptions masked
3827     StubRoutines::_mxcsr_std           = 0x1F80;
3828     // Note: the following two constants are 80-bit values
3829     //       layout is critical for correct loading by FPU.
3830     // Bias for strict fp multiply/divide
3831     StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
3832     StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
3833     StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
3834     // Un-Bias for strict fp multiply/divide
3835     StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
3836     StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
3837     StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
3838   }
3839 
3840   //---------------------------------------------------------------------------
3841   // Initialization
3842 
3843   void generate_initial() {
3844     // Generates all stubs and initializes the entry points
3845 
3846     //------------------------------------------------------------------------------------------------------------------------
3847     // entry points that exist in all platforms
3848     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3849     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3850     StubRoutines::_forward_exception_entry      = generate_forward_exception();
3851 
3852     StubRoutines::_call_stub_entry              =
3853       generate_call_stub(StubRoutines::_call_stub_return_address);
3854     // is referenced by megamorphic call
3855     StubRoutines::_catch_exception_entry        = generate_catch_exception();
3856 
3857     // These are currently used by Solaris/Intel
3858     StubRoutines::_atomic_xchg_entry            = generate_atomic_xchg();
3859 
3860     // platform dependent
3861     create_control_words();
3862 
3863     StubRoutines::x86::_verify_mxcsr_entry                 = generate_verify_mxcsr();
3864     StubRoutines::x86::_verify_fpu_cntrl_wrd_entry         = generate_verify_fpu_cntrl_wrd();
3865     StubRoutines::_d2i_wrapper                              = generate_d2i_wrapper(T_INT,
3866                                                                                    CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
3867     StubRoutines::_d2l_wrapper                              = generate_d2i_wrapper(T_LONG,
3868                                                                                    CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
3869 
3870     // Build this early so it's available for the interpreter
3871     StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",
3872                                                                                       CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
3873     StubRoutines::_throw_delayed_StackOverflowError_entry  = generate_throw_exception("delayed StackOverflowError throw_exception",
3874                                                                                       CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
3875 
3876     if (UseCRC32Intrinsics) {
3877       // set table address before stub generation which use it
3878       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
3879       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
3880     }
3881 
3882     if (UseCRC32CIntrinsics) {
3883       bool supports_clmul = VM_Version::supports_clmul();
3884       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
3885       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
3886       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
3887     }
3888     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
3889       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3890           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
3891           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3892         StubRoutines::x86::_L_2il0floatpacket_0_adr = (address)StubRoutines::x86::_L_2il0floatpacket_0;
3893         StubRoutines::x86::_Pi4Inv_adr = (address)StubRoutines::x86::_Pi4Inv;
3894         StubRoutines::x86::_Pi4x3_adr = (address)StubRoutines::x86::_Pi4x3;
3895         StubRoutines::x86::_Pi4x4_adr = (address)StubRoutines::x86::_Pi4x4;
3896         StubRoutines::x86::_ones_adr = (address)StubRoutines::x86::_ones;
3897       }
3898       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
3899         StubRoutines::_dexp = generate_libmExp();
3900       }
3901       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
3902         StubRoutines::_dlog = generate_libmLog();
3903       }
3904       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
3905         StubRoutines::_dlog10 = generate_libmLog10();
3906       }
3907       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
3908         StubRoutines::_dpow = generate_libmPow();
3909       }
3910       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3911         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
3912         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3913         StubRoutines::_dlibm_reduce_pi04l = generate_libm_reduce_pi04l();
3914       }
3915       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3916         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
3917         StubRoutines::_dlibm_sin_cos_huge = generate_libm_sin_cos_huge();
3918       }
3919       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
3920         StubRoutines::_dsin = generate_libmSin();
3921       }
3922       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
3923         StubRoutines::_dcos = generate_libmCos();
3924       }
3925       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3926         StubRoutines::_dlibm_tan_cot_huge = generate_libm_tan_cot_huge();
3927         StubRoutines::_dtan = generate_libmTan();
3928       }
3929     }
3930 
3931     // Safefetch stubs.
3932     generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
3933                                                    &StubRoutines::_safefetch32_fault_pc,
3934                                                    &StubRoutines::_safefetch32_continuation_pc);
3935     StubRoutines::_safefetchN_entry           = StubRoutines::_safefetch32_entry;
3936     StubRoutines::_safefetchN_fault_pc        = StubRoutines::_safefetch32_fault_pc;
3937     StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
3938   }
3939 
3940   void generate_all() {
3941     // Generates all stubs and initializes the entry points
3942 
3943     // These entry points require SharedInfo::stack0 to be set up in non-core builds
3944     // and need to be relocatable, so they each fabricate a RuntimeStub internally.
3945     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
3946     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
3947     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
3948 
3949     //------------------------------------------------------------------------------------------------------------------------
3950     // entry points that are platform specific
3951 
3952     StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF);
3953     StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x80000000);
3954     StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask_long_double("vector_double_sign_mask", 0x7FFFFFFF, 0xFFFFFFFF);
3955     StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask_long_double("vector_double_sign_flip", 0x80000000, 0x00000000);
3956     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff);
3957     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff);
3958     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff);
3959     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
3960                                                                         0xFFFFFFFF, 0, 0, 0);
3961     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
3962                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
3963     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x03020100);
3964     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x01000100);
3965     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask_long_double("vector_long_shuffle_mask", 0x00000001, 0x0);
3966     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
3967     StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask_long_double("vector_long_sign_mask", 0x80000000, 0x00000000);
3968     StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFF);
3969     StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
3970 
3971     // support for verify_oop (must happen after universe_init)
3972     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
3973 
3974     // arraycopy stubs used by compilers
3975     generate_arraycopy_stubs();
3976 
3977     // don't bother generating these AES intrinsic stubs unless global flag is set
3978     if (UseAESIntrinsics) {
3979       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
3980 
3981       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3982       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3983       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3984       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
3985     }
3986 
3987     if (UseAESCTRIntrinsics) {
3988       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
3989       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
3990     }
3991 
3992     if (UseSHA1Intrinsics) {
3993       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
3994       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
3995       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
3996       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
3997     }
3998     if (UseSHA256Intrinsics) {
3999       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
4000       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
4001       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
4002       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
4003     }
4004 
4005     // Generate GHASH intrinsics code
4006     if (UseGHASHIntrinsics) {
4007       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
4008       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
4009       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4010     }
4011 
4012     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
4013     if (bs_nm != NULL) {
4014       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
4015     }
4016   }
4017 
4018 
4019  public:
4020   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4021     if (all) {
4022       generate_all();
4023     } else {
4024       generate_initial();
4025     }
4026   }
4027 }; // end class declaration
4028 
4029 #define UCM_TABLE_MAX_ENTRIES 8
4030 void StubGenerator_generate(CodeBuffer* code, bool all) {
4031   if (UnsafeCopyMemory::_table == NULL) {
4032     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
4033   }
4034   StubGenerator g(code, all);
4035 }