1 /*
   2  * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "assembler_x86.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "nativeInst_x86.hpp"
  30 #include "oops/instanceOop.hpp"
  31 #include "oops/methodOop.hpp"
  32 #include "oops/objArrayKlass.hpp"
  33 #include "oops/oop.inline.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/handles.inline.hpp"
  37 #include "runtime/sharedRuntime.hpp"
  38 #include "runtime/stubCodeGenerator.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #include "utilities/top.hpp"
  41 #ifdef TARGET_OS_FAMILY_linux
  42 # include "thread_linux.inline.hpp"
  43 #endif
  44 #ifdef TARGET_OS_FAMILY_solaris
  45 # include "thread_solaris.inline.hpp"
  46 #endif
  47 #ifdef TARGET_OS_FAMILY_windows
  48 # include "thread_windows.inline.hpp"
  49 #endif
  50 #ifdef COMPILER2
  51 #include "opto/runtime.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #define __ _masm->
  59 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  60 #define a__ ((Assembler*)_masm)->
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  70 
  71 // Stub Code definitions
  72 
  73 static address handle_unsafe_access() {
  74   JavaThread* thread = JavaThread::current();
  75   address pc = thread->saved_exception_pc();
  76   // pc is the instruction which we must emulate
  77   // doing a no-op is fine:  return garbage from the load
  78   // therefore, compute npc
  79   address npc = Assembler::locate_next_instruction(pc);
  80 
  81   // request an async exception
  82   thread->set_pending_unsafe_access_error();
  83 
  84   // return address of next instruction to execute
  85   return npc;
  86 }
  87 
  88 class StubGenerator: public StubCodeGenerator {
  89  private:
  90 
  91 #ifdef PRODUCT
  92 #define inc_counter_np(counter) (0)
  93 #else
  94   void inc_counter_np_(int& counter) {
  95     __ incrementl(ExternalAddress((address)&counter));
  96   }
  97 #define inc_counter_np(counter) \
  98   BLOCK_COMMENT("inc_counter " #counter); \
  99   inc_counter_np_(counter);
 100 #endif
 101 
 102   // Call stubs are used to call Java from C
 103   //
 104   // Linux Arguments:
 105   //    c_rarg0:   call wrapper address                   address
 106   //    c_rarg1:   result                                 address
 107   //    c_rarg2:   result type                            BasicType
 108   //    c_rarg3:   method                                 methodOop
 109   //    c_rarg4:   (interpreter) entry point              address
 110   //    c_rarg5:   parameters                             intptr_t*
 111   //    16(rbp): parameter size (in words)              int
 112   //    24(rbp): thread                                 Thread*
 113   //
 114   //     [ return_from_Java     ] <--- rsp
 115   //     [ argument word n      ]
 116   //      ...
 117   // -12 [ argument word 1      ]
 118   // -11 [ saved r15            ] <--- rsp_after_call
 119   // -10 [ saved r14            ]
 120   //  -9 [ saved r13            ]
 121   //  -8 [ saved r12            ]
 122   //  -7 [ saved rbx            ]
 123   //  -6 [ call wrapper         ]
 124   //  -5 [ result               ]
 125   //  -4 [ result type          ]
 126   //  -3 [ method               ]
 127   //  -2 [ entry point          ]
 128   //  -1 [ parameters           ]
 129   //   0 [ saved rbp            ] <--- rbp
 130   //   1 [ return address       ]
 131   //   2 [ parameter size       ]
 132   //   3 [ thread               ]
 133   //
 134   // Windows Arguments:
 135   //    c_rarg0:   call wrapper address                   address
 136   //    c_rarg1:   result                                 address
 137   //    c_rarg2:   result type                            BasicType
 138   //    c_rarg3:   method                                 methodOop
 139   //    48(rbp): (interpreter) entry point              address
 140   //    56(rbp): parameters                             intptr_t*
 141   //    64(rbp): parameter size (in words)              int
 142   //    72(rbp): thread                                 Thread*
 143   //
 144   //     [ return_from_Java     ] <--- rsp
 145   //     [ argument word n      ]
 146   //      ...
 147   // -28 [ argument word 1      ]
 148   // -27 [ saved xmm15          ] <--- rsp_after_call 
 149   //     [ saved xmm7-xmm14     ] 
 150   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 151   //  -7 [ saved r15            ] 
 152   //  -6 [ saved r14            ]
 153   //  -5 [ saved r13            ]
 154   //  -4 [ saved r12            ]
 155   //  -3 [ saved rdi            ]
 156   //  -2 [ saved rsi            ]
 157   //  -1 [ saved rbx            ]
 158   //   0 [ saved rbp            ] <--- rbp
 159   //   1 [ return address       ]
 160   //   2 [ call wrapper         ]
 161   //   3 [ result               ]
 162   //   4 [ result type          ]
 163   //   5 [ method               ]
 164   //   6 [ entry point          ]
 165   //   7 [ parameters           ]
 166   //   8 [ parameter size       ]
 167   //   9 [ thread               ]
 168   //
 169   //    Windows reserves the callers stack space for arguments 1-4.
 170   //    We spill c_rarg0-c_rarg3 to this space.
 171 
 172   // Call stub stack layout word offsets from rbp
 173   enum call_stub_layout {
 174 #ifdef _WIN64
 175     xmm_save_first     = 6,  // save from xmm6
 176     xmm_save_last      = 15, // to xmm15
 177     xmm_save_base      = -9,
 178     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 179     r15_off            = -7,
 180     r14_off            = -6,
 181     r13_off            = -5,
 182     r12_off            = -4,
 183     rdi_off            = -3,
 184     rsi_off            = -2,
 185     rbx_off            = -1,
 186     rbp_off            =  0,
 187     retaddr_off        =  1,
 188     call_wrapper_off   =  2,
 189     result_off         =  3,
 190     result_type_off    =  4,
 191     method_off         =  5,
 192     entry_point_off    =  6,
 193     parameters_off     =  7,
 194     parameter_size_off =  8,
 195     thread_off         =  9
 196 #else
 197     rsp_after_call_off = -12,
 198     mxcsr_off          = rsp_after_call_off,
 199     r15_off            = -11,
 200     r14_off            = -10,
 201     r13_off            = -9,
 202     r12_off            = -8,
 203     rbx_off            = -7,
 204     call_wrapper_off   = -6,
 205     result_off         = -5,
 206     result_type_off    = -4,
 207     method_off         = -3,
 208     entry_point_off    = -2,
 209     parameters_off     = -1,
 210     rbp_off            =  0,
 211     retaddr_off        =  1,
 212     parameter_size_off =  2,
 213     thread_off         =  3
 214 #endif
 215   };
 216 
 217 #ifdef _WIN64
 218   Address xmm_save(int reg) {
 219     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 220     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 221   }
 222 #endif
 223 
 224   address generate_call_stub(address& return_address) {
 225     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 226            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 227            "adjust this code");
 228     StubCodeMark mark(this, "StubRoutines", "call_stub");
 229     address start = __ pc();
 230 
 231     // same as in generate_catch_exception()!
 232     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 233 
 234     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 235     const Address result        (rbp, result_off         * wordSize);
 236     const Address result_type   (rbp, result_type_off    * wordSize);
 237     const Address method        (rbp, method_off         * wordSize);
 238     const Address entry_point   (rbp, entry_point_off    * wordSize);
 239     const Address parameters    (rbp, parameters_off     * wordSize);
 240     const Address parameter_size(rbp, parameter_size_off * wordSize);
 241 
 242     // same as in generate_catch_exception()!
 243     const Address thread        (rbp, thread_off         * wordSize);
 244 
 245     const Address r15_save(rbp, r15_off * wordSize);
 246     const Address r14_save(rbp, r14_off * wordSize);
 247     const Address r13_save(rbp, r13_off * wordSize);
 248     const Address r12_save(rbp, r12_off * wordSize);
 249     const Address rbx_save(rbp, rbx_off * wordSize);
 250 
 251     // stub code
 252     __ enter();
 253     __ subptr(rsp, -rsp_after_call_off * wordSize);
 254 
 255     // save register parameters
 256 #ifndef _WIN64
 257     __ movptr(parameters,   c_rarg5); // parameters
 258     __ movptr(entry_point,  c_rarg4); // entry_point
 259 #endif
 260 
 261     __ movptr(method,       c_rarg3); // method
 262     __ movl(result_type,  c_rarg2);   // result type
 263     __ movptr(result,       c_rarg1); // result
 264     __ movptr(call_wrapper, c_rarg0); // call wrapper
 265 
 266     // save regs belonging to calling function
 267     __ movptr(rbx_save, rbx);
 268     __ movptr(r12_save, r12);
 269     __ movptr(r13_save, r13);
 270     __ movptr(r14_save, r14);
 271     __ movptr(r15_save, r15);
 272 #ifdef _WIN64
 273     for (int i = 6; i <= 15; i++) {
 274       __ movdqu(xmm_save(i), as_XMMRegister(i));
 275     }
 276 
 277     const Address rdi_save(rbp, rdi_off * wordSize);
 278     const Address rsi_save(rbp, rsi_off * wordSize);
 279 
 280     __ movptr(rsi_save, rsi);
 281     __ movptr(rdi_save, rdi);
 282 #else
 283     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 284     {
 285       Label skip_ldmx;
 286       __ stmxcsr(mxcsr_save);
 287       __ movl(rax, mxcsr_save);
 288       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 289       ExternalAddress mxcsr_std(StubRoutines::x86::mxcsr_std());
 290       __ cmp32(rax, mxcsr_std);
 291       __ jcc(Assembler::equal, skip_ldmx);
 292       __ ldmxcsr(mxcsr_std);
 293       __ bind(skip_ldmx);
 294     }
 295 #endif
 296 
 297     // Load up thread register
 298     __ movptr(r15_thread, thread);
 299     __ reinit_heapbase();
 300 
 301 #ifdef ASSERT
 302     // make sure we have no pending exceptions
 303     {
 304       Label L;
 305       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 306       __ jcc(Assembler::equal, L);
 307       __ stop("StubRoutines::call_stub: entered with pending exception");
 308       __ bind(L);
 309     }
 310 #endif
 311 
 312     // pass parameters if any
 313     BLOCK_COMMENT("pass parameters if any");
 314     Label parameters_done;
 315     __ movl(c_rarg3, parameter_size);
 316     __ testl(c_rarg3, c_rarg3);
 317     __ jcc(Assembler::zero, parameters_done);
 318 
 319     Label loop;
 320     __ movptr(c_rarg2, parameters);       // parameter pointer
 321     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 322     __ BIND(loop);
 323     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 324     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 325     __ decrementl(c_rarg1);             // decrement counter
 326     __ push(rax);                       // pass parameter
 327     __ jcc(Assembler::notZero, loop);
 328 
 329     // call Java function
 330     __ BIND(parameters_done);
 331     __ movptr(rbx, method);             // get methodOop
 332     __ movptr(c_rarg1, entry_point);    // get entry_point
 333     __ mov(r13, rsp);                   // set sender sp
 334     BLOCK_COMMENT("call Java function");
 335     __ call(c_rarg1);
 336 
 337     BLOCK_COMMENT("call_stub_return_address:");
 338     return_address = __ pc();
 339 
 340     // store result depending on type (everything that is not
 341     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 342     __ movptr(c_rarg0, result);
 343     Label is_long, is_float, is_double, exit;
 344     __ movl(c_rarg1, result_type);
 345     __ cmpl(c_rarg1, T_OBJECT);
 346     __ jcc(Assembler::equal, is_long);
 347     __ cmpl(c_rarg1, T_LONG);
 348     __ jcc(Assembler::equal, is_long);
 349     __ cmpl(c_rarg1, T_FLOAT);
 350     __ jcc(Assembler::equal, is_float);
 351     __ cmpl(c_rarg1, T_DOUBLE);
 352     __ jcc(Assembler::equal, is_double);
 353 
 354     // handle T_INT case
 355     __ movl(Address(c_rarg0, 0), rax);
 356 
 357     __ BIND(exit);
 358 
 359     // pop parameters
 360     __ lea(rsp, rsp_after_call);
 361 
 362 #ifdef ASSERT
 363     // verify that threads correspond
 364     {
 365       Label L, S;
 366       __ cmpptr(r15_thread, thread);
 367       __ jcc(Assembler::notEqual, S);
 368       __ get_thread(rbx);
 369       __ cmpptr(r15_thread, rbx);
 370       __ jcc(Assembler::equal, L);
 371       __ bind(S);
 372       __ jcc(Assembler::equal, L);
 373       __ stop("StubRoutines::call_stub: threads must correspond");
 374       __ bind(L);
 375     }
 376 #endif
 377 
 378     // restore regs belonging to calling function
 379 #ifdef _WIN64
 380     for (int i = 15; i >= 6; i--) {
 381       __ movdqu(as_XMMRegister(i), xmm_save(i));
 382     }
 383 #endif
 384     __ movptr(r15, r15_save);
 385     __ movptr(r14, r14_save);
 386     __ movptr(r13, r13_save);
 387     __ movptr(r12, r12_save);
 388     __ movptr(rbx, rbx_save);
 389 
 390 #ifdef _WIN64
 391     __ movptr(rdi, rdi_save);
 392     __ movptr(rsi, rsi_save);
 393 #else
 394     __ ldmxcsr(mxcsr_save);
 395 #endif
 396 
 397     // restore rsp
 398     __ addptr(rsp, -rsp_after_call_off * wordSize);
 399 
 400     // return
 401     __ pop(rbp);
 402     __ ret(0);
 403 
 404     // handle return types different from T_INT
 405     __ BIND(is_long);
 406     __ movq(Address(c_rarg0, 0), rax);
 407     __ jmp(exit);
 408 
 409     __ BIND(is_float);
 410     __ movflt(Address(c_rarg0, 0), xmm0);
 411     __ jmp(exit);
 412 
 413     __ BIND(is_double);
 414     __ movdbl(Address(c_rarg0, 0), xmm0);
 415     __ jmp(exit);
 416 
 417     return start;
 418   }
 419 
 420   // Return point for a Java call if there's an exception thrown in
 421   // Java code.  The exception is caught and transformed into a
 422   // pending exception stored in JavaThread that can be tested from
 423   // within the VM.
 424   //
 425   // Note: Usually the parameters are removed by the callee. In case
 426   // of an exception crossing an activation frame boundary, that is
 427   // not the case if the callee is compiled code => need to setup the
 428   // rsp.
 429   //
 430   // rax: exception oop
 431 
 432   address generate_catch_exception() {
 433     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 434     address start = __ pc();
 435 
 436     // same as in generate_call_stub():
 437     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 438     const Address thread        (rbp, thread_off         * wordSize);
 439 
 440 #ifdef ASSERT
 441     // verify that threads correspond
 442     {
 443       Label L, S;
 444       __ cmpptr(r15_thread, thread);
 445       __ jcc(Assembler::notEqual, S);
 446       __ get_thread(rbx);
 447       __ cmpptr(r15_thread, rbx);
 448       __ jcc(Assembler::equal, L);
 449       __ bind(S);
 450       __ stop("StubRoutines::catch_exception: threads must correspond");
 451       __ bind(L);
 452     }
 453 #endif
 454 
 455     // set pending exception
 456     __ verify_oop(rax);
 457 
 458     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 459     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 460     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 461     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 462 
 463     // complete return to VM
 464     assert(StubRoutines::_call_stub_return_address != NULL,
 465            "_call_stub_return_address must have been generated before");
 466     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 467 
 468     return start;
 469   }
 470 
 471   // Continuation point for runtime calls returning with a pending
 472   // exception.  The pending exception check happened in the runtime
 473   // or native call stub.  The pending exception in Thread is
 474   // converted into a Java-level exception.
 475   //
 476   // Contract with Java-level exception handlers:
 477   // rax: exception
 478   // rdx: throwing pc
 479   //
 480   // NOTE: At entry of this stub, exception-pc must be on stack !!
 481 
 482   address generate_forward_exception() {
 483     StubCodeMark mark(this, "StubRoutines", "forward exception");
 484     address start = __ pc();
 485 
 486     // Upon entry, the sp points to the return address returning into
 487     // Java (interpreted or compiled) code; i.e., the return address
 488     // becomes the throwing pc.
 489     //
 490     // Arguments pushed before the runtime call are still on the stack
 491     // but the exception handler will reset the stack pointer ->
 492     // ignore them.  A potential result in registers can be ignored as
 493     // well.
 494 
 495 #ifdef ASSERT
 496     // make sure this code is only executed if there is a pending exception
 497     {
 498       Label L;
 499       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 500       __ jcc(Assembler::notEqual, L);
 501       __ stop("StubRoutines::forward exception: no pending exception (1)");
 502       __ bind(L);
 503     }
 504 #endif
 505 
 506     // compute exception handler into rbx
 507     __ movptr(c_rarg0, Address(rsp, 0));
 508     BLOCK_COMMENT("call exception_handler_for_return_address");
 509     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 510                          SharedRuntime::exception_handler_for_return_address),
 511                     r15_thread, c_rarg0);
 512     __ mov(rbx, rax);
 513 
 514     // setup rax & rdx, remove return address & clear pending exception
 515     __ pop(rdx);
 516     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 517     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 518 
 519 #ifdef ASSERT
 520     // make sure exception is set
 521     {
 522       Label L;
 523       __ testptr(rax, rax);
 524       __ jcc(Assembler::notEqual, L);
 525       __ stop("StubRoutines::forward exception: no pending exception (2)");
 526       __ bind(L);
 527     }
 528 #endif
 529 
 530     // continue at exception handler (return address removed)
 531     // rax: exception
 532     // rbx: exception handler
 533     // rdx: throwing pc
 534     __ verify_oop(rax);
 535     __ jmp(rbx);
 536 
 537     return start;
 538   }
 539 
 540   // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
 541   //
 542   // Arguments :
 543   //    c_rarg0: exchange_value
 544   //    c_rarg0: dest
 545   //
 546   // Result:
 547   //    *dest <- ex, return (orig *dest)
 548   address generate_atomic_xchg() {
 549     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 550     address start = __ pc();
 551 
 552     __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
 553     __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
 554     __ ret(0);
 555 
 556     return start;
 557   }
 558 
 559   // Support for intptr_t atomic::xchg_ptr(intptr_t exchange_value, volatile intptr_t* dest)
 560   //
 561   // Arguments :
 562   //    c_rarg0: exchange_value
 563   //    c_rarg1: dest
 564   //
 565   // Result:
 566   //    *dest <- ex, return (orig *dest)
 567   address generate_atomic_xchg_ptr() {
 568     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_ptr");
 569     address start = __ pc();
 570 
 571     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 572     __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
 573     __ ret(0);
 574 
 575     return start;
 576   }
 577 
 578   // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
 579   //                                         jint compare_value)
 580   //
 581   // Arguments :
 582   //    c_rarg0: exchange_value
 583   //    c_rarg1: dest
 584   //    c_rarg2: compare_value
 585   //
 586   // Result:
 587   //    if ( compare_value == *dest ) {
 588   //       *dest = exchange_value
 589   //       return compare_value;
 590   //    else
 591   //       return *dest;
 592   address generate_atomic_cmpxchg() {
 593     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 594     address start = __ pc();
 595 
 596     __ movl(rax, c_rarg2);
 597    if ( os::is_MP() ) __ lock();
 598     __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
 599     __ ret(0);
 600 
 601     return start;
 602   }
 603 
 604   // Support for jint atomic::atomic_cmpxchg_long(jlong exchange_value,
 605   //                                             volatile jlong* dest,
 606   //                                             jlong compare_value)
 607   // Arguments :
 608   //    c_rarg0: exchange_value
 609   //    c_rarg1: dest
 610   //    c_rarg2: compare_value
 611   //
 612   // Result:
 613   //    if ( compare_value == *dest ) {
 614   //       *dest = exchange_value
 615   //       return compare_value;
 616   //    else
 617   //       return *dest;
 618   address generate_atomic_cmpxchg_long() {
 619     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 620     address start = __ pc();
 621 
 622     __ movq(rax, c_rarg2);
 623    if ( os::is_MP() ) __ lock();
 624     __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
 625     __ ret(0);
 626 
 627     return start;
 628   }
 629 
 630   // Support for jint atomic::add(jint add_value, volatile jint* dest)
 631   //
 632   // Arguments :
 633   //    c_rarg0: add_value
 634   //    c_rarg1: dest
 635   //
 636   // Result:
 637   //    *dest += add_value
 638   //    return *dest;
 639   address generate_atomic_add() {
 640     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 641     address start = __ pc();
 642 
 643     __ movl(rax, c_rarg0);
 644    if ( os::is_MP() ) __ lock();
 645     __ xaddl(Address(c_rarg1, 0), c_rarg0);
 646     __ addl(rax, c_rarg0);
 647     __ ret(0);
 648 
 649     return start;
 650   }
 651 
 652   // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
 653   //
 654   // Arguments :
 655   //    c_rarg0: add_value
 656   //    c_rarg1: dest
 657   //
 658   // Result:
 659   //    *dest += add_value
 660   //    return *dest;
 661   address generate_atomic_add_ptr() {
 662     StubCodeMark mark(this, "StubRoutines", "atomic_add_ptr");
 663     address start = __ pc();
 664 
 665     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 666    if ( os::is_MP() ) __ lock();
 667     __ xaddptr(Address(c_rarg1, 0), c_rarg0);
 668     __ addptr(rax, c_rarg0);
 669     __ ret(0);
 670 
 671     return start;
 672   }
 673 
 674   // Support for intptr_t OrderAccess::fence()
 675   //
 676   // Arguments :
 677   //
 678   // Result:
 679   address generate_orderaccess_fence() {
 680     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 681     address start = __ pc();
 682     __ membar(Assembler::StoreLoad);
 683     __ ret(0);
 684 
 685     return start;
 686   }
 687 
 688   // Support for intptr_t get_previous_fp()
 689   //
 690   // This routine is used to find the previous frame pointer for the
 691   // caller (current_frame_guess). This is used as part of debugging
 692   // ps() is seemingly lost trying to find frames.
 693   // This code assumes that caller current_frame_guess) has a frame.
 694   address generate_get_previous_fp() {
 695     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
 696     const Address old_fp(rbp, 0);
 697     const Address older_fp(rax, 0);
 698     address start = __ pc();
 699 
 700     __ enter();
 701     __ movptr(rax, old_fp); // callers fp
 702     __ movptr(rax, older_fp); // the frame for ps()
 703     __ pop(rbp);
 704     __ ret(0);
 705 
 706     return start;
 707   }
 708 
 709   //----------------------------------------------------------------------------------------------------
 710   // Support for void verify_mxcsr()
 711   //
 712   // This routine is used with -Xcheck:jni to verify that native
 713   // JNI code does not return to Java code without restoring the
 714   // MXCSR register to our expected state.
 715 
 716   address generate_verify_mxcsr() {
 717     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 718     address start = __ pc();
 719 
 720     const Address mxcsr_save(rsp, 0);
 721 
 722     if (CheckJNICalls) {
 723       Label ok_ret;
 724       __ push(rax);
 725       __ subptr(rsp, wordSize);      // allocate a temp location
 726       __ stmxcsr(mxcsr_save);
 727       __ movl(rax, mxcsr_save);
 728       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 729       __ cmpl(rax, *(int *)(StubRoutines::x86::mxcsr_std()));
 730       __ jcc(Assembler::equal, ok_ret);
 731 
 732       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 733 
 734       __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
 735 
 736       __ bind(ok_ret);
 737       __ addptr(rsp, wordSize);
 738       __ pop(rax);
 739     }
 740 
 741     __ ret(0);
 742 
 743     return start;
 744   }
 745 
 746   address generate_f2i_fixup() {
 747     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 748     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 749 
 750     address start = __ pc();
 751 
 752     Label L;
 753 
 754     __ push(rax);
 755     __ push(c_rarg3);
 756     __ push(c_rarg2);
 757     __ push(c_rarg1);
 758 
 759     __ movl(rax, 0x7f800000);
 760     __ xorl(c_rarg3, c_rarg3);
 761     __ movl(c_rarg2, inout);
 762     __ movl(c_rarg1, c_rarg2);
 763     __ andl(c_rarg1, 0x7fffffff);
 764     __ cmpl(rax, c_rarg1); // NaN? -> 0
 765     __ jcc(Assembler::negative, L);
 766     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 767     __ movl(c_rarg3, 0x80000000);
 768     __ movl(rax, 0x7fffffff);
 769     __ cmovl(Assembler::positive, c_rarg3, rax);
 770 
 771     __ bind(L);
 772     __ movptr(inout, c_rarg3);
 773 
 774     __ pop(c_rarg1);
 775     __ pop(c_rarg2);
 776     __ pop(c_rarg3);
 777     __ pop(rax);
 778 
 779     __ ret(0);
 780 
 781     return start;
 782   }
 783 
 784   address generate_f2l_fixup() {
 785     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 786     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 787     address start = __ pc();
 788 
 789     Label L;
 790 
 791     __ push(rax);
 792     __ push(c_rarg3);
 793     __ push(c_rarg2);
 794     __ push(c_rarg1);
 795 
 796     __ movl(rax, 0x7f800000);
 797     __ xorl(c_rarg3, c_rarg3);
 798     __ movl(c_rarg2, inout);
 799     __ movl(c_rarg1, c_rarg2);
 800     __ andl(c_rarg1, 0x7fffffff);
 801     __ cmpl(rax, c_rarg1); // NaN? -> 0
 802     __ jcc(Assembler::negative, L);
 803     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 804     __ mov64(c_rarg3, 0x8000000000000000);
 805     __ mov64(rax, 0x7fffffffffffffff);
 806     __ cmov(Assembler::positive, c_rarg3, rax);
 807 
 808     __ bind(L);
 809     __ movptr(inout, c_rarg3);
 810 
 811     __ pop(c_rarg1);
 812     __ pop(c_rarg2);
 813     __ pop(c_rarg3);
 814     __ pop(rax);
 815 
 816     __ ret(0);
 817 
 818     return start;
 819   }
 820 
 821   address generate_d2i_fixup() {
 822     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 823     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 824 
 825     address start = __ pc();
 826 
 827     Label L;
 828 
 829     __ push(rax);
 830     __ push(c_rarg3);
 831     __ push(c_rarg2);
 832     __ push(c_rarg1);
 833     __ push(c_rarg0);
 834 
 835     __ movl(rax, 0x7ff00000);
 836     __ movq(c_rarg2, inout);
 837     __ movl(c_rarg3, c_rarg2);
 838     __ mov(c_rarg1, c_rarg2);
 839     __ mov(c_rarg0, c_rarg2);
 840     __ negl(c_rarg3);
 841     __ shrptr(c_rarg1, 0x20);
 842     __ orl(c_rarg3, c_rarg2);
 843     __ andl(c_rarg1, 0x7fffffff);
 844     __ xorl(c_rarg2, c_rarg2);
 845     __ shrl(c_rarg3, 0x1f);
 846     __ orl(c_rarg1, c_rarg3);
 847     __ cmpl(rax, c_rarg1);
 848     __ jcc(Assembler::negative, L); // NaN -> 0
 849     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 850     __ movl(c_rarg2, 0x80000000);
 851     __ movl(rax, 0x7fffffff);
 852     __ cmov(Assembler::positive, c_rarg2, rax);
 853 
 854     __ bind(L);
 855     __ movptr(inout, c_rarg2);
 856 
 857     __ pop(c_rarg0);
 858     __ pop(c_rarg1);
 859     __ pop(c_rarg2);
 860     __ pop(c_rarg3);
 861     __ pop(rax);
 862 
 863     __ ret(0);
 864 
 865     return start;
 866   }
 867 
 868   address generate_d2l_fixup() {
 869     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 870     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 871 
 872     address start = __ pc();
 873 
 874     Label L;
 875 
 876     __ push(rax);
 877     __ push(c_rarg3);
 878     __ push(c_rarg2);
 879     __ push(c_rarg1);
 880     __ push(c_rarg0);
 881 
 882     __ movl(rax, 0x7ff00000);
 883     __ movq(c_rarg2, inout);
 884     __ movl(c_rarg3, c_rarg2);
 885     __ mov(c_rarg1, c_rarg2);
 886     __ mov(c_rarg0, c_rarg2);
 887     __ negl(c_rarg3);
 888     __ shrptr(c_rarg1, 0x20);
 889     __ orl(c_rarg3, c_rarg2);
 890     __ andl(c_rarg1, 0x7fffffff);
 891     __ xorl(c_rarg2, c_rarg2);
 892     __ shrl(c_rarg3, 0x1f);
 893     __ orl(c_rarg1, c_rarg3);
 894     __ cmpl(rax, c_rarg1);
 895     __ jcc(Assembler::negative, L); // NaN -> 0
 896     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 897     __ mov64(c_rarg2, 0x8000000000000000);
 898     __ mov64(rax, 0x7fffffffffffffff);
 899     __ cmovq(Assembler::positive, c_rarg2, rax);
 900 
 901     __ bind(L);
 902     __ movq(inout, c_rarg2);
 903 
 904     __ pop(c_rarg0);
 905     __ pop(c_rarg1);
 906     __ pop(c_rarg2);
 907     __ pop(c_rarg3);
 908     __ pop(rax);
 909 
 910     __ ret(0);
 911 
 912     return start;
 913   }
 914 
 915   address generate_fp_mask(const char *stub_name, int64_t mask) {
 916     __ align(CodeEntryAlignment);
 917     StubCodeMark mark(this, "StubRoutines", stub_name);
 918     address start = __ pc();
 919 
 920     __ emit_data64( mask, relocInfo::none );
 921     __ emit_data64( mask, relocInfo::none );
 922 
 923     return start;
 924   }
 925 
 926   // The following routine generates a subroutine to throw an
 927   // asynchronous UnknownError when an unsafe access gets a fault that
 928   // could not be reasonably prevented by the programmer.  (Example:
 929   // SIGBUS/OBJERR.)
 930   address generate_handler_for_unsafe_access() {
 931     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
 932     address start = __ pc();
 933 
 934     __ push(0);                       // hole for return address-to-be
 935     __ pusha();                       // push registers
 936     Address next_pc(rsp, RegisterImpl::number_of_registers * BytesPerWord);
 937 
 938     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 939     BLOCK_COMMENT("call handle_unsafe_access");
 940     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, handle_unsafe_access)));
 941     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 942 
 943     __ movptr(next_pc, rax);          // stuff next address
 944     __ popa();
 945     __ ret(0);                        // jump to next address
 946 
 947     return start;
 948   }
 949 
 950   // Non-destructive plausibility checks for oops
 951   //
 952   // Arguments:
 953   //    all args on stack!
 954   //
 955   // Stack after saving c_rarg3:
 956   //    [tos + 0]: saved c_rarg3
 957   //    [tos + 1]: saved c_rarg2
 958   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
 959   //    [tos + 3]: saved flags
 960   //    [tos + 4]: return address
 961   //  * [tos + 5]: error message (char*)
 962   //  * [tos + 6]: object to verify (oop)
 963   //  * [tos + 7]: saved rax - saved by caller and bashed
 964   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
 965   //  * = popped on exit
 966   address generate_verify_oop() {
 967     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 968     address start = __ pc();
 969 
 970     Label exit, error;
 971 
 972     __ pushf();
 973     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 974 
 975     __ push(r12);
 976 
 977     // save c_rarg2 and c_rarg3
 978     __ push(c_rarg2);
 979     __ push(c_rarg3);
 980 
 981     enum {
 982            // After previous pushes.
 983            oop_to_verify = 6 * wordSize,
 984            saved_rax     = 7 * wordSize,
 985            saved_r10     = 8 * wordSize,
 986 
 987            // Before the call to MacroAssembler::debug(), see below.
 988            return_addr   = 16 * wordSize,
 989            error_msg     = 17 * wordSize
 990     };
 991 
 992     // get object
 993     __ movptr(rax, Address(rsp, oop_to_verify));
 994 
 995     // make sure object is 'reasonable'
 996     __ testptr(rax, rax);
 997     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
 998     // Check if the oop is in the right area of memory
 999     __ movptr(c_rarg2, rax);
1000     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1001     __ andptr(c_rarg2, c_rarg3);
1002     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1003     __ cmpptr(c_rarg2, c_rarg3);
1004     __ jcc(Assembler::notZero, error);
1005 
1006     // set r12 to heapbase for load_klass()
1007     __ reinit_heapbase();
1008 
1009     // make sure klass is 'reasonable'
1010     __ load_klass(rax, rax);  // get klass
1011     __ testptr(rax, rax);
1012     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1013     // Check if the klass is in the right area of memory
1014     __ mov(c_rarg2, rax);
1015     __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_mask());
1016     __ andptr(c_rarg2, c_rarg3);
1017     __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_bits());
1018     __ cmpptr(c_rarg2, c_rarg3);
1019     __ jcc(Assembler::notZero, error);
1020 
1021     // make sure klass' klass is 'reasonable'
1022     __ load_klass(rax, rax);
1023     __ testptr(rax, rax);
1024     __ jcc(Assembler::zero, error); // if klass' klass is NULL it is broken
1025     // Check if the klass' klass is in the right area of memory
1026     __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_mask());
1027     __ andptr(rax, c_rarg3);
1028     __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_bits());
1029     __ cmpptr(rax, c_rarg3);
1030     __ jcc(Assembler::notZero, error);
1031 
1032     // return if everything seems ok
1033     __ bind(exit);
1034     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1035     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1036     __ pop(c_rarg3);                             // restore c_rarg3
1037     __ pop(c_rarg2);                             // restore c_rarg2
1038     __ pop(r12);                                 // restore r12
1039     __ popf();                                   // restore flags
1040     __ ret(4 * wordSize);                        // pop caller saved stuff
1041 
1042     // handle errors
1043     __ bind(error);
1044     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1045     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1046     __ pop(c_rarg3);                             // get saved c_rarg3 back
1047     __ pop(c_rarg2);                             // get saved c_rarg2 back
1048     __ pop(r12);                                 // get saved r12 back
1049     __ popf();                                   // get saved flags off stack --
1050                                                  // will be ignored
1051 
1052     __ pusha();                                  // push registers
1053                                                  // (rip is already
1054                                                  // already pushed)
1055     // debug(char* msg, int64_t pc, int64_t regs[])
1056     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1057     // pushed all the registers, so now the stack looks like:
1058     //     [tos +  0] 16 saved registers
1059     //     [tos + 16] return address
1060     //   * [tos + 17] error message (char*)
1061     //   * [tos + 18] object to verify (oop)
1062     //   * [tos + 19] saved rax - saved by caller and bashed
1063     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1064     //   * = popped on exit
1065 
1066     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1067     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1068     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1069     __ mov(r12, rsp);                               // remember rsp
1070     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1071     __ andptr(rsp, -16);                            // align stack as required by ABI
1072     BLOCK_COMMENT("call MacroAssembler::debug");
1073     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1074     __ mov(rsp, r12);                               // restore rsp
1075     __ popa();                                      // pop registers (includes r12)
1076     __ ret(4 * wordSize);                           // pop caller saved stuff
1077 
1078     return start;
1079   }
1080 
1081   //
1082   // Verify that a register contains clean 32-bits positive value
1083   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1084   //
1085   //  Input:
1086   //    Rint  -  32-bits value
1087   //    Rtmp  -  scratch
1088   //
1089   void assert_clean_int(Register Rint, Register Rtmp) {
1090 #ifdef ASSERT
1091     Label L;
1092     assert_different_registers(Rtmp, Rint);
1093     __ movslq(Rtmp, Rint);
1094     __ cmpq(Rtmp, Rint);
1095     __ jcc(Assembler::equal, L);
1096     __ stop("high 32-bits of int value are not 0");
1097     __ bind(L);
1098 #endif
1099   }
1100 
1101   //  Generate overlap test for array copy stubs
1102   //
1103   //  Input:
1104   //     c_rarg0 - from
1105   //     c_rarg1 - to
1106   //     c_rarg2 - element count
1107   //
1108   //  Output:
1109   //     rax   - &from[element count - 1]
1110   //
1111   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1112     assert(no_overlap_target != NULL, "must be generated");
1113     array_overlap_test(no_overlap_target, NULL, sf);
1114   }
1115   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1116     array_overlap_test(NULL, &L_no_overlap, sf);
1117   }
1118   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1119     const Register from     = c_rarg0;
1120     const Register to       = c_rarg1;
1121     const Register count    = c_rarg2;
1122     const Register end_from = rax;
1123 
1124     __ cmpptr(to, from);
1125     __ lea(end_from, Address(from, count, sf, 0));
1126     if (NOLp == NULL) {
1127       ExternalAddress no_overlap(no_overlap_target);
1128       __ jump_cc(Assembler::belowEqual, no_overlap);
1129       __ cmpptr(to, end_from);
1130       __ jump_cc(Assembler::aboveEqual, no_overlap);
1131     } else {
1132       __ jcc(Assembler::belowEqual, (*NOLp));
1133       __ cmpptr(to, end_from);
1134       __ jcc(Assembler::aboveEqual, (*NOLp));
1135     }
1136   }
1137 
1138   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1139   //
1140   // Outputs:
1141   //    rdi - rcx
1142   //    rsi - rdx
1143   //    rdx - r8
1144   //    rcx - r9
1145   //
1146   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1147   // are non-volatile.  r9 and r10 should not be used by the caller.
1148   //
1149   void setup_arg_regs(int nargs = 3) {
1150     const Register saved_rdi = r9;
1151     const Register saved_rsi = r10;
1152     assert(nargs == 3 || nargs == 4, "else fix");
1153 #ifdef _WIN64
1154     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1155            "unexpected argument registers");
1156     if (nargs >= 4)
1157       __ mov(rax, r9);  // r9 is also saved_rdi
1158     __ movptr(saved_rdi, rdi);
1159     __ movptr(saved_rsi, rsi);
1160     __ mov(rdi, rcx); // c_rarg0
1161     __ mov(rsi, rdx); // c_rarg1
1162     __ mov(rdx, r8);  // c_rarg2
1163     if (nargs >= 4)
1164       __ mov(rcx, rax); // c_rarg3 (via rax)
1165 #else
1166     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1167            "unexpected argument registers");
1168 #endif
1169   }
1170 
1171   void restore_arg_regs() {
1172     const Register saved_rdi = r9;
1173     const Register saved_rsi = r10;
1174 #ifdef _WIN64
1175     __ movptr(rdi, saved_rdi);
1176     __ movptr(rsi, saved_rsi);
1177 #endif
1178   }
1179 
1180   // Generate code for an array write pre barrier
1181   //
1182   //     addr    -  starting address
1183   //     count   -  element count
1184   //     tmp     - scratch register
1185   //
1186   //     Destroy no registers!
1187   //
1188   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1189     BarrierSet* bs = Universe::heap()->barrier_set();
1190     switch (bs->kind()) {
1191       case BarrierSet::G1SATBCT:
1192       case BarrierSet::G1SATBCTLogging:
1193         // With G1, don't generate the call if we statically know that the target in uninitialized
1194         if (!dest_uninitialized) {
1195            __ pusha();                      // push registers
1196            if (count == c_rarg0) {
1197              if (addr == c_rarg1) {
1198                // exactly backwards!!
1199                __ xchgptr(c_rarg1, c_rarg0);
1200              } else {
1201                __ movptr(c_rarg1, count);
1202                __ movptr(c_rarg0, addr);
1203              }
1204            } else {
1205              __ movptr(c_rarg0, addr);
1206              __ movptr(c_rarg1, count);
1207            }
1208            __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
1209            __ popa();
1210         }
1211          break;
1212       case BarrierSet::CardTableModRef:
1213       case BarrierSet::CardTableExtension:
1214       case BarrierSet::ModRef:
1215         break;
1216       default:
1217         ShouldNotReachHere();
1218 
1219     }
1220   }
1221 
1222   //
1223   // Generate code for an array write post barrier
1224   //
1225   //  Input:
1226   //     start    - register containing starting address of destination array
1227   //     end      - register containing ending address of destination array
1228   //     scratch  - scratch register
1229   //
1230   //  The input registers are overwritten.
1231   //  The ending address is inclusive.
1232   void  gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
1233     assert_different_registers(start, end, scratch);
1234     BarrierSet* bs = Universe::heap()->barrier_set();
1235     switch (bs->kind()) {
1236       case BarrierSet::G1SATBCT:
1237       case BarrierSet::G1SATBCTLogging:
1238 
1239         {
1240           __ pusha();                      // push registers (overkill)
1241           // must compute element count unless barrier set interface is changed (other platforms supply count)
1242           assert_different_registers(start, end, scratch);
1243           __ lea(scratch, Address(end, BytesPerHeapOop));
1244           __ subptr(scratch, start);               // subtract start to get #bytes
1245           __ shrptr(scratch, LogBytesPerHeapOop);  // convert to element count
1246           __ mov(c_rarg0, start);
1247           __ mov(c_rarg1, scratch);
1248           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
1249           __ popa();
1250         }
1251         break;
1252       case BarrierSet::CardTableModRef:
1253       case BarrierSet::CardTableExtension:
1254         {
1255           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1256           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1257 
1258           Label L_loop;
1259 
1260            __ shrptr(start, CardTableModRefBS::card_shift);
1261            __ addptr(end, BytesPerHeapOop);
1262            __ shrptr(end, CardTableModRefBS::card_shift);
1263            __ subptr(end, start); // number of bytes to copy
1264 
1265           intptr_t disp = (intptr_t) ct->byte_map_base;
1266           if (__ is_simm32(disp)) {
1267             Address cardtable(noreg, noreg, Address::no_scale, disp);
1268             __ lea(scratch, cardtable);
1269           } else {
1270             ExternalAddress cardtable((address)disp);
1271             __ lea(scratch, cardtable);
1272           }
1273 
1274           const Register count = end; // 'end' register contains bytes count now
1275           __ addptr(start, scratch);
1276         __ BIND(L_loop);
1277           __ movb(Address(start, count, Address::times_1), 0);
1278           __ decrement(count);
1279           __ jcc(Assembler::greaterEqual, L_loop);
1280         }
1281         break;
1282       default:
1283         ShouldNotReachHere();
1284 
1285     }
1286   }
1287 
1288 
1289   // Copy big chunks forward
1290   //
1291   // Inputs:
1292   //   end_from     - source arrays end address
1293   //   end_to       - destination array end address
1294   //   qword_count  - 64-bits element count, negative
1295   //   to           - scratch
1296   //   L_copy_32_bytes - entry label
1297   //   L_copy_8_bytes  - exit  label
1298   //
1299   void copy_32_bytes_forward(Register end_from, Register end_to,
1300                              Register qword_count, Register to,
1301                              Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
1302     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1303     Label L_loop;
1304     __ align(OptoLoopAlignment);
1305   __ BIND(L_loop);
1306     if(UseUnalignedLoadStores) {
1307       __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1308       __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1309       __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1310       __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1311 
1312     } else {
1313       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1314       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1315       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1316       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1317       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1318       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1319       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1320       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1321     }
1322   __ BIND(L_copy_32_bytes);
1323     __ addptr(qword_count, 4);
1324     __ jcc(Assembler::lessEqual, L_loop);
1325     __ subptr(qword_count, 4);
1326     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1327   }
1328 
1329 
1330   // Copy big chunks backward
1331   //
1332   // Inputs:
1333   //   from         - source arrays address
1334   //   dest         - destination array address
1335   //   qword_count  - 64-bits element count
1336   //   to           - scratch
1337   //   L_copy_32_bytes - entry label
1338   //   L_copy_8_bytes  - exit  label
1339   //
1340   void copy_32_bytes_backward(Register from, Register dest,
1341                               Register qword_count, Register to,
1342                               Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
1343     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1344     Label L_loop;
1345     __ align(OptoLoopAlignment);
1346   __ BIND(L_loop);
1347     if(UseUnalignedLoadStores) {
1348       __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1349       __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1350       __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1351       __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1352 
1353     } else {
1354       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1355       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1356       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1357       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1358       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1359       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1360       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1361       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1362     }
1363   __ BIND(L_copy_32_bytes);
1364     __ subptr(qword_count, 4);
1365     __ jcc(Assembler::greaterEqual, L_loop);
1366     __ addptr(qword_count, 4);
1367     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1368   }
1369 
1370 
1371   // Arguments:
1372   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1373   //             ignored
1374   //   name    - stub name string
1375   //
1376   // Inputs:
1377   //   c_rarg0   - source array address
1378   //   c_rarg1   - destination array address
1379   //   c_rarg2   - element count, treated as ssize_t, can be zero
1380   //
1381   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1382   // we let the hardware handle it.  The one to eight bytes within words,
1383   // dwords or qwords that span cache line boundaries will still be loaded
1384   // and stored atomically.
1385   //
1386   // Side Effects:
1387   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1388   //   used by generate_conjoint_byte_copy().
1389   //
1390   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1391     __ align(CodeEntryAlignment);
1392     StubCodeMark mark(this, "StubRoutines", name);
1393     address start = __ pc();
1394 
1395     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1396     Label L_copy_byte, L_exit;
1397     const Register from        = rdi;  // source array address
1398     const Register to          = rsi;  // destination array address
1399     const Register count       = rdx;  // elements count
1400     const Register byte_count  = rcx;
1401     const Register qword_count = count;
1402     const Register end_from    = from; // source array end address
1403     const Register end_to      = to;   // destination array end address
1404     // End pointers are inclusive, and if count is not zero they point
1405     // to the last unit copied:  end_to[0] := end_from[0]
1406 
1407     __ enter(); // required for proper stackwalking of RuntimeStub frame
1408     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1409 
1410     if (entry != NULL) {
1411       *entry = __ pc();
1412        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1413       BLOCK_COMMENT("Entry:");
1414     }
1415 
1416     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1417                       // r9 and r10 may be used to save non-volatile registers
1418 
1419     // 'from', 'to' and 'count' are now valid
1420     __ movptr(byte_count, count);
1421     __ shrptr(count, 3); // count => qword_count
1422 
1423     // Copy from low to high addresses.  Use 'to' as scratch.
1424     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1425     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1426     __ negptr(qword_count); // make the count negative
1427     __ jmp(L_copy_32_bytes);
1428 
1429     // Copy trailing qwords
1430   __ BIND(L_copy_8_bytes);
1431     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1432     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1433     __ increment(qword_count);
1434     __ jcc(Assembler::notZero, L_copy_8_bytes);
1435 
1436     // Check for and copy trailing dword
1437   __ BIND(L_copy_4_bytes);
1438     __ testl(byte_count, 4);
1439     __ jccb(Assembler::zero, L_copy_2_bytes);
1440     __ movl(rax, Address(end_from, 8));
1441     __ movl(Address(end_to, 8), rax);
1442 
1443     __ addptr(end_from, 4);
1444     __ addptr(end_to, 4);
1445 
1446     // Check for and copy trailing word
1447   __ BIND(L_copy_2_bytes);
1448     __ testl(byte_count, 2);
1449     __ jccb(Assembler::zero, L_copy_byte);
1450     __ movw(rax, Address(end_from, 8));
1451     __ movw(Address(end_to, 8), rax);
1452 
1453     __ addptr(end_from, 2);
1454     __ addptr(end_to, 2);
1455 
1456     // Check for and copy trailing byte
1457   __ BIND(L_copy_byte);
1458     __ testl(byte_count, 1);
1459     __ jccb(Assembler::zero, L_exit);
1460     __ movb(rax, Address(end_from, 8));
1461     __ movb(Address(end_to, 8), rax);
1462 
1463   __ BIND(L_exit);
1464     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr);
1465     restore_arg_regs();
1466     __ xorptr(rax, rax); // return 0
1467     __ leave(); // required for proper stackwalking of RuntimeStub frame
1468     __ ret(0);
1469 
1470     // Copy in 32-bytes chunks
1471     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1472     __ jmp(L_copy_4_bytes);
1473 
1474     return start;
1475   }
1476 
1477   // Arguments:
1478   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1479   //             ignored
1480   //   name    - stub name string
1481   //
1482   // Inputs:
1483   //   c_rarg0   - source array address
1484   //   c_rarg1   - destination array address
1485   //   c_rarg2   - element count, treated as ssize_t, can be zero
1486   //
1487   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1488   // we let the hardware handle it.  The one to eight bytes within words,
1489   // dwords or qwords that span cache line boundaries will still be loaded
1490   // and stored atomically.
1491   //
1492   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1493                                       address* entry, const char *name) {
1494     __ align(CodeEntryAlignment);
1495     StubCodeMark mark(this, "StubRoutines", name);
1496     address start = __ pc();
1497 
1498     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1499     const Register from        = rdi;  // source array address
1500     const Register to          = rsi;  // destination array address
1501     const Register count       = rdx;  // elements count
1502     const Register byte_count  = rcx;
1503     const Register qword_count = count;
1504 
1505     __ enter(); // required for proper stackwalking of RuntimeStub frame
1506     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1507 
1508     if (entry != NULL) {
1509       *entry = __ pc();
1510       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1511       BLOCK_COMMENT("Entry:");
1512     }
1513 
1514     array_overlap_test(nooverlap_target, Address::times_1);
1515     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1516                       // r9 and r10 may be used to save non-volatile registers
1517 
1518     // 'from', 'to' and 'count' are now valid
1519     __ movptr(byte_count, count);
1520     __ shrptr(count, 3);   // count => qword_count
1521 
1522     // Copy from high to low addresses.
1523 
1524     // Check for and copy trailing byte
1525     __ testl(byte_count, 1);
1526     __ jcc(Assembler::zero, L_copy_2_bytes);
1527     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1528     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1529     __ decrement(byte_count); // Adjust for possible trailing word
1530 
1531     // Check for and copy trailing word
1532   __ BIND(L_copy_2_bytes);
1533     __ testl(byte_count, 2);
1534     __ jcc(Assembler::zero, L_copy_4_bytes);
1535     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1536     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1537 
1538     // Check for and copy trailing dword
1539   __ BIND(L_copy_4_bytes);
1540     __ testl(byte_count, 4);
1541     __ jcc(Assembler::zero, L_copy_32_bytes);
1542     __ movl(rax, Address(from, qword_count, Address::times_8));
1543     __ movl(Address(to, qword_count, Address::times_8), rax);
1544     __ jmp(L_copy_32_bytes);
1545 
1546     // Copy trailing qwords
1547   __ BIND(L_copy_8_bytes);
1548     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1549     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1550     __ decrement(qword_count);
1551     __ jcc(Assembler::notZero, L_copy_8_bytes);
1552 
1553     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr);
1554     restore_arg_regs();
1555     __ xorptr(rax, rax); // return 0
1556     __ leave(); // required for proper stackwalking of RuntimeStub frame
1557     __ ret(0);
1558 
1559     // Copy in 32-bytes chunks
1560     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1561 
1562     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr);
1563     restore_arg_regs();
1564     __ xorptr(rax, rax); // return 0
1565     __ leave(); // required for proper stackwalking of RuntimeStub frame
1566     __ ret(0);
1567 
1568     return start;
1569   }
1570 
1571   // Arguments:
1572   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1573   //             ignored
1574   //   name    - stub name string
1575   //
1576   // Inputs:
1577   //   c_rarg0   - source array address
1578   //   c_rarg1   - destination array address
1579   //   c_rarg2   - element count, treated as ssize_t, can be zero
1580   //
1581   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1582   // let the hardware handle it.  The two or four words within dwords
1583   // or qwords that span cache line boundaries will still be loaded
1584   // and stored atomically.
1585   //
1586   // Side Effects:
1587   //   disjoint_short_copy_entry is set to the no-overlap entry point
1588   //   used by generate_conjoint_short_copy().
1589   //
1590   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1591     __ align(CodeEntryAlignment);
1592     StubCodeMark mark(this, "StubRoutines", name);
1593     address start = __ pc();
1594 
1595     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1596     const Register from        = rdi;  // source array address
1597     const Register to          = rsi;  // destination array address
1598     const Register count       = rdx;  // elements count
1599     const Register word_count  = rcx;
1600     const Register qword_count = count;
1601     const Register end_from    = from; // source array end address
1602     const Register end_to      = to;   // destination array end address
1603     // End pointers are inclusive, and if count is not zero they point
1604     // to the last unit copied:  end_to[0] := end_from[0]
1605 
1606     __ enter(); // required for proper stackwalking of RuntimeStub frame
1607     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1608 
1609     if (entry != NULL) {
1610       *entry = __ pc();
1611       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1612       BLOCK_COMMENT("Entry:");
1613     }
1614 
1615     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1616                       // r9 and r10 may be used to save non-volatile registers
1617 
1618     // 'from', 'to' and 'count' are now valid
1619     __ movptr(word_count, count);
1620     __ shrptr(count, 2); // count => qword_count
1621 
1622     // Copy from low to high addresses.  Use 'to' as scratch.
1623     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1624     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1625     __ negptr(qword_count);
1626     __ jmp(L_copy_32_bytes);
1627 
1628     // Copy trailing qwords
1629   __ BIND(L_copy_8_bytes);
1630     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1631     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1632     __ increment(qword_count);
1633     __ jcc(Assembler::notZero, L_copy_8_bytes);
1634 
1635     // Original 'dest' is trashed, so we can't use it as a
1636     // base register for a possible trailing word copy
1637 
1638     // Check for and copy trailing dword
1639   __ BIND(L_copy_4_bytes);
1640     __ testl(word_count, 2);
1641     __ jccb(Assembler::zero, L_copy_2_bytes);
1642     __ movl(rax, Address(end_from, 8));
1643     __ movl(Address(end_to, 8), rax);
1644 
1645     __ addptr(end_from, 4);
1646     __ addptr(end_to, 4);
1647 
1648     // Check for and copy trailing word
1649   __ BIND(L_copy_2_bytes);
1650     __ testl(word_count, 1);
1651     __ jccb(Assembler::zero, L_exit);
1652     __ movw(rax, Address(end_from, 8));
1653     __ movw(Address(end_to, 8), rax);
1654 
1655   __ BIND(L_exit);
1656     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr);
1657     restore_arg_regs();
1658     __ xorptr(rax, rax); // return 0
1659     __ leave(); // required for proper stackwalking of RuntimeStub frame
1660     __ ret(0);
1661 
1662     // Copy in 32-bytes chunks
1663     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1664     __ jmp(L_copy_4_bytes);
1665 
1666     return start;
1667   }
1668 
1669   address generate_fill(BasicType t, bool aligned, const char *name) {
1670     __ align(CodeEntryAlignment);
1671     StubCodeMark mark(this, "StubRoutines", name);
1672     address start = __ pc();
1673 
1674     BLOCK_COMMENT("Entry:");
1675 
1676     const Register to       = c_rarg0;  // source array address
1677     const Register value    = c_rarg1;  // value
1678     const Register count    = c_rarg2;  // elements count
1679 
1680     __ enter(); // required for proper stackwalking of RuntimeStub frame
1681 
1682     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1683 
1684     __ leave(); // required for proper stackwalking of RuntimeStub frame
1685     __ ret(0);
1686     return start;
1687   }
1688 
1689   // Arguments:
1690   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1691   //             ignored
1692   //   name    - stub name string
1693   //
1694   // Inputs:
1695   //   c_rarg0   - source array address
1696   //   c_rarg1   - destination array address
1697   //   c_rarg2   - element count, treated as ssize_t, can be zero
1698   //
1699   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1700   // let the hardware handle it.  The two or four words within dwords
1701   // or qwords that span cache line boundaries will still be loaded
1702   // and stored atomically.
1703   //
1704   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1705                                        address *entry, const char *name) {
1706     __ align(CodeEntryAlignment);
1707     StubCodeMark mark(this, "StubRoutines", name);
1708     address start = __ pc();
1709 
1710     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes;
1711     const Register from        = rdi;  // source array address
1712     const Register to          = rsi;  // destination array address
1713     const Register count       = rdx;  // elements count
1714     const Register word_count  = rcx;
1715     const Register qword_count = count;
1716 
1717     __ enter(); // required for proper stackwalking of RuntimeStub frame
1718     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1719 
1720     if (entry != NULL) {
1721       *entry = __ pc();
1722       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1723       BLOCK_COMMENT("Entry:");
1724     }
1725 
1726     array_overlap_test(nooverlap_target, Address::times_2);
1727     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1728                       // r9 and r10 may be used to save non-volatile registers
1729 
1730     // 'from', 'to' and 'count' are now valid
1731     __ movptr(word_count, count);
1732     __ shrptr(count, 2); // count => qword_count
1733 
1734     // Copy from high to low addresses.  Use 'to' as scratch.
1735 
1736     // Check for and copy trailing word
1737     __ testl(word_count, 1);
1738     __ jccb(Assembler::zero, L_copy_4_bytes);
1739     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1740     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1741 
1742     // Check for and copy trailing dword
1743   __ BIND(L_copy_4_bytes);
1744     __ testl(word_count, 2);
1745     __ jcc(Assembler::zero, L_copy_32_bytes);
1746     __ movl(rax, Address(from, qword_count, Address::times_8));
1747     __ movl(Address(to, qword_count, Address::times_8), rax);
1748     __ jmp(L_copy_32_bytes);
1749 
1750     // Copy trailing qwords
1751   __ BIND(L_copy_8_bytes);
1752     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1753     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1754     __ decrement(qword_count);
1755     __ jcc(Assembler::notZero, L_copy_8_bytes);
1756 
1757     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr);
1758     restore_arg_regs();
1759     __ xorptr(rax, rax); // return 0
1760     __ leave(); // required for proper stackwalking of RuntimeStub frame
1761     __ ret(0);
1762 
1763     // Copy in 32-bytes chunks
1764     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1765 
1766     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr);
1767     restore_arg_regs();
1768     __ xorptr(rax, rax); // return 0
1769     __ leave(); // required for proper stackwalking of RuntimeStub frame
1770     __ ret(0);
1771 
1772     return start;
1773   }
1774 
1775   // Arguments:
1776   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1777   //             ignored
1778   //   is_oop  - true => oop array, so generate store check code
1779   //   name    - stub name string
1780   //
1781   // Inputs:
1782   //   c_rarg0   - source array address
1783   //   c_rarg1   - destination array address
1784   //   c_rarg2   - element count, treated as ssize_t, can be zero
1785   //
1786   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1787   // the hardware handle it.  The two dwords within qwords that span
1788   // cache line boundaries will still be loaded and stored atomicly.
1789   //
1790   // Side Effects:
1791   //   disjoint_int_copy_entry is set to the no-overlap entry point
1792   //   used by generate_conjoint_int_oop_copy().
1793   //
1794   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1795                                          const char *name, bool dest_uninitialized = false) {
1796     __ align(CodeEntryAlignment);
1797     StubCodeMark mark(this, "StubRoutines", name);
1798     address start = __ pc();
1799 
1800     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1801     const Register from        = rdi;  // source array address
1802     const Register to          = rsi;  // destination array address
1803     const Register count       = rdx;  // elements count
1804     const Register dword_count = rcx;
1805     const Register qword_count = count;
1806     const Register end_from    = from; // source array end address
1807     const Register end_to      = to;   // destination array end address
1808     const Register saved_to    = r11;  // saved destination array address
1809     // End pointers are inclusive, and if count is not zero they point
1810     // to the last unit copied:  end_to[0] := end_from[0]
1811 
1812     __ enter(); // required for proper stackwalking of RuntimeStub frame
1813     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1814 
1815     if (entry != NULL) {
1816       *entry = __ pc();
1817       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1818       BLOCK_COMMENT("Entry:");
1819     }
1820 
1821     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1822                       // r9 and r10 may be used to save non-volatile registers
1823     if (is_oop) {
1824       __ movq(saved_to, to);
1825       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1826     }
1827 
1828     // 'from', 'to' and 'count' are now valid
1829     __ movptr(dword_count, count);
1830     __ shrptr(count, 1); // count => qword_count
1831 
1832     // Copy from low to high addresses.  Use 'to' as scratch.
1833     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1834     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1835     __ negptr(qword_count);
1836     __ jmp(L_copy_32_bytes);
1837 
1838     // Copy trailing qwords
1839   __ BIND(L_copy_8_bytes);
1840     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1841     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1842     __ increment(qword_count);
1843     __ jcc(Assembler::notZero, L_copy_8_bytes);
1844 
1845     // Check for and copy trailing dword
1846   __ BIND(L_copy_4_bytes);
1847     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1848     __ jccb(Assembler::zero, L_exit);
1849     __ movl(rax, Address(end_from, 8));
1850     __ movl(Address(end_to, 8), rax);
1851 
1852   __ BIND(L_exit);
1853     if (is_oop) {
1854       __ leaq(end_to, Address(saved_to, dword_count, Address::times_4, -4));
1855       gen_write_ref_array_post_barrier(saved_to, end_to, rax);
1856     }
1857     inc_counter_np(SharedRuntime::_jint_array_copy_ctr);
1858     restore_arg_regs();
1859     __ xorptr(rax, rax); // return 0
1860     __ leave(); // required for proper stackwalking of RuntimeStub frame
1861     __ ret(0);
1862 
1863     // Copy 32-bytes chunks
1864     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1865     __ jmp(L_copy_4_bytes);
1866 
1867     return start;
1868   }
1869 
1870   // Arguments:
1871   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1872   //             ignored
1873   //   is_oop  - true => oop array, so generate store check code
1874   //   name    - stub name string
1875   //
1876   // Inputs:
1877   //   c_rarg0   - source array address
1878   //   c_rarg1   - destination array address
1879   //   c_rarg2   - element count, treated as ssize_t, can be zero
1880   //
1881   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1882   // the hardware handle it.  The two dwords within qwords that span
1883   // cache line boundaries will still be loaded and stored atomicly.
1884   //
1885   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1886                                          address *entry, const char *name,
1887                                          bool dest_uninitialized = false) {
1888     __ align(CodeEntryAlignment);
1889     StubCodeMark mark(this, "StubRoutines", name);
1890     address start = __ pc();
1891 
1892     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
1893     const Register from        = rdi;  // source array address
1894     const Register to          = rsi;  // destination array address
1895     const Register count       = rdx;  // elements count
1896     const Register dword_count = rcx;
1897     const Register qword_count = count;
1898 
1899     __ enter(); // required for proper stackwalking of RuntimeStub frame
1900     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1901 
1902     if (entry != NULL) {
1903       *entry = __ pc();
1904        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1905       BLOCK_COMMENT("Entry:");
1906     }
1907 
1908     array_overlap_test(nooverlap_target, Address::times_4);
1909     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1910                       // r9 and r10 may be used to save non-volatile registers
1911 
1912     if (is_oop) {
1913       // no registers are destroyed by this call
1914       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1915     }
1916 
1917     assert_clean_int(count, rax); // Make sure 'count' is clean int.
1918     // 'from', 'to' and 'count' are now valid
1919     __ movptr(dword_count, count);
1920     __ shrptr(count, 1); // count => qword_count
1921 
1922     // Copy from high to low addresses.  Use 'to' as scratch.
1923 
1924     // Check for and copy trailing dword
1925     __ testl(dword_count, 1);
1926     __ jcc(Assembler::zero, L_copy_32_bytes);
1927     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
1928     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
1929     __ jmp(L_copy_32_bytes);
1930 
1931     // Copy trailing qwords
1932   __ BIND(L_copy_8_bytes);
1933     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1934     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1935     __ decrement(qword_count);
1936     __ jcc(Assembler::notZero, L_copy_8_bytes);
1937 
1938     inc_counter_np(SharedRuntime::_jint_array_copy_ctr);
1939     if (is_oop) {
1940       __ jmp(L_exit);
1941     }
1942     restore_arg_regs();
1943     __ xorptr(rax, rax); // return 0
1944     __ leave(); // required for proper stackwalking of RuntimeStub frame
1945     __ ret(0);
1946 
1947     // Copy in 32-bytes chunks
1948     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1949 
1950    inc_counter_np(SharedRuntime::_jint_array_copy_ctr);
1951    __ bind(L_exit);
1952      if (is_oop) {
1953        Register end_to = rdx;
1954        __ leaq(end_to, Address(to, dword_count, Address::times_4, -4));
1955        gen_write_ref_array_post_barrier(to, end_to, rax);
1956      }
1957     restore_arg_regs();
1958     __ xorptr(rax, rax); // return 0
1959     __ leave(); // required for proper stackwalking of RuntimeStub frame
1960     __ ret(0);
1961 
1962     return start;
1963   }
1964 
1965   // Arguments:
1966   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1967   //             ignored
1968   //   is_oop  - true => oop array, so generate store check code
1969   //   name    - stub name string
1970   //
1971   // Inputs:
1972   //   c_rarg0   - source array address
1973   //   c_rarg1   - destination array address
1974   //   c_rarg2   - element count, treated as ssize_t, can be zero
1975   //
1976  // Side Effects:
1977   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1978   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1979   //
1980   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
1981                                           const char *name, bool dest_uninitialized = false) {
1982     __ align(CodeEntryAlignment);
1983     StubCodeMark mark(this, "StubRoutines", name);
1984     address start = __ pc();
1985 
1986     Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
1987     const Register from        = rdi;  // source array address
1988     const Register to          = rsi;  // destination array address
1989     const Register qword_count = rdx;  // elements count
1990     const Register end_from    = from; // source array end address
1991     const Register end_to      = rcx;  // destination array end address
1992     const Register saved_to    = to;
1993     // End pointers are inclusive, and if count is not zero they point
1994     // to the last unit copied:  end_to[0] := end_from[0]
1995 
1996     __ enter(); // required for proper stackwalking of RuntimeStub frame
1997     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
1998     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1999 
2000     if (entry != NULL) {
2001       *entry = __ pc();
2002       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2003       BLOCK_COMMENT("Entry:");
2004     }
2005 
2006     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2007                       // r9 and r10 may be used to save non-volatile registers
2008     // 'from', 'to' and 'qword_count' are now valid
2009     if (is_oop) {
2010       // no registers are destroyed by this call
2011       gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized);
2012     }
2013 
2014     // Copy from low to high addresses.  Use 'to' as scratch.
2015     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2016     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2017     __ negptr(qword_count);
2018     __ jmp(L_copy_32_bytes);
2019 
2020     // Copy trailing qwords
2021   __ BIND(L_copy_8_bytes);
2022     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2023     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2024     __ increment(qword_count);
2025     __ jcc(Assembler::notZero, L_copy_8_bytes);
2026 
2027     if (is_oop) {
2028       __ jmp(L_exit);
2029     } else {
2030       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
2031       restore_arg_regs();
2032       __ xorptr(rax, rax); // return 0
2033       __ leave(); // required for proper stackwalking of RuntimeStub frame
2034       __ ret(0);
2035     }
2036 
2037     // Copy 64-byte chunks
2038     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
2039 
2040     if (is_oop) {
2041     __ BIND(L_exit);
2042       gen_write_ref_array_post_barrier(saved_to, end_to, rax);
2043       inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
2044     } else {
2045       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
2046     }
2047     restore_arg_regs();
2048     __ xorptr(rax, rax); // return 0
2049     __ leave(); // required for proper stackwalking of RuntimeStub frame
2050     __ ret(0);
2051 
2052     return start;
2053   }
2054 
2055   // Arguments:
2056   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2057   //             ignored
2058   //   is_oop  - true => oop array, so generate store check code
2059   //   name    - stub name string
2060   //
2061   // Inputs:
2062   //   c_rarg0   - source array address
2063   //   c_rarg1   - destination array address
2064   //   c_rarg2   - element count, treated as ssize_t, can be zero
2065   //
2066   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2067                                           address nooverlap_target, address *entry,
2068                                           const char *name, bool dest_uninitialized = false) {
2069     __ align(CodeEntryAlignment);
2070     StubCodeMark mark(this, "StubRoutines", name);
2071     address start = __ pc();
2072 
2073     Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
2074     const Register from        = rdi;  // source array address
2075     const Register to          = rsi;  // destination array address
2076     const Register qword_count = rdx;  // elements count
2077     const Register saved_count = rcx;
2078 
2079     __ enter(); // required for proper stackwalking of RuntimeStub frame
2080     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2081 
2082     if (entry != NULL) {
2083       *entry = __ pc();
2084       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2085       BLOCK_COMMENT("Entry:");
2086     }
2087 
2088     array_overlap_test(nooverlap_target, Address::times_8);
2089     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2090                       // r9 and r10 may be used to save non-volatile registers
2091     // 'from', 'to' and 'qword_count' are now valid
2092     if (is_oop) {
2093       // Save to and count for store barrier
2094       __ movptr(saved_count, qword_count);
2095       // No registers are destroyed by this call
2096       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2097     }
2098 
2099     __ jmp(L_copy_32_bytes);
2100 
2101     // Copy trailing qwords
2102   __ BIND(L_copy_8_bytes);
2103     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2104     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2105     __ decrement(qword_count);
2106     __ jcc(Assembler::notZero, L_copy_8_bytes);
2107 
2108     if (is_oop) {
2109       __ jmp(L_exit);
2110     } else {
2111       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
2112       restore_arg_regs();
2113       __ xorptr(rax, rax); // return 0
2114       __ leave(); // required for proper stackwalking of RuntimeStub frame
2115       __ ret(0);
2116     }
2117 
2118     // Copy in 32-bytes chunks
2119     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
2120 
2121     if (is_oop) {
2122     __ BIND(L_exit);
2123       __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
2124       gen_write_ref_array_post_barrier(to, rcx, rax);
2125       inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
2126     } else {
2127       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
2128     }
2129     restore_arg_regs();
2130     __ xorptr(rax, rax); // return 0
2131     __ leave(); // required for proper stackwalking of RuntimeStub frame
2132     __ ret(0);
2133 
2134     return start;
2135   }
2136 
2137 
2138   // Helper for generating a dynamic type check.
2139   // Smashes no registers.
2140   void generate_type_check(Register sub_klass,
2141                            Register super_check_offset,
2142                            Register super_klass,
2143                            Label& L_success) {
2144     assert_different_registers(sub_klass, super_check_offset, super_klass);
2145 
2146     BLOCK_COMMENT("type_check:");
2147 
2148     Label L_miss;
2149 
2150     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2151                                      super_check_offset);
2152     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2153 
2154     // Fall through on failure!
2155     __ BIND(L_miss);
2156   }
2157 
2158   //
2159   //  Generate checkcasting array copy stub
2160   //
2161   //  Input:
2162   //    c_rarg0   - source array address
2163   //    c_rarg1   - destination array address
2164   //    c_rarg2   - element count, treated as ssize_t, can be zero
2165   //    c_rarg3   - size_t ckoff (super_check_offset)
2166   // not Win64
2167   //    c_rarg4   - oop ckval (super_klass)
2168   // Win64
2169   //    rsp+40    - oop ckval (super_klass)
2170   //
2171   //  Output:
2172   //    rax ==  0  -  success
2173   //    rax == -1^K - failure, where K is partial transfer count
2174   //
2175   address generate_checkcast_copy(const char *name, address *entry,
2176                                   bool dest_uninitialized = false) {
2177 
2178     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2179 
2180     // Input registers (after setup_arg_regs)
2181     const Register from        = rdi;   // source array address
2182     const Register to          = rsi;   // destination array address
2183     const Register length      = rdx;   // elements count
2184     const Register ckoff       = rcx;   // super_check_offset
2185     const Register ckval       = r8;    // super_klass
2186 
2187     // Registers used as temps (r13, r14 are save-on-entry)
2188     const Register end_from    = from;  // source array end address
2189     const Register end_to      = r13;   // destination array end address
2190     const Register count       = rdx;   // -(count_remaining)
2191     const Register r14_length  = r14;   // saved copy of length
2192     // End pointers are inclusive, and if length is not zero they point
2193     // to the last unit copied:  end_to[0] := end_from[0]
2194 
2195     const Register rax_oop    = rax;    // actual oop copied
2196     const Register r11_klass  = r11;    // oop._klass
2197 
2198     //---------------------------------------------------------------
2199     // Assembler stub will be used for this call to arraycopy
2200     // if the two arrays are subtypes of Object[] but the
2201     // destination array type is not equal to or a supertype
2202     // of the source type.  Each element must be separately
2203     // checked.
2204 
2205     __ align(CodeEntryAlignment);
2206     StubCodeMark mark(this, "StubRoutines", name);
2207     address start = __ pc();
2208 
2209     __ enter(); // required for proper stackwalking of RuntimeStub frame
2210 
2211 #ifdef ASSERT
2212     // caller guarantees that the arrays really are different
2213     // otherwise, we would have to make conjoint checks
2214     { Label L;
2215       array_overlap_test(L, TIMES_OOP);
2216       __ stop("checkcast_copy within a single array");
2217       __ bind(L);
2218     }
2219 #endif //ASSERT
2220 
2221     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2222                        // ckoff => rcx, ckval => r8
2223                        // r9 and r10 may be used to save non-volatile registers
2224 #ifdef _WIN64
2225     // last argument (#4) is on stack on Win64
2226     __ movptr(ckval, Address(rsp, 6 * wordSize));
2227 #endif
2228 
2229     // Caller of this entry point must set up the argument registers.
2230     if (entry != NULL) {
2231       *entry = __ pc();
2232       BLOCK_COMMENT("Entry:");
2233     }
2234 
2235     // allocate spill slots for r13, r14
2236     enum {
2237       saved_r13_offset,
2238       saved_r14_offset,
2239       saved_rbp_offset
2240     };
2241     __ subptr(rsp, saved_rbp_offset * wordSize);
2242     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2243     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2244 
2245     // check that int operands are properly extended to size_t
2246     assert_clean_int(length, rax);
2247     assert_clean_int(ckoff, rax);
2248 
2249 #ifdef ASSERT
2250     BLOCK_COMMENT("assert consistent ckoff/ckval");
2251     // The ckoff and ckval must be mutually consistent,
2252     // even though caller generates both.
2253     { Label L;
2254       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
2255                         Klass::super_check_offset_offset_in_bytes());
2256       __ cmpl(ckoff, Address(ckval, sco_offset));
2257       __ jcc(Assembler::equal, L);
2258       __ stop("super_check_offset inconsistent");
2259       __ bind(L);
2260     }
2261 #endif //ASSERT
2262 
2263     // Loop-invariant addresses.  They are exclusive end pointers.
2264     Address end_from_addr(from, length, TIMES_OOP, 0);
2265     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2266     // Loop-variant addresses.  They assume post-incremented count < 0.
2267     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2268     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2269 
2270     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2271 
2272     // Copy from low to high addresses, indexed from the end of each array.
2273     __ lea(end_from, end_from_addr);
2274     __ lea(end_to,   end_to_addr);
2275     __ movptr(r14_length, length);        // save a copy of the length
2276     assert(length == count, "");          // else fix next line:
2277     __ negptr(count);                     // negate and test the length
2278     __ jcc(Assembler::notZero, L_load_element);
2279 
2280     // Empty array:  Nothing to do.
2281     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2282     __ jmp(L_done);
2283 
2284     // ======== begin loop ========
2285     // (Loop is rotated; its entry is L_load_element.)
2286     // Loop control:
2287     //   for (count = -count; count != 0; count++)
2288     // Base pointers src, dst are biased by 8*(count-1),to last element.
2289     __ align(OptoLoopAlignment);
2290 
2291     __ BIND(L_store_element);
2292     __ store_heap_oop(to_element_addr, rax_oop);  // store the oop
2293     __ increment(count);               // increment the count toward zero
2294     __ jcc(Assembler::zero, L_do_card_marks);
2295 
2296     // ======== loop entry is here ========
2297     __ BIND(L_load_element);
2298     __ load_heap_oop(rax_oop, from_element_addr); // load the oop
2299     __ testptr(rax_oop, rax_oop);
2300     __ jcc(Assembler::zero, L_store_element);
2301 
2302     __ load_klass(r11_klass, rax_oop);// query the object klass
2303     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2304     // ======== end loop ========
2305 
2306     // It was a real error; we must depend on the caller to finish the job.
2307     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2308     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2309     // and report their number to the caller.
2310     assert_different_registers(rax, r14_length, count, to, end_to, rcx);
2311     __ lea(end_to, to_element_addr);
2312     __ addptr(end_to, -heapOopSize);      // make an inclusive end pointer
2313     gen_write_ref_array_post_barrier(to, end_to, rscratch1);
2314     __ movptr(rax, r14_length);           // original oops
2315     __ addptr(rax, count);                // K = (original - remaining) oops
2316     __ notptr(rax);                       // report (-1^K) to caller
2317     __ jmp(L_done);
2318 
2319     // Come here on success only.
2320     __ BIND(L_do_card_marks);
2321     __ addptr(end_to, -heapOopSize);         // make an inclusive end pointer
2322     gen_write_ref_array_post_barrier(to, end_to, rscratch1);
2323     __ xorptr(rax, rax);                  // return 0 on success
2324 
2325     // Common exit point (success or failure).
2326     __ BIND(L_done);
2327     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2328     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2329     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2330     restore_arg_regs();
2331     __ leave(); // required for proper stackwalking of RuntimeStub frame
2332     __ ret(0);
2333 
2334     return start;
2335   }
2336 
2337   //
2338   //  Generate 'unsafe' array copy stub
2339   //  Though just as safe as the other stubs, it takes an unscaled
2340   //  size_t argument instead of an element count.
2341   //
2342   //  Input:
2343   //    c_rarg0   - source array address
2344   //    c_rarg1   - destination array address
2345   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2346   //
2347   // Examines the alignment of the operands and dispatches
2348   // to a long, int, short, or byte copy loop.
2349   //
2350   address generate_unsafe_copy(const char *name,
2351                                address byte_copy_entry, address short_copy_entry,
2352                                address int_copy_entry, address long_copy_entry) {
2353 
2354     Label L_long_aligned, L_int_aligned, L_short_aligned;
2355 
2356     // Input registers (before setup_arg_regs)
2357     const Register from        = c_rarg0;  // source array address
2358     const Register to          = c_rarg1;  // destination array address
2359     const Register size        = c_rarg2;  // byte count (size_t)
2360 
2361     // Register used as a temp
2362     const Register bits        = rax;      // test copy of low bits
2363 
2364     __ align(CodeEntryAlignment);
2365     StubCodeMark mark(this, "StubRoutines", name);
2366     address start = __ pc();
2367 
2368     __ enter(); // required for proper stackwalking of RuntimeStub frame
2369 
2370     // bump this on entry, not on exit:
2371     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2372 
2373     __ mov(bits, from);
2374     __ orptr(bits, to);
2375     __ orptr(bits, size);
2376 
2377     __ testb(bits, BytesPerLong-1);
2378     __ jccb(Assembler::zero, L_long_aligned);
2379 
2380     __ testb(bits, BytesPerInt-1);
2381     __ jccb(Assembler::zero, L_int_aligned);
2382 
2383     __ testb(bits, BytesPerShort-1);
2384     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2385 
2386     __ BIND(L_short_aligned);
2387     __ shrptr(size, LogBytesPerShort); // size => short_count
2388     __ jump(RuntimeAddress(short_copy_entry));
2389 
2390     __ BIND(L_int_aligned);
2391     __ shrptr(size, LogBytesPerInt); // size => int_count
2392     __ jump(RuntimeAddress(int_copy_entry));
2393 
2394     __ BIND(L_long_aligned);
2395     __ shrptr(size, LogBytesPerLong); // size => qword_count
2396     __ jump(RuntimeAddress(long_copy_entry));
2397 
2398     return start;
2399   }
2400 
2401   // Perform range checks on the proposed arraycopy.
2402   // Kills temp, but nothing else.
2403   // Also, clean the sign bits of src_pos and dst_pos.
2404   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2405                               Register src_pos, // source position (c_rarg1)
2406                               Register dst,     // destination array oo (c_rarg2)
2407                               Register dst_pos, // destination position (c_rarg3)
2408                               Register length,
2409                               Register temp,
2410                               Label& L_failed) {
2411     BLOCK_COMMENT("arraycopy_range_checks:");
2412 
2413     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2414     __ movl(temp, length);
2415     __ addl(temp, src_pos);             // src_pos + length
2416     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2417     __ jcc(Assembler::above, L_failed);
2418 
2419     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2420     __ movl(temp, length);
2421     __ addl(temp, dst_pos);             // dst_pos + length
2422     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2423     __ jcc(Assembler::above, L_failed);
2424 
2425     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2426     // Move with sign extension can be used since they are positive.
2427     __ movslq(src_pos, src_pos);
2428     __ movslq(dst_pos, dst_pos);
2429 
2430     BLOCK_COMMENT("arraycopy_range_checks done");
2431   }
2432 
2433   //
2434   //  Generate generic array copy stubs
2435   //
2436   //  Input:
2437   //    c_rarg0    -  src oop
2438   //    c_rarg1    -  src_pos (32-bits)
2439   //    c_rarg2    -  dst oop
2440   //    c_rarg3    -  dst_pos (32-bits)
2441   // not Win64
2442   //    c_rarg4    -  element count (32-bits)
2443   // Win64
2444   //    rsp+40     -  element count (32-bits)
2445   //
2446   //  Output:
2447   //    rax ==  0  -  success
2448   //    rax == -1^K - failure, where K is partial transfer count
2449   //
2450   address generate_generic_copy(const char *name,
2451                                 address byte_copy_entry, address short_copy_entry,
2452                                 address int_copy_entry, address long_copy_entry,
2453                                 address oop_copy_entry, address checkcast_copy_entry) {
2454 
2455     Label L_failed, L_failed_0, L_objArray;
2456     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2457 
2458     // Input registers
2459     const Register src        = c_rarg0;  // source array oop
2460     const Register src_pos    = c_rarg1;  // source position
2461     const Register dst        = c_rarg2;  // destination array oop
2462     const Register dst_pos    = c_rarg3;  // destination position
2463 #ifndef _WIN64
2464     const Register length     = c_rarg4;
2465 #else
2466     const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2467 #endif
2468 
2469     { int modulus = CodeEntryAlignment;
2470       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2471       int advance = target - (__ offset() % modulus);
2472       if (advance < 0)  advance += modulus;
2473       if (advance > 0)  __ nop(advance);
2474     }
2475     StubCodeMark mark(this, "StubRoutines", name);
2476 
2477     // Short-hop target to L_failed.  Makes for denser prologue code.
2478     __ BIND(L_failed_0);
2479     __ jmp(L_failed);
2480     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2481 
2482     __ align(CodeEntryAlignment);
2483     address start = __ pc();
2484 
2485     __ enter(); // required for proper stackwalking of RuntimeStub frame
2486 
2487     // bump this on entry, not on exit:
2488     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2489 
2490     //-----------------------------------------------------------------------
2491     // Assembler stub will be used for this call to arraycopy
2492     // if the following conditions are met:
2493     //
2494     // (1) src and dst must not be null.
2495     // (2) src_pos must not be negative.
2496     // (3) dst_pos must not be negative.
2497     // (4) length  must not be negative.
2498     // (5) src klass and dst klass should be the same and not NULL.
2499     // (6) src and dst should be arrays.
2500     // (7) src_pos + length must not exceed length of src.
2501     // (8) dst_pos + length must not exceed length of dst.
2502     //
2503 
2504     //  if (src == NULL) return -1;
2505     __ testptr(src, src);         // src oop
2506     size_t j1off = __ offset();
2507     __ jccb(Assembler::zero, L_failed_0);
2508 
2509     //  if (src_pos < 0) return -1;
2510     __ testl(src_pos, src_pos); // src_pos (32-bits)
2511     __ jccb(Assembler::negative, L_failed_0);
2512 
2513     //  if (dst == NULL) return -1;
2514     __ testptr(dst, dst);         // dst oop
2515     __ jccb(Assembler::zero, L_failed_0);
2516 
2517     //  if (dst_pos < 0) return -1;
2518     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2519     size_t j4off = __ offset();
2520     __ jccb(Assembler::negative, L_failed_0);
2521 
2522     // The first four tests are very dense code,
2523     // but not quite dense enough to put four
2524     // jumps in a 16-byte instruction fetch buffer.
2525     // That's good, because some branch predicters
2526     // do not like jumps so close together.
2527     // Make sure of this.
2528     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2529 
2530     // registers used as temp
2531     const Register r11_length    = r11; // elements count to copy
2532     const Register r10_src_klass = r10; // array klass
2533 
2534     //  if (length < 0) return -1;
2535     __ movl(r11_length, length);        // length (elements count, 32-bits value)
2536     __ testl(r11_length, r11_length);
2537     __ jccb(Assembler::negative, L_failed_0);
2538 
2539     __ load_klass(r10_src_klass, src);
2540 #ifdef ASSERT
2541     //  assert(src->klass() != NULL);
2542     {
2543       BLOCK_COMMENT("assert klasses not null {");
2544       Label L1, L2;
2545       __ testptr(r10_src_klass, r10_src_klass);
2546       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2547       __ bind(L1);
2548       __ stop("broken null klass");
2549       __ bind(L2);
2550       __ load_klass(rax, dst);
2551       __ cmpq(rax, 0);
2552       __ jcc(Assembler::equal, L1);     // this would be broken also
2553       BLOCK_COMMENT("} assert klasses not null done");
2554     }
2555 #endif
2556 
2557     // Load layout helper (32-bits)
2558     //
2559     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2560     // 32        30    24            16              8     2                 0
2561     //
2562     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2563     //
2564 
2565     const int lh_offset = klassOopDesc::header_size() * HeapWordSize +
2566                           Klass::layout_helper_offset_in_bytes();
2567 
2568     // Handle objArrays completely differently...
2569     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2570     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2571     __ jcc(Assembler::equal, L_objArray);
2572 
2573     //  if (src->klass() != dst->klass()) return -1;
2574     __ load_klass(rax, dst);
2575     __ cmpq(r10_src_klass, rax);
2576     __ jcc(Assembler::notEqual, L_failed);
2577 
2578     const Register rax_lh = rax;  // layout helper
2579     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2580 
2581     //  if (!src->is_Array()) return -1;
2582     __ cmpl(rax_lh, Klass::_lh_neutral_value);
2583     __ jcc(Assembler::greaterEqual, L_failed);
2584 
2585     // At this point, it is known to be a typeArray (array_tag 0x3).
2586 #ifdef ASSERT
2587     {
2588       BLOCK_COMMENT("assert primitive array {");
2589       Label L;
2590       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2591       __ jcc(Assembler::greaterEqual, L);
2592       __ stop("must be a primitive array");
2593       __ bind(L);
2594       BLOCK_COMMENT("} assert primitive array done");
2595     }
2596 #endif
2597 
2598     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2599                            r10, L_failed);
2600 
2601     // typeArrayKlass
2602     //
2603     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2604     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2605     //
2606 
2607     const Register r10_offset = r10;    // array offset
2608     const Register rax_elsize = rax_lh; // element size
2609 
2610     __ movl(r10_offset, rax_lh);
2611     __ shrl(r10_offset, Klass::_lh_header_size_shift);
2612     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2613     __ addptr(src, r10_offset);           // src array offset
2614     __ addptr(dst, r10_offset);           // dst array offset
2615     BLOCK_COMMENT("choose copy loop based on element size");
2616     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2617 
2618     // next registers should be set before the jump to corresponding stub
2619     const Register from     = c_rarg0;  // source array address
2620     const Register to       = c_rarg1;  // destination array address
2621     const Register count    = c_rarg2;  // elements count
2622 
2623     // 'from', 'to', 'count' registers should be set in such order
2624     // since they are the same as 'src', 'src_pos', 'dst'.
2625 
2626   __ BIND(L_copy_bytes);
2627     __ cmpl(rax_elsize, 0);
2628     __ jccb(Assembler::notEqual, L_copy_shorts);
2629     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2630     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2631     __ movl2ptr(count, r11_length); // length
2632     __ jump(RuntimeAddress(byte_copy_entry));
2633 
2634   __ BIND(L_copy_shorts);
2635     __ cmpl(rax_elsize, LogBytesPerShort);
2636     __ jccb(Assembler::notEqual, L_copy_ints);
2637     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2638     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2639     __ movl2ptr(count, r11_length); // length
2640     __ jump(RuntimeAddress(short_copy_entry));
2641 
2642   __ BIND(L_copy_ints);
2643     __ cmpl(rax_elsize, LogBytesPerInt);
2644     __ jccb(Assembler::notEqual, L_copy_longs);
2645     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2646     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2647     __ movl2ptr(count, r11_length); // length
2648     __ jump(RuntimeAddress(int_copy_entry));
2649 
2650   __ BIND(L_copy_longs);
2651 #ifdef ASSERT
2652     {
2653       BLOCK_COMMENT("assert long copy {");
2654       Label L;
2655       __ cmpl(rax_elsize, LogBytesPerLong);
2656       __ jcc(Assembler::equal, L);
2657       __ stop("must be long copy, but elsize is wrong");
2658       __ bind(L);
2659       BLOCK_COMMENT("} assert long copy done");
2660     }
2661 #endif
2662     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2663     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2664     __ movl2ptr(count, r11_length); // length
2665     __ jump(RuntimeAddress(long_copy_entry));
2666 
2667     // objArrayKlass
2668   __ BIND(L_objArray);
2669     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2670 
2671     Label L_plain_copy, L_checkcast_copy;
2672     //  test array classes for subtyping
2673     __ load_klass(rax, dst);
2674     __ cmpq(r10_src_klass, rax); // usual case is exact equality
2675     __ jcc(Assembler::notEqual, L_checkcast_copy);
2676 
2677     // Identically typed arrays can be copied without element-wise checks.
2678     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2679                            r10, L_failed);
2680 
2681     __ lea(from, Address(src, src_pos, TIMES_OOP,
2682                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2683     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2684                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2685     __ movl2ptr(count, r11_length); // length
2686   __ BIND(L_plain_copy);
2687     __ jump(RuntimeAddress(oop_copy_entry));
2688 
2689   __ BIND(L_checkcast_copy);
2690     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
2691     {
2692       // Before looking at dst.length, make sure dst is also an objArray.
2693       __ cmpl(Address(rax, lh_offset), objArray_lh);
2694       __ jcc(Assembler::notEqual, L_failed);
2695 
2696       // It is safe to examine both src.length and dst.length.
2697       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2698                              rax, L_failed);
2699 
2700       const Register r11_dst_klass = r11;
2701       __ load_klass(r11_dst_klass, dst); // reload
2702 
2703       // Marshal the base address arguments now, freeing registers.
2704       __ lea(from, Address(src, src_pos, TIMES_OOP,
2705                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2706       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2707                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2708       __ movl(count, length);           // length (reloaded)
2709       Register sco_temp = c_rarg3;      // this register is free now
2710       assert_different_registers(from, to, count, sco_temp,
2711                                  r11_dst_klass, r10_src_klass);
2712       assert_clean_int(count, sco_temp);
2713 
2714       // Generate the type check.
2715       const int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
2716                               Klass::super_check_offset_offset_in_bytes());
2717       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2718       assert_clean_int(sco_temp, rax);
2719       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2720 
2721       // Fetch destination element klass from the objArrayKlass header.
2722       int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
2723                        objArrayKlass::element_klass_offset_in_bytes());
2724       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2725       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
2726       assert_clean_int(sco_temp, rax);
2727 
2728       // the checkcast_copy loop needs two extra arguments:
2729       assert(c_rarg3 == sco_temp, "#3 already in place");
2730       // Set up arguments for checkcast_copy_entry.
2731       setup_arg_regs(4);
2732       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
2733       __ jump(RuntimeAddress(checkcast_copy_entry));
2734     }
2735 
2736   __ BIND(L_failed);
2737     __ xorptr(rax, rax);
2738     __ notptr(rax); // return -1
2739     __ leave();   // required for proper stackwalking of RuntimeStub frame
2740     __ ret(0);
2741 
2742     return start;
2743   }
2744 
2745   void generate_arraycopy_stubs() {
2746     address entry;
2747     address entry_jbyte_arraycopy;
2748     address entry_jshort_arraycopy;
2749     address entry_jint_arraycopy;
2750     address entry_oop_arraycopy;
2751     address entry_jlong_arraycopy;
2752     address entry_checkcast_arraycopy;
2753 
2754     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
2755                                                                            "jbyte_disjoint_arraycopy");
2756     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
2757                                                                            "jbyte_arraycopy");
2758 
2759     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2760                                                                             "jshort_disjoint_arraycopy");
2761     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
2762                                                                             "jshort_arraycopy");
2763 
2764     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
2765                                                                               "jint_disjoint_arraycopy");
2766     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
2767                                                                               &entry_jint_arraycopy, "jint_arraycopy");
2768 
2769     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
2770                                                                                "jlong_disjoint_arraycopy");
2771     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
2772                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
2773 
2774 
2775     if (UseCompressedOops) {
2776       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
2777                                                                               "oop_disjoint_arraycopy");
2778       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
2779                                                                               &entry_oop_arraycopy, "oop_arraycopy");
2780       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
2781                                                                                      "oop_disjoint_arraycopy_uninit",
2782                                                                                      /*dest_uninitialized*/true);
2783       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
2784                                                                                      NULL, "oop_arraycopy_uninit",
2785                                                                                      /*dest_uninitialized*/true);
2786     } else {
2787       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
2788                                                                                "oop_disjoint_arraycopy");
2789       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
2790                                                                                &entry_oop_arraycopy, "oop_arraycopy");
2791       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
2792                                                                                       "oop_disjoint_arraycopy_uninit",
2793                                                                                       /*dest_uninitialized*/true);
2794       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
2795                                                                                       NULL, "oop_arraycopy_uninit",
2796                                                                                       /*dest_uninitialized*/true);
2797     }
2798 
2799     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2800     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2801                                                                         /*dest_uninitialized*/true);
2802 
2803     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2804                                                               entry_jbyte_arraycopy,
2805                                                               entry_jshort_arraycopy,
2806                                                               entry_jint_arraycopy,
2807                                                               entry_jlong_arraycopy);
2808     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2809                                                                entry_jbyte_arraycopy,
2810                                                                entry_jshort_arraycopy,
2811                                                                entry_jint_arraycopy,
2812                                                                entry_oop_arraycopy,
2813                                                                entry_jlong_arraycopy,
2814                                                                entry_checkcast_arraycopy);
2815 
2816     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2817     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2818     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2819     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2820     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2821     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2822 
2823     // We don't generate specialized code for HeapWord-aligned source
2824     // arrays, so just use the code we've already generated
2825     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
2826     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
2827 
2828     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
2829     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
2830 
2831     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
2832     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
2833 
2834     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
2835     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
2836 
2837     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
2838     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
2839 
2840     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
2841     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
2842   }
2843 
2844   void generate_math_stubs() {
2845     {
2846       StubCodeMark mark(this, "StubRoutines", "log");
2847       StubRoutines::_intrinsic_log = (double (*)(double)) __ pc();
2848 
2849       __ subq(rsp, 8);
2850       __ movdbl(Address(rsp, 0), xmm0);
2851       __ fld_d(Address(rsp, 0));
2852       __ flog();
2853       __ fstp_d(Address(rsp, 0));
2854       __ movdbl(xmm0, Address(rsp, 0));
2855       __ addq(rsp, 8);
2856       __ ret(0);
2857     }
2858     {
2859       StubCodeMark mark(this, "StubRoutines", "log10");
2860       StubRoutines::_intrinsic_log10 = (double (*)(double)) __ pc();
2861 
2862       __ subq(rsp, 8);
2863       __ movdbl(Address(rsp, 0), xmm0);
2864       __ fld_d(Address(rsp, 0));
2865       __ flog10();
2866       __ fstp_d(Address(rsp, 0));
2867       __ movdbl(xmm0, Address(rsp, 0));
2868       __ addq(rsp, 8);
2869       __ ret(0);
2870     }
2871     {
2872       StubCodeMark mark(this, "StubRoutines", "sin");
2873       StubRoutines::_intrinsic_sin = (double (*)(double)) __ pc();
2874 
2875       __ subq(rsp, 8);
2876       __ movdbl(Address(rsp, 0), xmm0);
2877       __ fld_d(Address(rsp, 0));
2878       __ trigfunc('s');
2879       __ fstp_d(Address(rsp, 0));
2880       __ movdbl(xmm0, Address(rsp, 0));
2881       __ addq(rsp, 8);
2882       __ ret(0);
2883     }
2884     {
2885       StubCodeMark mark(this, "StubRoutines", "cos");
2886       StubRoutines::_intrinsic_cos = (double (*)(double)) __ pc();
2887 
2888       __ subq(rsp, 8);
2889       __ movdbl(Address(rsp, 0), xmm0);
2890       __ fld_d(Address(rsp, 0));
2891       __ trigfunc('c');
2892       __ fstp_d(Address(rsp, 0));
2893       __ movdbl(xmm0, Address(rsp, 0));
2894       __ addq(rsp, 8);
2895       __ ret(0);
2896     }
2897     {
2898       StubCodeMark mark(this, "StubRoutines", "tan");
2899       StubRoutines::_intrinsic_tan = (double (*)(double)) __ pc();
2900 
2901       __ subq(rsp, 8);
2902       __ movdbl(Address(rsp, 0), xmm0);
2903       __ fld_d(Address(rsp, 0));
2904       __ trigfunc('t');
2905       __ fstp_d(Address(rsp, 0));
2906       __ movdbl(xmm0, Address(rsp, 0));
2907       __ addq(rsp, 8);
2908       __ ret(0);
2909     }
2910 
2911     // The intrinsic version of these seem to return the same value as
2912     // the strict version.
2913     StubRoutines::_intrinsic_exp = SharedRuntime::dexp;
2914     StubRoutines::_intrinsic_pow = SharedRuntime::dpow;
2915   }
2916 
2917 #undef __
2918 #define __ masm->
2919 
2920   // Continuation point for throwing of implicit exceptions that are
2921   // not handled in the current activation. Fabricates an exception
2922   // oop and initiates normal exception dispatching in this
2923   // frame. Since we need to preserve callee-saved values (currently
2924   // only for C2, but done for C1 as well) we need a callee-saved oop
2925   // map and therefore have to make these stubs into RuntimeStubs
2926   // rather than BufferBlobs.  If the compiler needs all registers to
2927   // be preserved between the fault point and the exception handler
2928   // then it must assume responsibility for that in
2929   // AbstractCompiler::continuation_for_implicit_null_exception or
2930   // continuation_for_implicit_division_by_zero_exception. All other
2931   // implicit exceptions (e.g., NullPointerException or
2932   // AbstractMethodError on entry) are either at call sites or
2933   // otherwise assume that stack unwinding will be initiated, so
2934   // caller saved registers were assumed volatile in the compiler.
2935   address generate_throw_exception(const char* name,
2936                                    address runtime_entry,
2937                                    bool restore_saved_exception_pc) {
2938     // Information about frame layout at time of blocking runtime call.
2939     // Note that we only have to preserve callee-saved registers since
2940     // the compilers are responsible for supplying a continuation point
2941     // if they expect all registers to be preserved.
2942     enum layout {
2943       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
2944       rbp_off2,
2945       return_off,
2946       return_off2,
2947       framesize // inclusive of return address
2948     };
2949 
2950     int insts_size = 512;
2951     int locs_size  = 64;
2952 
2953     CodeBuffer code(name, insts_size, locs_size);
2954     OopMapSet* oop_maps  = new OopMapSet();
2955     MacroAssembler* masm = new MacroAssembler(&code);
2956 
2957     address start = __ pc();
2958 
2959     // This is an inlined and slightly modified version of call_VM
2960     // which has the ability to fetch the return PC out of
2961     // thread-local storage and also sets up last_Java_sp slightly
2962     // differently than the real call_VM
2963     if (restore_saved_exception_pc) {
2964       __ movptr(rax,
2965                 Address(r15_thread,
2966                         in_bytes(JavaThread::saved_exception_pc_offset())));
2967       __ push(rax);
2968     }
2969 
2970     __ enter(); // required for proper stackwalking of RuntimeStub frame
2971 
2972     assert(is_even(framesize/2), "sp not 16-byte aligned");
2973 
2974     // return address and rbp are already in place
2975     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
2976 
2977     int frame_complete = __ pc() - start;
2978 
2979     // Set up last_Java_sp and last_Java_fp
2980     __ set_last_Java_frame(rsp, rbp, NULL);
2981 
2982     // Call runtime
2983     __ movptr(c_rarg0, r15_thread);
2984     BLOCK_COMMENT("call runtime_entry");
2985     __ call(RuntimeAddress(runtime_entry));
2986 
2987     // Generate oop map
2988     OopMap* map = new OopMap(framesize, 0);
2989 
2990     oop_maps->add_gc_map(__ pc() - start, map);
2991 
2992     __ reset_last_Java_frame(true, false);
2993 
2994     __ leave(); // required for proper stackwalking of RuntimeStub frame
2995 
2996     // check for pending exceptions
2997 #ifdef ASSERT
2998     Label L;
2999     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
3000             (int32_t) NULL_WORD);
3001     __ jcc(Assembler::notEqual, L);
3002     __ should_not_reach_here();
3003     __ bind(L);
3004 #endif // ASSERT
3005     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3006 
3007 
3008     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3009     RuntimeStub* stub =
3010       RuntimeStub::new_runtime_stub(name,
3011                                     &code,
3012                                     frame_complete,
3013                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3014                                     oop_maps, false);
3015     return stub->entry_point();
3016   }
3017 
3018   // Initialization
3019   void generate_initial() {
3020     // Generates all stubs and initializes the entry points
3021 
3022     // This platform-specific stub is needed by generate_call_stub()
3023     StubRoutines::x86::_mxcsr_std        = generate_fp_mask("mxcsr_std",        0x0000000000001F80);
3024 
3025     // entry points that exist in all platforms Note: This is code
3026     // that could be shared among different platforms - however the
3027     // benefit seems to be smaller than the disadvantage of having a
3028     // much more complicated generator structure. See also comment in
3029     // stubRoutines.hpp.
3030 
3031     StubRoutines::_forward_exception_entry = generate_forward_exception();
3032 
3033     StubRoutines::_call_stub_entry =
3034       generate_call_stub(StubRoutines::_call_stub_return_address);
3035 
3036     // is referenced by megamorphic call
3037     StubRoutines::_catch_exception_entry = generate_catch_exception();
3038 
3039     // atomic calls
3040     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
3041     StubRoutines::_atomic_xchg_ptr_entry     = generate_atomic_xchg_ptr();
3042     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
3043     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
3044     StubRoutines::_atomic_add_entry          = generate_atomic_add();
3045     StubRoutines::_atomic_add_ptr_entry      = generate_atomic_add_ptr();
3046     StubRoutines::_fence_entry               = generate_orderaccess_fence();
3047 
3048     StubRoutines::_handler_for_unsafe_access_entry =
3049       generate_handler_for_unsafe_access();
3050 
3051     // platform dependent
3052     StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
3053 
3054     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
3055   }
3056 
3057   void generate_all() {
3058     // Generates all stubs and initializes the entry points
3059 
3060     // These entry points require SharedInfo::stack0 to be set up in
3061     // non-core builds and need to be relocatable, so they each
3062     // fabricate a RuntimeStub internally.
3063     StubRoutines::_throw_AbstractMethodError_entry =
3064       generate_throw_exception("AbstractMethodError throw_exception",
3065                                CAST_FROM_FN_PTR(address,
3066                                                 SharedRuntime::
3067                                                 throw_AbstractMethodError),
3068                                false);
3069 
3070     StubRoutines::_throw_IncompatibleClassChangeError_entry =
3071       generate_throw_exception("IncompatibleClassChangeError throw_exception",
3072                                CAST_FROM_FN_PTR(address,
3073                                                 SharedRuntime::
3074                                                 throw_IncompatibleClassChangeError),
3075                                false);
3076 
3077     StubRoutines::_throw_ArithmeticException_entry =
3078       generate_throw_exception("ArithmeticException throw_exception",
3079                                CAST_FROM_FN_PTR(address,
3080                                                 SharedRuntime::
3081                                                 throw_ArithmeticException),
3082                                true);
3083 
3084     StubRoutines::_throw_NullPointerException_entry =
3085       generate_throw_exception("NullPointerException throw_exception",
3086                                CAST_FROM_FN_PTR(address,
3087                                                 SharedRuntime::
3088                                                 throw_NullPointerException),
3089                                true);
3090 
3091     StubRoutines::_throw_NullPointerException_at_call_entry =
3092       generate_throw_exception("NullPointerException at call throw_exception",
3093                                CAST_FROM_FN_PTR(address,
3094                                                 SharedRuntime::
3095                                                 throw_NullPointerException_at_call),
3096                                false);
3097 
3098     StubRoutines::_throw_StackOverflowError_entry =
3099       generate_throw_exception("StackOverflowError throw_exception",
3100                                CAST_FROM_FN_PTR(address,
3101                                                 SharedRuntime::
3102                                                 throw_StackOverflowError),
3103                                false);
3104 
3105     // entry points that are platform specific
3106     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
3107     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
3108     StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
3109     StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
3110 
3111     StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
3112     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
3113     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
3114     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
3115 
3116     // support for verify_oop (must happen after universe_init)
3117     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
3118 
3119     // arraycopy stubs used by compilers
3120     generate_arraycopy_stubs();
3121 
3122     generate_math_stubs();
3123   }
3124 
3125  public:
3126   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3127     if (all) {
3128       generate_all();
3129     } else {
3130       generate_initial();
3131     }
3132   }
3133 }; // end class declaration
3134 
3135 void StubGenerator_generate(CodeBuffer* code, bool all) {
3136   StubGenerator g(code, all);
3137 }