1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #include "utilities/top.hpp"
  43 #ifdef COMPILER2
  44 #include "opto/runtime.hpp"
  45 #endif
  46 
  47 #ifdef BUILTIN_SIM
  48 #include "../../../../../../simulator/simulator.hpp"
  49 #endif
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp
  54 
  55 #undef __
  56 #define __ _masm->
  57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) __ block_comment(str)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 // Stub Code definitions
  68 
  69 class StubGenerator: public StubCodeGenerator {
  70  private:
  71 
  72 #ifdef PRODUCT
  73 #define inc_counter_np(counter) ((void)0)
  74 #else
  75   void inc_counter_np_(int& counter) {
  76     __ lea(rscratch2, ExternalAddress((address)&counter));
  77     __ ldrw(rscratch1, Address(rscratch2));
  78     __ addw(rscratch1, rscratch1, 1);
  79     __ strw(rscratch1, Address(rscratch2));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save r30 (lr) as the return PC at the base of the frame and
 102   // link r29 (fp) below it as the frame pointer installing sp (r31)
 103   // into fp.
 104   //
 105   // we save r0-r7, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save r8 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save r9-r15 which both C and Java treat as
 116   // volatile
 117   //
 118   // we don't need to save r16-18 because Java does not use them
 119   //
 120   // we save r19-r28 which Java uses as scratch registers and C
 121   // expects to be callee-save
 122   //
 123   // we don't save any FP registers since only v8-v15 are callee-save
 124   // (strictly only the f and d components) and Java uses them as
 125   // callee-save. v0-v7 are arg registers and C treats v16-v31 as
 126   // volatile (as does Java?)
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved d15            ] <--- sp_after_call
 135   // -25 [ saved d14            ]
 136   // -24 [ saved d13            ]
 137   // -23 [ saved d12            ]
 138   // -22 [ saved d11            ]
 139   // -21 [ saved d10            ]
 140   // -20 [ saved d9             ]
 141   // -19 [ saved d8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d14_off            = -25,
 169     d13_off            = -24,
 170     d12_off            = -23,
 171     d11_off            = -22,
 172     d10_off            = -21,
 173     d9_off             = -20,
 174     d8_off             = -19,
 175 
 176     r28_off            = -18,
 177     r27_off            = -17,
 178     r26_off            = -16,
 179     r25_off            = -15,
 180     r24_off            = -14,
 181     r23_off            = -13,
 182     r22_off            = -12,
 183     r21_off            = -11,
 184     r20_off            = -10,
 185     r19_off            =  -9,
 186     call_wrapper_off   =  -8,
 187     result_off         =  -7,
 188     result_type_off    =  -6,
 189     method_off         =  -5,
 190     entry_point_off    =  -4,
 191     parameters_off     =  -3,
 192     parameter_size_off =  -2,
 193     thread_off         =  -1,
 194     fp_f               =   0,
 195     retaddr_off        =   1,
 196   };
 197 
 198   address generate_call_stub(address& return_address) {
 199     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 200            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 201            "adjust this code");
 202 
 203     StubCodeMark mark(this, "StubRoutines", "call_stub");
 204     address start = __ pc();
 205 
 206     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 207 
 208     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 209     const Address result        (rfp, result_off         * wordSize);
 210     const Address result_type   (rfp, result_type_off    * wordSize);
 211     const Address method        (rfp, method_off         * wordSize);
 212     const Address entry_point   (rfp, entry_point_off    * wordSize);
 213     const Address parameters    (rfp, parameters_off     * wordSize);
 214     const Address parameter_size(rfp, parameter_size_off * wordSize);
 215 
 216     const Address thread        (rfp, thread_off         * wordSize);
 217 
 218     const Address d15_save      (rfp, d15_off * wordSize);
 219     const Address d14_save      (rfp, d14_off * wordSize);
 220     const Address d13_save      (rfp, d13_off * wordSize);
 221     const Address d12_save      (rfp, d12_off * wordSize);
 222     const Address d11_save      (rfp, d11_off * wordSize);
 223     const Address d10_save      (rfp, d10_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225     const Address d8_save       (rfp, d8_off * wordSize);
 226 
 227     const Address r28_save      (rfp, r28_off * wordSize);
 228     const Address r27_save      (rfp, r27_off * wordSize);
 229     const Address r26_save      (rfp, r26_off * wordSize);
 230     const Address r25_save      (rfp, r25_off * wordSize);
 231     const Address r24_save      (rfp, r24_off * wordSize);
 232     const Address r23_save      (rfp, r23_off * wordSize);
 233     const Address r22_save      (rfp, r22_off * wordSize);
 234     const Address r21_save      (rfp, r21_off * wordSize);
 235     const Address r20_save      (rfp, r20_off * wordSize);
 236     const Address r19_save      (rfp, r19_off * wordSize);
 237 
 238     // stub code
 239 
 240     // we need a C prolog to bootstrap the x86 caller into the sim
 241     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 242 
 243     address aarch64_entry = __ pc();
 244 
 245 #ifdef BUILTIN_SIM
 246     // Save sender's SP for stack traces.
 247     __ mov(rscratch1, sp);
 248     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 249 #endif
 250     // set up frame and move sp to end of save area
 251     __ enter();
 252     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 253 
 254     // save register parameters and Java scratch/global registers
 255     // n.b. we save thread even though it gets installed in
 256     // rthread because we want to sanity check rthread later
 257     __ str(c_rarg7,  thread);
 258     __ strw(c_rarg6, parameter_size);
 259     __ str(c_rarg5,  parameters);
 260     __ str(c_rarg4,  entry_point);
 261     __ str(c_rarg3,  method);
 262     __ str(c_rarg2,  result_type);
 263     __ str(c_rarg1,  result);
 264     __ str(c_rarg0,  call_wrapper);
 265     __ str(r19,      r19_save);
 266     __ str(r20,      r20_save);
 267     __ str(r21,      r21_save);
 268     __ str(r22,      r22_save);
 269     __ str(r23,      r23_save);
 270     __ str(r24,      r24_save);
 271     __ str(r25,      r25_save);
 272     __ str(r26,      r26_save);
 273     __ str(r27,      r27_save);
 274     __ str(r28,      r28_save);
 275 
 276     __ strd(v8,      d8_save);
 277     __ strd(v9,      d9_save);
 278     __ strd(v10,     d10_save);
 279     __ strd(v11,     d11_save);
 280     __ strd(v12,     d12_save);
 281     __ strd(v13,     d13_save);
 282     __ strd(v14,     d14_save);
 283     __ strd(v15,     d15_save);
 284 
 285     // install Java thread in global register now we have saved
 286     // whatever value it held
 287     __ mov(rthread, c_rarg7);
 288     // And method
 289     __ mov(rmethod, c_rarg3);
 290 
 291     // set up the heapbase register
 292     __ reinit_heapbase();
 293 
 294 #ifdef ASSERT
 295     // make sure we have no pending exceptions
 296     {
 297       Label L;
 298       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 299       __ cmp(rscratch1, (unsigned)NULL_WORD);
 300       __ br(Assembler::EQ, L);
 301       __ stop("StubRoutines::call_stub: entered with pending exception");
 302       __ BIND(L);
 303     }
 304 #endif
 305     // pass parameters if any
 306     __ mov(esp, sp);
 307     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 308     __ andr(sp, rscratch1, -2 * wordSize);
 309 
 310     BLOCK_COMMENT("pass parameters if any");
 311     Label parameters_done;
 312     // parameter count is still in c_rarg6
 313     // and parameter pointer identifying param 1 is in c_rarg5
 314     __ cbzw(c_rarg6, parameters_done);
 315 
 316     address loop = __ pc();
 317     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 318     __ subsw(c_rarg6, c_rarg6, 1);
 319     __ push(rscratch1);
 320     __ br(Assembler::GT, loop);
 321 
 322     __ BIND(parameters_done);
 323 
 324     // call Java entry -- passing methdoOop, and current sp
 325     //      rmethod: Method*
 326     //      r13: sender sp
 327     BLOCK_COMMENT("call Java function");
 328     __ mov(r13, sp);
 329     __ blr(c_rarg4);
 330 
 331     // tell the simulator we have returned to the stub
 332 
 333     // we do this here because the notify will already have been done
 334     // if we get to the next instruction via an exception
 335     //
 336     // n.b. adding this instruction here affects the calculation of
 337     // whether or not a routine returns to the call stub (used when
 338     // doing stack walks) since the normal test is to check the return
 339     // pc against the address saved below. so we may need to allow for
 340     // this extra instruction in the check.
 341 
 342     if (NotifySimulator) {
 343       __ notify(Assembler::method_reentry);
 344     }
 345     // save current address for use by exception handling code
 346 
 347     return_address = __ pc();
 348 
 349     // store result depending on type (everything that is not
 350     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 351     // n.b. this assumes Java returns an integral result in r0
 352     // and a floating result in j_farg0
 353     __ ldr(j_rarg2, result);
 354     Label is_long, is_float, is_double, exit;
 355     __ ldr(j_rarg1, result_type);
 356     __ cmp(j_rarg1, T_OBJECT);
 357     __ br(Assembler::EQ, is_long);
 358     __ cmp(j_rarg1, T_LONG);
 359     __ br(Assembler::EQ, is_long);
 360     __ cmp(j_rarg1, T_FLOAT);
 361     __ br(Assembler::EQ, is_float);
 362     __ cmp(j_rarg1, T_DOUBLE);
 363     __ br(Assembler::EQ, is_double);
 364 
 365     // handle T_INT case
 366     __ strw(r0, Address(j_rarg2));
 367 
 368     __ BIND(exit);
 369 
 370     // pop parameters
 371     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 372 
 373 #ifdef ASSERT
 374     // verify that threads correspond
 375     {
 376       Label L, S;
 377       __ ldr(rscratch1, thread);
 378       __ cmp(rthread, rscratch1);
 379       __ br(Assembler::NE, S);
 380       __ get_thread(rscratch1);
 381       __ cmp(rthread, rscratch1);
 382       __ br(Assembler::EQ, L);
 383       __ BIND(S);
 384       __ stop("StubRoutines::call_stub: threads must correspond");
 385       __ BIND(L);
 386     }
 387 #endif
 388 
 389     // restore callee-save registers
 390     __ ldrd(v15,      d15_save);
 391     __ ldrd(v14,      d14_save);
 392     __ ldrd(v13,      d13_save);
 393     __ ldrd(v12,      d12_save);
 394     __ ldrd(v11,      d11_save);
 395     __ ldrd(v10,      d10_save);
 396     __ ldrd(v9,       d9_save);
 397     __ ldrd(v8,       d8_save);
 398 
 399     __ ldr(r28,      r28_save);
 400     __ ldr(r27,      r27_save);
 401     __ ldr(r26,      r26_save);
 402     __ ldr(r25,      r25_save);
 403     __ ldr(r24,      r24_save);
 404     __ ldr(r23,      r23_save);
 405     __ ldr(r22,      r22_save);
 406     __ ldr(r21,      r21_save);
 407     __ ldr(r20,      r20_save);
 408     __ ldr(r19,      r19_save);
 409     __ ldr(c_rarg0,  call_wrapper);
 410     __ ldr(c_rarg1,  result);
 411     __ ldrw(c_rarg2, result_type);
 412     __ ldr(c_rarg3,  method);
 413     __ ldr(c_rarg4,  entry_point);
 414     __ ldr(c_rarg5,  parameters);
 415     __ ldr(c_rarg6,  parameter_size);
 416     __ ldr(c_rarg7,  thread);
 417 
 418 #ifndef PRODUCT
 419     // tell the simulator we are about to end Java execution
 420     if (NotifySimulator) {
 421       __ notify(Assembler::method_exit);
 422     }
 423 #endif
 424     // leave frame and return to caller
 425     __ leave();
 426     __ ret(lr);
 427 
 428     // handle return types different from T_INT
 429 
 430     __ BIND(is_long);
 431     __ str(r0, Address(j_rarg2, 0));
 432     __ br(Assembler::AL, exit);
 433 
 434     __ BIND(is_float);
 435     __ strs(j_farg0, Address(j_rarg2, 0));
 436     __ br(Assembler::AL, exit);
 437 
 438     __ BIND(is_double);
 439     __ strd(j_farg0, Address(j_rarg2, 0));
 440     __ br(Assembler::AL, exit);
 441 
 442     return start;
 443   }
 444 
 445   // Return point for a Java call if there's an exception thrown in
 446   // Java code.  The exception is caught and transformed into a
 447   // pending exception stored in JavaThread that can be tested from
 448   // within the VM.
 449   //
 450   // Note: Usually the parameters are removed by the callee. In case
 451   // of an exception crossing an activation frame boundary, that is
 452   // not the case if the callee is compiled code => need to setup the
 453   // rsp.
 454   //
 455   // r0: exception oop
 456 
 457   // NOTE: this is used as a target from the signal handler so it
 458   // needs an x86 prolog which returns into the current simulator
 459   // executing the generated catch_exception code. so the prolog
 460   // needs to install rax in a sim register and adjust the sim's
 461   // restart pc to enter the generated code at the start position
 462   // then return from native to simulated execution.
 463 
 464   address generate_catch_exception() {
 465     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 466     address start = __ pc();
 467 
 468     // same as in generate_call_stub():
 469     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 470     const Address thread        (rfp, thread_off         * wordSize);
 471 
 472 #ifdef ASSERT
 473     // verify that threads correspond
 474     {
 475       Label L, S;
 476       __ ldr(rscratch1, thread);
 477       __ cmp(rthread, rscratch1);
 478       __ br(Assembler::NE, S);
 479       __ get_thread(rscratch1);
 480       __ cmp(rthread, rscratch1);
 481       __ br(Assembler::EQ, L);
 482       __ bind(S);
 483       __ stop("StubRoutines::catch_exception: threads must correspond");
 484       __ bind(L);
 485     }
 486 #endif
 487 
 488     // set pending exception
 489     __ verify_oop(r0);
 490 
 491     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 492     __ mov(rscratch1, (address)__FILE__);
 493     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 494     __ movw(rscratch1, (int)__LINE__);
 495     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 496 
 497     // complete return to VM
 498     assert(StubRoutines::_call_stub_return_address != NULL,
 499            "_call_stub_return_address must have been generated before");
 500     __ b(StubRoutines::_call_stub_return_address);
 501 
 502     return start;
 503   }
 504 
 505   // Continuation point for runtime calls returning with a pending
 506   // exception.  The pending exception check happened in the runtime
 507   // or native call stub.  The pending exception in Thread is
 508   // converted into a Java-level exception.
 509   //
 510   // Contract with Java-level exception handlers:
 511   // r0: exception
 512   // r3: throwing pc
 513   //
 514   // NOTE: At entry of this stub, exception-pc must be in LR !!
 515 
 516   // NOTE: this is always used as a jump target within generated code
 517   // so it just needs to be generated code wiht no x86 prolog
 518 
 519   address generate_forward_exception() {
 520     StubCodeMark mark(this, "StubRoutines", "forward exception");
 521     address start = __ pc();
 522 
 523     // Upon entry, LR points to the return address returning into
 524     // Java (interpreted or compiled) code; i.e., the return address
 525     // becomes the throwing pc.
 526     //
 527     // Arguments pushed before the runtime call are still on the stack
 528     // but the exception handler will reset the stack pointer ->
 529     // ignore them.  A potential result in registers can be ignored as
 530     // well.
 531 
 532 #ifdef ASSERT
 533     // make sure this code is only executed if there is a pending exception
 534     {
 535       Label L;
 536       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 537       __ cbnz(rscratch1, L);
 538       __ stop("StubRoutines::forward exception: no pending exception (1)");
 539       __ bind(L);
 540     }
 541 #endif
 542 
 543     // compute exception handler into r19
 544 
 545     // call the VM to find the handler address associated with the
 546     // caller address. pass thread in r0 and caller pc (ret address)
 547     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 548     // the stack.
 549     __ mov(c_rarg1, lr);
 550     // lr will be trashed by the VM call so we move it to R19
 551     // (callee-saved) because we also need to pass it to the handler
 552     // returned by this call.
 553     __ mov(r19, lr);
 554     BLOCK_COMMENT("call exception_handler_for_return_address");
 555     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 556                          SharedRuntime::exception_handler_for_return_address),
 557                     rthread, c_rarg1);
 558     // we should not really care that lr is no longer the callee
 559     // address. we saved the value the handler needs in r19 so we can
 560     // just copy it to r3. however, the C2 handler will push its own
 561     // frame and then calls into the VM and the VM code asserts that
 562     // the PC for the frame above the handler belongs to a compiled
 563     // Java method. So, we restore lr here to satisfy that assert.
 564     __ mov(lr, r19);
 565     // setup r0 & r3 & clear pending exception
 566     __ mov(r3, r19);
 567     __ mov(r19, r0);
 568     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 569     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 570 
 571 #ifdef ASSERT
 572     // make sure exception is set
 573     {
 574       Label L;
 575       __ cbnz(r0, L);
 576       __ stop("StubRoutines::forward exception: no pending exception (2)");
 577       __ bind(L);
 578     }
 579 #endif
 580 
 581     // continue at exception handler
 582     // r0: exception
 583     // r3: throwing pc
 584     // r19: exception handler
 585     __ verify_oop(r0);
 586     __ br(r19);
 587 
 588     return start;
 589   }
 590 
 591   // Non-destructive plausibility checks for oops
 592   //
 593   // Arguments:
 594   //    r0: oop to verify
 595   //    rscratch1: error message
 596   //
 597   // Stack after saving c_rarg3:
 598   //    [tos + 0]: saved c_rarg3
 599   //    [tos + 1]: saved c_rarg2
 600   //    [tos + 2]: saved lr
 601   //    [tos + 3]: saved rscratch2
 602   //    [tos + 4]: saved r0
 603   //    [tos + 5]: saved rscratch1
 604   address generate_verify_oop() {
 605 
 606     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 607     address start = __ pc();
 608 
 609     Label exit, error;
 610 
 611     // save c_rarg2 and c_rarg3
 612     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 613 
 614     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 615     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 616     __ ldr(c_rarg3, Address(c_rarg2));
 617     __ add(c_rarg3, c_rarg3, 1);
 618     __ str(c_rarg3, Address(c_rarg2));
 619 
 620     // object is in r0
 621     // make sure object is 'reasonable'
 622     __ cbz(r0, exit); // if obj is NULL it is OK
 623 
 624     // Check if the oop is in the right area of memory
 625     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 626     __ andr(c_rarg2, r0, c_rarg3);
 627     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 628 
 629     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 630     // instruction here because the flags register is live.
 631     __ eor(c_rarg2, c_rarg2, c_rarg3);
 632     __ cbnz(c_rarg2, error);
 633 
 634     // make sure klass is 'reasonable', which is not zero.
 635     __ load_klass(r0, r0);  // get klass
 636     __ cbz(r0, error);      // if klass is NULL it is broken
 637 
 638     // return if everything seems ok
 639     __ bind(exit);
 640 
 641     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 642     __ ret(lr);
 643 
 644     // handle errors
 645     __ bind(error);
 646     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 647 
 648     __ push(RegSet::range(r0, r29), sp);
 649     // debug(char* msg, int64_t pc, int64_t regs[])
 650     __ mov(c_rarg0, rscratch1);      // pass address of error message
 651     __ mov(c_rarg1, lr);             // pass return address
 652     __ mov(c_rarg2, sp);             // pass address of regs on stack
 653 #ifndef PRODUCT
 654     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 655 #endif
 656     BLOCK_COMMENT("call MacroAssembler::debug");
 657     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 658     __ blrt(rscratch1, 3, 0, 1);
 659 
 660     return start;
 661   }
 662 
 663   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 664 
 665   // Generate code for an array write pre barrier
 666   //
 667   //     addr    -  starting address
 668   //     count   -  element count
 669   //     tmp     - scratch register
 670   //
 671   //     Destroy no registers!
 672   //
 673   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 674     BarrierSet* bs = Universe::heap()->barrier_set();
 675     switch (bs->kind()) {
 676     case BarrierSet::G1SATBCTLogging:
 677       // With G1, don't generate the call if we statically know that the target in uninitialized
 678       if (!dest_uninitialized) {
 679         __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 680         if (count == c_rarg0) {
 681           if (addr == c_rarg1) {
 682             // exactly backwards!!
 683             __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize));
 684             __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize));
 685           } else {
 686             __ mov(c_rarg1, count);
 687             __ mov(c_rarg0, addr);
 688           }
 689         } else {
 690           __ mov(c_rarg0, addr);
 691           __ mov(c_rarg1, count);
 692         }
 693         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 694         __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 695         break;
 696       case BarrierSet::CardTableModRef:
 697       case BarrierSet::CardTableExtension:
 698       case BarrierSet::ModRef:
 699         break;
 700       default:
 701         ShouldNotReachHere();
 702 
 703       }
 704     }
 705   }
 706 
 707   //
 708   // Generate code for an array write post barrier
 709   //
 710   //  Input:
 711   //     start    - register containing starting address of destination array
 712   //     end      - register containing ending address of destination array
 713   //     scratch  - scratch register
 714   //
 715   //  The input registers are overwritten.
 716   //  The ending address is inclusive.
 717   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 718     assert_different_registers(start, end, scratch);
 719     BarrierSet* bs = Universe::heap()->barrier_set();
 720     switch (bs->kind()) {
 721       case BarrierSet::G1SATBCTLogging:
 722 
 723         {
 724           __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 725           // must compute element count unless barrier set interface is changed (other platforms supply count)
 726           assert_different_registers(start, end, scratch);
 727           __ lea(scratch, Address(end, BytesPerHeapOop));
 728           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 729           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 730           __ mov(c_rarg0, start);
 731           __ mov(c_rarg1, scratch);
 732           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 733           __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 734         }
 735         break;
 736       case BarrierSet::CardTableModRef:
 737       case BarrierSet::CardTableExtension:
 738         {
 739           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 740           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 741 
 742           Label L_loop;
 743 
 744            __ lsr(start, start, CardTableModRefBS::card_shift);
 745            __ lsr(end, end, CardTableModRefBS::card_shift);
 746            __ sub(end, end, start); // number of bytes to copy
 747 
 748           const Register count = end; // 'end' register contains bytes count now
 749           __ mov(scratch, (address)ct->byte_map_base);
 750           __ add(start, start, scratch);
 751           __ BIND(L_loop);
 752           __ strb(zr, Address(start, count));
 753           __ subs(count, count, 1);
 754           __ br(Assembler::HS, L_loop);
 755         }
 756         break;
 757       default:
 758         ShouldNotReachHere();
 759 
 760     }
 761   }
 762 
 763   typedef enum {
 764     copy_forwards = 1,
 765     copy_backwards = -1
 766   } copy_direction;
 767 
 768   // Bulk copy of blocks of 8 words.
 769   //
 770   // count is a count of words.
 771   //
 772   // Precondition: count >= 2
 773   //
 774   // Postconditions:
 775   //
 776   // The least significant bit of count contains the remaining count
 777   // of words to copy.  The rest of count is trash.
 778   //
 779   // s and d are adjusted to point to the remaining words to copy
 780   //
 781   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 782                            copy_direction direction) {
 783     int unit = wordSize * direction;
 784 
 785     int offset;
 786     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 787       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 788 
 789     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 790     assert_different_registers(s, d, count, rscratch1);
 791 
 792     Label again, large, small;
 793     __ align(6);
 794     __ bind(start);
 795     __ cmp(count, 8);
 796     __ br(Assembler::LO, small);
 797     if (direction == copy_forwards) {
 798       __ sub(s, s, 2 * wordSize);
 799       __ sub(d, d, 2 * wordSize);
 800     }
 801     __ subs(count, count, 16);
 802     __ br(Assembler::GE, large);
 803 
 804     // 8 <= count < 16 words.  Copy 8.
 805     __ ldp(t0, t1, Address(s, 2 * unit));
 806     __ ldp(t2, t3, Address(s, 4 * unit));
 807     __ ldp(t4, t5, Address(s, 6 * unit));
 808     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 809 
 810     __ stp(t0, t1, Address(d, 2 * unit));
 811     __ stp(t2, t3, Address(d, 4 * unit));
 812     __ stp(t4, t5, Address(d, 6 * unit));
 813     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 814 
 815     if (direction == copy_forwards) {
 816       __ add(s, s, 2 * wordSize);
 817       __ add(d, d, 2 * wordSize);
 818     }
 819 
 820     {
 821       Label L1, L2;
 822       __ bind(small);
 823       __ tbz(count, exact_log2(4), L1);
 824       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 825       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 826       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 827       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 828       __ bind(L1);
 829 
 830       __ tbz(count, 1, L2);
 831       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 832       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 833       __ bind(L2);
 834     }
 835 
 836     __ ret(lr);
 837 
 838     __ align(6);
 839     __ bind(large);
 840 
 841     // Fill 8 registers
 842     __ ldp(t0, t1, Address(s, 2 * unit));
 843     __ ldp(t2, t3, Address(s, 4 * unit));
 844     __ ldp(t4, t5, Address(s, 6 * unit));
 845     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 846 
 847     __ bind(again);
 848 
 849     if (direction == copy_forwards && PrefetchCopyIntervalInBytes > 0)
 850       __ prfm(Address(s, PrefetchCopyIntervalInBytes), PLDL1KEEP);
 851 
 852     __ stp(t0, t1, Address(d, 2 * unit));
 853     __ ldp(t0, t1, Address(s, 2 * unit));
 854     __ stp(t2, t3, Address(d, 4 * unit));
 855     __ ldp(t2, t3, Address(s, 4 * unit));
 856     __ stp(t4, t5, Address(d, 6 * unit));
 857     __ ldp(t4, t5, Address(s, 6 * unit));
 858     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 859     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 860 
 861     __ subs(count, count, 8);
 862     __ br(Assembler::HS, again);
 863 
 864     // Drain
 865     __ stp(t0, t1, Address(d, 2 * unit));
 866     __ stp(t2, t3, Address(d, 4 * unit));
 867     __ stp(t4, t5, Address(d, 6 * unit));
 868     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 869 
 870     if (direction == copy_forwards) {
 871       __ add(s, s, 2 * wordSize);
 872       __ add(d, d, 2 * wordSize);
 873     }
 874 
 875     {
 876       Label L1, L2;
 877       __ tbz(count, exact_log2(4), L1);
 878       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 879       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 880       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 881       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 882       __ bind(L1);
 883 
 884       __ tbz(count, 1, L2);
 885       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 886       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 887       __ bind(L2);
 888     }
 889 
 890     __ ret(lr);
 891   }
 892 
 893   // Small copy: less than 16 bytes.
 894   //
 895   // NB: Ignores all of the bits of count which represent more than 15
 896   // bytes, so a caller doesn't have to mask them.
 897 
 898   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
 899     bool is_backwards = step < 0;
 900     size_t granularity = uabs(step);
 901     int direction = is_backwards ? -1 : 1;
 902     int unit = wordSize * direction;
 903 
 904     Label Lpair, Lword, Lint, Lshort, Lbyte;
 905 
 906     assert(granularity
 907            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 908 
 909     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
 910 
 911     // ??? I don't know if this bit-test-and-branch is the right thing
 912     // to do.  It does a lot of jumping, resulting in several
 913     // mispredicted branches.  It might make more sense to do this
 914     // with something like Duff's device with a single computed branch.
 915 
 916     __ tbz(count, 3 - exact_log2(granularity), Lword);
 917     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
 918     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
 919     __ bind(Lword);
 920 
 921     if (granularity <= sizeof (jint)) {
 922       __ tbz(count, 2 - exact_log2(granularity), Lint);
 923       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 924       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 925       __ bind(Lint);
 926     }
 927 
 928     if (granularity <= sizeof (jshort)) {
 929       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 930       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 931       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 932       __ bind(Lshort);
 933     }
 934 
 935     if (granularity <= sizeof (jbyte)) {
 936       __ tbz(count, 0, Lbyte);
 937       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 938       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 939       __ bind(Lbyte);
 940     }
 941   }
 942 
 943   Label copy_f, copy_b;
 944 
 945   // All-singing all-dancing memory copy.
 946   //
 947   // Copy count units of memory from s to d.  The size of a unit is
 948   // step, which can be positive or negative depending on the direction
 949   // of copy.  If is_aligned is false, we align the source address.
 950   //
 951 
 952   void copy_memory(bool is_aligned, Register s, Register d,
 953                    Register count, Register tmp, int step) {
 954     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 955     bool is_backwards = step < 0;
 956     int granularity = uabs(step);
 957     const Register t0 = r3, t1 = r4;
 958 
 959     if (is_backwards) {
 960       __ lea(s, Address(s, count, Address::uxtw(exact_log2(-step))));
 961       __ lea(d, Address(d, count, Address::uxtw(exact_log2(-step))));
 962     }
 963 
 964     Label done, tail;
 965 
 966     __ cmp(count, 16/granularity);
 967     __ br(Assembler::LO, tail);
 968 
 969     // Now we've got the small case out of the way we can align the
 970     // source address on a 2-word boundary.
 971 
 972     Label aligned;
 973 
 974     if (is_aligned) {
 975       // We may have to adjust by 1 word to get s 2-word-aligned.
 976       __ tbz(s, exact_log2(wordSize), aligned);
 977       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
 978       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
 979       __ sub(count, count, wordSize/granularity);
 980     } else {
 981       if (is_backwards) {
 982         __ andr(rscratch2, s, 2 * wordSize - 1);
 983       } else {
 984         __ neg(rscratch2, s);
 985         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
 986       }
 987       // rscratch2 is the byte adjustment needed to align s.
 988       __ cbz(rscratch2, aligned);
 989       __ lsr(rscratch2, rscratch2, exact_log2(granularity));
 990       __ sub(count, count, rscratch2);
 991 
 992 #if 0
 993       // ?? This code is only correct for a disjoint copy.  It may or
 994       // may not make sense to use it in that case.
 995 
 996       // Copy the first pair; s and d may not be aligned.
 997       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 998       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 999 
1000       // Align s and d, adjust count
1001       if (is_backwards) {
1002         __ sub(s, s, rscratch2);
1003         __ sub(d, d, rscratch2);
1004       } else {
1005         __ add(s, s, rscratch2);
1006         __ add(d, d, rscratch2);
1007       }
1008 #else
1009       copy_memory_small(s, d, rscratch2, rscratch1, step);
1010 #endif
1011     }
1012 
1013     __ cmp(count, 16/granularity);
1014     __ br(Assembler::LT, tail);
1015     __ bind(aligned);
1016 
1017     // s is now 2-word-aligned.
1018 
1019     // We have a count of units and some trailing bytes.  Adjust the
1020     // count and do a bulk copy of words.
1021     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1022     if (direction == copy_forwards)
1023       __ bl(copy_f);
1024     else
1025       __ bl(copy_b);
1026 
1027     // And the tail.
1028 
1029     __ bind(tail);
1030     copy_memory_small(s, d, count, tmp, step);
1031   }
1032 
1033 
1034   void clobber_registers() {
1035 #ifdef ASSERT
1036     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1037     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1038     for (Register r = r3; r <= r18; r++)
1039       if (r != rscratch1) __ mov(r, rscratch1);
1040 #endif
1041   }
1042 
1043   // Scan over array at a for count oops, verifying each one.
1044   // Preserves a and count, clobbers rscratch1 and rscratch2.
1045   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1046     Label loop, end;
1047     __ mov(rscratch1, a);
1048     __ mov(rscratch2, zr);
1049     __ bind(loop);
1050     __ cmp(rscratch2, count);
1051     __ br(Assembler::HS, end);
1052     if (size == (size_t)wordSize) {
1053       __ ldr(temp, Address(a, rscratch2, Address::uxtw(exact_log2(size))));
1054       __ verify_oop(temp);
1055     } else {
1056       __ ldrw(r16, Address(a, rscratch2, Address::uxtw(exact_log2(size))));
1057       __ decode_heap_oop(temp); // calls verify_oop
1058     }
1059     __ add(rscratch2, rscratch2, size);
1060     __ b(loop);
1061     __ bind(end);
1062   }
1063 
1064   // Arguments:
1065   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1066   //             ignored
1067   //   is_oop  - true => oop array, so generate store check code
1068   //   name    - stub name string
1069   //
1070   // Inputs:
1071   //   c_rarg0   - source array address
1072   //   c_rarg1   - destination array address
1073   //   c_rarg2   - element count, treated as ssize_t, can be zero
1074   //
1075   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1076   // the hardware handle it.  The two dwords within qwords that span
1077   // cache line boundaries will still be loaded and stored atomicly.
1078   //
1079   // Side Effects:
1080   //   disjoint_int_copy_entry is set to the no-overlap entry point
1081   //   used by generate_conjoint_int_oop_copy().
1082   //
1083   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1084                                   const char *name, bool dest_uninitialized = false) {
1085     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1086     __ align(CodeEntryAlignment);
1087     StubCodeMark mark(this, "StubRoutines", name);
1088     address start = __ pc();
1089     if (entry != NULL) {
1090       *entry = __ pc();
1091       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1092       BLOCK_COMMENT("Entry:");
1093     }
1094     __ enter();
1095     if (is_oop) {
1096       __ push(RegSet::of(d, count), sp);
1097       // no registers are destroyed by this call
1098       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1099     }
1100     copy_memory(aligned, s, d, count, rscratch1, size);
1101     if (is_oop) {
1102       __ pop(RegSet::of(d, count), sp);
1103       if (VerifyOops)
1104         verify_oop_array(size, d, count, r16);
1105       __ sub(count, count, 1); // make an inclusive end pointer
1106       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1107       gen_write_ref_array_post_barrier(d, count, rscratch1);
1108     }
1109     __ leave();
1110     __ ret(lr);
1111 #ifdef BUILTIN_SIM
1112     {
1113       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1114       sim->notifyCompile(const_cast<char*>(name), start);
1115     }
1116 #endif
1117     return start;
1118   }
1119 
1120   // Arguments:
1121   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1122   //             ignored
1123   //   is_oop  - true => oop array, so generate store check code
1124   //   name    - stub name string
1125   //
1126   // Inputs:
1127   //   c_rarg0   - source array address
1128   //   c_rarg1   - destination array address
1129   //   c_rarg2   - element count, treated as ssize_t, can be zero
1130   //
1131   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1132   // the hardware handle it.  The two dwords within qwords that span
1133   // cache line boundaries will still be loaded and stored atomicly.
1134   //
1135   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1136                                  address *entry, const char *name,
1137                                  bool dest_uninitialized = false) {
1138     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1139 
1140     StubCodeMark mark(this, "StubRoutines", name);
1141     address start = __ pc();
1142 
1143     __ cmp(d, s);
1144     __ br(Assembler::LS, nooverlap_target);
1145 
1146     __ enter();
1147     if (is_oop) {
1148       __ push(RegSet::of(d, count), sp);
1149       // no registers are destroyed by this call
1150       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1151     }
1152     copy_memory(aligned, s, d, count, rscratch1, -size);
1153     if (is_oop) {
1154       __ pop(RegSet::of(d, count), sp);
1155       if (VerifyOops)
1156         verify_oop_array(size, d, count, r16);
1157       __ sub(count, count, 1); // make an inclusive end pointer
1158       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1159       gen_write_ref_array_post_barrier(d, count, rscratch1);
1160     }
1161     __ leave();
1162     __ ret(lr);
1163 #ifdef BUILTIN_SIM
1164     {
1165       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1166       sim->notifyCompile(const_cast<char*>(name), start);
1167     }
1168 #endif
1169     return start;
1170 }
1171 
1172   // Arguments:
1173   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1174   //             ignored
1175   //   name    - stub name string
1176   //
1177   // Inputs:
1178   //   c_rarg0   - source array address
1179   //   c_rarg1   - destination array address
1180   //   c_rarg2   - element count, treated as ssize_t, can be zero
1181   //
1182   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1183   // we let the hardware handle it.  The one to eight bytes within words,
1184   // dwords or qwords that span cache line boundaries will still be loaded
1185   // and stored atomically.
1186   //
1187   // Side Effects:
1188   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1189   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1190   // we let the hardware handle it.  The one to eight bytes within words,
1191   // dwords or qwords that span cache line boundaries will still be loaded
1192   // and stored atomically.
1193   //
1194   // Side Effects:
1195   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1196   //   used by generate_conjoint_byte_copy().
1197   //
1198   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1199     const bool not_oop = false;
1200     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1201   }
1202 
1203   // Arguments:
1204   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1205   //             ignored
1206   //   name    - stub name string
1207   //
1208   // Inputs:
1209   //   c_rarg0   - source array address
1210   //   c_rarg1   - destination array address
1211   //   c_rarg2   - element count, treated as ssize_t, can be zero
1212   //
1213   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1214   // we let the hardware handle it.  The one to eight bytes within words,
1215   // dwords or qwords that span cache line boundaries will still be loaded
1216   // and stored atomically.
1217   //
1218   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1219                                       address* entry, const char *name) {
1220     const bool not_oop = false;
1221     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1222   }
1223 
1224   // Arguments:
1225   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1226   //             ignored
1227   //   name    - stub name string
1228   //
1229   // Inputs:
1230   //   c_rarg0   - source array address
1231   //   c_rarg1   - destination array address
1232   //   c_rarg2   - element count, treated as ssize_t, can be zero
1233   //
1234   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1235   // let the hardware handle it.  The two or four words within dwords
1236   // or qwords that span cache line boundaries will still be loaded
1237   // and stored atomically.
1238   //
1239   // Side Effects:
1240   //   disjoint_short_copy_entry is set to the no-overlap entry point
1241   //   used by generate_conjoint_short_copy().
1242   //
1243   address generate_disjoint_short_copy(bool aligned,
1244                                        address* entry, const char *name) {
1245     const bool not_oop = false;
1246     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1247   }
1248 
1249   // Arguments:
1250   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1251   //             ignored
1252   //   name    - stub name string
1253   //
1254   // Inputs:
1255   //   c_rarg0   - source array address
1256   //   c_rarg1   - destination array address
1257   //   c_rarg2   - element count, treated as ssize_t, can be zero
1258   //
1259   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1260   // let the hardware handle it.  The two or four words within dwords
1261   // or qwords that span cache line boundaries will still be loaded
1262   // and stored atomically.
1263   //
1264   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1265                                        address *entry, const char *name) {
1266     const bool not_oop = false;
1267     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1268 
1269   }
1270   // Arguments:
1271   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1272   //             ignored
1273   //   name    - stub name string
1274   //
1275   // Inputs:
1276   //   c_rarg0   - source array address
1277   //   c_rarg1   - destination array address
1278   //   c_rarg2   - element count, treated as ssize_t, can be zero
1279   //
1280   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1281   // the hardware handle it.  The two dwords within qwords that span
1282   // cache line boundaries will still be loaded and stored atomicly.
1283   //
1284   // Side Effects:
1285   //   disjoint_int_copy_entry is set to the no-overlap entry point
1286   //   used by generate_conjoint_int_oop_copy().
1287   //
1288   address generate_disjoint_int_copy(bool aligned, address *entry,
1289                                          const char *name, bool dest_uninitialized = false) {
1290     const bool not_oop = false;
1291     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1292   }
1293 
1294   // Arguments:
1295   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1296   //             ignored
1297   //   name    - stub name string
1298   //
1299   // Inputs:
1300   //   c_rarg0   - source array address
1301   //   c_rarg1   - destination array address
1302   //   c_rarg2   - element count, treated as ssize_t, can be zero
1303   //
1304   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1305   // the hardware handle it.  The two dwords within qwords that span
1306   // cache line boundaries will still be loaded and stored atomicly.
1307   //
1308   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1309                                      address *entry, const char *name,
1310                                      bool dest_uninitialized = false) {
1311     const bool not_oop = false;
1312     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1313   }
1314 
1315 
1316   // Arguments:
1317   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1318   //             ignored
1319   //   name    - stub name string
1320   //
1321   // Inputs:
1322   //   c_rarg0   - source array address
1323   //   c_rarg1   - destination array address
1324   //   c_rarg2   - element count, treated as size_t, can be zero
1325   //
1326   // Side Effects:
1327   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1328   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1329   //
1330   address generate_disjoint_long_copy(bool aligned, address *entry,
1331                                           const char *name, bool dest_uninitialized = false) {
1332     const bool not_oop = false;
1333     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1334   }
1335 
1336   // Arguments:
1337   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1338   //             ignored
1339   //   name    - stub name string
1340   //
1341   // Inputs:
1342   //   c_rarg0   - source array address
1343   //   c_rarg1   - destination array address
1344   //   c_rarg2   - element count, treated as size_t, can be zero
1345   //
1346   address generate_conjoint_long_copy(bool aligned,
1347                                       address nooverlap_target, address *entry,
1348                                       const char *name, bool dest_uninitialized = false) {
1349     const bool not_oop = false;
1350     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1351   }
1352 
1353   // Arguments:
1354   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1355   //             ignored
1356   //   name    - stub name string
1357   //
1358   // Inputs:
1359   //   c_rarg0   - source array address
1360   //   c_rarg1   - destination array address
1361   //   c_rarg2   - element count, treated as size_t, can be zero
1362   //
1363   // Side Effects:
1364   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1365   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1366   //
1367   address generate_disjoint_oop_copy(bool aligned, address *entry,
1368                                      const char *name, bool dest_uninitialized = false) {
1369     const bool is_oop = true;
1370     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1371     return generate_disjoint_copy(size, aligned, is_oop, entry, name);
1372   }
1373 
1374   // Arguments:
1375   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1376   //             ignored
1377   //   name    - stub name string
1378   //
1379   // Inputs:
1380   //   c_rarg0   - source array address
1381   //   c_rarg1   - destination array address
1382   //   c_rarg2   - element count, treated as size_t, can be zero
1383   //
1384   address generate_conjoint_oop_copy(bool aligned,
1385                                      address nooverlap_target, address *entry,
1386                                      const char *name, bool dest_uninitialized = false) {
1387     const bool is_oop = true;
1388     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1389     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name);
1390   }
1391 
1392 
1393   // Helper for generating a dynamic type check.
1394   // Smashes rscratch1.
1395   void generate_type_check(Register sub_klass,
1396                            Register super_check_offset,
1397                            Register super_klass,
1398                            Label& L_success) {
1399     assert_different_registers(sub_klass, super_check_offset, super_klass);
1400 
1401     BLOCK_COMMENT("type_check:");
1402 
1403     Label L_miss;
1404 
1405     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1406                                      super_check_offset);
1407     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1408 
1409     // Fall through on failure!
1410     __ BIND(L_miss);
1411   }
1412 
1413   //
1414   //  Generate checkcasting array copy stub
1415   //
1416   //  Input:
1417   //    c_rarg0   - source array address
1418   //    c_rarg1   - destination array address
1419   //    c_rarg2   - element count, treated as ssize_t, can be zero
1420   //    c_rarg3   - size_t ckoff (super_check_offset)
1421   //    c_rarg4   - oop ckval (super_klass)
1422   //
1423   //  Output:
1424   //    r0 ==  0  -  success
1425   //    r0 == -1^K - failure, where K is partial transfer count
1426   //
1427   address generate_checkcast_copy(const char *name, address *entry,
1428                                   bool dest_uninitialized = false) {
1429 
1430     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1431 
1432     // Input registers (after setup_arg_regs)
1433     const Register from        = c_rarg0;   // source array address
1434     const Register to          = c_rarg1;   // destination array address
1435     const Register count       = c_rarg2;   // elementscount
1436     const Register ckoff       = c_rarg3;   // super_check_offset
1437     const Register ckval       = c_rarg4;   // super_klass
1438 
1439     // Registers used as temps (r18, r19, r20 are save-on-entry)
1440     const Register count_save  = r21;       // orig elementscount
1441     const Register start_to    = r20;       // destination array start address
1442     const Register copied_oop  = r18;       // actual oop copied
1443     const Register r19_klass   = r19;       // oop._klass
1444 
1445     //---------------------------------------------------------------
1446     // Assembler stub will be used for this call to arraycopy
1447     // if the two arrays are subtypes of Object[] but the
1448     // destination array type is not equal to or a supertype
1449     // of the source type.  Each element must be separately
1450     // checked.
1451 
1452     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1453                                copied_oop, r19_klass, count_save);
1454 
1455     __ align(CodeEntryAlignment);
1456     StubCodeMark mark(this, "StubRoutines", name);
1457     address start = __ pc();
1458 
1459     __ enter(); // required for proper stackwalking of RuntimeStub frame
1460 
1461 #ifdef ASSERT
1462     // caller guarantees that the arrays really are different
1463     // otherwise, we would have to make conjoint checks
1464     { Label L;
1465       array_overlap_test(L, TIMES_OOP);
1466       __ stop("checkcast_copy within a single array");
1467       __ bind(L);
1468     }
1469 #endif //ASSERT
1470 
1471     // Caller of this entry point must set up the argument registers.
1472     if (entry != NULL) {
1473       *entry = __ pc();
1474       BLOCK_COMMENT("Entry:");
1475     }
1476 
1477      // Empty array:  Nothing to do.
1478     __ cbz(count, L_done);
1479 
1480     __ push(RegSet::of(r18, r19, r20, r21), sp);
1481 
1482 #ifdef ASSERT
1483     BLOCK_COMMENT("assert consistent ckoff/ckval");
1484     // The ckoff and ckval must be mutually consistent,
1485     // even though caller generates both.
1486     { Label L;
1487       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1488       __ ldrw(start_to, Address(ckval, sco_offset));
1489       __ cmpw(ckoff, start_to);
1490       __ br(Assembler::EQ, L);
1491       __ stop("super_check_offset inconsistent");
1492       __ bind(L);
1493     }
1494 #endif //ASSERT
1495 
1496     // save the original count
1497     __ mov(count_save, count);
1498 
1499     // Copy from low to high addresses
1500     __ mov(start_to, to);              // Save destination array start address
1501     __ b(L_load_element);
1502 
1503     // ======== begin loop ========
1504     // (Loop is rotated; its entry is L_load_element.)
1505     // Loop control:
1506     //   for (; count != 0; count--) {
1507     //     copied_oop = load_heap_oop(from++);
1508     //     ... generate_type_check ...;
1509     //     store_heap_oop(to++, copied_oop);
1510     //   }
1511     __ align(OptoLoopAlignment);
1512 
1513     __ BIND(L_store_element);
1514     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1515     __ sub(count, count, 1);
1516     __ cbz(count, L_do_card_marks);
1517 
1518     // ======== loop entry is here ========
1519     __ BIND(L_load_element);
1520     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1521     __ cbz(copied_oop, L_store_element);
1522 
1523     __ load_klass(r19_klass, copied_oop);// query the object klass
1524     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1525     // ======== end loop ========
1526 
1527     // It was a real error; we must depend on the caller to finish the job.
1528     // Register count = remaining oops, count_orig = total oops.
1529     // Emit GC store barriers for the oops we have copied and report
1530     // their number to the caller.
1531 
1532     __ subs(count, count_save, count);     // K = partially copied oop count
1533     __ eon(count, count, zr);                   // report (-1^K) to caller
1534     __ br(Assembler::EQ, L_done_pop);
1535 
1536     __ BIND(L_do_card_marks);
1537     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1538     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1539 
1540     __ bind(L_done_pop);
1541     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1542     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1543 
1544     __ bind(L_done);
1545     __ mov(r0, count);
1546     __ leave();
1547     __ ret(lr);
1548 
1549     return start;
1550   }
1551 
1552   // Perform range checks on the proposed arraycopy.
1553   // Kills temp, but nothing else.
1554   // Also, clean the sign bits of src_pos and dst_pos.
1555   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1556                               Register src_pos, // source position (c_rarg1)
1557                               Register dst,     // destination array oo (c_rarg2)
1558                               Register dst_pos, // destination position (c_rarg3)
1559                               Register length,
1560                               Register temp,
1561                               Label& L_failed) { Unimplemented(); }
1562 
1563   // These stubs get called from some dumb test routine.
1564   // I'll write them properly when they're called from
1565   // something that's actually doing something.
1566   static void fake_arraycopy_stub(address src, address dst, int count) {
1567     assert(count == 0, "huh?");
1568   }
1569 
1570 
1571   void generate_arraycopy_stubs() {
1572     address entry;
1573     address entry_jbyte_arraycopy;
1574     address entry_jshort_arraycopy;
1575     address entry_jint_arraycopy;
1576     address entry_oop_arraycopy;
1577     address entry_jlong_arraycopy;
1578     address entry_checkcast_arraycopy;
1579 
1580     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
1581     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
1582 
1583     //*** jbyte
1584     // Always need aligned and unaligned versions
1585     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
1586                                                                                   "jbyte_disjoint_arraycopy");
1587     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
1588                                                                                   &entry_jbyte_arraycopy,
1589                                                                                   "jbyte_arraycopy");
1590     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
1591                                                                                   "arrayof_jbyte_disjoint_arraycopy");
1592     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
1593                                                                                   "arrayof_jbyte_arraycopy");
1594 
1595     //*** jshort
1596     // Always need aligned and unaligned versions
1597     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
1598                                                                                     "jshort_disjoint_arraycopy");
1599     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
1600                                                                                     &entry_jshort_arraycopy,
1601                                                                                     "jshort_arraycopy");
1602     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
1603                                                                                     "arrayof_jshort_disjoint_arraycopy");
1604     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
1605                                                                                     "arrayof_jshort_arraycopy");
1606 
1607     //*** jint
1608     // Aligned versions
1609     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
1610                                                                                 "arrayof_jint_disjoint_arraycopy");
1611     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
1612                                                                                 "arrayof_jint_arraycopy");
1613     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
1614     // entry_jint_arraycopy always points to the unaligned version
1615     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
1616                                                                                 "jint_disjoint_arraycopy");
1617     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
1618                                                                                 &entry_jint_arraycopy,
1619                                                                                 "jint_arraycopy");
1620 
1621     //*** jlong
1622     // It is always aligned
1623     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
1624                                                                                   "arrayof_jlong_disjoint_arraycopy");
1625     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
1626                                                                                   "arrayof_jlong_arraycopy");
1627     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
1628     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
1629 
1630     //*** oops
1631     {
1632       // With compressed oops we need unaligned versions; notice that
1633       // we overwrite entry_oop_arraycopy.
1634       bool aligned = !UseCompressedOops;
1635 
1636       StubRoutines::_arrayof_oop_disjoint_arraycopy
1637         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy");
1638       StubRoutines::_arrayof_oop_arraycopy
1639         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy");
1640       // Aligned versions without pre-barriers
1641       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
1642         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
1643                                      /*dest_uninitialized*/true);
1644       StubRoutines::_arrayof_oop_arraycopy_uninit
1645         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
1646                                      /*dest_uninitialized*/true);
1647     }
1648 
1649     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
1650     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
1651     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
1652     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
1653 
1654     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
1655     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
1656                                                                         /*dest_uninitialized*/true);
1657   }
1658 
1659   void generate_math_stubs() { Unimplemented(); }
1660 
1661   // Arguments:
1662   //
1663   // Inputs:
1664   //   c_rarg0   - source byte array address
1665   //   c_rarg1   - destination byte array address
1666   //   c_rarg2   - K (key) in little endian int array
1667   //
1668   address generate_aescrypt_encryptBlock() {
1669     __ align(CodeEntryAlignment);
1670     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
1671 
1672     Label L_doLast;
1673 
1674     const Register from        = c_rarg0;  // source array address
1675     const Register to          = c_rarg1;  // destination array address
1676     const Register key         = c_rarg2;  // key array address
1677     const Register keylen      = rscratch1;
1678 
1679     address start = __ pc();
1680     __ enter();
1681 
1682     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1683 
1684     __ ld1(v0, __ T16B, from); // get 16 bytes of input
1685 
1686     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1687     __ rev32(v1, __ T16B, v1);
1688     __ rev32(v2, __ T16B, v2);
1689     __ rev32(v3, __ T16B, v3);
1690     __ rev32(v4, __ T16B, v4);
1691     __ aese(v0, v1);
1692     __ aesmc(v0, v0);
1693     __ aese(v0, v2);
1694     __ aesmc(v0, v0);
1695     __ aese(v0, v3);
1696     __ aesmc(v0, v0);
1697     __ aese(v0, v4);
1698     __ aesmc(v0, v0);
1699 
1700     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1701     __ rev32(v1, __ T16B, v1);
1702     __ rev32(v2, __ T16B, v2);
1703     __ rev32(v3, __ T16B, v3);
1704     __ rev32(v4, __ T16B, v4);
1705     __ aese(v0, v1);
1706     __ aesmc(v0, v0);
1707     __ aese(v0, v2);
1708     __ aesmc(v0, v0);
1709     __ aese(v0, v3);
1710     __ aesmc(v0, v0);
1711     __ aese(v0, v4);
1712     __ aesmc(v0, v0);
1713 
1714     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1715     __ rev32(v1, __ T16B, v1);
1716     __ rev32(v2, __ T16B, v2);
1717 
1718     __ cmpw(keylen, 44);
1719     __ br(Assembler::EQ, L_doLast);
1720 
1721     __ aese(v0, v1);
1722     __ aesmc(v0, v0);
1723     __ aese(v0, v2);
1724     __ aesmc(v0, v0);
1725 
1726     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1727     __ rev32(v1, __ T16B, v1);
1728     __ rev32(v2, __ T16B, v2);
1729 
1730     __ cmpw(keylen, 52);
1731     __ br(Assembler::EQ, L_doLast);
1732 
1733     __ aese(v0, v1);
1734     __ aesmc(v0, v0);
1735     __ aese(v0, v2);
1736     __ aesmc(v0, v0);
1737 
1738     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1739     __ rev32(v1, __ T16B, v1);
1740     __ rev32(v2, __ T16B, v2);
1741 
1742     __ BIND(L_doLast);
1743 
1744     __ aese(v0, v1);
1745     __ aesmc(v0, v0);
1746     __ aese(v0, v2);
1747 
1748     __ ld1(v1, __ T16B, key);
1749     __ rev32(v1, __ T16B, v1);
1750     __ eor(v0, __ T16B, v0, v1);
1751 
1752     __ st1(v0, __ T16B, to);
1753 
1754     __ mov(r0, 0);
1755 
1756     __ leave();
1757     __ ret(lr);
1758 
1759     return start;
1760   }
1761 
1762   // Arguments:
1763   //
1764   // Inputs:
1765   //   c_rarg0   - source byte array address
1766   //   c_rarg1   - destination byte array address
1767   //   c_rarg2   - K (key) in little endian int array
1768   //
1769   address generate_aescrypt_decryptBlock() {
1770     assert(UseAES, "need AES instructions and misaligned SSE support");
1771     __ align(CodeEntryAlignment);
1772     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
1773     Label L_doLast;
1774 
1775     const Register from        = c_rarg0;  // source array address
1776     const Register to          = c_rarg1;  // destination array address
1777     const Register key         = c_rarg2;  // key array address
1778     const Register keylen      = rscratch1;
1779 
1780     address start = __ pc();
1781     __ enter(); // required for proper stackwalking of RuntimeStub frame
1782 
1783     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1784 
1785     __ ld1(v0, __ T16B, from); // get 16 bytes of input
1786 
1787     __ ld1(v5, __ T16B, __ post(key, 16));
1788     __ rev32(v5, __ T16B, v5);
1789 
1790     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1791     __ rev32(v1, __ T16B, v1);
1792     __ rev32(v2, __ T16B, v2);
1793     __ rev32(v3, __ T16B, v3);
1794     __ rev32(v4, __ T16B, v4);
1795     __ aesd(v0, v1);
1796     __ aesimc(v0, v0);
1797     __ aesd(v0, v2);
1798     __ aesimc(v0, v0);
1799     __ aesd(v0, v3);
1800     __ aesimc(v0, v0);
1801     __ aesd(v0, v4);
1802     __ aesimc(v0, v0);
1803 
1804     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1805     __ rev32(v1, __ T16B, v1);
1806     __ rev32(v2, __ T16B, v2);
1807     __ rev32(v3, __ T16B, v3);
1808     __ rev32(v4, __ T16B, v4);
1809     __ aesd(v0, v1);
1810     __ aesimc(v0, v0);
1811     __ aesd(v0, v2);
1812     __ aesimc(v0, v0);
1813     __ aesd(v0, v3);
1814     __ aesimc(v0, v0);
1815     __ aesd(v0, v4);
1816     __ aesimc(v0, v0);
1817 
1818     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1819     __ rev32(v1, __ T16B, v1);
1820     __ rev32(v2, __ T16B, v2);
1821 
1822     __ cmpw(keylen, 44);
1823     __ br(Assembler::EQ, L_doLast);
1824 
1825     __ aesd(v0, v1);
1826     __ aesimc(v0, v0);
1827     __ aesd(v0, v2);
1828     __ aesimc(v0, v0);
1829 
1830     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1831     __ rev32(v1, __ T16B, v1);
1832     __ rev32(v2, __ T16B, v2);
1833 
1834     __ cmpw(keylen, 52);
1835     __ br(Assembler::EQ, L_doLast);
1836 
1837     __ aesd(v0, v1);
1838     __ aesimc(v0, v0);
1839     __ aesd(v0, v2);
1840     __ aesimc(v0, v0);
1841 
1842     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1843     __ rev32(v1, __ T16B, v1);
1844     __ rev32(v2, __ T16B, v2);
1845 
1846     __ BIND(L_doLast);
1847 
1848     __ aesd(v0, v1);
1849     __ aesimc(v0, v0);
1850     __ aesd(v0, v2);
1851 
1852     __ eor(v0, __ T16B, v0, v5);
1853 
1854     __ st1(v0, __ T16B, to);
1855 
1856     __ mov(r0, 0);
1857 
1858     __ leave();
1859     __ ret(lr);
1860 
1861     return start;
1862   }
1863 
1864   // Arguments:
1865   //
1866   // Inputs:
1867   //   c_rarg0   - source byte array address
1868   //   c_rarg1   - destination byte array address
1869   //   c_rarg2   - K (key) in little endian int array
1870   //   c_rarg3   - r vector byte array address
1871   //   c_rarg4   - input length
1872   //
1873   // Output:
1874   //   x0        - input length
1875   //
1876   address generate_cipherBlockChaining_encryptAESCrypt() {
1877     assert(UseAES, "need AES instructions and misaligned SSE support");
1878     __ align(CodeEntryAlignment);
1879     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
1880 
1881     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
1882 
1883     const Register from        = c_rarg0;  // source array address
1884     const Register to          = c_rarg1;  // destination array address
1885     const Register key         = c_rarg2;  // key array address
1886     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
1887                                            // and left with the results of the last encryption block
1888     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
1889     const Register keylen      = rscratch1;
1890 
1891     address start = __ pc();
1892       __ enter();
1893 
1894       __ mov(rscratch1, len_reg);
1895       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1896 
1897       __ ld1(v0, __ T16B, rvec);
1898 
1899       __ cmpw(keylen, 52);
1900       __ br(Assembler::CC, L_loadkeys_44);
1901       __ br(Assembler::EQ, L_loadkeys_52);
1902 
1903       __ ld1(v17, v18, __ T16B, __ post(key, 32));
1904       __ rev32(v17, __ T16B, v17);
1905       __ rev32(v18, __ T16B, v18);
1906     __ BIND(L_loadkeys_52);
1907       __ ld1(v19, v20, __ T16B, __ post(key, 32));
1908       __ rev32(v19, __ T16B, v19);
1909       __ rev32(v20, __ T16B, v20);
1910     __ BIND(L_loadkeys_44);
1911       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
1912       __ rev32(v21, __ T16B, v21);
1913       __ rev32(v22, __ T16B, v22);
1914       __ rev32(v23, __ T16B, v23);
1915       __ rev32(v24, __ T16B, v24);
1916       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
1917       __ rev32(v25, __ T16B, v25);
1918       __ rev32(v26, __ T16B, v26);
1919       __ rev32(v27, __ T16B, v27);
1920       __ rev32(v28, __ T16B, v28);
1921       __ ld1(v29, v30, v31, __ T16B, key);
1922       __ rev32(v29, __ T16B, v29);
1923       __ rev32(v30, __ T16B, v30);
1924       __ rev32(v31, __ T16B, v31);
1925 
1926     __ BIND(L_aes_loop);
1927       __ ld1(v1, __ T16B, __ post(from, 16));
1928       __ eor(v0, __ T16B, v0, v1);
1929 
1930       __ br(Assembler::CC, L_rounds_44);
1931       __ br(Assembler::EQ, L_rounds_52);
1932 
1933       __ aese(v0, v17); __ aesmc(v0, v0);
1934       __ aese(v0, v18); __ aesmc(v0, v0);
1935     __ BIND(L_rounds_52);
1936       __ aese(v0, v19); __ aesmc(v0, v0);
1937       __ aese(v0, v20); __ aesmc(v0, v0);
1938     __ BIND(L_rounds_44);
1939       __ aese(v0, v21); __ aesmc(v0, v0);
1940       __ aese(v0, v22); __ aesmc(v0, v0);
1941       __ aese(v0, v23); __ aesmc(v0, v0);
1942       __ aese(v0, v24); __ aesmc(v0, v0);
1943       __ aese(v0, v25); __ aesmc(v0, v0);
1944       __ aese(v0, v26); __ aesmc(v0, v0);
1945       __ aese(v0, v27); __ aesmc(v0, v0);
1946       __ aese(v0, v28); __ aesmc(v0, v0);
1947       __ aese(v0, v29); __ aesmc(v0, v0);
1948       __ aese(v0, v30);
1949       __ eor(v0, __ T16B, v0, v31);
1950 
1951       __ st1(v0, __ T16B, __ post(to, 16));
1952       __ sub(len_reg, len_reg, 16);
1953       __ cbnz(len_reg, L_aes_loop);
1954 
1955       __ st1(v0, __ T16B, rvec);
1956 
1957       __ mov(r0, rscratch2);
1958 
1959       __ leave();
1960       __ ret(lr);
1961 
1962       return start;
1963   }
1964 
1965   // Arguments:
1966   //
1967   // Inputs:
1968   //   c_rarg0   - source byte array address
1969   //   c_rarg1   - destination byte array address
1970   //   c_rarg2   - K (key) in little endian int array
1971   //   c_rarg3   - r vector byte array address
1972   //   c_rarg4   - input length
1973   //
1974   // Output:
1975   //   rax       - input length
1976   //
1977   address generate_cipherBlockChaining_decryptAESCrypt() {
1978     assert(UseAES, "need AES instructions and misaligned SSE support");
1979     __ align(CodeEntryAlignment);
1980     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
1981 
1982     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
1983 
1984     const Register from        = c_rarg0;  // source array address
1985     const Register to          = c_rarg1;  // destination array address
1986     const Register key         = c_rarg2;  // key array address
1987     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
1988                                            // and left with the results of the last encryption block
1989     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
1990     const Register keylen      = rscratch1;
1991 
1992     address start = __ pc();
1993       __ enter();
1994 
1995       __ mov(rscratch2, len_reg);
1996       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1997 
1998       __ ld1(v2, __ T16B, rvec);
1999 
2000       __ ld1(v31, __ T16B, __ post(key, 16));
2001       __ rev32(v31, __ T16B, v31);
2002 
2003       __ cmpw(keylen, 52);
2004       __ br(Assembler::CC, L_loadkeys_44);
2005       __ br(Assembler::EQ, L_loadkeys_52);
2006 
2007       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2008       __ rev32(v17, __ T16B, v17);
2009       __ rev32(v18, __ T16B, v18);
2010     __ BIND(L_loadkeys_52);
2011       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2012       __ rev32(v19, __ T16B, v19);
2013       __ rev32(v20, __ T16B, v20);
2014     __ BIND(L_loadkeys_44);
2015       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2016       __ rev32(v21, __ T16B, v21);
2017       __ rev32(v22, __ T16B, v22);
2018       __ rev32(v23, __ T16B, v23);
2019       __ rev32(v24, __ T16B, v24);
2020       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2021       __ rev32(v25, __ T16B, v25);
2022       __ rev32(v26, __ T16B, v26);
2023       __ rev32(v27, __ T16B, v27);
2024       __ rev32(v28, __ T16B, v28);
2025       __ ld1(v29, v30, __ T16B, key);
2026       __ rev32(v29, __ T16B, v29);
2027       __ rev32(v30, __ T16B, v30);
2028 
2029     __ BIND(L_aes_loop);
2030       __ ld1(v0, __ T16B, __ post(from, 16));
2031       __ orr(v1, __ T16B, v0, v0);
2032 
2033       __ br(Assembler::CC, L_rounds_44);
2034       __ br(Assembler::EQ, L_rounds_52);
2035 
2036       __ aesd(v0, v17); __ aesimc(v0, v0);
2037       __ aesd(v0, v17); __ aesimc(v0, v0);
2038     __ BIND(L_rounds_52);
2039       __ aesd(v0, v19); __ aesimc(v0, v0);
2040       __ aesd(v0, v20); __ aesimc(v0, v0);
2041     __ BIND(L_rounds_44);
2042       __ aesd(v0, v21); __ aesimc(v0, v0);
2043       __ aesd(v0, v22); __ aesimc(v0, v0);
2044       __ aesd(v0, v23); __ aesimc(v0, v0);
2045       __ aesd(v0, v24); __ aesimc(v0, v0);
2046       __ aesd(v0, v25); __ aesimc(v0, v0);
2047       __ aesd(v0, v26); __ aesimc(v0, v0);
2048       __ aesd(v0, v27); __ aesimc(v0, v0);
2049       __ aesd(v0, v28); __ aesimc(v0, v0);
2050       __ aesd(v0, v29); __ aesimc(v0, v0);
2051       __ aesd(v0, v30);
2052       __ eor(v0, __ T16B, v0, v31);
2053       __ eor(v0, __ T16B, v0, v2);
2054 
2055       __ st1(v0, __ T16B, __ post(to, 16));
2056       __ orr(v2, __ T16B, v1, v1);
2057 
2058       __ sub(len_reg, len_reg, 16);
2059       __ cbnz(len_reg, L_aes_loop);
2060 
2061       __ st1(v2, __ T16B, rvec);
2062 
2063       __ mov(r0, rscratch2);
2064 
2065       __ leave();
2066       __ ret(lr);
2067 
2068     return start;
2069   }
2070 
2071   // Arguments:
2072   //
2073   // Inputs:
2074   //   c_rarg0   - byte[]  source+offset
2075   //   c_rarg1   - int[]   SHA.state
2076   //   c_rarg2   - int     offset
2077   //   c_rarg3   - int     limit
2078   //
2079   address generate_sha1_implCompress(bool multi_block, const char *name) {
2080     __ align(CodeEntryAlignment);
2081     StubCodeMark mark(this, "StubRoutines", name);
2082     address start = __ pc();
2083 
2084     Register buf   = c_rarg0;
2085     Register state = c_rarg1;
2086     Register ofs   = c_rarg2;
2087     Register limit = c_rarg3;
2088 
2089     Label keys;
2090     Label sha1_loop;
2091 
2092     // load the keys into v0..v3
2093     __ adr(rscratch1, keys);
2094     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2095     // load 5 words state into v6, v7
2096     __ ldrq(v6, Address(state, 0));
2097     __ ldrs(v7, Address(state, 16));
2098 
2099 
2100     __ BIND(sha1_loop);
2101     // load 64 bytes of data into v16..v19
2102     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2103     __ rev32(v16, __ T16B, v16);
2104     __ rev32(v17, __ T16B, v17);
2105     __ rev32(v18, __ T16B, v18);
2106     __ rev32(v19, __ T16B, v19);
2107 
2108     // do the sha1
2109     __ addv(v4, __ T4S, v16, v0);
2110     __ orr(v20, __ T16B, v6, v6);
2111 
2112     FloatRegister d0 = v16;
2113     FloatRegister d1 = v17;
2114     FloatRegister d2 = v18;
2115     FloatRegister d3 = v19;
2116 
2117     for (int round = 0; round < 20; round++) {
2118       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2119       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2120       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2121       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2122       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2123 
2124       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2125       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2126       __ sha1h(tmp2, __ T4S, v20);
2127       if (round < 5)
2128         __ sha1c(v20, __ T4S, tmp3, tmp4);
2129       else if (round < 10 || round >= 15)
2130         __ sha1p(v20, __ T4S, tmp3, tmp4);
2131       else
2132         __ sha1m(v20, __ T4S, tmp3, tmp4);
2133       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2134 
2135       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2136     }
2137 
2138     __ addv(v7, __ T2S, v7, v21);
2139     __ addv(v6, __ T4S, v6, v20);
2140 
2141     if (multi_block) {
2142       __ add(ofs, ofs, 64);
2143       __ cmp(ofs, limit);
2144       __ br(Assembler::LE, sha1_loop);
2145       __ mov(c_rarg0, ofs); // return ofs
2146     }
2147 
2148     __ strq(v6, Address(state, 0));
2149     __ strs(v7, Address(state, 16));
2150 
2151     __ ret(lr);
2152 
2153     __ bind(keys);
2154     __ emit_int32(0x5a827999);
2155     __ emit_int32(0x6ed9eba1);
2156     __ emit_int32(0x8f1bbcdc);
2157     __ emit_int32(0xca62c1d6);
2158 
2159     return start;
2160   }
2161 
2162 
2163   // Arguments:
2164   //
2165   // Inputs:
2166   //   c_rarg0   - byte[]  source+offset
2167   //   c_rarg1   - int[]   SHA.state
2168   //   c_rarg2   - int     offset
2169   //   c_rarg3   - int     limit
2170   //
2171   address generate_sha256_implCompress(bool multi_block, const char *name) {
2172     static const uint32_t round_consts[64] = {
2173       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2174       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2175       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2176       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2177       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2178       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2179       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2180       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2181       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2182       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2183       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2184       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2185       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2186       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2187       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2188       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
2189     };
2190     __ align(CodeEntryAlignment);
2191     StubCodeMark mark(this, "StubRoutines", name);
2192     address start = __ pc();
2193 
2194     Register buf   = c_rarg0;
2195     Register state = c_rarg1;
2196     Register ofs   = c_rarg2;
2197     Register limit = c_rarg3;
2198 
2199     Label sha1_loop;
2200 
2201     __ stpd(v8, v9, __ pre(sp, -32));
2202     __ stpd(v10, v11, Address(sp, 16));
2203 
2204 // dga == v0
2205 // dgb == v1
2206 // dg0 == v2
2207 // dg1 == v3
2208 // dg2 == v4
2209 // t0 == v6
2210 // t1 == v7
2211 
2212     // load 16 keys to v16..v31
2213     __ lea(rscratch1, ExternalAddress((address)round_consts));
2214     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
2215     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
2216     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
2217     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
2218 
2219     // load 8 words (256 bits) state
2220     __ ldpq(v0, v1, state);
2221 
2222     __ BIND(sha1_loop);
2223     // load 64 bytes of data into v8..v11
2224     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
2225     __ rev32(v8, __ T16B, v8);
2226     __ rev32(v9, __ T16B, v9);
2227     __ rev32(v10, __ T16B, v10);
2228     __ rev32(v11, __ T16B, v11);
2229 
2230     __ addv(v6, __ T4S, v8, v16);
2231     __ orr(v2, __ T16B, v0, v0);
2232     __ orr(v3, __ T16B, v1, v1);
2233 
2234     FloatRegister d0 = v8;
2235     FloatRegister d1 = v9;
2236     FloatRegister d2 = v10;
2237     FloatRegister d3 = v11;
2238 
2239 
2240     for (int round = 0; round < 16; round++) {
2241       FloatRegister tmp1 = (round & 1) ? v6 : v7;
2242       FloatRegister tmp2 = (round & 1) ? v7 : v6;
2243       FloatRegister tmp3 = (round & 1) ? v2 : v4;
2244       FloatRegister tmp4 = (round & 1) ? v4 : v2;
2245 
2246       if (round < 12) __ sha256su0(d0, __ T4S, d1);
2247        __ orr(v4, __ T16B, v2, v2);
2248       if (round < 15)
2249         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
2250       __ sha256h(v2, __ T4S, v3, tmp2);
2251       __ sha256h2(v3, __ T4S, v4, tmp2);
2252       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
2253 
2254       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2255     }
2256 
2257     __ addv(v0, __ T4S, v0, v2);
2258     __ addv(v1, __ T4S, v1, v3);
2259 
2260     if (multi_block) {
2261       __ add(ofs, ofs, 64);
2262       __ cmp(ofs, limit);
2263       __ br(Assembler::LE, sha1_loop);
2264       __ mov(c_rarg0, ofs); // return ofs
2265     }
2266 
2267     __ ldpd(v10, v11, Address(sp, 16));
2268     __ ldpd(v8, v9, __ post(sp, 32));
2269 
2270     __ stpq(v0, v1, state);
2271 
2272     __ ret(lr);
2273 
2274     return start;
2275   }
2276 
2277 #ifndef BUILTIN_SIM
2278   // Safefetch stubs.
2279   void generate_safefetch(const char* name, int size, address* entry,
2280                           address* fault_pc, address* continuation_pc) {
2281     // safefetch signatures:
2282     //   int      SafeFetch32(int*      adr, int      errValue);
2283     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2284     //
2285     // arguments:
2286     //   c_rarg0 = adr
2287     //   c_rarg1 = errValue
2288     //
2289     // result:
2290     //   PPC_RET  = *adr or errValue
2291 
2292     StubCodeMark mark(this, "StubRoutines", name);
2293 
2294     // Entry point, pc or function descriptor.
2295     *entry = __ pc();
2296 
2297     // Load *adr into c_rarg1, may fault.
2298     *fault_pc = __ pc();
2299     switch (size) {
2300       case 4:
2301         // int32_t
2302         __ ldrw(c_rarg1, Address(c_rarg0, 0));
2303         break;
2304       case 8:
2305         // int64_t
2306         __ ldr(c_rarg1, Address(c_rarg0, 0));
2307         break;
2308       default:
2309         ShouldNotReachHere();
2310     }
2311 
2312     // return errValue or *adr
2313     *continuation_pc = __ pc();
2314     __ mov(r0, c_rarg1);
2315     __ ret(lr);
2316   }
2317 #endif
2318 
2319   /**
2320    *  Arguments:
2321    *
2322    * Inputs:
2323    *   c_rarg0   - int crc
2324    *   c_rarg1   - byte* buf
2325    *   c_rarg2   - int length
2326    *
2327    * Ouput:
2328    *       rax   - int crc result
2329    */
2330   address generate_updateBytesCRC32() {
2331     assert(UseCRC32Intrinsics, "what are we doing here?");
2332 
2333     __ align(CodeEntryAlignment);
2334     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
2335 
2336     address start = __ pc();
2337 
2338     const Register crc   = c_rarg0;  // crc
2339     const Register buf   = c_rarg1;  // source java byte array address
2340     const Register len   = c_rarg2;  // length
2341     const Register table0 = c_rarg3; // crc_table address
2342     const Register table1 = c_rarg4;
2343     const Register table2 = c_rarg5;
2344     const Register table3 = c_rarg6;
2345     const Register tmp3 = c_rarg7;
2346 
2347     BLOCK_COMMENT("Entry:");
2348     __ enter(); // required for proper stackwalking of RuntimeStub frame
2349 
2350     __ kernel_crc32(crc, buf, len,
2351               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2352 
2353     __ leave(); // required for proper stackwalking of RuntimeStub frame
2354     __ ret(lr);
2355 
2356     return start;
2357   }
2358 
2359 #undef __
2360 #define __ masm->
2361 
2362   // Continuation point for throwing of implicit exceptions that are
2363   // not handled in the current activation. Fabricates an exception
2364   // oop and initiates normal exception dispatching in this
2365   // frame. Since we need to preserve callee-saved values (currently
2366   // only for C2, but done for C1 as well) we need a callee-saved oop
2367   // map and therefore have to make these stubs into RuntimeStubs
2368   // rather than BufferBlobs.  If the compiler needs all registers to
2369   // be preserved between the fault point and the exception handler
2370   // then it must assume responsibility for that in
2371   // AbstractCompiler::continuation_for_implicit_null_exception or
2372   // continuation_for_implicit_division_by_zero_exception. All other
2373   // implicit exceptions (e.g., NullPointerException or
2374   // AbstractMethodError on entry) are either at call sites or
2375   // otherwise assume that stack unwinding will be initiated, so
2376   // caller saved registers were assumed volatile in the compiler.
2377 
2378   address generate_throw_exception(const char* name,
2379                                    address runtime_entry,
2380                                    Register arg1 = noreg,
2381                                    Register arg2 = noreg) {
2382     // Information about frame layout at time of blocking runtime call.
2383     // Note that we only have to preserve callee-saved registers since
2384     // the compilers are responsible for supplying a continuation point
2385     // if they expect all registers to be preserved.
2386     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
2387     enum layout {
2388       rfp_off = 0,
2389       rfp_off2,
2390       return_off,
2391       return_off2,
2392       framesize // inclusive of return address
2393     };
2394 
2395     int insts_size = 512;
2396     int locs_size  = 64;
2397 
2398     CodeBuffer code(name, insts_size, locs_size);
2399     OopMapSet* oop_maps  = new OopMapSet();
2400     MacroAssembler* masm = new MacroAssembler(&code);
2401 
2402     address start = __ pc();
2403 
2404     // This is an inlined and slightly modified version of call_VM
2405     // which has the ability to fetch the return PC out of
2406     // thread-local storage and also sets up last_Java_sp slightly
2407     // differently than the real call_VM
2408 
2409     __ enter(); // Save FP and LR before call
2410 
2411     assert(is_even(framesize/2), "sp not 16-byte aligned");
2412 
2413     // lr and fp are already in place
2414     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
2415 
2416     int frame_complete = __ pc() - start;
2417 
2418     // Set up last_Java_sp and last_Java_fp
2419     address the_pc = __ pc();
2420     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
2421 
2422     // Call runtime
2423     if (arg1 != noreg) {
2424       assert(arg2 != c_rarg1, "clobbered");
2425       __ mov(c_rarg1, arg1);
2426     }
2427     if (arg2 != noreg) {
2428       __ mov(c_rarg2, arg2);
2429     }
2430     __ mov(c_rarg0, rthread);
2431     BLOCK_COMMENT("call runtime_entry");
2432     __ mov(rscratch1, runtime_entry);
2433     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
2434 
2435     // Generate oop map
2436     OopMap* map = new OopMap(framesize, 0);
2437 
2438     oop_maps->add_gc_map(the_pc - start, map);
2439 
2440     __ reset_last_Java_frame(true, true);
2441     __ maybe_isb();
2442 
2443     __ leave();
2444 
2445     // check for pending exceptions
2446 #ifdef ASSERT
2447     Label L;
2448     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
2449     __ cbnz(rscratch1, L);
2450     __ should_not_reach_here();
2451     __ bind(L);
2452 #endif // ASSERT
2453     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2454 
2455 
2456     // codeBlob framesize is in words (not VMRegImpl::slot_size)
2457     RuntimeStub* stub =
2458       RuntimeStub::new_runtime_stub(name,
2459                                     &code,
2460                                     frame_complete,
2461                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
2462                                     oop_maps, false);
2463     return stub->entry_point();
2464   }
2465 
2466   // Initialization
2467   void generate_initial() {
2468     // Generate initial stubs and initializes the entry points
2469 
2470     // entry points that exist in all platforms Note: This is code
2471     // that could be shared among different platforms - however the
2472     // benefit seems to be smaller than the disadvantage of having a
2473     // much more complicated generator structure. See also comment in
2474     // stubRoutines.hpp.
2475 
2476     StubRoutines::_forward_exception_entry = generate_forward_exception();
2477 
2478     StubRoutines::_call_stub_entry =
2479       generate_call_stub(StubRoutines::_call_stub_return_address);
2480 
2481     // is referenced by megamorphic call
2482     StubRoutines::_catch_exception_entry = generate_catch_exception();
2483 
2484     // Build this early so it's available for the interpreter.
2485     StubRoutines::_throw_StackOverflowError_entry =
2486       generate_throw_exception("StackOverflowError throw_exception",
2487                                CAST_FROM_FN_PTR(address,
2488                                                 SharedRuntime::
2489                                                 throw_StackOverflowError));
2490     if (UseCRC32Intrinsics) {
2491       // set table address before stub generation which use it
2492       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
2493       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
2494     }
2495   }
2496 
2497   void generate_all() {
2498     // support for verify_oop (must happen after universe_init)
2499     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
2500     StubRoutines::_throw_AbstractMethodError_entry =
2501       generate_throw_exception("AbstractMethodError throw_exception",
2502                                CAST_FROM_FN_PTR(address,
2503                                                 SharedRuntime::
2504                                                 throw_AbstractMethodError));
2505 
2506     StubRoutines::_throw_IncompatibleClassChangeError_entry =
2507       generate_throw_exception("IncompatibleClassChangeError throw_exception",
2508                                CAST_FROM_FN_PTR(address,
2509                                                 SharedRuntime::
2510                                                 throw_IncompatibleClassChangeError));
2511 
2512     StubRoutines::_throw_NullPointerException_at_call_entry =
2513       generate_throw_exception("NullPointerException at call throw_exception",
2514                                CAST_FROM_FN_PTR(address,
2515                                                 SharedRuntime::
2516                                                 throw_NullPointerException_at_call));
2517 
2518     // arraycopy stubs used by compilers
2519     generate_arraycopy_stubs();
2520 
2521 #ifndef BUILTIN_SIM
2522     if (UseAESIntrinsics) {
2523       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
2524       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
2525       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
2526       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
2527     }
2528 
2529     if (UseSHA1Intrinsics) {
2530       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
2531       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
2532     }
2533     if (UseSHA256Intrinsics) {
2534       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
2535       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
2536     }
2537 
2538     // Safefetch stubs.
2539     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
2540                                                        &StubRoutines::_safefetch32_fault_pc,
2541                                                        &StubRoutines::_safefetch32_continuation_pc);
2542     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2543                                                        &StubRoutines::_safefetchN_fault_pc,
2544                                                        &StubRoutines::_safefetchN_continuation_pc);
2545 #endif
2546   }
2547 
2548  public:
2549   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2550     if (all) {
2551       generate_all();
2552     } else {
2553       generate_initial();
2554     }
2555   }
2556 }; // end class declaration
2557 
2558 void StubGenerator_generate(CodeBuffer* code, bool all) {
2559   StubGenerator g(code, all);
2560 }