1 /*
   2  * Copyright (c) 2013, Red Hat Inc.
   3  * Copyright (c) 2003, 2011, Oracle and/or its affiliates.
   4  * All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "interpreter/interpreter.hpp"
  31 #include "nativeInst_aarch64.hpp"
  32 #include "oops/instanceOop.hpp"
  33 #include "oops/method.hpp"
  34 #include "oops/objArrayKlass.hpp"
  35 #include "oops/oop.inline.hpp"
  36 #include "prims/methodHandles.hpp"
  37 #include "runtime/frame.inline.hpp"
  38 #include "runtime/handles.inline.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubCodeGenerator.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "runtime/thread.inline.hpp"
  43 #include "utilities/top.hpp"
  44 #ifdef COMPILER2
  45 #include "opto/runtime.hpp"
  46 #endif
  47 
  48 #ifdef BUILTIN_SIM
  49 #include "../../../../../../simulator/simulator.hpp"
  50 #endif
  51 
  52 // Declaration and definition of StubGenerator (no .hpp file).
  53 // For a more detailed description of the stub routine structure
  54 // see the comment in stubRoutines.hpp
  55 
  56 #undef __
  57 #define __ _masm->
  58 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) /* nothing */
  62 #else
  63 #define BLOCK_COMMENT(str) __ block_comment(str)
  64 #endif
  65 
  66 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  67 
  68 // Stub Code definitions
  69 
  70 class StubGenerator: public StubCodeGenerator {
  71  private:
  72 
  73 #ifdef PRODUCT
  74 #define inc_counter_np(counter) ((void)0)
  75 #else
  76   void inc_counter_np_(int& counter) {
  77     __ lea(rscratch2, ExternalAddress((address)&counter));
  78     __ ldrw(rscratch1, Address(rscratch2));
  79     __ addw(rscratch1, rscratch1, 1);
  80     __ strw(rscratch1, Address(rscratch2));
  81   }
  82 #define inc_counter_np(counter) \
  83   BLOCK_COMMENT("inc_counter " #counter); \
  84   inc_counter_np_(counter);
  85 #endif
  86 
  87   // Call stubs are used to call Java from C
  88   //
  89   // Arguments:
  90   //    c_rarg0:   call wrapper address                   address
  91   //    c_rarg1:   result                                 address
  92   //    c_rarg2:   result type                            BasicType
  93   //    c_rarg3:   method                                 Method*
  94   //    c_rarg4:   (interpreter) entry point              address
  95   //    c_rarg5:   parameters                             intptr_t*
  96   //    c_rarg6:   parameter size (in words)              int
  97   //    c_rarg7:   thread                                 Thread*
  98   //
  99   // There is no return from the stub itself as any Java result
 100   // is written to result
 101   //
 102   // we save r30 (lr) as the return PC at the base of the frame and
 103   // link r29 (fp) below it as the frame pointer installing sp (r31)
 104   // into fp.
 105   //
 106   // we save r0-r7, which accounts for all the c arguments.
 107   //
 108   // TODO: strictly do we need to save them all? they are treated as
 109   // volatile by C so could we omit saving the ones we are going to
 110   // place in global registers (thread? method?) or those we only use
 111   // during setup of the Java call?
 112   //
 113   // we don't need to save r8 which C uses as an indirect result location
 114   // return register.
 115   //
 116   // we don't need to save r9-r15 which both C and Java treat as
 117   // volatile
 118   //
 119   // we don't need to save r16-18 because Java does not use them
 120   //
 121   // we save r19-r28 which Java uses as scratch registers and C
 122   // expects to be callee-save
 123   //
 124   // we don't save any FP registers since only v8-v15 are callee-save
 125   // (strictly only the f and d components) and Java uses them as
 126   // callee-save. v0-v7 are arg registers and C treats v16-v31 as
 127   // volatile (as does Java?)
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved d15            ] <--- sp_after_call
 136   // -25 [ saved d14            ]
 137   // -24 [ saved d13            ]
 138   // -23 [ saved d12            ]
 139   // -22 [ saved d11            ]
 140   // -21 [ saved d10            ]
 141   // -20 [ saved d9             ]
 142   // -19 [ saved d8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d14_off            = -25,
 170     d13_off            = -24,
 171     d12_off            = -23,
 172     d11_off            = -22,
 173     d10_off            = -21,
 174     d9_off             = -20,
 175     d8_off             = -19,
 176 
 177     r28_off            = -18,
 178     r27_off            = -17,
 179     r26_off            = -16,
 180     r25_off            = -15,
 181     r24_off            = -14,
 182     r23_off            = -13,
 183     r22_off            = -12,
 184     r21_off            = -11,
 185     r20_off            = -10,
 186     r19_off            =  -9,
 187     call_wrapper_off   =  -8,
 188     result_off         =  -7,
 189     result_type_off    =  -6,
 190     method_off         =  -5,
 191     entry_point_off    =  -4,
 192     parameters_off     =  -3,
 193     parameter_size_off =  -2,
 194     thread_off         =  -1,
 195     fp_f               =   0,
 196     retaddr_off        =   1,
 197   };
 198 
 199   address generate_call_stub(address& return_address) {
 200     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 201            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 202            "adjust this code");
 203 
 204     StubCodeMark mark(this, "StubRoutines", "call_stub");
 205     address start = __ pc();
 206 
 207     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 208 
 209     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 210     const Address result        (rfp, result_off         * wordSize);
 211     const Address result_type   (rfp, result_type_off    * wordSize);
 212     const Address method        (rfp, method_off         * wordSize);
 213     const Address entry_point   (rfp, entry_point_off    * wordSize);
 214     const Address parameters    (rfp, parameters_off     * wordSize);
 215     const Address parameter_size(rfp, parameter_size_off * wordSize);
 216 
 217     const Address thread        (rfp, thread_off         * wordSize);
 218 
 219     const Address d15_save      (rfp, d15_off * wordSize);
 220     const Address d14_save      (rfp, d14_off * wordSize);
 221     const Address d13_save      (rfp, d13_off * wordSize);
 222     const Address d12_save      (rfp, d12_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d10_save      (rfp, d10_off * wordSize);
 225     const Address d9_save       (rfp, d9_off * wordSize);
 226     const Address d8_save       (rfp, d8_off * wordSize);
 227 
 228     const Address r28_save      (rfp, r28_off * wordSize);
 229     const Address r27_save      (rfp, r27_off * wordSize);
 230     const Address r26_save      (rfp, r26_off * wordSize);
 231     const Address r25_save      (rfp, r25_off * wordSize);
 232     const Address r24_save      (rfp, r24_off * wordSize);
 233     const Address r23_save      (rfp, r23_off * wordSize);
 234     const Address r22_save      (rfp, r22_off * wordSize);
 235     const Address r21_save      (rfp, r21_off * wordSize);
 236     const Address r20_save      (rfp, r20_off * wordSize);
 237     const Address r19_save      (rfp, r19_off * wordSize);
 238 
 239     // stub code
 240 
 241     // we need a C prolog to bootstrap the x86 caller into the sim
 242     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 243 
 244     address aarch64_entry = __ pc();
 245 
 246 #ifdef BUILTIN_SIM
 247     // Save sender's SP for stack traces.
 248     __ mov(rscratch1, sp);
 249     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 250 #endif
 251     // set up frame and move sp to end of save area
 252     __ enter();
 253     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 254 
 255     // save register parameters and Java scratch/global registers
 256     // n.b. we save thread even though it gets installed in
 257     // rthread because we want to sanity check rthread later
 258     __ str(c_rarg7,  thread);
 259     __ strw(c_rarg6, parameter_size);
 260     __ str(c_rarg5,  parameters);
 261     __ str(c_rarg4,  entry_point);
 262     __ str(c_rarg3,  method);
 263     __ str(c_rarg2,  result_type);
 264     __ str(c_rarg1,  result);
 265     __ str(c_rarg0,  call_wrapper);
 266     __ str(r19,      r19_save);
 267     __ str(r20,      r20_save);
 268     __ str(r21,      r21_save);
 269     __ str(r22,      r22_save);
 270     __ str(r23,      r23_save);
 271     __ str(r24,      r24_save);
 272     __ str(r25,      r25_save);
 273     __ str(r26,      r26_save);
 274     __ str(r27,      r27_save);
 275     __ str(r28,      r28_save);
 276 
 277     __ strd(v8,      d8_save);
 278     __ strd(v9,      d9_save);
 279     __ strd(v10,     d10_save);
 280     __ strd(v11,     d11_save);
 281     __ strd(v12,     d12_save);
 282     __ strd(v13,     d13_save);
 283     __ strd(v14,     d14_save);
 284     __ strd(v15,     d15_save);
 285 
 286     // install Java thread in global register now we have saved
 287     // whatever value it held
 288     __ mov(rthread, c_rarg7);
 289     // And method
 290     __ mov(rmethod, c_rarg3);
 291 
 292     // set up the heapbase register
 293     __ reinit_heapbase();
 294 
 295 #ifdef ASSERT
 296     // make sure we have no pending exceptions
 297     {
 298       Label L;
 299       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 300       __ cmp(rscratch1, (unsigned)NULL_WORD);
 301       __ br(Assembler::EQ, L);
 302       __ stop("StubRoutines::call_stub: entered with pending exception");
 303       __ BIND(L);
 304     }
 305 #endif
 306     // pass parameters if any
 307     __ mov(esp, sp);
 308     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 309     __ andr(sp, rscratch1, -2 * wordSize);
 310 
 311     BLOCK_COMMENT("pass parameters if any");
 312     Label parameters_done;
 313     // parameter count is still in c_rarg6
 314     // and parameter pointer identifying param 1 is in c_rarg5
 315     __ cbzw(c_rarg6, parameters_done);
 316 
 317     address loop = __ pc();
 318     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 319     __ subsw(c_rarg6, c_rarg6, 1);
 320     __ push(rscratch1);
 321     __ br(Assembler::GT, loop);
 322 
 323     __ BIND(parameters_done);
 324 
 325     // call Java entry -- passing methdoOop, and current sp
 326     //      rmethod: Method*
 327     //      r13: sender sp
 328     BLOCK_COMMENT("call Java function");
 329     __ mov(r13, sp);
 330     __ blr(c_rarg4);
 331 
 332     // tell the simulator we have returned to the stub
 333 
 334     // we do this here because the notify will already have been done
 335     // if we get to the next instruction via an exception
 336     //
 337     // n.b. adding this instruction here affects the calculation of
 338     // whether or not a routine returns to the call stub (used when
 339     // doing stack walks) since the normal test is to check the return
 340     // pc against the address saved below. so we may need to allow for
 341     // this extra instruction in the check.
 342 
 343     if (NotifySimulator) {
 344       __ notify(Assembler::method_reentry);
 345     }
 346     // save current address for use by exception handling code
 347 
 348     return_address = __ pc();
 349 
 350     // store result depending on type (everything that is not
 351     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 352     // n.b. this assumes Java returns an integral result in r0
 353     // and a floating result in j_farg0
 354     __ ldr(j_rarg2, result);
 355     Label is_long, is_float, is_double, exit;
 356     __ ldr(j_rarg1, result_type);
 357     __ cmp(j_rarg1, T_OBJECT);
 358     __ br(Assembler::EQ, is_long);
 359     __ cmp(j_rarg1, T_LONG);
 360     __ br(Assembler::EQ, is_long);
 361     __ cmp(j_rarg1, T_FLOAT);
 362     __ br(Assembler::EQ, is_float);
 363     __ cmp(j_rarg1, T_DOUBLE);
 364     __ br(Assembler::EQ, is_double);
 365 
 366     // handle T_INT case
 367     __ strw(r0, Address(j_rarg2));
 368 
 369     __ BIND(exit);
 370 
 371     // pop parameters
 372     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 373 
 374 #ifdef ASSERT
 375     // verify that threads correspond
 376     {
 377       Label L, S;
 378       __ ldr(rscratch1, thread);
 379       __ cmp(rthread, rscratch1);
 380       __ br(Assembler::NE, S);
 381       __ get_thread(rscratch1);
 382       __ cmp(rthread, rscratch1);
 383       __ br(Assembler::EQ, L);
 384       __ BIND(S);
 385       __ stop("StubRoutines::call_stub: threads must correspond");
 386       __ BIND(L);
 387     }
 388 #endif
 389 
 390     // restore callee-save registers
 391     __ ldrd(v15,      d15_save);
 392     __ ldrd(v14,      d14_save);
 393     __ ldrd(v13,      d13_save);
 394     __ ldrd(v12,      d12_save);
 395     __ ldrd(v11,      d11_save);
 396     __ ldrd(v10,      d10_save);
 397     __ ldrd(v9,       d9_save);
 398     __ ldrd(v8,       d8_save);
 399 
 400     __ ldr(r28,      r28_save);
 401     __ ldr(r27,      r27_save);
 402     __ ldr(r26,      r26_save);
 403     __ ldr(r25,      r25_save);
 404     __ ldr(r24,      r24_save);
 405     __ ldr(r23,      r23_save);
 406     __ ldr(r22,      r22_save);
 407     __ ldr(r21,      r21_save);
 408     __ ldr(r20,      r20_save);
 409     __ ldr(r19,      r19_save);
 410     __ ldr(c_rarg0,  call_wrapper);
 411     __ ldr(c_rarg1,  result);
 412     __ ldrw(c_rarg2, result_type);
 413     __ ldr(c_rarg3,  method);
 414     __ ldr(c_rarg4,  entry_point);
 415     __ ldr(c_rarg5,  parameters);
 416     __ ldr(c_rarg6,  parameter_size);
 417     __ ldr(c_rarg7,  thread);
 418 
 419 #ifndef PRODUCT
 420     // tell the simulator we are about to end Java execution
 421     if (NotifySimulator) {
 422       __ notify(Assembler::method_exit);
 423     }
 424 #endif
 425     // leave frame and return to caller
 426     __ leave();
 427     __ ret(lr);
 428 
 429     // handle return types different from T_INT
 430 
 431     __ BIND(is_long);
 432     __ str(r0, Address(j_rarg2, 0));
 433     __ br(Assembler::AL, exit);
 434 
 435     __ BIND(is_float);
 436     __ strs(j_farg0, Address(j_rarg2, 0));
 437     __ br(Assembler::AL, exit);
 438 
 439     __ BIND(is_double);
 440     __ strd(j_farg0, Address(j_rarg2, 0));
 441     __ br(Assembler::AL, exit);
 442 
 443     return start;
 444   }
 445 
 446   // Return point for a Java call if there's an exception thrown in
 447   // Java code.  The exception is caught and transformed into a
 448   // pending exception stored in JavaThread that can be tested from
 449   // within the VM.
 450   //
 451   // Note: Usually the parameters are removed by the callee. In case
 452   // of an exception crossing an activation frame boundary, that is
 453   // not the case if the callee is compiled code => need to setup the
 454   // rsp.
 455   //
 456   // r0: exception oop
 457 
 458   // NOTE: this is used as a target from the signal handler so it
 459   // needs an x86 prolog which returns into the current simulator
 460   // executing the generated catch_exception code. so the prolog
 461   // needs to install rax in a sim register and adjust the sim's
 462   // restart pc to enter the generated code at the start position
 463   // then return from native to simulated execution.
 464 
 465   address generate_catch_exception() {
 466     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 467     address start = __ pc();
 468 
 469     // same as in generate_call_stub():
 470     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 471     const Address thread        (rfp, thread_off         * wordSize);
 472 
 473 #ifdef ASSERT
 474     // verify that threads correspond
 475     {
 476       Label L, S;
 477       __ ldr(rscratch1, thread);
 478       __ cmp(rthread, rscratch1);
 479       __ br(Assembler::NE, S);
 480       __ get_thread(rscratch1);
 481       __ cmp(rthread, rscratch1);
 482       __ br(Assembler::EQ, L);
 483       __ bind(S);
 484       __ stop("StubRoutines::catch_exception: threads must correspond");
 485       __ bind(L);
 486     }
 487 #endif
 488 
 489     // set pending exception
 490     __ verify_oop(r0);
 491 
 492     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 493     __ mov(rscratch1, (address)__FILE__);
 494     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 495     __ movw(rscratch1, (int)__LINE__);
 496     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 497 
 498     // complete return to VM
 499     assert(StubRoutines::_call_stub_return_address != NULL,
 500            "_call_stub_return_address must have been generated before");
 501     __ b(StubRoutines::_call_stub_return_address);
 502 
 503     return start;
 504   }
 505 
 506   // Continuation point for runtime calls returning with a pending
 507   // exception.  The pending exception check happened in the runtime
 508   // or native call stub.  The pending exception in Thread is
 509   // converted into a Java-level exception.
 510   //
 511   // Contract with Java-level exception handlers:
 512   // r0: exception
 513   // r3: throwing pc
 514   //
 515   // NOTE: At entry of this stub, exception-pc must be in LR !!
 516 
 517   // NOTE: this is always used as a jump target within generated code
 518   // so it just needs to be generated code wiht no x86 prolog
 519 
 520   address generate_forward_exception() {
 521     StubCodeMark mark(this, "StubRoutines", "forward exception");
 522     address start = __ pc();
 523 
 524     // Upon entry, LR points to the return address returning into
 525     // Java (interpreted or compiled) code; i.e., the return address
 526     // becomes the throwing pc.
 527     //
 528     // Arguments pushed before the runtime call are still on the stack
 529     // but the exception handler will reset the stack pointer ->
 530     // ignore them.  A potential result in registers can be ignored as
 531     // well.
 532 
 533 #ifdef ASSERT
 534     // make sure this code is only executed if there is a pending exception
 535     {
 536       Label L;
 537       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 538       __ cbnz(rscratch1, L);
 539       __ stop("StubRoutines::forward exception: no pending exception (1)");
 540       __ bind(L);
 541     }
 542 #endif
 543 
 544     // compute exception handler into r19
 545 
 546     // call the VM to find the handler address associated with the
 547     // caller address. pass thread in r0 and caller pc (ret address)
 548     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 549     // the stack.
 550     __ mov(c_rarg1, lr);
 551     // lr will be trashed by the VM call so we move it to R19
 552     // (callee-saved) because we also need to pass it to the handler
 553     // returned by this call.
 554     __ mov(r19, lr);
 555     BLOCK_COMMENT("call exception_handler_for_return_address");
 556     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 557                          SharedRuntime::exception_handler_for_return_address),
 558                     rthread, c_rarg1);
 559     // we should not really care that lr is no longer the callee
 560     // address. we saved the value the handler needs in r19 so we can
 561     // just copy it to r3. however, the C2 handler will push its own
 562     // frame and then calls into the VM and the VM code asserts that
 563     // the PC for the frame above the handler belongs to a compiled
 564     // Java method. So, we restore lr here to satisfy that assert.
 565     __ mov(lr, r19);
 566     // setup r0 & r3 & clear pending exception
 567     __ mov(r3, r19);
 568     __ mov(r19, r0);
 569     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 570     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 571 
 572 #ifdef ASSERT
 573     // make sure exception is set
 574     {
 575       Label L;
 576       __ cbnz(r0, L);
 577       __ stop("StubRoutines::forward exception: no pending exception (2)");
 578       __ bind(L);
 579     }
 580 #endif
 581 
 582     // continue at exception handler
 583     // r0: exception
 584     // r3: throwing pc
 585     // r19: exception handler
 586     __ verify_oop(r0);
 587     __ br(r19);
 588 
 589     return start;
 590   }
 591 
 592   // Non-destructive plausibility checks for oops
 593   //
 594   // Arguments:
 595   //    r0: oop to verify
 596   //    rscratch1: error message
 597   //
 598   // Stack after saving c_rarg3:
 599   //    [tos + 0]: saved c_rarg3
 600   //    [tos + 1]: saved c_rarg2
 601   //    [tos + 2]: saved lr
 602   //    [tos + 3]: saved rscratch2
 603   //    [tos + 4]: saved r0
 604   //    [tos + 5]: saved rscratch1
 605   address generate_verify_oop() {
 606 
 607     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 608     address start = __ pc();
 609 
 610     Label exit, error;
 611 
 612     // save c_rarg2 and c_rarg3
 613     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 614 
 615     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 616     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 617     __ ldr(c_rarg3, Address(c_rarg2));
 618     __ add(c_rarg3, c_rarg3, 1);
 619     __ str(c_rarg3, Address(c_rarg2));
 620 
 621     // object is in r0
 622     // make sure object is 'reasonable'
 623     __ cbz(r0, exit); // if obj is NULL it is OK
 624 
 625     // Check if the oop is in the right area of memory
 626     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 627     __ andr(c_rarg2, r0, c_rarg3);
 628     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 629 
 630     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 631     // instruction here because the flags register is live.
 632     __ eor(c_rarg2, c_rarg2, c_rarg3);
 633     __ cbnz(c_rarg2, error);
 634 
 635     // make sure klass is 'reasonable', which is not zero.
 636     __ load_klass(r0, r0);  // get klass
 637     __ cbz(r0, error);      // if klass is NULL it is broken
 638 
 639     // return if everything seems ok
 640     __ bind(exit);
 641 
 642     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 643     __ ret(lr);
 644 
 645     // handle errors
 646     __ bind(error);
 647     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 648 
 649     __ push(RegSet::range(r0, r29), sp);
 650     // debug(char* msg, int64_t pc, int64_t regs[])
 651     __ mov(c_rarg0, rscratch1);      // pass address of error message
 652     __ mov(c_rarg1, lr);             // pass return address
 653     __ mov(c_rarg2, sp);             // pass address of regs on stack
 654 #ifndef PRODUCT
 655     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 656 #endif
 657     BLOCK_COMMENT("call MacroAssembler::debug");
 658     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 659     __ blrt(rscratch1, 3, 0, 1);
 660 
 661     return start;
 662   }
 663 
 664   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 665 
 666   // Generate code for an array write pre barrier
 667   //
 668   //     addr    -  starting address
 669   //     count   -  element count
 670   //     tmp     - scratch register
 671   //
 672   //     Destroy no registers!
 673   //
 674   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 675     BarrierSet* bs = Universe::heap()->barrier_set();
 676     switch (bs->kind()) {
 677     case BarrierSet::G1SATBCT:
 678     case BarrierSet::G1SATBCTLogging:
 679       // With G1, don't generate the call if we statically know that the target in uninitialized
 680       if (!dest_uninitialized) {
 681         __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 682         if (count == c_rarg0) {
 683           if (addr == c_rarg1) {
 684             // exactly backwards!!
 685             __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize));
 686             __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize));
 687           } else {
 688             __ mov(c_rarg1, count);
 689             __ mov(c_rarg0, addr);
 690           }
 691         } else {
 692           __ mov(c_rarg0, addr);
 693           __ mov(c_rarg1, count);
 694         }
 695         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 696         __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 697         break;
 698       case BarrierSet::CardTableModRef:
 699       case BarrierSet::CardTableExtension:
 700       case BarrierSet::ModRef:
 701         break;
 702       default:
 703         ShouldNotReachHere();
 704 
 705       }
 706     }
 707   }
 708 
 709   //
 710   // Generate code for an array write post barrier
 711   //
 712   //  Input:
 713   //     start    - register containing starting address of destination array
 714   //     end      - register containing ending address of destination array
 715   //     scratch  - scratch register
 716   //
 717   //  The input registers are overwritten.
 718   //  The ending address is inclusive.
 719   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 720     assert_different_registers(start, end, scratch);
 721     BarrierSet* bs = Universe::heap()->barrier_set();
 722     switch (bs->kind()) {
 723       case BarrierSet::G1SATBCT:
 724       case BarrierSet::G1SATBCTLogging:
 725 
 726         {
 727           __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 728           // must compute element count unless barrier set interface is changed (other platforms supply count)
 729           assert_different_registers(start, end, scratch);
 730           __ lea(scratch, Address(end, BytesPerHeapOop));
 731           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 732           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 733           __ mov(c_rarg0, start);
 734           __ mov(c_rarg1, scratch);
 735           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 736           __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 737         }
 738         break;
 739       case BarrierSet::CardTableModRef:
 740       case BarrierSet::CardTableExtension:
 741         {
 742           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 743           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 744 
 745           Label L_loop;
 746 
 747            __ lsr(start, start, CardTableModRefBS::card_shift);
 748            __ lsr(end, end, CardTableModRefBS::card_shift);
 749            __ sub(end, end, start); // number of bytes to copy
 750 
 751           const Register count = end; // 'end' register contains bytes count now
 752           __ mov(scratch, (address)ct->byte_map_base);
 753           __ add(start, start, scratch);
 754           __ BIND(L_loop);
 755           __ strb(zr, Address(start, count));
 756           __ subs(count, count, 1);
 757           __ br(Assembler::HS, L_loop);
 758         }
 759         break;
 760       default:
 761         ShouldNotReachHere();
 762 
 763     }
 764   }
 765 
 766   typedef enum {
 767     copy_forwards = 1,
 768     copy_backwards = -1
 769   } copy_direction;
 770 
 771   // Bulk copy of blocks of 8 words.
 772   //
 773   // count is a count of words.
 774   //
 775   // Precondition: count >= 2
 776   //
 777   // Postconditions:
 778   //
 779   // The least significant bit of count contains the remaining count
 780   // of words to copy.  The rest of count is trash.
 781   //
 782   // s and d are adjusted to point to the remaining words to copy
 783   //
 784   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 785                            copy_direction direction) {
 786     int unit = wordSize * direction;
 787 
 788     int offset;
 789     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 790       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 791 
 792     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 793     assert_different_registers(s, d, count, rscratch1);
 794 
 795     Label again, large, small;
 796     __ align(6);
 797     __ bind(start);
 798     __ cmp(count, 8);
 799     __ br(Assembler::LO, small);
 800     if (direction == copy_forwards) {
 801       __ sub(s, s, 2 * wordSize);
 802       __ sub(d, d, 2 * wordSize);
 803     }
 804     __ subs(count, count, 16);
 805     __ br(Assembler::GE, large);
 806 
 807     // 8 <= count < 16 words.  Copy 8.
 808     __ ldp(t0, t1, Address(s, 2 * unit));
 809     __ ldp(t2, t3, Address(s, 4 * unit));
 810     __ ldp(t4, t5, Address(s, 6 * unit));
 811     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 812 
 813     __ stp(t0, t1, Address(d, 2 * unit));
 814     __ stp(t2, t3, Address(d, 4 * unit));
 815     __ stp(t4, t5, Address(d, 6 * unit));
 816     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 817 
 818     if (direction == copy_forwards) {
 819       __ add(s, s, 2 * wordSize);
 820       __ add(d, d, 2 * wordSize);
 821     }
 822 
 823     {
 824       Label L1, L2;
 825       __ bind(small);
 826       __ tbz(count, exact_log2(4), L1);
 827       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 828       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 829       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 830       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 831       __ bind(L1);
 832 
 833       __ tbz(count, 1, L2);
 834       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 835       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 836       __ bind(L2);
 837     }
 838 
 839     __ ret(lr);
 840 
 841     __ align(6);
 842     __ bind(large);
 843 
 844     // Fill 8 registers
 845     __ ldp(t0, t1, Address(s, 2 * unit));
 846     __ ldp(t2, t3, Address(s, 4 * unit));
 847     __ ldp(t4, t5, Address(s, 6 * unit));
 848     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 849 
 850     __ bind(again);
 851 
 852     if (direction == copy_forwards && PrefetchCopyIntervalInBytes > 0)
 853       __ prfm(Address(s, PrefetchCopyIntervalInBytes), PLDL1KEEP);
 854 
 855     __ stp(t0, t1, Address(d, 2 * unit));
 856     __ ldp(t0, t1, Address(s, 2 * unit));
 857     __ stp(t2, t3, Address(d, 4 * unit));
 858     __ ldp(t2, t3, Address(s, 4 * unit));
 859     __ stp(t4, t5, Address(d, 6 * unit));
 860     __ ldp(t4, t5, Address(s, 6 * unit));
 861     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 862     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 863 
 864     __ subs(count, count, 8);
 865     __ br(Assembler::HS, again);
 866 
 867     // Drain
 868     __ stp(t0, t1, Address(d, 2 * unit));
 869     __ stp(t2, t3, Address(d, 4 * unit));
 870     __ stp(t4, t5, Address(d, 6 * unit));
 871     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 872 
 873     if (direction == copy_forwards) {
 874       __ add(s, s, 2 * wordSize);
 875       __ add(d, d, 2 * wordSize);
 876     }
 877 
 878     {
 879       Label L1, L2;
 880       __ tbz(count, exact_log2(4), L1);
 881       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 882       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 883       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 884       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 885       __ bind(L1);
 886 
 887       __ tbz(count, 1, L2);
 888       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 889       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 890       __ bind(L2);
 891     }
 892 
 893     __ ret(lr);
 894   }
 895 
 896   // Small copy: less than 16 bytes.
 897   //
 898   // NB: Ignores all of the bits of count which represent more than 15
 899   // bytes, so a caller doesn't have to mask them.
 900 
 901   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
 902     bool is_backwards = step < 0;
 903     size_t granularity = uabs(step);
 904     int direction = is_backwards ? -1 : 1;
 905     int unit = wordSize * direction;
 906 
 907     Label Lpair, Lword, Lint, Lshort, Lbyte;
 908 
 909     assert(granularity
 910            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 911 
 912     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
 913 
 914     // ??? I don't know if this bit-test-and-branch is the right thing
 915     // to do.  It does a lot of jumping, resulting in several
 916     // mispredicted branches.  It might make more sense to do this
 917     // with something like Duff's device with a single computed branch.
 918 
 919     __ tbz(count, 3 - exact_log2(granularity), Lword);
 920     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
 921     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
 922     __ bind(Lword);
 923 
 924     if (granularity <= sizeof (jint)) {
 925       __ tbz(count, 2 - exact_log2(granularity), Lint);
 926       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 927       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 928       __ bind(Lint);
 929     }
 930 
 931     if (granularity <= sizeof (jshort)) {
 932       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 933       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 934       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 935       __ bind(Lshort);
 936     }
 937 
 938     if (granularity <= sizeof (jbyte)) {
 939       __ tbz(count, 0, Lbyte);
 940       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 941       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 942       __ bind(Lbyte);
 943     }
 944   }
 945 
 946   Label copy_f, copy_b;
 947 
 948   // All-singing all-dancing memory copy.
 949   //
 950   // Copy count units of memory from s to d.  The size of a unit is
 951   // step, which can be positive or negative depending on the direction
 952   // of copy.  If is_aligned is false, we align the source address.
 953   //
 954 
 955   void copy_memory(bool is_aligned, Register s, Register d,
 956                    Register count, Register tmp, int step) {
 957     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 958     bool is_backwards = step < 0;
 959     int granularity = uabs(step);
 960     const Register t0 = r3, t1 = r4;
 961 
 962     if (is_backwards) {
 963       __ lea(s, Address(s, count, Address::uxtw(exact_log2(-step))));
 964       __ lea(d, Address(d, count, Address::uxtw(exact_log2(-step))));
 965     }
 966 
 967     Label done, tail;
 968 
 969     __ cmp(count, 16/granularity);
 970     __ br(Assembler::LO, tail);
 971 
 972     // Now we've got the small case out of the way we can align the
 973     // source address on a 2-word boundary.
 974 
 975     Label aligned;
 976 
 977     if (is_aligned) {
 978       // We may have to adjust by 1 word to get s 2-word-aligned.
 979       __ tbz(s, exact_log2(wordSize), aligned);
 980       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
 981       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
 982       __ sub(count, count, wordSize/granularity);
 983     } else {
 984       if (is_backwards) {
 985         __ andr(rscratch2, s, 2 * wordSize - 1);
 986       } else {
 987         __ neg(rscratch2, s);
 988         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
 989       }
 990       // rscratch2 is the byte adjustment needed to align s.
 991       __ cbz(rscratch2, aligned);
 992       __ lsr(rscratch2, rscratch2, exact_log2(granularity));
 993       __ sub(count, count, rscratch2);
 994 
 995 #if 0
 996       // ?? This code is only correct for a disjoint copy.  It may or
 997       // may not make sense to use it in that case.
 998 
 999       // Copy the first pair; s and d may not be aligned.
1000       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1001       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1002 
1003       // Align s and d, adjust count
1004       if (is_backwards) {
1005         __ sub(s, s, rscratch2);
1006         __ sub(d, d, rscratch2);
1007       } else {
1008         __ add(s, s, rscratch2);
1009         __ add(d, d, rscratch2);
1010       }
1011 #else
1012       copy_memory_small(s, d, rscratch2, rscratch1, step);
1013 #endif
1014     }
1015 
1016     __ cmp(count, 16/granularity);
1017     __ br(Assembler::LT, tail);
1018     __ bind(aligned);
1019 
1020     // s is now 2-word-aligned.
1021 
1022     // We have a count of units and some trailing bytes.  Adjust the
1023     // count and do a bulk copy of words.
1024     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1025     if (direction == copy_forwards)
1026       __ bl(copy_f);
1027     else
1028       __ bl(copy_b);
1029 
1030     // And the tail.
1031 
1032     __ bind(tail);
1033     copy_memory_small(s, d, count, tmp, step);
1034   }
1035 
1036 
1037   void clobber_registers() {
1038 #ifdef ASSERT
1039     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1040     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1041     for (Register r = r3; r <= r18; r++)
1042       if (r != rscratch1) __ mov(r, rscratch1);
1043 #endif
1044   }
1045 
1046   // Scan over array at a for count oops, verifying each one.
1047   // Preserves a and count, clobbers rscratch1 and rscratch2.
1048   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1049     Label loop, end;
1050     __ mov(rscratch1, a);
1051     __ mov(rscratch2, zr);
1052     __ bind(loop);
1053     __ cmp(rscratch2, count);
1054     __ br(Assembler::HS, end);
1055     if (size == (size_t)wordSize) {
1056       __ ldr(temp, Address(a, rscratch2, Address::uxtw(exact_log2(size))));
1057       __ verify_oop(temp);
1058     } else {
1059       __ ldrw(r16, Address(a, rscratch2, Address::uxtw(exact_log2(size))));
1060       __ decode_heap_oop(temp); // calls verify_oop
1061     }
1062     __ add(rscratch2, rscratch2, size);
1063     __ b(loop);
1064     __ bind(end);
1065   }
1066 
1067   // Arguments:
1068   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1069   //             ignored
1070   //   is_oop  - true => oop array, so generate store check code
1071   //   name    - stub name string
1072   //
1073   // Inputs:
1074   //   c_rarg0   - source array address
1075   //   c_rarg1   - destination array address
1076   //   c_rarg2   - element count, treated as ssize_t, can be zero
1077   //
1078   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1079   // the hardware handle it.  The two dwords within qwords that span
1080   // cache line boundaries will still be loaded and stored atomicly.
1081   //
1082   // Side Effects:
1083   //   disjoint_int_copy_entry is set to the no-overlap entry point
1084   //   used by generate_conjoint_int_oop_copy().
1085   //
1086   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1087                                   const char *name, bool dest_uninitialized = false) {
1088     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1089     __ align(CodeEntryAlignment);
1090     StubCodeMark mark(this, "StubRoutines", name);
1091     address start = __ pc();
1092     if (entry != NULL) {
1093       *entry = __ pc();
1094       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1095       BLOCK_COMMENT("Entry:");
1096     }
1097     __ enter();
1098     if (is_oop) {
1099       __ push(RegSet::of(d, count), sp);
1100       // no registers are destroyed by this call
1101       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1102     }
1103     copy_memory(aligned, s, d, count, rscratch1, size);
1104     if (is_oop) {
1105       __ pop(RegSet::of(d, count), sp);
1106       if (VerifyOops)
1107         verify_oop_array(size, d, count, r16);
1108       __ sub(count, count, 1); // make an inclusive end pointer
1109       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1110       gen_write_ref_array_post_barrier(d, count, rscratch1);
1111     }
1112     __ leave();
1113     __ ret(lr);
1114 #ifdef BUILTIN_SIM
1115     {
1116       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1117       sim->notifyCompile(const_cast<char*>(name), start);
1118     }
1119 #endif
1120     return start;
1121   }
1122 
1123   // Arguments:
1124   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1125   //             ignored
1126   //   is_oop  - true => oop array, so generate store check code
1127   //   name    - stub name string
1128   //
1129   // Inputs:
1130   //   c_rarg0   - source array address
1131   //   c_rarg1   - destination array address
1132   //   c_rarg2   - element count, treated as ssize_t, can be zero
1133   //
1134   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1135   // the hardware handle it.  The two dwords within qwords that span
1136   // cache line boundaries will still be loaded and stored atomicly.
1137   //
1138   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1139                                  address *entry, const char *name,
1140                                  bool dest_uninitialized = false) {
1141     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1142 
1143     StubCodeMark mark(this, "StubRoutines", name);
1144     address start = __ pc();
1145 
1146     __ cmp(d, s);
1147     __ br(Assembler::LS, nooverlap_target);
1148 
1149     __ enter();
1150     if (is_oop) {
1151       __ push(RegSet::of(d, count), sp);
1152       // no registers are destroyed by this call
1153       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1154     }
1155     copy_memory(aligned, s, d, count, rscratch1, -size);
1156     if (is_oop) {
1157       __ pop(RegSet::of(d, count), sp);
1158       if (VerifyOops)
1159         verify_oop_array(size, d, count, r16);
1160       __ sub(count, count, 1); // make an inclusive end pointer
1161       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1162       gen_write_ref_array_post_barrier(d, count, rscratch1);
1163     }
1164     __ leave();
1165     __ ret(lr);
1166 #ifdef BUILTIN_SIM
1167     {
1168       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1169       sim->notifyCompile(const_cast<char*>(name), start);
1170     }
1171 #endif
1172     return start;
1173 }
1174 
1175   // Arguments:
1176   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1177   //             ignored
1178   //   name    - stub name string
1179   //
1180   // Inputs:
1181   //   c_rarg0   - source array address
1182   //   c_rarg1   - destination array address
1183   //   c_rarg2   - element count, treated as ssize_t, can be zero
1184   //
1185   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1186   // we let the hardware handle it.  The one to eight bytes within words,
1187   // dwords or qwords that span cache line boundaries will still be loaded
1188   // and stored atomically.
1189   //
1190   // Side Effects:
1191   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1192   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1193   // we let the hardware handle it.  The one to eight bytes within words,
1194   // dwords or qwords that span cache line boundaries will still be loaded
1195   // and stored atomically.
1196   //
1197   // Side Effects:
1198   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1199   //   used by generate_conjoint_byte_copy().
1200   //
1201   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1202     const bool not_oop = false;
1203     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1204   }
1205 
1206   // Arguments:
1207   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1208   //             ignored
1209   //   name    - stub name string
1210   //
1211   // Inputs:
1212   //   c_rarg0   - source array address
1213   //   c_rarg1   - destination array address
1214   //   c_rarg2   - element count, treated as ssize_t, can be zero
1215   //
1216   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1217   // we let the hardware handle it.  The one to eight bytes within words,
1218   // dwords or qwords that span cache line boundaries will still be loaded
1219   // and stored atomically.
1220   //
1221   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1222                                       address* entry, const char *name) {
1223     const bool not_oop = false;
1224     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1225   }
1226 
1227   // Arguments:
1228   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1229   //             ignored
1230   //   name    - stub name string
1231   //
1232   // Inputs:
1233   //   c_rarg0   - source array address
1234   //   c_rarg1   - destination array address
1235   //   c_rarg2   - element count, treated as ssize_t, can be zero
1236   //
1237   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1238   // let the hardware handle it.  The two or four words within dwords
1239   // or qwords that span cache line boundaries will still be loaded
1240   // and stored atomically.
1241   //
1242   // Side Effects:
1243   //   disjoint_short_copy_entry is set to the no-overlap entry point
1244   //   used by generate_conjoint_short_copy().
1245   //
1246   address generate_disjoint_short_copy(bool aligned,
1247                                        address* entry, const char *name) {
1248     const bool not_oop = false;
1249     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1250   }
1251 
1252   // Arguments:
1253   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1254   //             ignored
1255   //   name    - stub name string
1256   //
1257   // Inputs:
1258   //   c_rarg0   - source array address
1259   //   c_rarg1   - destination array address
1260   //   c_rarg2   - element count, treated as ssize_t, can be zero
1261   //
1262   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1263   // let the hardware handle it.  The two or four words within dwords
1264   // or qwords that span cache line boundaries will still be loaded
1265   // and stored atomically.
1266   //
1267   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1268                                        address *entry, const char *name) {
1269     const bool not_oop = false;
1270     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1271 
1272   }
1273   // Arguments:
1274   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1275   //             ignored
1276   //   name    - stub name string
1277   //
1278   // Inputs:
1279   //   c_rarg0   - source array address
1280   //   c_rarg1   - destination array address
1281   //   c_rarg2   - element count, treated as ssize_t, can be zero
1282   //
1283   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1284   // the hardware handle it.  The two dwords within qwords that span
1285   // cache line boundaries will still be loaded and stored atomicly.
1286   //
1287   // Side Effects:
1288   //   disjoint_int_copy_entry is set to the no-overlap entry point
1289   //   used by generate_conjoint_int_oop_copy().
1290   //
1291   address generate_disjoint_int_copy(bool aligned, address *entry,
1292                                          const char *name, bool dest_uninitialized = false) {
1293     const bool not_oop = false;
1294     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1295   }
1296 
1297   // Arguments:
1298   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1299   //             ignored
1300   //   name    - stub name string
1301   //
1302   // Inputs:
1303   //   c_rarg0   - source array address
1304   //   c_rarg1   - destination array address
1305   //   c_rarg2   - element count, treated as ssize_t, can be zero
1306   //
1307   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1308   // the hardware handle it.  The two dwords within qwords that span
1309   // cache line boundaries will still be loaded and stored atomicly.
1310   //
1311   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1312                                      address *entry, const char *name,
1313                                      bool dest_uninitialized = false) {
1314     const bool not_oop = false;
1315     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1316   }
1317 
1318 
1319   // Arguments:
1320   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1321   //             ignored
1322   //   name    - stub name string
1323   //
1324   // Inputs:
1325   //   c_rarg0   - source array address
1326   //   c_rarg1   - destination array address
1327   //   c_rarg2   - element count, treated as size_t, can be zero
1328   //
1329   // Side Effects:
1330   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1331   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1332   //
1333   address generate_disjoint_long_copy(bool aligned, address *entry,
1334                                           const char *name, bool dest_uninitialized = false) {
1335     const bool not_oop = false;
1336     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1337   }
1338 
1339   // Arguments:
1340   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1341   //             ignored
1342   //   name    - stub name string
1343   //
1344   // Inputs:
1345   //   c_rarg0   - source array address
1346   //   c_rarg1   - destination array address
1347   //   c_rarg2   - element count, treated as size_t, can be zero
1348   //
1349   address generate_conjoint_long_copy(bool aligned,
1350                                       address nooverlap_target, address *entry,
1351                                       const char *name, bool dest_uninitialized = false) {
1352     const bool not_oop = false;
1353     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1354   }
1355 
1356   // Arguments:
1357   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1358   //             ignored
1359   //   name    - stub name string
1360   //
1361   // Inputs:
1362   //   c_rarg0   - source array address
1363   //   c_rarg1   - destination array address
1364   //   c_rarg2   - element count, treated as size_t, can be zero
1365   //
1366   // Side Effects:
1367   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1368   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1369   //
1370   address generate_disjoint_oop_copy(bool aligned, address *entry,
1371                                      const char *name, bool dest_uninitialized = false) {
1372     const bool is_oop = true;
1373     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1374     return generate_disjoint_copy(size, aligned, is_oop, entry, name);
1375   }
1376 
1377   // Arguments:
1378   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1379   //             ignored
1380   //   name    - stub name string
1381   //
1382   // Inputs:
1383   //   c_rarg0   - source array address
1384   //   c_rarg1   - destination array address
1385   //   c_rarg2   - element count, treated as size_t, can be zero
1386   //
1387   address generate_conjoint_oop_copy(bool aligned,
1388                                      address nooverlap_target, address *entry,
1389                                      const char *name, bool dest_uninitialized = false) {
1390     const bool is_oop = true;
1391     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1392     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name);
1393   }
1394 
1395 
1396   // Helper for generating a dynamic type check.
1397   // Smashes rscratch1.
1398   void generate_type_check(Register sub_klass,
1399                            Register super_check_offset,
1400                            Register super_klass,
1401                            Label& L_success) {
1402     assert_different_registers(sub_klass, super_check_offset, super_klass);
1403 
1404     BLOCK_COMMENT("type_check:");
1405 
1406     Label L_miss;
1407 
1408     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1409                                      super_check_offset);
1410     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1411 
1412     // Fall through on failure!
1413     __ BIND(L_miss);
1414   }
1415 
1416   //
1417   //  Generate checkcasting array copy stub
1418   //
1419   //  Input:
1420   //    c_rarg0   - source array address
1421   //    c_rarg1   - destination array address
1422   //    c_rarg2   - element count, treated as ssize_t, can be zero
1423   //    c_rarg3   - size_t ckoff (super_check_offset)
1424   //    c_rarg4   - oop ckval (super_klass)
1425   //
1426   //  Output:
1427   //    r0 ==  0  -  success
1428   //    r0 == -1^K - failure, where K is partial transfer count
1429   //
1430   address generate_checkcast_copy(const char *name, address *entry,
1431                                   bool dest_uninitialized = false) {
1432 
1433     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1434 
1435     // Input registers (after setup_arg_regs)
1436     const Register from        = c_rarg0;   // source array address
1437     const Register to          = c_rarg1;   // destination array address
1438     const Register count       = c_rarg2;   // elementscount
1439     const Register ckoff       = c_rarg3;   // super_check_offset
1440     const Register ckval       = c_rarg4;   // super_klass
1441 
1442     // Registers used as temps (r18, r19, r20 are save-on-entry)
1443     const Register count_save  = r21;       // orig elementscount
1444     const Register start_to    = r20;       // destination array start address
1445     const Register copied_oop  = r18;       // actual oop copied
1446     const Register r19_klass   = r19;       // oop._klass
1447 
1448     //---------------------------------------------------------------
1449     // Assembler stub will be used for this call to arraycopy
1450     // if the two arrays are subtypes of Object[] but the
1451     // destination array type is not equal to or a supertype
1452     // of the source type.  Each element must be separately
1453     // checked.
1454 
1455     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1456                                copied_oop, r19_klass, count_save);
1457 
1458     __ align(CodeEntryAlignment);
1459     StubCodeMark mark(this, "StubRoutines", name);
1460     address start = __ pc();
1461 
1462     __ enter(); // required for proper stackwalking of RuntimeStub frame
1463 
1464 #ifdef ASSERT
1465     // caller guarantees that the arrays really are different
1466     // otherwise, we would have to make conjoint checks
1467     { Label L;
1468       array_overlap_test(L, TIMES_OOP);
1469       __ stop("checkcast_copy within a single array");
1470       __ bind(L);
1471     }
1472 #endif //ASSERT
1473 
1474     // Caller of this entry point must set up the argument registers.
1475     if (entry != NULL) {
1476       *entry = __ pc();
1477       BLOCK_COMMENT("Entry:");
1478     }
1479 
1480      // Empty array:  Nothing to do.
1481     __ cbz(count, L_done);
1482 
1483     __ push(RegSet::of(r18, r19, r20, r21), sp);
1484 
1485 #ifdef ASSERT
1486     BLOCK_COMMENT("assert consistent ckoff/ckval");
1487     // The ckoff and ckval must be mutually consistent,
1488     // even though caller generates both.
1489     { Label L;
1490       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1491       __ ldrw(start_to, Address(ckval, sco_offset));
1492       __ cmpw(ckoff, start_to);
1493       __ br(Assembler::EQ, L);
1494       __ stop("super_check_offset inconsistent");
1495       __ bind(L);
1496     }
1497 #endif //ASSERT
1498 
1499     // save the original count
1500     __ mov(count_save, count);
1501 
1502     // Copy from low to high addresses
1503     __ mov(start_to, to);              // Save destination array start address
1504     __ b(L_load_element);
1505 
1506     // ======== begin loop ========
1507     // (Loop is rotated; its entry is L_load_element.)
1508     // Loop control:
1509     //   for (; count != 0; count--) {
1510     //     copied_oop = load_heap_oop(from++);
1511     //     ... generate_type_check ...;
1512     //     store_heap_oop(to++, copied_oop);
1513     //   }
1514     __ align(OptoLoopAlignment);
1515 
1516     __ BIND(L_store_element);
1517     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1518     __ sub(count, count, 1);
1519     __ cbz(count, L_do_card_marks);
1520 
1521     // ======== loop entry is here ========
1522     __ BIND(L_load_element);
1523     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1524     __ cbz(copied_oop, L_store_element);
1525 
1526     __ load_klass(r19_klass, copied_oop);// query the object klass
1527     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1528     // ======== end loop ========
1529 
1530     // It was a real error; we must depend on the caller to finish the job.
1531     // Register count = remaining oops, count_orig = total oops.
1532     // Emit GC store barriers for the oops we have copied and report
1533     // their number to the caller.
1534 
1535     __ subs(count, count_save, count);     // K = partially copied oop count
1536     __ eon(count, count, zr);                   // report (-1^K) to caller
1537     __ br(Assembler::EQ, L_done_pop);
1538 
1539     __ BIND(L_do_card_marks);
1540     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1541     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1542 
1543     __ bind(L_done_pop);
1544     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1545     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1546 
1547     __ bind(L_done);
1548     __ mov(r0, count);
1549     __ leave();
1550     __ ret(lr);
1551 
1552     return start;
1553   }
1554 
1555   // Perform range checks on the proposed arraycopy.
1556   // Kills temp, but nothing else.
1557   // Also, clean the sign bits of src_pos and dst_pos.
1558   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1559                               Register src_pos, // source position (c_rarg1)
1560                               Register dst,     // destination array oo (c_rarg2)
1561                               Register dst_pos, // destination position (c_rarg3)
1562                               Register length,
1563                               Register temp,
1564                               Label& L_failed) { Unimplemented(); }
1565 
1566   // These stubs get called from some dumb test routine.
1567   // I'll write them properly when they're called from
1568   // something that's actually doing something.
1569   static void fake_arraycopy_stub(address src, address dst, int count) {
1570     assert(count == 0, "huh?");
1571   }
1572 
1573 
1574   void generate_arraycopy_stubs() {
1575     address entry;
1576     address entry_jbyte_arraycopy;
1577     address entry_jshort_arraycopy;
1578     address entry_jint_arraycopy;
1579     address entry_oop_arraycopy;
1580     address entry_jlong_arraycopy;
1581     address entry_checkcast_arraycopy;
1582 
1583     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
1584     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
1585 
1586     //*** jbyte
1587     // Always need aligned and unaligned versions
1588     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
1589                                                                                   "jbyte_disjoint_arraycopy");
1590     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
1591                                                                                   &entry_jbyte_arraycopy,
1592                                                                                   "jbyte_arraycopy");
1593     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
1594                                                                                   "arrayof_jbyte_disjoint_arraycopy");
1595     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
1596                                                                                   "arrayof_jbyte_arraycopy");
1597 
1598     //*** jshort
1599     // Always need aligned and unaligned versions
1600     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
1601                                                                                     "jshort_disjoint_arraycopy");
1602     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
1603                                                                                     &entry_jshort_arraycopy,
1604                                                                                     "jshort_arraycopy");
1605     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
1606                                                                                     "arrayof_jshort_disjoint_arraycopy");
1607     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
1608                                                                                     "arrayof_jshort_arraycopy");
1609 
1610     //*** jint
1611     // Aligned versions
1612     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
1613                                                                                 "arrayof_jint_disjoint_arraycopy");
1614     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
1615                                                                                 "arrayof_jint_arraycopy");
1616     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
1617     // entry_jint_arraycopy always points to the unaligned version
1618     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
1619                                                                                 "jint_disjoint_arraycopy");
1620     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
1621                                                                                 &entry_jint_arraycopy,
1622                                                                                 "jint_arraycopy");
1623 
1624     //*** jlong
1625     // It is always aligned
1626     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
1627                                                                                   "arrayof_jlong_disjoint_arraycopy");
1628     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
1629                                                                                   "arrayof_jlong_arraycopy");
1630     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
1631     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
1632 
1633     //*** oops
1634     {
1635       // With compressed oops we need unaligned versions; notice that
1636       // we overwrite entry_oop_arraycopy.
1637       bool aligned = !UseCompressedOops;
1638 
1639       StubRoutines::_arrayof_oop_disjoint_arraycopy
1640         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy");
1641       StubRoutines::_arrayof_oop_arraycopy
1642         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy");
1643       // Aligned versions without pre-barriers
1644       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
1645         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
1646                                      /*dest_uninitialized*/true);
1647       StubRoutines::_arrayof_oop_arraycopy_uninit
1648         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
1649                                      /*dest_uninitialized*/true);
1650     }
1651 
1652     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
1653     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
1654     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
1655     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
1656 
1657     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
1658     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
1659                                                                         /*dest_uninitialized*/true);
1660   }
1661 
1662   // Arguments:
1663   //
1664   // Inputs:
1665   //   c_rarg0   - source byte array address
1666   //   c_rarg1   - destination byte array address
1667   //   c_rarg2   - K (key) in little endian int array
1668   //
1669   address generate_aescrypt_encryptBlock() {
1670     __ align(CodeEntryAlignment);
1671     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
1672 
1673     Label L_doLast;
1674 
1675     const Register from        = c_rarg0;  // source array address
1676     const Register to          = c_rarg1;  // destination array address
1677     const Register key         = c_rarg2;  // key array address
1678     const Register keylen      = rscratch1;
1679 
1680     address start = __ pc();
1681     __ enter();
1682 
1683     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1684 
1685     __ ld1(v0, __ T16B, from); // get 16 bytes of input
1686 
1687     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1688     __ rev32(v1, __ T16B, v1);
1689     __ rev32(v2, __ T16B, v2);
1690     __ rev32(v3, __ T16B, v3);
1691     __ rev32(v4, __ T16B, v4);
1692     __ aese(v0, v1);
1693     __ aesmc(v0, v0);
1694     __ aese(v0, v2);
1695     __ aesmc(v0, v0);
1696     __ aese(v0, v3);
1697     __ aesmc(v0, v0);
1698     __ aese(v0, v4);
1699     __ aesmc(v0, v0);
1700 
1701     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1702     __ rev32(v1, __ T16B, v1);
1703     __ rev32(v2, __ T16B, v2);
1704     __ rev32(v3, __ T16B, v3);
1705     __ rev32(v4, __ T16B, v4);
1706     __ aese(v0, v1);
1707     __ aesmc(v0, v0);
1708     __ aese(v0, v2);
1709     __ aesmc(v0, v0);
1710     __ aese(v0, v3);
1711     __ aesmc(v0, v0);
1712     __ aese(v0, v4);
1713     __ aesmc(v0, v0);
1714 
1715     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1716     __ rev32(v1, __ T16B, v1);
1717     __ rev32(v2, __ T16B, v2);
1718 
1719     __ cmpw(keylen, 44);
1720     __ br(Assembler::EQ, L_doLast);
1721 
1722     __ aese(v0, v1);
1723     __ aesmc(v0, v0);
1724     __ aese(v0, v2);
1725     __ aesmc(v0, v0);
1726 
1727     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1728     __ rev32(v1, __ T16B, v1);
1729     __ rev32(v2, __ T16B, v2);
1730 
1731     __ cmpw(keylen, 52);
1732     __ br(Assembler::EQ, L_doLast);
1733 
1734     __ aese(v0, v1);
1735     __ aesmc(v0, v0);
1736     __ aese(v0, v2);
1737     __ aesmc(v0, v0);
1738 
1739     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1740     __ rev32(v1, __ T16B, v1);
1741     __ rev32(v2, __ T16B, v2);
1742 
1743     __ BIND(L_doLast);
1744 
1745     __ aese(v0, v1);
1746     __ aesmc(v0, v0);
1747     __ aese(v0, v2);
1748 
1749     __ ld1(v1, __ T16B, key);
1750     __ rev32(v1, __ T16B, v1);
1751     __ eor(v0, __ T16B, v0, v1);
1752 
1753     __ st1(v0, __ T16B, to);
1754 
1755     __ mov(r0, 0);
1756 
1757     __ leave();
1758     __ ret(lr);
1759 
1760     return start;
1761   }
1762 
1763   // Arguments:
1764   //
1765   // Inputs:
1766   //   c_rarg0   - source byte array address
1767   //   c_rarg1   - destination byte array address
1768   //   c_rarg2   - K (key) in little endian int array
1769   //
1770   address generate_aescrypt_decryptBlock() {
1771     assert(UseAES, "need AES instructions and misaligned SSE support");
1772     __ align(CodeEntryAlignment);
1773     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
1774     Label L_doLast;
1775 
1776     const Register from        = c_rarg0;  // source array address
1777     const Register to          = c_rarg1;  // destination array address
1778     const Register key         = c_rarg2;  // key array address
1779     const Register keylen      = rscratch1;
1780 
1781     address start = __ pc();
1782     __ enter(); // required for proper stackwalking of RuntimeStub frame
1783 
1784     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1785 
1786     __ ld1(v0, __ T16B, from); // get 16 bytes of input
1787 
1788     __ ld1(v5, __ T16B, __ post(key, 16));
1789     __ rev32(v5, __ T16B, v5);
1790 
1791     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1792     __ rev32(v1, __ T16B, v1);
1793     __ rev32(v2, __ T16B, v2);
1794     __ rev32(v3, __ T16B, v3);
1795     __ rev32(v4, __ T16B, v4);
1796     __ aesd(v0, v1);
1797     __ aesimc(v0, v0);
1798     __ aesd(v0, v2);
1799     __ aesimc(v0, v0);
1800     __ aesd(v0, v3);
1801     __ aesimc(v0, v0);
1802     __ aesd(v0, v4);
1803     __ aesimc(v0, v0);
1804 
1805     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1806     __ rev32(v1, __ T16B, v1);
1807     __ rev32(v2, __ T16B, v2);
1808     __ rev32(v3, __ T16B, v3);
1809     __ rev32(v4, __ T16B, v4);
1810     __ aesd(v0, v1);
1811     __ aesimc(v0, v0);
1812     __ aesd(v0, v2);
1813     __ aesimc(v0, v0);
1814     __ aesd(v0, v3);
1815     __ aesimc(v0, v0);
1816     __ aesd(v0, v4);
1817     __ aesimc(v0, v0);
1818 
1819     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1820     __ rev32(v1, __ T16B, v1);
1821     __ rev32(v2, __ T16B, v2);
1822 
1823     __ cmpw(keylen, 44);
1824     __ br(Assembler::EQ, L_doLast);
1825 
1826     __ aesd(v0, v1);
1827     __ aesimc(v0, v0);
1828     __ aesd(v0, v2);
1829     __ aesimc(v0, v0);
1830 
1831     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1832     __ rev32(v1, __ T16B, v1);
1833     __ rev32(v2, __ T16B, v2);
1834 
1835     __ cmpw(keylen, 52);
1836     __ br(Assembler::EQ, L_doLast);
1837 
1838     __ aesd(v0, v1);
1839     __ aesimc(v0, v0);
1840     __ aesd(v0, v2);
1841     __ aesimc(v0, v0);
1842 
1843     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1844     __ rev32(v1, __ T16B, v1);
1845     __ rev32(v2, __ T16B, v2);
1846 
1847     __ BIND(L_doLast);
1848 
1849     __ aesd(v0, v1);
1850     __ aesimc(v0, v0);
1851     __ aesd(v0, v2);
1852 
1853     __ eor(v0, __ T16B, v0, v5);
1854 
1855     __ st1(v0, __ T16B, to);
1856 
1857     __ mov(r0, 0);
1858 
1859     __ leave();
1860     __ ret(lr);
1861 
1862     return start;
1863   }
1864 
1865   // Arguments:
1866   //
1867   // Inputs:
1868   //   c_rarg0   - source byte array address
1869   //   c_rarg1   - destination byte array address
1870   //   c_rarg2   - K (key) in little endian int array
1871   //   c_rarg3   - r vector byte array address
1872   //   c_rarg4   - input length
1873   //
1874   // Output:
1875   //   x0        - input length
1876   //
1877   address generate_cipherBlockChaining_encryptAESCrypt() {
1878     assert(UseAES, "need AES instructions and misaligned SSE support");
1879     __ align(CodeEntryAlignment);
1880     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
1881 
1882     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
1883 
1884     const Register from        = c_rarg0;  // source array address
1885     const Register to          = c_rarg1;  // destination array address
1886     const Register key         = c_rarg2;  // key array address
1887     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
1888                                            // and left with the results of the last encryption block
1889     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
1890     const Register keylen      = rscratch1;
1891 
1892     address start = __ pc();
1893       __ enter();
1894 
1895       __ mov(rscratch2, len_reg);
1896       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1897 
1898       __ ld1(v0, __ T16B, rvec);
1899 
1900       __ cmpw(keylen, 52);
1901       __ br(Assembler::CC, L_loadkeys_44);
1902       __ br(Assembler::EQ, L_loadkeys_52);
1903 
1904       __ ld1(v17, v18, __ T16B, __ post(key, 32));
1905       __ rev32(v17, __ T16B, v17);
1906       __ rev32(v18, __ T16B, v18);
1907     __ BIND(L_loadkeys_52);
1908       __ ld1(v19, v20, __ T16B, __ post(key, 32));
1909       __ rev32(v19, __ T16B, v19);
1910       __ rev32(v20, __ T16B, v20);
1911     __ BIND(L_loadkeys_44);
1912       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
1913       __ rev32(v21, __ T16B, v21);
1914       __ rev32(v22, __ T16B, v22);
1915       __ rev32(v23, __ T16B, v23);
1916       __ rev32(v24, __ T16B, v24);
1917       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
1918       __ rev32(v25, __ T16B, v25);
1919       __ rev32(v26, __ T16B, v26);
1920       __ rev32(v27, __ T16B, v27);
1921       __ rev32(v28, __ T16B, v28);
1922       __ ld1(v29, v30, v31, __ T16B, key);
1923       __ rev32(v29, __ T16B, v29);
1924       __ rev32(v30, __ T16B, v30);
1925       __ rev32(v31, __ T16B, v31);
1926 
1927     __ BIND(L_aes_loop);
1928       __ ld1(v1, __ T16B, __ post(from, 16));
1929       __ eor(v0, __ T16B, v0, v1);
1930 
1931       __ br(Assembler::CC, L_rounds_44);
1932       __ br(Assembler::EQ, L_rounds_52);
1933 
1934       __ aese(v0, v17); __ aesmc(v0, v0);
1935       __ aese(v0, v18); __ aesmc(v0, v0);
1936     __ BIND(L_rounds_52);
1937       __ aese(v0, v19); __ aesmc(v0, v0);
1938       __ aese(v0, v20); __ aesmc(v0, v0);
1939     __ BIND(L_rounds_44);
1940       __ aese(v0, v21); __ aesmc(v0, v0);
1941       __ aese(v0, v22); __ aesmc(v0, v0);
1942       __ aese(v0, v23); __ aesmc(v0, v0);
1943       __ aese(v0, v24); __ aesmc(v0, v0);
1944       __ aese(v0, v25); __ aesmc(v0, v0);
1945       __ aese(v0, v26); __ aesmc(v0, v0);
1946       __ aese(v0, v27); __ aesmc(v0, v0);
1947       __ aese(v0, v28); __ aesmc(v0, v0);
1948       __ aese(v0, v29); __ aesmc(v0, v0);
1949       __ aese(v0, v30);
1950       __ eor(v0, __ T16B, v0, v31);
1951 
1952       __ st1(v0, __ T16B, __ post(to, 16));
1953       __ sub(len_reg, len_reg, 16);
1954       __ cbnz(len_reg, L_aes_loop);
1955 
1956       __ st1(v0, __ T16B, rvec);
1957 
1958       __ mov(r0, rscratch2);
1959 
1960       __ leave();
1961       __ ret(lr);
1962 
1963       return start;
1964   }
1965 
1966   // Arguments:
1967   //
1968   // Inputs:
1969   //   c_rarg0   - source byte array address
1970   //   c_rarg1   - destination byte array address
1971   //   c_rarg2   - K (key) in little endian int array
1972   //   c_rarg3   - r vector byte array address
1973   //   c_rarg4   - input length
1974   //
1975   // Output:
1976   //   rax       - input length
1977   //
1978   address generate_cipherBlockChaining_decryptAESCrypt() {
1979     assert(UseAES, "need AES instructions and misaligned SSE support");
1980     __ align(CodeEntryAlignment);
1981     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
1982 
1983     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
1984 
1985     const Register from        = c_rarg0;  // source array address
1986     const Register to          = c_rarg1;  // destination array address
1987     const Register key         = c_rarg2;  // key array address
1988     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
1989                                            // and left with the results of the last encryption block
1990     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
1991     const Register keylen      = rscratch1;
1992 
1993     address start = __ pc();
1994       __ enter();
1995 
1996       __ mov(rscratch2, len_reg);
1997       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1998 
1999       __ ld1(v2, __ T16B, rvec);
2000 
2001       __ ld1(v31, __ T16B, __ post(key, 16));
2002       __ rev32(v31, __ T16B, v31);
2003 
2004       __ cmpw(keylen, 52);
2005       __ br(Assembler::CC, L_loadkeys_44);
2006       __ br(Assembler::EQ, L_loadkeys_52);
2007 
2008       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2009       __ rev32(v17, __ T16B, v17);
2010       __ rev32(v18, __ T16B, v18);
2011     __ BIND(L_loadkeys_52);
2012       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2013       __ rev32(v19, __ T16B, v19);
2014       __ rev32(v20, __ T16B, v20);
2015     __ BIND(L_loadkeys_44);
2016       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2017       __ rev32(v21, __ T16B, v21);
2018       __ rev32(v22, __ T16B, v22);
2019       __ rev32(v23, __ T16B, v23);
2020       __ rev32(v24, __ T16B, v24);
2021       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2022       __ rev32(v25, __ T16B, v25);
2023       __ rev32(v26, __ T16B, v26);
2024       __ rev32(v27, __ T16B, v27);
2025       __ rev32(v28, __ T16B, v28);
2026       __ ld1(v29, v30, __ T16B, key);
2027       __ rev32(v29, __ T16B, v29);
2028       __ rev32(v30, __ T16B, v30);
2029 
2030     __ BIND(L_aes_loop);
2031       __ ld1(v0, __ T16B, __ post(from, 16));
2032       __ orr(v1, __ T16B, v0, v0);
2033 
2034       __ br(Assembler::CC, L_rounds_44);
2035       __ br(Assembler::EQ, L_rounds_52);
2036 
2037       __ aesd(v0, v17); __ aesimc(v0, v0);
2038       __ aesd(v0, v17); __ aesimc(v0, v0);
2039     __ BIND(L_rounds_52);
2040       __ aesd(v0, v19); __ aesimc(v0, v0);
2041       __ aesd(v0, v20); __ aesimc(v0, v0);
2042     __ BIND(L_rounds_44);
2043       __ aesd(v0, v21); __ aesimc(v0, v0);
2044       __ aesd(v0, v22); __ aesimc(v0, v0);
2045       __ aesd(v0, v23); __ aesimc(v0, v0);
2046       __ aesd(v0, v24); __ aesimc(v0, v0);
2047       __ aesd(v0, v25); __ aesimc(v0, v0);
2048       __ aesd(v0, v26); __ aesimc(v0, v0);
2049       __ aesd(v0, v27); __ aesimc(v0, v0);
2050       __ aesd(v0, v28); __ aesimc(v0, v0);
2051       __ aesd(v0, v29); __ aesimc(v0, v0);
2052       __ aesd(v0, v30);
2053       __ eor(v0, __ T16B, v0, v31);
2054       __ eor(v0, __ T16B, v0, v2);
2055 
2056       __ st1(v0, __ T16B, __ post(to, 16));
2057       __ orr(v2, __ T16B, v1, v1);
2058 
2059       __ sub(len_reg, len_reg, 16);
2060       __ cbnz(len_reg, L_aes_loop);
2061 
2062       __ st1(v2, __ T16B, rvec);
2063 
2064       __ mov(r0, rscratch2);
2065 
2066       __ leave();
2067       __ ret(lr);
2068 
2069     return start;
2070   }
2071 
2072   // Arguments:
2073   //
2074   // Inputs:
2075   //   c_rarg0   - byte[]  source+offset
2076   //   c_rarg1   - int[]   SHA.state
2077   //   c_rarg2   - int     offset
2078   //   c_rarg3   - int     limit
2079   //
2080   address generate_sha1_implCompress(bool multi_block, const char *name) {
2081     __ align(CodeEntryAlignment);
2082     StubCodeMark mark(this, "StubRoutines", name);
2083     address start = __ pc();
2084 
2085     Register buf   = c_rarg0;
2086     Register state = c_rarg1;
2087     Register ofs   = c_rarg2;
2088     Register limit = c_rarg3;
2089 
2090     Label keys;
2091     Label sha1_loop;
2092 
2093     // load the keys into v0..v3
2094     __ adr(rscratch1, keys);
2095     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2096     // load 5 words state into v6, v7
2097     __ ldrq(v6, Address(state, 0));
2098     __ ldrs(v7, Address(state, 16));
2099 
2100 
2101     __ BIND(sha1_loop);
2102     // load 64 bytes of data into v16..v19
2103     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2104     __ rev32(v16, __ T16B, v16);
2105     __ rev32(v17, __ T16B, v17);
2106     __ rev32(v18, __ T16B, v18);
2107     __ rev32(v19, __ T16B, v19);
2108 
2109     // do the sha1
2110     __ addv(v4, __ T4S, v16, v0);
2111     __ orr(v20, __ T16B, v6, v6);
2112 
2113     FloatRegister d0 = v16;
2114     FloatRegister d1 = v17;
2115     FloatRegister d2 = v18;
2116     FloatRegister d3 = v19;
2117 
2118     for (int round = 0; round < 20; round++) {
2119       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2120       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2121       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2122       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2123       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2124 
2125       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2126       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2127       __ sha1h(tmp2, __ T4S, v20);
2128       if (round < 5)
2129         __ sha1c(v20, __ T4S, tmp3, tmp4);
2130       else if (round < 10 || round >= 15)
2131         __ sha1p(v20, __ T4S, tmp3, tmp4);
2132       else
2133         __ sha1m(v20, __ T4S, tmp3, tmp4);
2134       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2135 
2136       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2137     }
2138 
2139     __ addv(v7, __ T2S, v7, v21);
2140     __ addv(v6, __ T4S, v6, v20);
2141 
2142     if (multi_block) {
2143       __ add(ofs, ofs, 64);
2144       __ cmp(ofs, limit);
2145       __ br(Assembler::LE, sha1_loop);
2146       __ mov(c_rarg0, ofs); // return ofs
2147     }
2148 
2149     __ strq(v6, Address(state, 0));
2150     __ strs(v7, Address(state, 16));
2151 
2152     __ ret(lr);
2153 
2154     __ bind(keys);
2155     __ emit_int32(0x5a827999);
2156     __ emit_int32(0x6ed9eba1);
2157     __ emit_int32(0x8f1bbcdc);
2158     __ emit_int32(0xca62c1d6);
2159 
2160     return start;
2161   }
2162 
2163 
2164   // Arguments:
2165   //
2166   // Inputs:
2167   //   c_rarg0   - byte[]  source+offset
2168   //   c_rarg1   - int[]   SHA.state
2169   //   c_rarg2   - int     offset
2170   //   c_rarg3   - int     limit
2171   //
2172   address generate_sha256_implCompress(bool multi_block, const char *name) {
2173     static const uint32_t round_consts[64] = {
2174       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2175       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2176       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2177       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2178       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2179       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2180       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2181       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2182       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2183       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2184       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2185       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2186       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2187       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2188       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2189       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
2190     };
2191     __ align(CodeEntryAlignment);
2192     StubCodeMark mark(this, "StubRoutines", name);
2193     address start = __ pc();
2194 
2195     Register buf   = c_rarg0;
2196     Register state = c_rarg1;
2197     Register ofs   = c_rarg2;
2198     Register limit = c_rarg3;
2199 
2200     Label sha1_loop;
2201 
2202     __ stpd(v8, v9, __ pre(sp, -32));
2203     __ stpd(v10, v11, Address(sp, 16));
2204 
2205 // dga == v0
2206 // dgb == v1
2207 // dg0 == v2
2208 // dg1 == v3
2209 // dg2 == v4
2210 // t0 == v6
2211 // t1 == v7
2212 
2213     // load 16 keys to v16..v31
2214     __ lea(rscratch1, ExternalAddress((address)round_consts));
2215     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
2216     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
2217     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
2218     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
2219 
2220     // load 8 words (256 bits) state
2221     __ ldpq(v0, v1, state);
2222 
2223     __ BIND(sha1_loop);
2224     // load 64 bytes of data into v8..v11
2225     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
2226     __ rev32(v8, __ T16B, v8);
2227     __ rev32(v9, __ T16B, v9);
2228     __ rev32(v10, __ T16B, v10);
2229     __ rev32(v11, __ T16B, v11);
2230 
2231     __ addv(v6, __ T4S, v8, v16);
2232     __ orr(v2, __ T16B, v0, v0);
2233     __ orr(v3, __ T16B, v1, v1);
2234 
2235     FloatRegister d0 = v8;
2236     FloatRegister d1 = v9;
2237     FloatRegister d2 = v10;
2238     FloatRegister d3 = v11;
2239 
2240 
2241     for (int round = 0; round < 16; round++) {
2242       FloatRegister tmp1 = (round & 1) ? v6 : v7;
2243       FloatRegister tmp2 = (round & 1) ? v7 : v6;
2244       FloatRegister tmp3 = (round & 1) ? v2 : v4;
2245       FloatRegister tmp4 = (round & 1) ? v4 : v2;
2246 
2247       if (round < 12) __ sha256su0(d0, __ T4S, d1);
2248        __ orr(v4, __ T16B, v2, v2);
2249       if (round < 15)
2250         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
2251       __ sha256h(v2, __ T4S, v3, tmp2);
2252       __ sha256h2(v3, __ T4S, v4, tmp2);
2253       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
2254 
2255       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2256     }
2257 
2258     __ addv(v0, __ T4S, v0, v2);
2259     __ addv(v1, __ T4S, v1, v3);
2260 
2261     if (multi_block) {
2262       __ add(ofs, ofs, 64);
2263       __ cmp(ofs, limit);
2264       __ br(Assembler::LE, sha1_loop);
2265       __ mov(c_rarg0, ofs); // return ofs
2266     }
2267 
2268     __ ldpd(v10, v11, Address(sp, 16));
2269     __ ldpd(v8, v9, __ post(sp, 32));
2270 
2271     __ stpq(v0, v1, state);
2272 
2273     __ ret(lr);
2274 
2275     return start;
2276   }
2277 
2278 #ifndef BUILTIN_SIM
2279   // Safefetch stubs.
2280   void generate_safefetch(const char* name, int size, address* entry,
2281                           address* fault_pc, address* continuation_pc) {
2282     // safefetch signatures:
2283     //   int      SafeFetch32(int*      adr, int      errValue);
2284     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2285     //
2286     // arguments:
2287     //   c_rarg0 = adr
2288     //   c_rarg1 = errValue
2289     //
2290     // result:
2291     //   PPC_RET  = *adr or errValue
2292 
2293     StubCodeMark mark(this, "StubRoutines", name);
2294 
2295     // Entry point, pc or function descriptor.
2296     *entry = __ pc();
2297 
2298     // Load *adr into c_rarg1, may fault.
2299     *fault_pc = __ pc();
2300     switch (size) {
2301       case 4:
2302         // int32_t
2303         __ ldrw(c_rarg1, Address(c_rarg0, 0));
2304         break;
2305       case 8:
2306         // int64_t
2307         __ ldr(c_rarg1, Address(c_rarg0, 0));
2308         break;
2309       default:
2310         ShouldNotReachHere();
2311     }
2312 
2313     // return errValue or *adr
2314     *continuation_pc = __ pc();
2315     __ mov(r0, c_rarg1);
2316     __ ret(lr);
2317   }
2318 #endif
2319 
2320   /**
2321    *  Arguments:
2322    *
2323    * Inputs:
2324    *   c_rarg0   - int crc
2325    *   c_rarg1   - byte* buf
2326    *   c_rarg2   - int length
2327    *
2328    * Output:
2329    *       r0   - int crc result
2330    *
2331    * Preserves:
2332    *       r13
2333    *
2334    */
2335   address generate_updateBytesCRC32() {
2336     assert(UseCRC32Intrinsics, "what are we doing here?");
2337 
2338     __ align(CodeEntryAlignment);
2339     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
2340 
2341     address start = __ pc();
2342 
2343     const Register crc   = c_rarg0;  // crc
2344     const Register buf   = c_rarg1;  // source java byte array address
2345     const Register len   = c_rarg2;  // length
2346     const Register table0 = c_rarg3; // crc_table address
2347     const Register table1 = c_rarg4;
2348     const Register table2 = c_rarg5;
2349     const Register table3 = c_rarg6;
2350     const Register tmp3 = c_rarg7;
2351 
2352     BLOCK_COMMENT("Entry:");
2353     __ enter(); // required for proper stackwalking of RuntimeStub frame
2354 
2355     __ kernel_crc32(crc, buf, len,
2356               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2357 
2358     __ leave(); // required for proper stackwalking of RuntimeStub frame
2359     __ ret(lr);
2360 
2361     return start;
2362   }
2363 
2364   /**
2365    *  Arguments:
2366    *
2367    *  Input:
2368    *    c_rarg0   - x address
2369    *    c_rarg1   - x length
2370    *    c_rarg2   - y address
2371    *    c_rarg3   - y lenth
2372    *    c_rarg4   - z address
2373    *    c_rarg5   - z length
2374    */
2375   address generate_multiplyToLen() {
2376     __ align(CodeEntryAlignment);
2377     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2378 
2379     address start = __ pc();
2380     const Register x     = r0;
2381     const Register xlen  = r1;
2382     const Register y     = r2;
2383     const Register ylen  = r3;
2384     const Register z     = r4;
2385     const Register zlen  = r5;
2386 
2387     const Register tmp1  = r10;
2388     const Register tmp2  = r11;
2389     const Register tmp3  = r12;
2390     const Register tmp4  = r13;
2391     const Register tmp5  = r14;
2392     const Register tmp6  = r15;
2393     const Register tmp7  = r16;
2394 
2395     BLOCK_COMMENT("Entry:");
2396     __ enter(); // required for proper stackwalking of RuntimeStub frame
2397     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2398     __ leave(); // required for proper stackwalking of RuntimeStub frame
2399     __ ret(lr);
2400 
2401     return start;
2402   }
2403 
2404   // Continuation point for throwing of implicit exceptions that are
2405   // not handled in the current activation. Fabricates an exception
2406   // oop and initiates normal exception dispatching in this
2407   // frame. Since we need to preserve callee-saved values (currently
2408   // only for C2, but done for C1 as well) we need a callee-saved oop
2409   // map and therefore have to make these stubs into RuntimeStubs
2410   // rather than BufferBlobs.  If the compiler needs all registers to
2411   // be preserved between the fault point and the exception handler
2412   // then it must assume responsibility for that in
2413   // AbstractCompiler::continuation_for_implicit_null_exception or
2414   // continuation_for_implicit_division_by_zero_exception. All other
2415   // implicit exceptions (e.g., NullPointerException or
2416   // AbstractMethodError on entry) are either at call sites or
2417   // otherwise assume that stack unwinding will be initiated, so
2418   // caller saved registers were assumed volatile in the compiler.
2419 
2420 #undef __
2421 #define __ masm->
2422 
2423   address generate_throw_exception(const char* name,
2424                                    address runtime_entry,
2425                                    Register arg1 = noreg,
2426                                    Register arg2 = noreg) {
2427     // Information about frame layout at time of blocking runtime call.
2428     // Note that we only have to preserve callee-saved registers since
2429     // the compilers are responsible for supplying a continuation point
2430     // if they expect all registers to be preserved.
2431     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
2432     enum layout {
2433       rfp_off = 0,
2434       rfp_off2,
2435       return_off,
2436       return_off2,
2437       framesize // inclusive of return address
2438     };
2439 
2440     int insts_size = 512;
2441     int locs_size  = 64;
2442 
2443     CodeBuffer code(name, insts_size, locs_size);
2444     OopMapSet* oop_maps  = new OopMapSet();
2445     MacroAssembler* masm = new MacroAssembler(&code);
2446 
2447     address start = __ pc();
2448 
2449     // This is an inlined and slightly modified version of call_VM
2450     // which has the ability to fetch the return PC out of
2451     // thread-local storage and also sets up last_Java_sp slightly
2452     // differently than the real call_VM
2453 
2454     __ enter(); // Save FP and LR before call
2455 
2456     assert(is_even(framesize/2), "sp not 16-byte aligned");
2457 
2458     // lr and fp are already in place
2459     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
2460 
2461     int frame_complete = __ pc() - start;
2462 
2463     // Set up last_Java_sp and last_Java_fp
2464     address the_pc = __ pc();
2465     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
2466 
2467     // Call runtime
2468     if (arg1 != noreg) {
2469       assert(arg2 != c_rarg1, "clobbered");
2470       __ mov(c_rarg1, arg1);
2471     }
2472     if (arg2 != noreg) {
2473       __ mov(c_rarg2, arg2);
2474     }
2475     __ mov(c_rarg0, rthread);
2476     BLOCK_COMMENT("call runtime_entry");
2477     __ mov(rscratch1, runtime_entry);
2478     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
2479 
2480     // Generate oop map
2481     OopMap* map = new OopMap(framesize, 0);
2482 
2483     oop_maps->add_gc_map(the_pc - start, map);
2484 
2485     __ reset_last_Java_frame(true, true);
2486     __ maybe_isb();
2487 
2488     __ leave();
2489 
2490     // check for pending exceptions
2491 #ifdef ASSERT
2492     Label L;
2493     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
2494     __ cbnz(rscratch1, L);
2495     __ should_not_reach_here();
2496     __ bind(L);
2497 #endif // ASSERT
2498     __ b(RuntimeAddress(StubRoutines::forward_exception_entry()));
2499 
2500 
2501     // codeBlob framesize is in words (not VMRegImpl::slot_size)
2502     RuntimeStub* stub =
2503       RuntimeStub::new_runtime_stub(name,
2504                                     &code,
2505                                     frame_complete,
2506                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
2507                                     oop_maps, false);
2508     return stub->entry_point();
2509   }
2510 
2511   // Initialization
2512   void generate_initial() {
2513     // Generate initial stubs and initializes the entry points
2514 
2515     // entry points that exist in all platforms Note: This is code
2516     // that could be shared among different platforms - however the
2517     // benefit seems to be smaller than the disadvantage of having a
2518     // much more complicated generator structure. See also comment in
2519     // stubRoutines.hpp.
2520 
2521     StubRoutines::_forward_exception_entry = generate_forward_exception();
2522 
2523     StubRoutines::_call_stub_entry =
2524       generate_call_stub(StubRoutines::_call_stub_return_address);
2525 
2526     // is referenced by megamorphic call
2527     StubRoutines::_catch_exception_entry = generate_catch_exception();
2528 
2529     // Build this early so it's available for the interpreter.
2530     StubRoutines::_throw_StackOverflowError_entry =
2531       generate_throw_exception("StackOverflowError throw_exception",
2532                                CAST_FROM_FN_PTR(address,
2533                                                 SharedRuntime::
2534                                                 throw_StackOverflowError));
2535     if (UseCRC32Intrinsics) {
2536       // set table address before stub generation which use it
2537       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
2538       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
2539     }
2540   }
2541 
2542   void generate_all() {
2543     // support for verify_oop (must happen after universe_init)
2544     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
2545     StubRoutines::_throw_AbstractMethodError_entry =
2546       generate_throw_exception("AbstractMethodError throw_exception",
2547                                CAST_FROM_FN_PTR(address,
2548                                                 SharedRuntime::
2549                                                 throw_AbstractMethodError));
2550 
2551     StubRoutines::_throw_IncompatibleClassChangeError_entry =
2552       generate_throw_exception("IncompatibleClassChangeError throw_exception",
2553                                CAST_FROM_FN_PTR(address,
2554                                                 SharedRuntime::
2555                                                 throw_IncompatibleClassChangeError));
2556 
2557     StubRoutines::_throw_NullPointerException_at_call_entry =
2558       generate_throw_exception("NullPointerException at call throw_exception",
2559                                CAST_FROM_FN_PTR(address,
2560                                                 SharedRuntime::
2561                                                 throw_NullPointerException_at_call));
2562 
2563     // arraycopy stubs used by compilers
2564     generate_arraycopy_stubs();
2565 
2566     if (UseMultiplyToLenIntrinsic) {
2567       StubRoutines::_multiplyToLen = generate_multiplyToLen();
2568     }
2569 
2570 #ifndef BUILTIN_SIM
2571     if (UseAESIntrinsics) {
2572       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
2573       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
2574       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
2575       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
2576     }
2577 
2578     if (UseSHA1Intrinsics) {
2579       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
2580       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
2581     }
2582     if (UseSHA256Intrinsics) {
2583       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
2584       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
2585     }
2586 
2587     // Safefetch stubs.
2588     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
2589                                                        &StubRoutines::_safefetch32_fault_pc,
2590                                                        &StubRoutines::_safefetch32_continuation_pc);
2591     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2592                                                        &StubRoutines::_safefetchN_fault_pc,
2593                                                        &StubRoutines::_safefetchN_continuation_pc);
2594 #endif
2595   }
2596 
2597  public:
2598   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2599     if (all) {
2600       generate_all();
2601     } else {
2602       generate_initial();
2603     }
2604   }
2605 }; // end class declaration
2606 
2607 void StubGenerator_generate(CodeBuffer* code, bool all) {
2608   StubGenerator g(code, all);
2609 }