1 /*
   2  * Copyright (c) 2013, Red Hat Inc.
   3  * Copyright (c) 2003, 2011, Oracle and/or its affiliates.
   4  * All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "interpreter/interpreter.hpp"
  31 #include "nativeInst_aarch64.hpp"
  32 #include "oops/instanceOop.hpp"
  33 #include "oops/method.hpp"
  34 #include "oops/objArrayKlass.hpp"
  35 #include "oops/oop.inline.hpp"
  36 #include "prims/methodHandles.hpp"
  37 #include "runtime/frame.inline.hpp"
  38 #include "runtime/handles.inline.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubCodeGenerator.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "runtime/thread.inline.hpp"
  43 #include "utilities/top.hpp"
  44 #ifdef COMPILER2
  45 #include "opto/runtime.hpp"
  46 #endif
  47 
  48 #ifdef BUILTIN_SIM
  49 #include "../../../../../../simulator/simulator.hpp"
  50 #endif
  51 
  52 // Declaration and definition of StubGenerator (no .hpp file).
  53 // For a more detailed description of the stub routine structure
  54 // see the comment in stubRoutines.hpp
  55 
  56 #undef __
  57 #define __ _masm->
  58 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) /* nothing */
  62 #else
  63 #define BLOCK_COMMENT(str) __ block_comment(str)
  64 #endif
  65 
  66 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  67 
  68 // Stub Code definitions
  69 
  70 class StubGenerator: public StubCodeGenerator {
  71  private:
  72 
  73 #ifdef PRODUCT
  74 #define inc_counter_np(counter) ((void)0)
  75 #else
  76   void inc_counter_np_(int& counter) {
  77     __ lea(rscratch2, ExternalAddress((address)&counter));
  78     __ ldrw(rscratch1, Address(rscratch2));
  79     __ addw(rscratch1, rscratch1, 1);
  80     __ strw(rscratch1, Address(rscratch2));
  81   }
  82 #define inc_counter_np(counter) \
  83   BLOCK_COMMENT("inc_counter " #counter); \
  84   inc_counter_np_(counter);
  85 #endif
  86 
  87   // Call stubs are used to call Java from C
  88   //
  89   // Arguments:
  90   //    c_rarg0:   call wrapper address                   address
  91   //    c_rarg1:   result                                 address
  92   //    c_rarg2:   result type                            BasicType
  93   //    c_rarg3:   method                                 Method*
  94   //    c_rarg4:   (interpreter) entry point              address
  95   //    c_rarg5:   parameters                             intptr_t*
  96   //    c_rarg6:   parameter size (in words)              int
  97   //    c_rarg7:   thread                                 Thread*
  98   //
  99   // There is no return from the stub itself as any Java result
 100   // is written to result
 101   //
 102   // we save r30 (lr) as the return PC at the base of the frame and
 103   // link r29 (fp) below it as the frame pointer installing sp (r31)
 104   // into fp.
 105   //
 106   // we save r0-r7, which accounts for all the c arguments.
 107   //
 108   // TODO: strictly do we need to save them all? they are treated as
 109   // volatile by C so could we omit saving the ones we are going to
 110   // place in global registers (thread? method?) or those we only use
 111   // during setup of the Java call?
 112   //
 113   // we don't need to save r8 which C uses as an indirect result location
 114   // return register.
 115   //
 116   // we don't need to save r9-r15 which both C and Java treat as
 117   // volatile
 118   //
 119   // we don't need to save r16-18 because Java does not use them
 120   //
 121   // we save r19-r28 which Java uses as scratch registers and C
 122   // expects to be callee-save
 123   //
 124   // we don't save any FP registers since only v8-v15 are callee-save
 125   // (strictly only the f and d components) and Java uses them as
 126   // callee-save. v0-v7 are arg registers and C treats v16-v31 as
 127   // volatile (as does Java?)
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved d15            ] <--- sp_after_call
 136   // -25 [ saved d14            ]
 137   // -24 [ saved d13            ]
 138   // -23 [ saved d12            ]
 139   // -22 [ saved d11            ]
 140   // -21 [ saved d10            ]
 141   // -20 [ saved d9             ]
 142   // -19 [ saved d8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d14_off            = -25,
 170     d13_off            = -24,
 171     d12_off            = -23,
 172     d11_off            = -22,
 173     d10_off            = -21,
 174     d9_off             = -20,
 175     d8_off             = -19,
 176 
 177     r28_off            = -18,
 178     r27_off            = -17,
 179     r26_off            = -16,
 180     r25_off            = -15,
 181     r24_off            = -14,
 182     r23_off            = -13,
 183     r22_off            = -12,
 184     r21_off            = -11,
 185     r20_off            = -10,
 186     r19_off            =  -9,
 187     call_wrapper_off   =  -8,
 188     result_off         =  -7,
 189     result_type_off    =  -6,
 190     method_off         =  -5,
 191     entry_point_off    =  -4,
 192     parameters_off     =  -3,
 193     parameter_size_off =  -2,
 194     thread_off         =  -1,
 195     fp_f               =   0,
 196     retaddr_off        =   1,
 197   };
 198 
 199   address generate_call_stub(address& return_address) {
 200     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 201            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 202            "adjust this code");
 203 
 204     StubCodeMark mark(this, "StubRoutines", "call_stub");
 205     address start = __ pc();
 206 
 207     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 208 
 209     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 210     const Address result        (rfp, result_off         * wordSize);
 211     const Address result_type   (rfp, result_type_off    * wordSize);
 212     const Address method        (rfp, method_off         * wordSize);
 213     const Address entry_point   (rfp, entry_point_off    * wordSize);
 214     const Address parameters    (rfp, parameters_off     * wordSize);
 215     const Address parameter_size(rfp, parameter_size_off * wordSize);
 216 
 217     const Address thread        (rfp, thread_off         * wordSize);
 218 
 219     const Address d15_save      (rfp, d15_off * wordSize);
 220     const Address d14_save      (rfp, d14_off * wordSize);
 221     const Address d13_save      (rfp, d13_off * wordSize);
 222     const Address d12_save      (rfp, d12_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d10_save      (rfp, d10_off * wordSize);
 225     const Address d9_save       (rfp, d9_off * wordSize);
 226     const Address d8_save       (rfp, d8_off * wordSize);
 227 
 228     const Address r28_save      (rfp, r28_off * wordSize);
 229     const Address r27_save      (rfp, r27_off * wordSize);
 230     const Address r26_save      (rfp, r26_off * wordSize);
 231     const Address r25_save      (rfp, r25_off * wordSize);
 232     const Address r24_save      (rfp, r24_off * wordSize);
 233     const Address r23_save      (rfp, r23_off * wordSize);
 234     const Address r22_save      (rfp, r22_off * wordSize);
 235     const Address r21_save      (rfp, r21_off * wordSize);
 236     const Address r20_save      (rfp, r20_off * wordSize);
 237     const Address r19_save      (rfp, r19_off * wordSize);
 238 
 239     // stub code
 240 
 241     // we need a C prolog to bootstrap the x86 caller into the sim
 242     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 243 
 244     address aarch64_entry = __ pc();
 245 
 246 #ifdef BUILTIN_SIM
 247     // Save sender's SP for stack traces.
 248     __ mov(rscratch1, sp);
 249     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 250 #endif
 251     // set up frame and move sp to end of save area
 252     __ enter();
 253     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 254 
 255     // save register parameters and Java scratch/global registers
 256     // n.b. we save thread even though it gets installed in
 257     // rthread because we want to sanity check rthread later
 258     __ str(c_rarg7,  thread);
 259     __ strw(c_rarg6, parameter_size);
 260     __ str(c_rarg5,  parameters);
 261     __ str(c_rarg4,  entry_point);
 262     __ str(c_rarg3,  method);
 263     __ str(c_rarg2,  result_type);
 264     __ str(c_rarg1,  result);
 265     __ str(c_rarg0,  call_wrapper);
 266     __ str(r19,      r19_save);
 267     __ str(r20,      r20_save);
 268     __ str(r21,      r21_save);
 269     __ str(r22,      r22_save);
 270     __ str(r23,      r23_save);
 271     __ str(r24,      r24_save);
 272     __ str(r25,      r25_save);
 273     __ str(r26,      r26_save);
 274     __ str(r27,      r27_save);
 275     __ str(r28,      r28_save);
 276 
 277     __ strd(v8,      d8_save);
 278     __ strd(v9,      d9_save);
 279     __ strd(v10,     d10_save);
 280     __ strd(v11,     d11_save);
 281     __ strd(v12,     d12_save);
 282     __ strd(v13,     d13_save);
 283     __ strd(v14,     d14_save);
 284     __ strd(v15,     d15_save);
 285 
 286     // install Java thread in global register now we have saved
 287     // whatever value it held
 288     __ mov(rthread, c_rarg7);
 289     // And method
 290     __ mov(rmethod, c_rarg3);
 291 
 292     // set up the heapbase register
 293     __ reinit_heapbase();
 294 
 295 #ifdef ASSERT
 296     // make sure we have no pending exceptions
 297     {
 298       Label L;
 299       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 300       __ cmp(rscratch1, (unsigned)NULL_WORD);
 301       __ br(Assembler::EQ, L);
 302       __ stop("StubRoutines::call_stub: entered with pending exception");
 303       __ BIND(L);
 304     }
 305 #endif
 306     // pass parameters if any
 307     __ mov(esp, sp);
 308     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 309     __ andr(sp, rscratch1, -2 * wordSize);
 310 
 311     BLOCK_COMMENT("pass parameters if any");
 312     Label parameters_done;
 313     // parameter count is still in c_rarg6
 314     // and parameter pointer identifying param 1 is in c_rarg5
 315     __ cbzw(c_rarg6, parameters_done);
 316 
 317     address loop = __ pc();
 318     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 319     __ subsw(c_rarg6, c_rarg6, 1);
 320     __ push(rscratch1);
 321     __ br(Assembler::GT, loop);
 322 
 323     __ BIND(parameters_done);
 324 
 325     // call Java entry -- passing methdoOop, and current sp
 326     //      rmethod: Method*
 327     //      r13: sender sp
 328     BLOCK_COMMENT("call Java function");
 329     __ mov(r13, sp);
 330     __ blr(c_rarg4);
 331 
 332     // tell the simulator we have returned to the stub
 333 
 334     // we do this here because the notify will already have been done
 335     // if we get to the next instruction via an exception
 336     //
 337     // n.b. adding this instruction here affects the calculation of
 338     // whether or not a routine returns to the call stub (used when
 339     // doing stack walks) since the normal test is to check the return
 340     // pc against the address saved below. so we may need to allow for
 341     // this extra instruction in the check.
 342 
 343     if (NotifySimulator) {
 344       __ notify(Assembler::method_reentry);
 345     }
 346     // save current address for use by exception handling code
 347 
 348     return_address = __ pc();
 349 
 350     // store result depending on type (everything that is not
 351     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 352     // n.b. this assumes Java returns an integral result in r0
 353     // and a floating result in j_farg0
 354     __ ldr(j_rarg2, result);
 355     Label is_long, is_float, is_double, exit;
 356     __ ldr(j_rarg1, result_type);
 357     __ cmp(j_rarg1, T_OBJECT);
 358     __ br(Assembler::EQ, is_long);
 359     __ cmp(j_rarg1, T_LONG);
 360     __ br(Assembler::EQ, is_long);
 361     __ cmp(j_rarg1, T_FLOAT);
 362     __ br(Assembler::EQ, is_float);
 363     __ cmp(j_rarg1, T_DOUBLE);
 364     __ br(Assembler::EQ, is_double);
 365 
 366     // handle T_INT case
 367     __ strw(r0, Address(j_rarg2));
 368 
 369     __ BIND(exit);
 370 
 371     // pop parameters
 372     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 373 
 374 #ifdef ASSERT
 375     // verify that threads correspond
 376     {
 377       Label L, S;
 378       __ ldr(rscratch1, thread);
 379       __ cmp(rthread, rscratch1);
 380       __ br(Assembler::NE, S);
 381       __ get_thread(rscratch1);
 382       __ cmp(rthread, rscratch1);
 383       __ br(Assembler::EQ, L);
 384       __ BIND(S);
 385       __ stop("StubRoutines::call_stub: threads must correspond");
 386       __ BIND(L);
 387     }
 388 #endif
 389 
 390     // restore callee-save registers
 391     __ ldrd(v15,      d15_save);
 392     __ ldrd(v14,      d14_save);
 393     __ ldrd(v13,      d13_save);
 394     __ ldrd(v12,      d12_save);
 395     __ ldrd(v11,      d11_save);
 396     __ ldrd(v10,      d10_save);
 397     __ ldrd(v9,       d9_save);
 398     __ ldrd(v8,       d8_save);
 399 
 400     __ ldr(r28,      r28_save);
 401     __ ldr(r27,      r27_save);
 402     __ ldr(r26,      r26_save);
 403     __ ldr(r25,      r25_save);
 404     __ ldr(r24,      r24_save);
 405     __ ldr(r23,      r23_save);
 406     __ ldr(r22,      r22_save);
 407     __ ldr(r21,      r21_save);
 408     __ ldr(r20,      r20_save);
 409     __ ldr(r19,      r19_save);
 410     __ ldr(c_rarg0,  call_wrapper);
 411     __ ldr(c_rarg1,  result);
 412     __ ldrw(c_rarg2, result_type);
 413     __ ldr(c_rarg3,  method);
 414     __ ldr(c_rarg4,  entry_point);
 415     __ ldr(c_rarg5,  parameters);
 416     __ ldr(c_rarg6,  parameter_size);
 417     __ ldr(c_rarg7,  thread);
 418 
 419 #ifndef PRODUCT
 420     // tell the simulator we are about to end Java execution
 421     if (NotifySimulator) {
 422       __ notify(Assembler::method_exit);
 423     }
 424 #endif
 425     // leave frame and return to caller
 426     __ leave();
 427     __ ret(lr);
 428 
 429     // handle return types different from T_INT
 430 
 431     __ BIND(is_long);
 432     __ str(r0, Address(j_rarg2, 0));
 433     __ br(Assembler::AL, exit);
 434 
 435     __ BIND(is_float);
 436     __ strs(j_farg0, Address(j_rarg2, 0));
 437     __ br(Assembler::AL, exit);
 438 
 439     __ BIND(is_double);
 440     __ strd(j_farg0, Address(j_rarg2, 0));
 441     __ br(Assembler::AL, exit);
 442 
 443     return start;
 444   }
 445 
 446   // Return point for a Java call if there's an exception thrown in
 447   // Java code.  The exception is caught and transformed into a
 448   // pending exception stored in JavaThread that can be tested from
 449   // within the VM.
 450   //
 451   // Note: Usually the parameters are removed by the callee. In case
 452   // of an exception crossing an activation frame boundary, that is
 453   // not the case if the callee is compiled code => need to setup the
 454   // rsp.
 455   //
 456   // r0: exception oop
 457 
 458   // NOTE: this is used as a target from the signal handler so it
 459   // needs an x86 prolog which returns into the current simulator
 460   // executing the generated catch_exception code. so the prolog
 461   // needs to install rax in a sim register and adjust the sim's
 462   // restart pc to enter the generated code at the start position
 463   // then return from native to simulated execution.
 464 
 465   address generate_catch_exception() {
 466     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 467     address start = __ pc();
 468 
 469     // same as in generate_call_stub():
 470     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 471     const Address thread        (rfp, thread_off         * wordSize);
 472 
 473 #ifdef ASSERT
 474     // verify that threads correspond
 475     {
 476       Label L, S;
 477       __ ldr(rscratch1, thread);
 478       __ cmp(rthread, rscratch1);
 479       __ br(Assembler::NE, S);
 480       __ get_thread(rscratch1);
 481       __ cmp(rthread, rscratch1);
 482       __ br(Assembler::EQ, L);
 483       __ bind(S);
 484       __ stop("StubRoutines::catch_exception: threads must correspond");
 485       __ bind(L);
 486     }
 487 #endif
 488 
 489     // set pending exception
 490     __ verify_oop(r0);
 491 
 492     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 493     __ mov(rscratch1, (address)__FILE__);
 494     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 495     __ movw(rscratch1, (int)__LINE__);
 496     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 497 
 498     // complete return to VM
 499     assert(StubRoutines::_call_stub_return_address != NULL,
 500            "_call_stub_return_address must have been generated before");
 501     __ b(StubRoutines::_call_stub_return_address);
 502 
 503     return start;
 504   }
 505 
 506   // Continuation point for runtime calls returning with a pending
 507   // exception.  The pending exception check happened in the runtime
 508   // or native call stub.  The pending exception in Thread is
 509   // converted into a Java-level exception.
 510   //
 511   // Contract with Java-level exception handlers:
 512   // r0: exception
 513   // r3: throwing pc
 514   //
 515   // NOTE: At entry of this stub, exception-pc must be in LR !!
 516 
 517   // NOTE: this is always used as a jump target within generated code
 518   // so it just needs to be generated code wiht no x86 prolog
 519 
 520   address generate_forward_exception() {
 521     StubCodeMark mark(this, "StubRoutines", "forward exception");
 522     address start = __ pc();
 523 
 524     // Upon entry, LR points to the return address returning into
 525     // Java (interpreted or compiled) code; i.e., the return address
 526     // becomes the throwing pc.
 527     //
 528     // Arguments pushed before the runtime call are still on the stack
 529     // but the exception handler will reset the stack pointer ->
 530     // ignore them.  A potential result in registers can be ignored as
 531     // well.
 532 
 533 #ifdef ASSERT
 534     // make sure this code is only executed if there is a pending exception
 535     {
 536       Label L;
 537       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 538       __ cbnz(rscratch1, L);
 539       __ stop("StubRoutines::forward exception: no pending exception (1)");
 540       __ bind(L);
 541     }
 542 #endif
 543 
 544     // compute exception handler into r19
 545 
 546     // call the VM to find the handler address associated with the
 547     // caller address. pass thread in r0 and caller pc (ret address)
 548     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 549     // the stack.
 550     __ mov(c_rarg1, lr);
 551     // lr will be trashed by the VM call so we move it to R19
 552     // (callee-saved) because we also need to pass it to the handler
 553     // returned by this call.
 554     __ mov(r19, lr);
 555     BLOCK_COMMENT("call exception_handler_for_return_address");
 556     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 557                          SharedRuntime::exception_handler_for_return_address),
 558                     rthread, c_rarg1);
 559     // we should not really care that lr is no longer the callee
 560     // address. we saved the value the handler needs in r19 so we can
 561     // just copy it to r3. however, the C2 handler will push its own
 562     // frame and then calls into the VM and the VM code asserts that
 563     // the PC for the frame above the handler belongs to a compiled
 564     // Java method. So, we restore lr here to satisfy that assert.
 565     __ mov(lr, r19);
 566     // setup r0 & r3 & clear pending exception
 567     __ mov(r3, r19);
 568     __ mov(r19, r0);
 569     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 570     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 571 
 572 #ifdef ASSERT
 573     // make sure exception is set
 574     {
 575       Label L;
 576       __ cbnz(r0, L);
 577       __ stop("StubRoutines::forward exception: no pending exception (2)");
 578       __ bind(L);
 579     }
 580 #endif
 581 
 582     // continue at exception handler
 583     // r0: exception
 584     // r3: throwing pc
 585     // r19: exception handler
 586     __ verify_oop(r0);
 587     __ br(r19);
 588 
 589     return start;
 590   }
 591 
 592   // Non-destructive plausibility checks for oops
 593   //
 594   // Arguments:
 595   //    r0: oop to verify
 596   //    rscratch1: error message
 597   //
 598   // Stack after saving c_rarg3:
 599   //    [tos + 0]: saved c_rarg3
 600   //    [tos + 1]: saved c_rarg2
 601   //    [tos + 2]: saved lr
 602   //    [tos + 3]: saved rscratch2
 603   //    [tos + 4]: saved r0
 604   //    [tos + 5]: saved rscratch1
 605   address generate_verify_oop() {
 606 
 607     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 608     address start = __ pc();
 609 
 610     Label exit, error;
 611 
 612     // save c_rarg2 and c_rarg3
 613     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 614 
 615     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 616     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 617     __ ldr(c_rarg3, Address(c_rarg2));
 618     __ add(c_rarg3, c_rarg3, 1);
 619     __ str(c_rarg3, Address(c_rarg2));
 620 
 621     // object is in r0
 622     // make sure object is 'reasonable'
 623     __ cbz(r0, exit); // if obj is NULL it is OK
 624 
 625     // Check if the oop is in the right area of memory
 626     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 627     __ andr(c_rarg2, r0, c_rarg3);
 628     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 629 
 630     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 631     // instruction here because the flags register is live.
 632     __ eor(c_rarg2, c_rarg2, c_rarg3);
 633     __ cbnz(c_rarg2, error);
 634 
 635     // make sure klass is 'reasonable', which is not zero.
 636     __ load_klass(r0, r0);  // get klass
 637     __ cbz(r0, error);      // if klass is NULL it is broken
 638 
 639     // return if everything seems ok
 640     __ bind(exit);
 641 
 642     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 643     __ ret(lr);
 644 
 645     // handle errors
 646     __ bind(error);
 647     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 648 
 649     __ push(RegSet::range(r0, r29), sp);
 650     // debug(char* msg, int64_t pc, int64_t regs[])
 651     __ mov(c_rarg0, rscratch1);      // pass address of error message
 652     __ mov(c_rarg1, lr);             // pass return address
 653     __ mov(c_rarg2, sp);             // pass address of regs on stack
 654 #ifndef PRODUCT
 655     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 656 #endif
 657     BLOCK_COMMENT("call MacroAssembler::debug");
 658     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 659     __ blrt(rscratch1, 3, 0, 1);
 660 
 661     return start;
 662   }
 663 
 664   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 665 
 666   // Generate code for an array write pre barrier
 667   //
 668   //     addr    -  starting address
 669   //     count   -  element count
 670   //     tmp     - scratch register
 671   //
 672   //     Destroy no registers!
 673   //
 674   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 675     BarrierSet* bs = Universe::heap()->barrier_set();
 676     switch (bs->kind()) {
 677     case BarrierSet::G1SATBCT:
 678     case BarrierSet::G1SATBCTLogging:
 679       // With G1, don't generate the call if we statically know that the target in uninitialized
 680       if (!dest_uninitialized) {
 681         __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 682         if (count == c_rarg0) {
 683           if (addr == c_rarg1) {
 684             // exactly backwards!!
 685             __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize));
 686             __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize));
 687           } else {
 688             __ mov(c_rarg1, count);
 689             __ mov(c_rarg0, addr);
 690           }
 691         } else {
 692           __ mov(c_rarg0, addr);
 693           __ mov(c_rarg1, count);
 694         }
 695         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 696         __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 697         break;
 698       case BarrierSet::CardTableModRef:
 699       case BarrierSet::CardTableExtension:
 700       case BarrierSet::ModRef:
 701         break;
 702       default:
 703         ShouldNotReachHere();
 704 
 705       }
 706     }
 707   }
 708 
 709   //
 710   // Generate code for an array write post barrier
 711   //
 712   //  Input:
 713   //     start    - register containing starting address of destination array
 714   //     end      - register containing ending address of destination array
 715   //     scratch  - scratch register
 716   //
 717   //  The input registers are overwritten.
 718   //  The ending address is inclusive.
 719   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 720     assert_different_registers(start, end, scratch);
 721     BarrierSet* bs = Universe::heap()->barrier_set();
 722     switch (bs->kind()) {
 723       case BarrierSet::G1SATBCT:
 724       case BarrierSet::G1SATBCTLogging:
 725 
 726         {
 727           __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 728           // must compute element count unless barrier set interface is changed (other platforms supply count)
 729           assert_different_registers(start, end, scratch);
 730           __ lea(scratch, Address(end, BytesPerHeapOop));
 731           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 732           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 733           __ mov(c_rarg0, start);
 734           __ mov(c_rarg1, scratch);
 735           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 736           __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 737         }
 738         break;
 739       case BarrierSet::CardTableModRef:
 740       case BarrierSet::CardTableExtension:
 741         {
 742           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 743           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 744 
 745           Label L_loop;
 746 
 747            __ lsr(start, start, CardTableModRefBS::card_shift);
 748            __ lsr(end, end, CardTableModRefBS::card_shift);
 749            __ sub(end, end, start); // number of bytes to copy
 750 
 751           const Register count = end; // 'end' register contains bytes count now
 752           __ mov(scratch, (address)ct->byte_map_base);
 753           __ add(start, start, scratch);
 754           __ membar(__ StoreStore|__ LoadStore);
 755           __ BIND(L_loop);
 756           __ strb(zr, Address(start, count));
 757           __ subs(count, count, 1);
 758           __ br(Assembler::HS, L_loop);
 759         }
 760         break;
 761       default:
 762         ShouldNotReachHere();
 763 
 764     }
 765   }
 766 
 767   typedef enum {
 768     copy_forwards = 1,
 769     copy_backwards = -1
 770   } copy_direction;
 771 
 772   // Bulk copy of blocks of 8 words.
 773   //
 774   // count is a count of words.
 775   //
 776   // Precondition: count >= 2
 777   //
 778   // Postconditions:
 779   //
 780   // The least significant bit of count contains the remaining count
 781   // of words to copy.  The rest of count is trash.
 782   //
 783   // s and d are adjusted to point to the remaining words to copy
 784   //
 785   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 786                            copy_direction direction) {
 787     int unit = wordSize * direction;
 788 
 789     int offset;
 790     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 791       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 792 
 793     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 794     assert_different_registers(s, d, count, rscratch1);
 795 
 796     Label again, large, small;
 797     __ align(6);
 798     __ bind(start);
 799     __ cmp(count, 8);
 800     __ br(Assembler::LO, small);
 801     if (direction == copy_forwards) {
 802       __ sub(s, s, 2 * wordSize);
 803       __ sub(d, d, 2 * wordSize);
 804     }
 805     __ subs(count, count, 16);
 806     __ br(Assembler::GE, large);
 807 
 808     // 8 <= count < 16 words.  Copy 8.
 809     __ ldp(t0, t1, Address(s, 2 * unit));
 810     __ ldp(t2, t3, Address(s, 4 * unit));
 811     __ ldp(t4, t5, Address(s, 6 * unit));
 812     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 813 
 814     __ stp(t0, t1, Address(d, 2 * unit));
 815     __ stp(t2, t3, Address(d, 4 * unit));
 816     __ stp(t4, t5, Address(d, 6 * unit));
 817     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 818 
 819     if (direction == copy_forwards) {
 820       __ add(s, s, 2 * wordSize);
 821       __ add(d, d, 2 * wordSize);
 822     }
 823 
 824     {
 825       Label L1, L2;
 826       __ bind(small);
 827       __ tbz(count, exact_log2(4), L1);
 828       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 829       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 830       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 831       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 832       __ bind(L1);
 833 
 834       __ tbz(count, 1, L2);
 835       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 836       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 837       __ bind(L2);
 838     }
 839 
 840     __ ret(lr);
 841 
 842     __ align(6);
 843     __ bind(large);
 844 
 845     // Fill 8 registers
 846     __ ldp(t0, t1, Address(s, 2 * unit));
 847     __ ldp(t2, t3, Address(s, 4 * unit));
 848     __ ldp(t4, t5, Address(s, 6 * unit));
 849     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 850 
 851     __ bind(again);
 852 
 853     if (direction == copy_forwards && PrefetchCopyIntervalInBytes > 0)
 854       __ prfm(Address(s, PrefetchCopyIntervalInBytes), PLDL1KEEP);
 855 
 856     __ stp(t0, t1, Address(d, 2 * unit));
 857     __ ldp(t0, t1, Address(s, 2 * unit));
 858     __ stp(t2, t3, Address(d, 4 * unit));
 859     __ ldp(t2, t3, Address(s, 4 * unit));
 860     __ stp(t4, t5, Address(d, 6 * unit));
 861     __ ldp(t4, t5, Address(s, 6 * unit));
 862     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 863     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 864 
 865     __ subs(count, count, 8);
 866     __ br(Assembler::HS, again);
 867 
 868     // Drain
 869     __ stp(t0, t1, Address(d, 2 * unit));
 870     __ stp(t2, t3, Address(d, 4 * unit));
 871     __ stp(t4, t5, Address(d, 6 * unit));
 872     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 873 
 874     if (direction == copy_forwards) {
 875       __ add(s, s, 2 * wordSize);
 876       __ add(d, d, 2 * wordSize);
 877     }
 878 
 879     {
 880       Label L1, L2;
 881       __ tbz(count, exact_log2(4), L1);
 882       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 883       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 884       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 885       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 886       __ bind(L1);
 887 
 888       __ tbz(count, 1, L2);
 889       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 890       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 891       __ bind(L2);
 892     }
 893 
 894     __ ret(lr);
 895   }
 896 
 897   // Small copy: less than 16 bytes.
 898   //
 899   // NB: Ignores all of the bits of count which represent more than 15
 900   // bytes, so a caller doesn't have to mask them.
 901 
 902   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
 903     bool is_backwards = step < 0;
 904     size_t granularity = uabs(step);
 905     int direction = is_backwards ? -1 : 1;
 906     int unit = wordSize * direction;
 907 
 908     Label Lpair, Lword, Lint, Lshort, Lbyte;
 909 
 910     assert(granularity
 911            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 912 
 913     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
 914 
 915     // ??? I don't know if this bit-test-and-branch is the right thing
 916     // to do.  It does a lot of jumping, resulting in several
 917     // mispredicted branches.  It might make more sense to do this
 918     // with something like Duff's device with a single computed branch.
 919 
 920     __ tbz(count, 3 - exact_log2(granularity), Lword);
 921     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
 922     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
 923     __ bind(Lword);
 924 
 925     if (granularity <= sizeof (jint)) {
 926       __ tbz(count, 2 - exact_log2(granularity), Lint);
 927       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 928       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 929       __ bind(Lint);
 930     }
 931 
 932     if (granularity <= sizeof (jshort)) {
 933       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 934       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 935       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 936       __ bind(Lshort);
 937     }
 938 
 939     if (granularity <= sizeof (jbyte)) {
 940       __ tbz(count, 0, Lbyte);
 941       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 942       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 943       __ bind(Lbyte);
 944     }
 945   }
 946 
 947   Label copy_f, copy_b;
 948 
 949   // All-singing all-dancing memory copy.
 950   //
 951   // Copy count units of memory from s to d.  The size of a unit is
 952   // step, which can be positive or negative depending on the direction
 953   // of copy.  If is_aligned is false, we align the source address.
 954   //
 955 
 956   void copy_memory(bool is_aligned, Register s, Register d,
 957                    Register count, Register tmp, int step) {
 958     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 959     bool is_backwards = step < 0;
 960     int granularity = uabs(step);
 961     const Register t0 = r3, t1 = r4;
 962 
 963     if (is_backwards) {
 964       __ lea(s, Address(s, count, Address::uxtw(exact_log2(-step))));
 965       __ lea(d, Address(d, count, Address::uxtw(exact_log2(-step))));
 966     }
 967 
 968     Label done, tail;
 969 
 970     __ cmp(count, 16/granularity);
 971     __ br(Assembler::LO, tail);
 972 
 973     // Now we've got the small case out of the way we can align the
 974     // source address on a 2-word boundary.
 975 
 976     Label aligned;
 977 
 978     if (is_aligned) {
 979       // We may have to adjust by 1 word to get s 2-word-aligned.
 980       __ tbz(s, exact_log2(wordSize), aligned);
 981       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
 982       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
 983       __ sub(count, count, wordSize/granularity);
 984     } else {
 985       if (is_backwards) {
 986         __ andr(rscratch2, s, 2 * wordSize - 1);
 987       } else {
 988         __ neg(rscratch2, s);
 989         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
 990       }
 991       // rscratch2 is the byte adjustment needed to align s.
 992       __ cbz(rscratch2, aligned);
 993       __ lsr(rscratch2, rscratch2, exact_log2(granularity));
 994       __ sub(count, count, rscratch2);
 995 
 996 #if 0
 997       // ?? This code is only correct for a disjoint copy.  It may or
 998       // may not make sense to use it in that case.
 999 
1000       // Copy the first pair; s and d may not be aligned.
1001       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1002       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1003 
1004       // Align s and d, adjust count
1005       if (is_backwards) {
1006         __ sub(s, s, rscratch2);
1007         __ sub(d, d, rscratch2);
1008       } else {
1009         __ add(s, s, rscratch2);
1010         __ add(d, d, rscratch2);
1011       }
1012 #else
1013       copy_memory_small(s, d, rscratch2, rscratch1, step);
1014 #endif
1015     }
1016 
1017     __ cmp(count, 16/granularity);
1018     __ br(Assembler::LT, tail);
1019     __ bind(aligned);
1020 
1021     // s is now 2-word-aligned.
1022 
1023     // We have a count of units and some trailing bytes.  Adjust the
1024     // count and do a bulk copy of words.
1025     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1026     if (direction == copy_forwards)
1027       __ bl(copy_f);
1028     else
1029       __ bl(copy_b);
1030 
1031     // And the tail.
1032 
1033     __ bind(tail);
1034     copy_memory_small(s, d, count, tmp, step);
1035   }
1036 
1037 
1038   void clobber_registers() {
1039 #ifdef ASSERT
1040     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1041     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1042     for (Register r = r3; r <= r18; r++)
1043       if (r != rscratch1) __ mov(r, rscratch1);
1044 #endif
1045   }
1046 
1047   // Scan over array at a for count oops, verifying each one.
1048   // Preserves a and count, clobbers rscratch1 and rscratch2.
1049   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1050     Label loop, end;
1051     __ mov(rscratch1, a);
1052     __ mov(rscratch2, zr);
1053     __ bind(loop);
1054     __ cmp(rscratch2, count);
1055     __ br(Assembler::HS, end);
1056     if (size == (size_t)wordSize) {
1057       __ ldr(temp, Address(a, rscratch2, Address::uxtw(exact_log2(size))));
1058       __ verify_oop(temp);
1059     } else {
1060       __ ldrw(r16, Address(a, rscratch2, Address::uxtw(exact_log2(size))));
1061       __ decode_heap_oop(temp); // calls verify_oop
1062     }
1063     __ add(rscratch2, rscratch2, size);
1064     __ b(loop);
1065     __ bind(end);
1066   }
1067 
1068   // Arguments:
1069   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1070   //             ignored
1071   //   is_oop  - true => oop array, so generate store check code
1072   //   name    - stub name string
1073   //
1074   // Inputs:
1075   //   c_rarg0   - source array address
1076   //   c_rarg1   - destination array address
1077   //   c_rarg2   - element count, treated as ssize_t, can be zero
1078   //
1079   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1080   // the hardware handle it.  The two dwords within qwords that span
1081   // cache line boundaries will still be loaded and stored atomicly.
1082   //
1083   // Side Effects:
1084   //   disjoint_int_copy_entry is set to the no-overlap entry point
1085   //   used by generate_conjoint_int_oop_copy().
1086   //
1087   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1088                                   const char *name, bool dest_uninitialized = false) {
1089     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1090     __ align(CodeEntryAlignment);
1091     StubCodeMark mark(this, "StubRoutines", name);
1092     address start = __ pc();
1093     if (entry != NULL) {
1094       *entry = __ pc();
1095       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1096       BLOCK_COMMENT("Entry:");
1097     }
1098     __ enter();
1099     if (is_oop) {
1100       __ push(RegSet::of(d, count), sp);
1101       // no registers are destroyed by this call
1102       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1103     }
1104     copy_memory(aligned, s, d, count, rscratch1, size);
1105     if (is_oop) {
1106       __ pop(RegSet::of(d, count), sp);
1107       if (VerifyOops)
1108         verify_oop_array(size, d, count, r16);
1109       __ sub(count, count, 1); // make an inclusive end pointer
1110       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1111       gen_write_ref_array_post_barrier(d, count, rscratch1);
1112     }
1113     __ leave();
1114     __ ret(lr);
1115 #ifdef BUILTIN_SIM
1116     {
1117       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1118       sim->notifyCompile(const_cast<char*>(name), start);
1119     }
1120 #endif
1121     return start;
1122   }
1123 
1124   // Arguments:
1125   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1126   //             ignored
1127   //   is_oop  - true => oop array, so generate store check code
1128   //   name    - stub name string
1129   //
1130   // Inputs:
1131   //   c_rarg0   - source array address
1132   //   c_rarg1   - destination array address
1133   //   c_rarg2   - element count, treated as ssize_t, can be zero
1134   //
1135   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1136   // the hardware handle it.  The two dwords within qwords that span
1137   // cache line boundaries will still be loaded and stored atomicly.
1138   //
1139   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1140                                  address *entry, const char *name,
1141                                  bool dest_uninitialized = false) {
1142     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1143 
1144     StubCodeMark mark(this, "StubRoutines", name);
1145     address start = __ pc();
1146 
1147     __ cmp(d, s);
1148     __ br(Assembler::LS, nooverlap_target);
1149 
1150     __ enter();
1151     if (is_oop) {
1152       __ push(RegSet::of(d, count), sp);
1153       // no registers are destroyed by this call
1154       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1155     }
1156     copy_memory(aligned, s, d, count, rscratch1, -size);
1157     if (is_oop) {
1158       __ pop(RegSet::of(d, count), sp);
1159       if (VerifyOops)
1160         verify_oop_array(size, d, count, r16);
1161       __ sub(count, count, 1); // make an inclusive end pointer
1162       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1163       gen_write_ref_array_post_barrier(d, count, rscratch1);
1164     }
1165     __ leave();
1166     __ ret(lr);
1167 #ifdef BUILTIN_SIM
1168     {
1169       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1170       sim->notifyCompile(const_cast<char*>(name), start);
1171     }
1172 #endif
1173     return start;
1174 }
1175 
1176   // Arguments:
1177   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1178   //             ignored
1179   //   name    - stub name string
1180   //
1181   // Inputs:
1182   //   c_rarg0   - source array address
1183   //   c_rarg1   - destination array address
1184   //   c_rarg2   - element count, treated as ssize_t, can be zero
1185   //
1186   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1187   // we let the hardware handle it.  The one to eight bytes within words,
1188   // dwords or qwords that span cache line boundaries will still be loaded
1189   // and stored atomically.
1190   //
1191   // Side Effects:
1192   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1193   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1194   // we let the hardware handle it.  The one to eight bytes within words,
1195   // dwords or qwords that span cache line boundaries will still be loaded
1196   // and stored atomically.
1197   //
1198   // Side Effects:
1199   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1200   //   used by generate_conjoint_byte_copy().
1201   //
1202   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1203     const bool not_oop = false;
1204     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1205   }
1206 
1207   // Arguments:
1208   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1209   //             ignored
1210   //   name    - stub name string
1211   //
1212   // Inputs:
1213   //   c_rarg0   - source array address
1214   //   c_rarg1   - destination array address
1215   //   c_rarg2   - element count, treated as ssize_t, can be zero
1216   //
1217   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1218   // we let the hardware handle it.  The one to eight bytes within words,
1219   // dwords or qwords that span cache line boundaries will still be loaded
1220   // and stored atomically.
1221   //
1222   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1223                                       address* entry, const char *name) {
1224     const bool not_oop = false;
1225     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1226   }
1227 
1228   // Arguments:
1229   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1230   //             ignored
1231   //   name    - stub name string
1232   //
1233   // Inputs:
1234   //   c_rarg0   - source array address
1235   //   c_rarg1   - destination array address
1236   //   c_rarg2   - element count, treated as ssize_t, can be zero
1237   //
1238   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1239   // let the hardware handle it.  The two or four words within dwords
1240   // or qwords that span cache line boundaries will still be loaded
1241   // and stored atomically.
1242   //
1243   // Side Effects:
1244   //   disjoint_short_copy_entry is set to the no-overlap entry point
1245   //   used by generate_conjoint_short_copy().
1246   //
1247   address generate_disjoint_short_copy(bool aligned,
1248                                        address* entry, const char *name) {
1249     const bool not_oop = false;
1250     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1251   }
1252 
1253   // Arguments:
1254   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1255   //             ignored
1256   //   name    - stub name string
1257   //
1258   // Inputs:
1259   //   c_rarg0   - source array address
1260   //   c_rarg1   - destination array address
1261   //   c_rarg2   - element count, treated as ssize_t, can be zero
1262   //
1263   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1264   // let the hardware handle it.  The two or four words within dwords
1265   // or qwords that span cache line boundaries will still be loaded
1266   // and stored atomically.
1267   //
1268   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1269                                        address *entry, const char *name) {
1270     const bool not_oop = false;
1271     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1272 
1273   }
1274   // Arguments:
1275   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1276   //             ignored
1277   //   name    - stub name string
1278   //
1279   // Inputs:
1280   //   c_rarg0   - source array address
1281   //   c_rarg1   - destination array address
1282   //   c_rarg2   - element count, treated as ssize_t, can be zero
1283   //
1284   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1285   // the hardware handle it.  The two dwords within qwords that span
1286   // cache line boundaries will still be loaded and stored atomicly.
1287   //
1288   // Side Effects:
1289   //   disjoint_int_copy_entry is set to the no-overlap entry point
1290   //   used by generate_conjoint_int_oop_copy().
1291   //
1292   address generate_disjoint_int_copy(bool aligned, address *entry,
1293                                          const char *name, bool dest_uninitialized = false) {
1294     const bool not_oop = false;
1295     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1296   }
1297 
1298   // Arguments:
1299   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1300   //             ignored
1301   //   name    - stub name string
1302   //
1303   // Inputs:
1304   //   c_rarg0   - source array address
1305   //   c_rarg1   - destination array address
1306   //   c_rarg2   - element count, treated as ssize_t, can be zero
1307   //
1308   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1309   // the hardware handle it.  The two dwords within qwords that span
1310   // cache line boundaries will still be loaded and stored atomicly.
1311   //
1312   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1313                                      address *entry, const char *name,
1314                                      bool dest_uninitialized = false) {
1315     const bool not_oop = false;
1316     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1317   }
1318 
1319 
1320   // Arguments:
1321   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1322   //             ignored
1323   //   name    - stub name string
1324   //
1325   // Inputs:
1326   //   c_rarg0   - source array address
1327   //   c_rarg1   - destination array address
1328   //   c_rarg2   - element count, treated as size_t, can be zero
1329   //
1330   // Side Effects:
1331   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1332   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1333   //
1334   address generate_disjoint_long_copy(bool aligned, address *entry,
1335                                           const char *name, bool dest_uninitialized = false) {
1336     const bool not_oop = false;
1337     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1338   }
1339 
1340   // Arguments:
1341   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1342   //             ignored
1343   //   name    - stub name string
1344   //
1345   // Inputs:
1346   //   c_rarg0   - source array address
1347   //   c_rarg1   - destination array address
1348   //   c_rarg2   - element count, treated as size_t, can be zero
1349   //
1350   address generate_conjoint_long_copy(bool aligned,
1351                                       address nooverlap_target, address *entry,
1352                                       const char *name, bool dest_uninitialized = false) {
1353     const bool not_oop = false;
1354     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1355   }
1356 
1357   // Arguments:
1358   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1359   //             ignored
1360   //   name    - stub name string
1361   //
1362   // Inputs:
1363   //   c_rarg0   - source array address
1364   //   c_rarg1   - destination array address
1365   //   c_rarg2   - element count, treated as size_t, can be zero
1366   //
1367   // Side Effects:
1368   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1369   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1370   //
1371   address generate_disjoint_oop_copy(bool aligned, address *entry,
1372                                      const char *name, bool dest_uninitialized = false) {
1373     const bool is_oop = true;
1374     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1375     return generate_disjoint_copy(size, aligned, is_oop, entry, name);
1376   }
1377 
1378   // Arguments:
1379   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1380   //             ignored
1381   //   name    - stub name string
1382   //
1383   // Inputs:
1384   //   c_rarg0   - source array address
1385   //   c_rarg1   - destination array address
1386   //   c_rarg2   - element count, treated as size_t, can be zero
1387   //
1388   address generate_conjoint_oop_copy(bool aligned,
1389                                      address nooverlap_target, address *entry,
1390                                      const char *name, bool dest_uninitialized = false) {
1391     const bool is_oop = true;
1392     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1393     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name);
1394   }
1395 
1396 
1397   // Helper for generating a dynamic type check.
1398   // Smashes rscratch1.
1399   void generate_type_check(Register sub_klass,
1400                            Register super_check_offset,
1401                            Register super_klass,
1402                            Label& L_success) {
1403     assert_different_registers(sub_klass, super_check_offset, super_klass);
1404 
1405     BLOCK_COMMENT("type_check:");
1406 
1407     Label L_miss;
1408 
1409     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1410                                      super_check_offset);
1411     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1412 
1413     // Fall through on failure!
1414     __ BIND(L_miss);
1415   }
1416 
1417   //
1418   //  Generate checkcasting array copy stub
1419   //
1420   //  Input:
1421   //    c_rarg0   - source array address
1422   //    c_rarg1   - destination array address
1423   //    c_rarg2   - element count, treated as ssize_t, can be zero
1424   //    c_rarg3   - size_t ckoff (super_check_offset)
1425   //    c_rarg4   - oop ckval (super_klass)
1426   //
1427   //  Output:
1428   //    r0 ==  0  -  success
1429   //    r0 == -1^K - failure, where K is partial transfer count
1430   //
1431   address generate_checkcast_copy(const char *name, address *entry,
1432                                   bool dest_uninitialized = false) {
1433 
1434     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1435 
1436     // Input registers (after setup_arg_regs)
1437     const Register from        = c_rarg0;   // source array address
1438     const Register to          = c_rarg1;   // destination array address
1439     const Register count       = c_rarg2;   // elementscount
1440     const Register ckoff       = c_rarg3;   // super_check_offset
1441     const Register ckval       = c_rarg4;   // super_klass
1442 
1443     // Registers used as temps (r18, r19, r20 are save-on-entry)
1444     const Register count_save  = r21;       // orig elementscount
1445     const Register start_to    = r20;       // destination array start address
1446     const Register copied_oop  = r18;       // actual oop copied
1447     const Register r19_klass   = r19;       // oop._klass
1448 
1449     //---------------------------------------------------------------
1450     // Assembler stub will be used for this call to arraycopy
1451     // if the two arrays are subtypes of Object[] but the
1452     // destination array type is not equal to or a supertype
1453     // of the source type.  Each element must be separately
1454     // checked.
1455 
1456     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1457                                copied_oop, r19_klass, count_save);
1458 
1459     __ align(CodeEntryAlignment);
1460     StubCodeMark mark(this, "StubRoutines", name);
1461     address start = __ pc();
1462 
1463     __ enter(); // required for proper stackwalking of RuntimeStub frame
1464 
1465 #ifdef ASSERT
1466     // caller guarantees that the arrays really are different
1467     // otherwise, we would have to make conjoint checks
1468     { Label L;
1469       array_overlap_test(L, TIMES_OOP);
1470       __ stop("checkcast_copy within a single array");
1471       __ bind(L);
1472     }
1473 #endif //ASSERT
1474 
1475     // Caller of this entry point must set up the argument registers.
1476     if (entry != NULL) {
1477       *entry = __ pc();
1478       BLOCK_COMMENT("Entry:");
1479     }
1480 
1481      // Empty array:  Nothing to do.
1482     __ cbz(count, L_done);
1483 
1484     __ push(RegSet::of(r18, r19, r20, r21), sp);
1485 
1486 #ifdef ASSERT
1487     BLOCK_COMMENT("assert consistent ckoff/ckval");
1488     // The ckoff and ckval must be mutually consistent,
1489     // even though caller generates both.
1490     { Label L;
1491       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1492       __ ldrw(start_to, Address(ckval, sco_offset));
1493       __ cmpw(ckoff, start_to);
1494       __ br(Assembler::EQ, L);
1495       __ stop("super_check_offset inconsistent");
1496       __ bind(L);
1497     }
1498 #endif //ASSERT
1499 
1500     // save the original count
1501     __ mov(count_save, count);
1502 
1503     // Copy from low to high addresses
1504     __ mov(start_to, to);              // Save destination array start address
1505     __ b(L_load_element);
1506 
1507     // ======== begin loop ========
1508     // (Loop is rotated; its entry is L_load_element.)
1509     // Loop control:
1510     //   for (; count != 0; count--) {
1511     //     copied_oop = load_heap_oop(from++);
1512     //     ... generate_type_check ...;
1513     //     store_heap_oop(to++, copied_oop);
1514     //   }
1515     __ align(OptoLoopAlignment);
1516 
1517     __ BIND(L_store_element);
1518     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1519     __ sub(count, count, 1);
1520     __ cbz(count, L_do_card_marks);
1521 
1522     // ======== loop entry is here ========
1523     __ BIND(L_load_element);
1524     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1525     __ cbz(copied_oop, L_store_element);
1526 
1527     __ load_klass(r19_klass, copied_oop);// query the object klass
1528     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1529     // ======== end loop ========
1530 
1531     // It was a real error; we must depend on the caller to finish the job.
1532     // Register count = remaining oops, count_orig = total oops.
1533     // Emit GC store barriers for the oops we have copied and report
1534     // their number to the caller.
1535 
1536     __ subs(count, count_save, count);     // K = partially copied oop count
1537     __ eon(count, count, zr);                   // report (-1^K) to caller
1538     __ br(Assembler::EQ, L_done_pop);
1539 
1540     __ BIND(L_do_card_marks);
1541     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1542     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1543 
1544     __ bind(L_done_pop);
1545     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1546     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1547 
1548     __ bind(L_done);
1549     __ mov(r0, count);
1550     __ leave();
1551     __ ret(lr);
1552 
1553     return start;
1554   }
1555 
1556   // Perform range checks on the proposed arraycopy.
1557   // Kills temp, but nothing else.
1558   // Also, clean the sign bits of src_pos and dst_pos.
1559   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1560                               Register src_pos, // source position (c_rarg1)
1561                               Register dst,     // destination array oo (c_rarg2)
1562                               Register dst_pos, // destination position (c_rarg3)
1563                               Register length,
1564                               Register temp,
1565                               Label& L_failed) { Unimplemented(); }
1566 
1567   // These stubs get called from some dumb test routine.
1568   // I'll write them properly when they're called from
1569   // something that's actually doing something.
1570   static void fake_arraycopy_stub(address src, address dst, int count) {
1571     assert(count == 0, "huh?");
1572   }
1573 
1574 
1575   void generate_arraycopy_stubs() {
1576     address entry;
1577     address entry_jbyte_arraycopy;
1578     address entry_jshort_arraycopy;
1579     address entry_jint_arraycopy;
1580     address entry_oop_arraycopy;
1581     address entry_jlong_arraycopy;
1582     address entry_checkcast_arraycopy;
1583 
1584     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
1585     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
1586 
1587     //*** jbyte
1588     // Always need aligned and unaligned versions
1589     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
1590                                                                                   "jbyte_disjoint_arraycopy");
1591     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
1592                                                                                   &entry_jbyte_arraycopy,
1593                                                                                   "jbyte_arraycopy");
1594     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
1595                                                                                   "arrayof_jbyte_disjoint_arraycopy");
1596     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
1597                                                                                   "arrayof_jbyte_arraycopy");
1598 
1599     //*** jshort
1600     // Always need aligned and unaligned versions
1601     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
1602                                                                                     "jshort_disjoint_arraycopy");
1603     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
1604                                                                                     &entry_jshort_arraycopy,
1605                                                                                     "jshort_arraycopy");
1606     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
1607                                                                                     "arrayof_jshort_disjoint_arraycopy");
1608     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
1609                                                                                     "arrayof_jshort_arraycopy");
1610 
1611     //*** jint
1612     // Aligned versions
1613     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
1614                                                                                 "arrayof_jint_disjoint_arraycopy");
1615     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
1616                                                                                 "arrayof_jint_arraycopy");
1617     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
1618     // entry_jint_arraycopy always points to the unaligned version
1619     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
1620                                                                                 "jint_disjoint_arraycopy");
1621     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
1622                                                                                 &entry_jint_arraycopy,
1623                                                                                 "jint_arraycopy");
1624 
1625     //*** jlong
1626     // It is always aligned
1627     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
1628                                                                                   "arrayof_jlong_disjoint_arraycopy");
1629     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
1630                                                                                   "arrayof_jlong_arraycopy");
1631     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
1632     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
1633 
1634     //*** oops
1635     {
1636       // With compressed oops we need unaligned versions; notice that
1637       // we overwrite entry_oop_arraycopy.
1638       bool aligned = !UseCompressedOops;
1639 
1640       StubRoutines::_arrayof_oop_disjoint_arraycopy
1641         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy");
1642       StubRoutines::_arrayof_oop_arraycopy
1643         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy");
1644       // Aligned versions without pre-barriers
1645       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
1646         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
1647                                      /*dest_uninitialized*/true);
1648       StubRoutines::_arrayof_oop_arraycopy_uninit
1649         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
1650                                      /*dest_uninitialized*/true);
1651     }
1652 
1653     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
1654     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
1655     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
1656     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
1657 
1658     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
1659     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
1660                                                                         /*dest_uninitialized*/true);
1661   }
1662 
1663   // Arguments:
1664   //
1665   // Inputs:
1666   //   c_rarg0   - source byte array address
1667   //   c_rarg1   - destination byte array address
1668   //   c_rarg2   - K (key) in little endian int array
1669   //
1670   address generate_aescrypt_encryptBlock() {
1671     __ align(CodeEntryAlignment);
1672     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
1673 
1674     Label L_doLast;
1675 
1676     const Register from        = c_rarg0;  // source array address
1677     const Register to          = c_rarg1;  // destination array address
1678     const Register key         = c_rarg2;  // key array address
1679     const Register keylen      = rscratch1;
1680 
1681     address start = __ pc();
1682     __ enter();
1683 
1684     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1685 
1686     __ ld1(v0, __ T16B, from); // get 16 bytes of input
1687 
1688     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1689     __ rev32(v1, __ T16B, v1);
1690     __ rev32(v2, __ T16B, v2);
1691     __ rev32(v3, __ T16B, v3);
1692     __ rev32(v4, __ T16B, v4);
1693     __ aese(v0, v1);
1694     __ aesmc(v0, v0);
1695     __ aese(v0, v2);
1696     __ aesmc(v0, v0);
1697     __ aese(v0, v3);
1698     __ aesmc(v0, v0);
1699     __ aese(v0, v4);
1700     __ aesmc(v0, v0);
1701 
1702     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1703     __ rev32(v1, __ T16B, v1);
1704     __ rev32(v2, __ T16B, v2);
1705     __ rev32(v3, __ T16B, v3);
1706     __ rev32(v4, __ T16B, v4);
1707     __ aese(v0, v1);
1708     __ aesmc(v0, v0);
1709     __ aese(v0, v2);
1710     __ aesmc(v0, v0);
1711     __ aese(v0, v3);
1712     __ aesmc(v0, v0);
1713     __ aese(v0, v4);
1714     __ aesmc(v0, v0);
1715 
1716     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1717     __ rev32(v1, __ T16B, v1);
1718     __ rev32(v2, __ T16B, v2);
1719 
1720     __ cmpw(keylen, 44);
1721     __ br(Assembler::EQ, L_doLast);
1722 
1723     __ aese(v0, v1);
1724     __ aesmc(v0, v0);
1725     __ aese(v0, v2);
1726     __ aesmc(v0, v0);
1727 
1728     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1729     __ rev32(v1, __ T16B, v1);
1730     __ rev32(v2, __ T16B, v2);
1731 
1732     __ cmpw(keylen, 52);
1733     __ br(Assembler::EQ, L_doLast);
1734 
1735     __ aese(v0, v1);
1736     __ aesmc(v0, v0);
1737     __ aese(v0, v2);
1738     __ aesmc(v0, v0);
1739 
1740     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1741     __ rev32(v1, __ T16B, v1);
1742     __ rev32(v2, __ T16B, v2);
1743 
1744     __ BIND(L_doLast);
1745 
1746     __ aese(v0, v1);
1747     __ aesmc(v0, v0);
1748     __ aese(v0, v2);
1749 
1750     __ ld1(v1, __ T16B, key);
1751     __ rev32(v1, __ T16B, v1);
1752     __ eor(v0, __ T16B, v0, v1);
1753 
1754     __ st1(v0, __ T16B, to);
1755 
1756     __ mov(r0, 0);
1757 
1758     __ leave();
1759     __ ret(lr);
1760 
1761     return start;
1762   }
1763 
1764   // Arguments:
1765   //
1766   // Inputs:
1767   //   c_rarg0   - source byte array address
1768   //   c_rarg1   - destination byte array address
1769   //   c_rarg2   - K (key) in little endian int array
1770   //
1771   address generate_aescrypt_decryptBlock() {
1772     assert(UseAES, "need AES instructions and misaligned SSE support");
1773     __ align(CodeEntryAlignment);
1774     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
1775     Label L_doLast;
1776 
1777     const Register from        = c_rarg0;  // source array address
1778     const Register to          = c_rarg1;  // destination array address
1779     const Register key         = c_rarg2;  // key array address
1780     const Register keylen      = rscratch1;
1781 
1782     address start = __ pc();
1783     __ enter(); // required for proper stackwalking of RuntimeStub frame
1784 
1785     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1786 
1787     __ ld1(v0, __ T16B, from); // get 16 bytes of input
1788 
1789     __ ld1(v5, __ T16B, __ post(key, 16));
1790     __ rev32(v5, __ T16B, v5);
1791 
1792     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1793     __ rev32(v1, __ T16B, v1);
1794     __ rev32(v2, __ T16B, v2);
1795     __ rev32(v3, __ T16B, v3);
1796     __ rev32(v4, __ T16B, v4);
1797     __ aesd(v0, v1);
1798     __ aesimc(v0, v0);
1799     __ aesd(v0, v2);
1800     __ aesimc(v0, v0);
1801     __ aesd(v0, v3);
1802     __ aesimc(v0, v0);
1803     __ aesd(v0, v4);
1804     __ aesimc(v0, v0);
1805 
1806     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
1807     __ rev32(v1, __ T16B, v1);
1808     __ rev32(v2, __ T16B, v2);
1809     __ rev32(v3, __ T16B, v3);
1810     __ rev32(v4, __ T16B, v4);
1811     __ aesd(v0, v1);
1812     __ aesimc(v0, v0);
1813     __ aesd(v0, v2);
1814     __ aesimc(v0, v0);
1815     __ aesd(v0, v3);
1816     __ aesimc(v0, v0);
1817     __ aesd(v0, v4);
1818     __ aesimc(v0, v0);
1819 
1820     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1821     __ rev32(v1, __ T16B, v1);
1822     __ rev32(v2, __ T16B, v2);
1823 
1824     __ cmpw(keylen, 44);
1825     __ br(Assembler::EQ, L_doLast);
1826 
1827     __ aesd(v0, v1);
1828     __ aesimc(v0, v0);
1829     __ aesd(v0, v2);
1830     __ aesimc(v0, v0);
1831 
1832     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1833     __ rev32(v1, __ T16B, v1);
1834     __ rev32(v2, __ T16B, v2);
1835 
1836     __ cmpw(keylen, 52);
1837     __ br(Assembler::EQ, L_doLast);
1838 
1839     __ aesd(v0, v1);
1840     __ aesimc(v0, v0);
1841     __ aesd(v0, v2);
1842     __ aesimc(v0, v0);
1843 
1844     __ ld1(v1, v2, __ T16B, __ post(key, 32));
1845     __ rev32(v1, __ T16B, v1);
1846     __ rev32(v2, __ T16B, v2);
1847 
1848     __ BIND(L_doLast);
1849 
1850     __ aesd(v0, v1);
1851     __ aesimc(v0, v0);
1852     __ aesd(v0, v2);
1853 
1854     __ eor(v0, __ T16B, v0, v5);
1855 
1856     __ st1(v0, __ T16B, to);
1857 
1858     __ mov(r0, 0);
1859 
1860     __ leave();
1861     __ ret(lr);
1862 
1863     return start;
1864   }
1865 
1866   // Arguments:
1867   //
1868   // Inputs:
1869   //   c_rarg0   - source byte array address
1870   //   c_rarg1   - destination byte array address
1871   //   c_rarg2   - K (key) in little endian int array
1872   //   c_rarg3   - r vector byte array address
1873   //   c_rarg4   - input length
1874   //
1875   // Output:
1876   //   x0        - input length
1877   //
1878   address generate_cipherBlockChaining_encryptAESCrypt() {
1879     assert(UseAES, "need AES instructions and misaligned SSE support");
1880     __ align(CodeEntryAlignment);
1881     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
1882 
1883     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
1884 
1885     const Register from        = c_rarg0;  // source array address
1886     const Register to          = c_rarg1;  // destination array address
1887     const Register key         = c_rarg2;  // key array address
1888     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
1889                                            // and left with the results of the last encryption block
1890     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
1891     const Register keylen      = rscratch1;
1892 
1893     address start = __ pc();
1894       __ enter();
1895 
1896       __ mov(rscratch2, len_reg);
1897       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1898 
1899       __ ld1(v0, __ T16B, rvec);
1900 
1901       __ cmpw(keylen, 52);
1902       __ br(Assembler::CC, L_loadkeys_44);
1903       __ br(Assembler::EQ, L_loadkeys_52);
1904 
1905       __ ld1(v17, v18, __ T16B, __ post(key, 32));
1906       __ rev32(v17, __ T16B, v17);
1907       __ rev32(v18, __ T16B, v18);
1908     __ BIND(L_loadkeys_52);
1909       __ ld1(v19, v20, __ T16B, __ post(key, 32));
1910       __ rev32(v19, __ T16B, v19);
1911       __ rev32(v20, __ T16B, v20);
1912     __ BIND(L_loadkeys_44);
1913       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
1914       __ rev32(v21, __ T16B, v21);
1915       __ rev32(v22, __ T16B, v22);
1916       __ rev32(v23, __ T16B, v23);
1917       __ rev32(v24, __ T16B, v24);
1918       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
1919       __ rev32(v25, __ T16B, v25);
1920       __ rev32(v26, __ T16B, v26);
1921       __ rev32(v27, __ T16B, v27);
1922       __ rev32(v28, __ T16B, v28);
1923       __ ld1(v29, v30, v31, __ T16B, key);
1924       __ rev32(v29, __ T16B, v29);
1925       __ rev32(v30, __ T16B, v30);
1926       __ rev32(v31, __ T16B, v31);
1927 
1928     __ BIND(L_aes_loop);
1929       __ ld1(v1, __ T16B, __ post(from, 16));
1930       __ eor(v0, __ T16B, v0, v1);
1931 
1932       __ br(Assembler::CC, L_rounds_44);
1933       __ br(Assembler::EQ, L_rounds_52);
1934 
1935       __ aese(v0, v17); __ aesmc(v0, v0);
1936       __ aese(v0, v18); __ aesmc(v0, v0);
1937     __ BIND(L_rounds_52);
1938       __ aese(v0, v19); __ aesmc(v0, v0);
1939       __ aese(v0, v20); __ aesmc(v0, v0);
1940     __ BIND(L_rounds_44);
1941       __ aese(v0, v21); __ aesmc(v0, v0);
1942       __ aese(v0, v22); __ aesmc(v0, v0);
1943       __ aese(v0, v23); __ aesmc(v0, v0);
1944       __ aese(v0, v24); __ aesmc(v0, v0);
1945       __ aese(v0, v25); __ aesmc(v0, v0);
1946       __ aese(v0, v26); __ aesmc(v0, v0);
1947       __ aese(v0, v27); __ aesmc(v0, v0);
1948       __ aese(v0, v28); __ aesmc(v0, v0);
1949       __ aese(v0, v29); __ aesmc(v0, v0);
1950       __ aese(v0, v30);
1951       __ eor(v0, __ T16B, v0, v31);
1952 
1953       __ st1(v0, __ T16B, __ post(to, 16));
1954       __ sub(len_reg, len_reg, 16);
1955       __ cbnz(len_reg, L_aes_loop);
1956 
1957       __ st1(v0, __ T16B, rvec);
1958 
1959       __ mov(r0, rscratch2);
1960 
1961       __ leave();
1962       __ ret(lr);
1963 
1964       return start;
1965   }
1966 
1967   // Arguments:
1968   //
1969   // Inputs:
1970   //   c_rarg0   - source byte array address
1971   //   c_rarg1   - destination byte array address
1972   //   c_rarg2   - K (key) in little endian int array
1973   //   c_rarg3   - r vector byte array address
1974   //   c_rarg4   - input length
1975   //
1976   // Output:
1977   //   rax       - input length
1978   //
1979   address generate_cipherBlockChaining_decryptAESCrypt() {
1980     assert(UseAES, "need AES instructions and misaligned SSE support");
1981     __ align(CodeEntryAlignment);
1982     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
1983 
1984     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
1985 
1986     const Register from        = c_rarg0;  // source array address
1987     const Register to          = c_rarg1;  // destination array address
1988     const Register key         = c_rarg2;  // key array address
1989     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
1990                                            // and left with the results of the last encryption block
1991     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
1992     const Register keylen      = rscratch1;
1993 
1994     address start = __ pc();
1995       __ enter();
1996 
1997       __ mov(rscratch2, len_reg);
1998       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1999 
2000       __ ld1(v2, __ T16B, rvec);
2001 
2002       __ ld1(v31, __ T16B, __ post(key, 16));
2003       __ rev32(v31, __ T16B, v31);
2004 
2005       __ cmpw(keylen, 52);
2006       __ br(Assembler::CC, L_loadkeys_44);
2007       __ br(Assembler::EQ, L_loadkeys_52);
2008 
2009       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2010       __ rev32(v17, __ T16B, v17);
2011       __ rev32(v18, __ T16B, v18);
2012     __ BIND(L_loadkeys_52);
2013       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2014       __ rev32(v19, __ T16B, v19);
2015       __ rev32(v20, __ T16B, v20);
2016     __ BIND(L_loadkeys_44);
2017       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2018       __ rev32(v21, __ T16B, v21);
2019       __ rev32(v22, __ T16B, v22);
2020       __ rev32(v23, __ T16B, v23);
2021       __ rev32(v24, __ T16B, v24);
2022       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2023       __ rev32(v25, __ T16B, v25);
2024       __ rev32(v26, __ T16B, v26);
2025       __ rev32(v27, __ T16B, v27);
2026       __ rev32(v28, __ T16B, v28);
2027       __ ld1(v29, v30, __ T16B, key);
2028       __ rev32(v29, __ T16B, v29);
2029       __ rev32(v30, __ T16B, v30);
2030 
2031     __ BIND(L_aes_loop);
2032       __ ld1(v0, __ T16B, __ post(from, 16));
2033       __ orr(v1, __ T16B, v0, v0);
2034 
2035       __ br(Assembler::CC, L_rounds_44);
2036       __ br(Assembler::EQ, L_rounds_52);
2037 
2038       __ aesd(v0, v17); __ aesimc(v0, v0);
2039       __ aesd(v0, v17); __ aesimc(v0, v0);
2040     __ BIND(L_rounds_52);
2041       __ aesd(v0, v19); __ aesimc(v0, v0);
2042       __ aesd(v0, v20); __ aesimc(v0, v0);
2043     __ BIND(L_rounds_44);
2044       __ aesd(v0, v21); __ aesimc(v0, v0);
2045       __ aesd(v0, v22); __ aesimc(v0, v0);
2046       __ aesd(v0, v23); __ aesimc(v0, v0);
2047       __ aesd(v0, v24); __ aesimc(v0, v0);
2048       __ aesd(v0, v25); __ aesimc(v0, v0);
2049       __ aesd(v0, v26); __ aesimc(v0, v0);
2050       __ aesd(v0, v27); __ aesimc(v0, v0);
2051       __ aesd(v0, v28); __ aesimc(v0, v0);
2052       __ aesd(v0, v29); __ aesimc(v0, v0);
2053       __ aesd(v0, v30);
2054       __ eor(v0, __ T16B, v0, v31);
2055       __ eor(v0, __ T16B, v0, v2);
2056 
2057       __ st1(v0, __ T16B, __ post(to, 16));
2058       __ orr(v2, __ T16B, v1, v1);
2059 
2060       __ sub(len_reg, len_reg, 16);
2061       __ cbnz(len_reg, L_aes_loop);
2062 
2063       __ st1(v2, __ T16B, rvec);
2064 
2065       __ mov(r0, rscratch2);
2066 
2067       __ leave();
2068       __ ret(lr);
2069 
2070     return start;
2071   }
2072 
2073   // Arguments:
2074   //
2075   // Inputs:
2076   //   c_rarg0   - byte[]  source+offset
2077   //   c_rarg1   - int[]   SHA.state
2078   //   c_rarg2   - int     offset
2079   //   c_rarg3   - int     limit
2080   //
2081   address generate_sha1_implCompress(bool multi_block, const char *name) {
2082     __ align(CodeEntryAlignment);
2083     StubCodeMark mark(this, "StubRoutines", name);
2084     address start = __ pc();
2085 
2086     Register buf   = c_rarg0;
2087     Register state = c_rarg1;
2088     Register ofs   = c_rarg2;
2089     Register limit = c_rarg3;
2090 
2091     Label keys;
2092     Label sha1_loop;
2093 
2094     // load the keys into v0..v3
2095     __ adr(rscratch1, keys);
2096     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2097     // load 5 words state into v6, v7
2098     __ ldrq(v6, Address(state, 0));
2099     __ ldrs(v7, Address(state, 16));
2100 
2101 
2102     __ BIND(sha1_loop);
2103     // load 64 bytes of data into v16..v19
2104     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2105     __ rev32(v16, __ T16B, v16);
2106     __ rev32(v17, __ T16B, v17);
2107     __ rev32(v18, __ T16B, v18);
2108     __ rev32(v19, __ T16B, v19);
2109 
2110     // do the sha1
2111     __ addv(v4, __ T4S, v16, v0);
2112     __ orr(v20, __ T16B, v6, v6);
2113 
2114     FloatRegister d0 = v16;
2115     FloatRegister d1 = v17;
2116     FloatRegister d2 = v18;
2117     FloatRegister d3 = v19;
2118 
2119     for (int round = 0; round < 20; round++) {
2120       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2121       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2122       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2123       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2124       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2125 
2126       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2127       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2128       __ sha1h(tmp2, __ T4S, v20);
2129       if (round < 5)
2130         __ sha1c(v20, __ T4S, tmp3, tmp4);
2131       else if (round < 10 || round >= 15)
2132         __ sha1p(v20, __ T4S, tmp3, tmp4);
2133       else
2134         __ sha1m(v20, __ T4S, tmp3, tmp4);
2135       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2136 
2137       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2138     }
2139 
2140     __ addv(v7, __ T2S, v7, v21);
2141     __ addv(v6, __ T4S, v6, v20);
2142 
2143     if (multi_block) {
2144       __ add(ofs, ofs, 64);
2145       __ cmp(ofs, limit);
2146       __ br(Assembler::LE, sha1_loop);
2147       __ mov(c_rarg0, ofs); // return ofs
2148     }
2149 
2150     __ strq(v6, Address(state, 0));
2151     __ strs(v7, Address(state, 16));
2152 
2153     __ ret(lr);
2154 
2155     __ bind(keys);
2156     __ emit_int32(0x5a827999);
2157     __ emit_int32(0x6ed9eba1);
2158     __ emit_int32(0x8f1bbcdc);
2159     __ emit_int32(0xca62c1d6);
2160 
2161     return start;
2162   }
2163 
2164 
2165   // Arguments:
2166   //
2167   // Inputs:
2168   //   c_rarg0   - byte[]  source+offset
2169   //   c_rarg1   - int[]   SHA.state
2170   //   c_rarg2   - int     offset
2171   //   c_rarg3   - int     limit
2172   //
2173   address generate_sha256_implCompress(bool multi_block, const char *name) {
2174     static const uint32_t round_consts[64] = {
2175       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2176       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2177       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2178       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2179       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2180       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2181       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2182       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2183       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2184       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2185       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2186       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2187       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2188       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2189       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2190       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
2191     };
2192     __ align(CodeEntryAlignment);
2193     StubCodeMark mark(this, "StubRoutines", name);
2194     address start = __ pc();
2195 
2196     Register buf   = c_rarg0;
2197     Register state = c_rarg1;
2198     Register ofs   = c_rarg2;
2199     Register limit = c_rarg3;
2200 
2201     Label sha1_loop;
2202 
2203     __ stpd(v8, v9, __ pre(sp, -32));
2204     __ stpd(v10, v11, Address(sp, 16));
2205 
2206 // dga == v0
2207 // dgb == v1
2208 // dg0 == v2
2209 // dg1 == v3
2210 // dg2 == v4
2211 // t0 == v6
2212 // t1 == v7
2213 
2214     // load 16 keys to v16..v31
2215     __ lea(rscratch1, ExternalAddress((address)round_consts));
2216     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
2217     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
2218     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
2219     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
2220 
2221     // load 8 words (256 bits) state
2222     __ ldpq(v0, v1, state);
2223 
2224     __ BIND(sha1_loop);
2225     // load 64 bytes of data into v8..v11
2226     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
2227     __ rev32(v8, __ T16B, v8);
2228     __ rev32(v9, __ T16B, v9);
2229     __ rev32(v10, __ T16B, v10);
2230     __ rev32(v11, __ T16B, v11);
2231 
2232     __ addv(v6, __ T4S, v8, v16);
2233     __ orr(v2, __ T16B, v0, v0);
2234     __ orr(v3, __ T16B, v1, v1);
2235 
2236     FloatRegister d0 = v8;
2237     FloatRegister d1 = v9;
2238     FloatRegister d2 = v10;
2239     FloatRegister d3 = v11;
2240 
2241 
2242     for (int round = 0; round < 16; round++) {
2243       FloatRegister tmp1 = (round & 1) ? v6 : v7;
2244       FloatRegister tmp2 = (round & 1) ? v7 : v6;
2245       FloatRegister tmp3 = (round & 1) ? v2 : v4;
2246       FloatRegister tmp4 = (round & 1) ? v4 : v2;
2247 
2248       if (round < 12) __ sha256su0(d0, __ T4S, d1);
2249        __ orr(v4, __ T16B, v2, v2);
2250       if (round < 15)
2251         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
2252       __ sha256h(v2, __ T4S, v3, tmp2);
2253       __ sha256h2(v3, __ T4S, v4, tmp2);
2254       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
2255 
2256       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2257     }
2258 
2259     __ addv(v0, __ T4S, v0, v2);
2260     __ addv(v1, __ T4S, v1, v3);
2261 
2262     if (multi_block) {
2263       __ add(ofs, ofs, 64);
2264       __ cmp(ofs, limit);
2265       __ br(Assembler::LE, sha1_loop);
2266       __ mov(c_rarg0, ofs); // return ofs
2267     }
2268 
2269     __ ldpd(v10, v11, Address(sp, 16));
2270     __ ldpd(v8, v9, __ post(sp, 32));
2271 
2272     __ stpq(v0, v1, state);
2273 
2274     __ ret(lr);
2275 
2276     return start;
2277   }
2278 
2279 #ifndef BUILTIN_SIM
2280   // Safefetch stubs.
2281   void generate_safefetch(const char* name, int size, address* entry,
2282                           address* fault_pc, address* continuation_pc) {
2283     // safefetch signatures:
2284     //   int      SafeFetch32(int*      adr, int      errValue);
2285     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2286     //
2287     // arguments:
2288     //   c_rarg0 = adr
2289     //   c_rarg1 = errValue
2290     //
2291     // result:
2292     //   PPC_RET  = *adr or errValue
2293 
2294     StubCodeMark mark(this, "StubRoutines", name);
2295 
2296     // Entry point, pc or function descriptor.
2297     *entry = __ pc();
2298 
2299     // Load *adr into c_rarg1, may fault.
2300     *fault_pc = __ pc();
2301     switch (size) {
2302       case 4:
2303         // int32_t
2304         __ ldrw(c_rarg1, Address(c_rarg0, 0));
2305         break;
2306       case 8:
2307         // int64_t
2308         __ ldr(c_rarg1, Address(c_rarg0, 0));
2309         break;
2310       default:
2311         ShouldNotReachHere();
2312     }
2313 
2314     // return errValue or *adr
2315     *continuation_pc = __ pc();
2316     __ mov(r0, c_rarg1);
2317     __ ret(lr);
2318   }
2319 #endif
2320 
2321   /**
2322    *  Arguments:
2323    *
2324    * Inputs:
2325    *   c_rarg0   - int crc
2326    *   c_rarg1   - byte* buf
2327    *   c_rarg2   - int length
2328    *
2329    * Output:
2330    *       r0   - int crc result
2331    *
2332    * Preserves:
2333    *       r13
2334    *
2335    */
2336   address generate_updateBytesCRC32() {
2337     assert(UseCRC32Intrinsics, "what are we doing here?");
2338 
2339     __ align(CodeEntryAlignment);
2340     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
2341 
2342     address start = __ pc();
2343 
2344     const Register crc   = c_rarg0;  // crc
2345     const Register buf   = c_rarg1;  // source java byte array address
2346     const Register len   = c_rarg2;  // length
2347     const Register table0 = c_rarg3; // crc_table address
2348     const Register table1 = c_rarg4;
2349     const Register table2 = c_rarg5;
2350     const Register table3 = c_rarg6;
2351     const Register tmp3 = c_rarg7;
2352 
2353     BLOCK_COMMENT("Entry:");
2354     __ enter(); // required for proper stackwalking of RuntimeStub frame
2355 
2356     __ kernel_crc32(crc, buf, len,
2357               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2358 
2359     __ leave(); // required for proper stackwalking of RuntimeStub frame
2360     __ ret(lr);
2361 
2362     return start;
2363   }
2364 
2365   /**
2366    *  Arguments:
2367    *
2368    *  Input:
2369    *    c_rarg0   - x address
2370    *    c_rarg1   - x length
2371    *    c_rarg2   - y address
2372    *    c_rarg3   - y lenth
2373    *    c_rarg4   - z address
2374    *    c_rarg5   - z length
2375    */
2376   address generate_multiplyToLen() {
2377     __ align(CodeEntryAlignment);
2378     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2379 
2380     address start = __ pc();
2381     const Register x     = r0;
2382     const Register xlen  = r1;
2383     const Register y     = r2;
2384     const Register ylen  = r3;
2385     const Register z     = r4;
2386     const Register zlen  = r5;
2387 
2388     const Register tmp1  = r10;
2389     const Register tmp2  = r11;
2390     const Register tmp3  = r12;
2391     const Register tmp4  = r13;
2392     const Register tmp5  = r14;
2393     const Register tmp6  = r15;
2394     const Register tmp7  = r16;
2395 
2396     BLOCK_COMMENT("Entry:");
2397     __ enter(); // required for proper stackwalking of RuntimeStub frame
2398     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2399     __ leave(); // required for proper stackwalking of RuntimeStub frame
2400     __ ret(lr);
2401 
2402     return start;
2403   }
2404 
2405   // Continuation point for throwing of implicit exceptions that are
2406   // not handled in the current activation. Fabricates an exception
2407   // oop and initiates normal exception dispatching in this
2408   // frame. Since we need to preserve callee-saved values (currently
2409   // only for C2, but done for C1 as well) we need a callee-saved oop
2410   // map and therefore have to make these stubs into RuntimeStubs
2411   // rather than BufferBlobs.  If the compiler needs all registers to
2412   // be preserved between the fault point and the exception handler
2413   // then it must assume responsibility for that in
2414   // AbstractCompiler::continuation_for_implicit_null_exception or
2415   // continuation_for_implicit_division_by_zero_exception. All other
2416   // implicit exceptions (e.g., NullPointerException or
2417   // AbstractMethodError on entry) are either at call sites or
2418   // otherwise assume that stack unwinding will be initiated, so
2419   // caller saved registers were assumed volatile in the compiler.
2420 
2421 #undef __
2422 #define __ masm->
2423 
2424   address generate_throw_exception(const char* name,
2425                                    address runtime_entry,
2426                                    Register arg1 = noreg,
2427                                    Register arg2 = noreg) {
2428     // Information about frame layout at time of blocking runtime call.
2429     // Note that we only have to preserve callee-saved registers since
2430     // the compilers are responsible for supplying a continuation point
2431     // if they expect all registers to be preserved.
2432     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
2433     enum layout {
2434       rfp_off = 0,
2435       rfp_off2,
2436       return_off,
2437       return_off2,
2438       framesize // inclusive of return address
2439     };
2440 
2441     int insts_size = 512;
2442     int locs_size  = 64;
2443 
2444     CodeBuffer code(name, insts_size, locs_size);
2445     OopMapSet* oop_maps  = new OopMapSet();
2446     MacroAssembler* masm = new MacroAssembler(&code);
2447 
2448     address start = __ pc();
2449 
2450     // This is an inlined and slightly modified version of call_VM
2451     // which has the ability to fetch the return PC out of
2452     // thread-local storage and also sets up last_Java_sp slightly
2453     // differently than the real call_VM
2454 
2455     __ enter(); // Save FP and LR before call
2456 
2457     assert(is_even(framesize/2), "sp not 16-byte aligned");
2458 
2459     // lr and fp are already in place
2460     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
2461 
2462     int frame_complete = __ pc() - start;
2463 
2464     // Set up last_Java_sp and last_Java_fp
2465     address the_pc = __ pc();
2466     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
2467 
2468     // Call runtime
2469     if (arg1 != noreg) {
2470       assert(arg2 != c_rarg1, "clobbered");
2471       __ mov(c_rarg1, arg1);
2472     }
2473     if (arg2 != noreg) {
2474       __ mov(c_rarg2, arg2);
2475     }
2476     __ mov(c_rarg0, rthread);
2477     BLOCK_COMMENT("call runtime_entry");
2478     __ mov(rscratch1, runtime_entry);
2479     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
2480 
2481     // Generate oop map
2482     OopMap* map = new OopMap(framesize, 0);
2483 
2484     oop_maps->add_gc_map(the_pc - start, map);
2485 
2486     __ reset_last_Java_frame(true, true);
2487     __ maybe_isb();
2488 
2489     __ leave();
2490 
2491     // check for pending exceptions
2492 #ifdef ASSERT
2493     Label L;
2494     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
2495     __ cbnz(rscratch1, L);
2496     __ should_not_reach_here();
2497     __ bind(L);
2498 #endif // ASSERT
2499     __ b(RuntimeAddress(StubRoutines::forward_exception_entry()));
2500 
2501 
2502     // codeBlob framesize is in words (not VMRegImpl::slot_size)
2503     RuntimeStub* stub =
2504       RuntimeStub::new_runtime_stub(name,
2505                                     &code,
2506                                     frame_complete,
2507                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
2508                                     oop_maps, false);
2509     return stub->entry_point();
2510   }
2511 
2512   // Initialization
2513   void generate_initial() {
2514     // Generate initial stubs and initializes the entry points
2515 
2516     // entry points that exist in all platforms Note: This is code
2517     // that could be shared among different platforms - however the
2518     // benefit seems to be smaller than the disadvantage of having a
2519     // much more complicated generator structure. See also comment in
2520     // stubRoutines.hpp.
2521 
2522     StubRoutines::_forward_exception_entry = generate_forward_exception();
2523 
2524     StubRoutines::_call_stub_entry =
2525       generate_call_stub(StubRoutines::_call_stub_return_address);
2526 
2527     // is referenced by megamorphic call
2528     StubRoutines::_catch_exception_entry = generate_catch_exception();
2529 
2530     // Build this early so it's available for the interpreter.
2531     StubRoutines::_throw_StackOverflowError_entry =
2532       generate_throw_exception("StackOverflowError throw_exception",
2533                                CAST_FROM_FN_PTR(address,
2534                                                 SharedRuntime::
2535                                                 throw_StackOverflowError));
2536     if (UseCRC32Intrinsics) {
2537       // set table address before stub generation which use it
2538       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
2539       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
2540     }
2541   }
2542 
2543   void generate_all() {
2544     // support for verify_oop (must happen after universe_init)
2545     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
2546     StubRoutines::_throw_AbstractMethodError_entry =
2547       generate_throw_exception("AbstractMethodError throw_exception",
2548                                CAST_FROM_FN_PTR(address,
2549                                                 SharedRuntime::
2550                                                 throw_AbstractMethodError));
2551 
2552     StubRoutines::_throw_IncompatibleClassChangeError_entry =
2553       generate_throw_exception("IncompatibleClassChangeError throw_exception",
2554                                CAST_FROM_FN_PTR(address,
2555                                                 SharedRuntime::
2556                                                 throw_IncompatibleClassChangeError));
2557 
2558     StubRoutines::_throw_NullPointerException_at_call_entry =
2559       generate_throw_exception("NullPointerException at call throw_exception",
2560                                CAST_FROM_FN_PTR(address,
2561                                                 SharedRuntime::
2562                                                 throw_NullPointerException_at_call));
2563 
2564     // arraycopy stubs used by compilers
2565     generate_arraycopy_stubs();
2566 
2567     if (UseMultiplyToLenIntrinsic) {
2568       StubRoutines::_multiplyToLen = generate_multiplyToLen();
2569     }
2570 
2571 #ifndef BUILTIN_SIM
2572     if (UseAESIntrinsics) {
2573       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
2574       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
2575       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
2576       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
2577     }
2578 
2579     if (UseSHA1Intrinsics) {
2580       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
2581       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
2582     }
2583     if (UseSHA256Intrinsics) {
2584       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
2585       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
2586     }
2587 
2588     // Safefetch stubs.
2589     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
2590                                                        &StubRoutines::_safefetch32_fault_pc,
2591                                                        &StubRoutines::_safefetch32_continuation_pc);
2592     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2593                                                        &StubRoutines::_safefetchN_fault_pc,
2594                                                        &StubRoutines::_safefetchN_continuation_pc);
2595 #endif
2596   }
2597 
2598  public:
2599   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2600     if (all) {
2601       generate_all();
2602     } else {
2603       generate_initial();
2604     }
2605   }
2606 }; // end class declaration
2607 
2608 void StubGenerator_generate(CodeBuffer* code, bool all) {
2609   StubGenerator g(code, all);
2610 }