1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shenandoah/brooksPointer.hpp"
  30 #include "gc/shenandoah/shenandoahBarrierSet.hpp"
  31 #include "gc/shenandoah/shenandoahHeap.hpp"
  32 #include "gc/shenandoah/shenandoahHeapRegion.hpp"
  33 #include "gc/shared/barrierSet.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "interpreter/interpreter.hpp"
  36 #include "nativeInst_aarch64.hpp"
  37 #include "oops/instanceOop.hpp"
  38 #include "oops/method.hpp"
  39 #include "oops/objArrayKlass.hpp"
  40 #include "oops/oop.inline.hpp"
  41 #include "prims/methodHandles.hpp"
  42 #include "runtime/frame.inline.hpp"
  43 #include "runtime/handles.inline.hpp"
  44 #include "runtime/sharedRuntime.hpp"
  45 #include "runtime/stubCodeGenerator.hpp"
  46 #include "runtime/stubRoutines.hpp"
  47 #include "runtime/thread.inline.hpp"
  48 #include "utilities/align.hpp"
  49 #ifdef COMPILER2
  50 #include "opto/runtime.hpp"
  51 #endif
  52 
  53 #ifdef BUILTIN_SIM
  54 #include "../../../../../../simulator/simulator.hpp"
  55 #endif
  56 
  57 // Declaration and definition of StubGenerator (no .hpp file).
  58 // For a more detailed description of the stub routine structure
  59 // see the comment in stubRoutines.hpp
  60 
  61 #undef __
  62 #define __ _masm->
  63 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  64 
  65 #ifdef PRODUCT
  66 #define BLOCK_COMMENT(str) /* nothing */
  67 #else
  68 #define BLOCK_COMMENT(str) __ block_comment(str)
  69 #endif
  70 
  71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  72 
  73 // Stub Code definitions
  74 
  75 class StubGenerator: public StubCodeGenerator {
  76  private:
  77 
  78 #ifdef PRODUCT
  79 #define inc_counter_np(counter) ((void)0)
  80 #else
  81   void inc_counter_np_(int& counter) {
  82     __ lea(rscratch2, ExternalAddress((address)&counter));
  83     __ ldrw(rscratch1, Address(rscratch2));
  84     __ addw(rscratch1, rscratch1, 1);
  85     __ strw(rscratch1, Address(rscratch2));
  86   }
  87 #define inc_counter_np(counter) \
  88   BLOCK_COMMENT("inc_counter " #counter); \
  89   inc_counter_np_(counter);
  90 #endif
  91 
  92   // Call stubs are used to call Java from C
  93   //
  94   // Arguments:
  95   //    c_rarg0:   call wrapper address                   address
  96   //    c_rarg1:   result                                 address
  97   //    c_rarg2:   result type                            BasicType
  98   //    c_rarg3:   method                                 Method*
  99   //    c_rarg4:   (interpreter) entry point              address
 100   //    c_rarg5:   parameters                             intptr_t*
 101   //    c_rarg6:   parameter size (in words)              int
 102   //    c_rarg7:   thread                                 Thread*
 103   //
 104   // There is no return from the stub itself as any Java result
 105   // is written to result
 106   //
 107   // we save r30 (lr) as the return PC at the base of the frame and
 108   // link r29 (fp) below it as the frame pointer installing sp (r31)
 109   // into fp.
 110   //
 111   // we save r0-r7, which accounts for all the c arguments.
 112   //
 113   // TODO: strictly do we need to save them all? they are treated as
 114   // volatile by C so could we omit saving the ones we are going to
 115   // place in global registers (thread? method?) or those we only use
 116   // during setup of the Java call?
 117   //
 118   // we don't need to save r8 which C uses as an indirect result location
 119   // return register.
 120   //
 121   // we don't need to save r9-r15 which both C and Java treat as
 122   // volatile
 123   //
 124   // we don't need to save r16-18 because Java does not use them
 125   //
 126   // we save r19-r28 which Java uses as scratch registers and C
 127   // expects to be callee-save
 128   //
 129   // we save the bottom 64 bits of each value stored in v8-v15; it is
 130   // the responsibility of the caller to preserve larger values.
 131   //
 132   // so the stub frame looks like this when we enter Java code
 133   //
 134   //     [ return_from_Java     ] <--- sp
 135   //     [ argument word n      ]
 136   //      ...
 137   // -27 [ argument word 1      ]
 138   // -26 [ saved v15            ] <--- sp_after_call
 139   // -25 [ saved v14            ]
 140   // -24 [ saved v13            ]
 141   // -23 [ saved v12            ]
 142   // -22 [ saved v11            ]
 143   // -21 [ saved v10            ]
 144   // -20 [ saved v9             ]
 145   // -19 [ saved v8             ]
 146   // -18 [ saved r28            ]
 147   // -17 [ saved r27            ]
 148   // -16 [ saved r26            ]
 149   // -15 [ saved r25            ]
 150   // -14 [ saved r24            ]
 151   // -13 [ saved r23            ]
 152   // -12 [ saved r22            ]
 153   // -11 [ saved r21            ]
 154   // -10 [ saved r20            ]
 155   //  -9 [ saved r19            ]
 156   //  -8 [ call wrapper    (r0) ]
 157   //  -7 [ result          (r1) ]
 158   //  -6 [ result type     (r2) ]
 159   //  -5 [ method          (r3) ]
 160   //  -4 [ entry point     (r4) ]
 161   //  -3 [ parameters      (r5) ]
 162   //  -2 [ parameter size  (r6) ]
 163   //  -1 [ thread (r7)          ]
 164   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 165   //   1 [ saved lr       (r30) ]
 166 
 167   // Call stub stack layout word offsets from fp
 168   enum call_stub_layout {
 169     sp_after_call_off = -26,
 170 
 171     d15_off            = -26,
 172     d13_off            = -24,
 173     d11_off            = -22,
 174     d9_off             = -20,
 175 
 176     r28_off            = -18,
 177     r26_off            = -16,
 178     r24_off            = -14,
 179     r22_off            = -12,
 180     r20_off            = -10,
 181     call_wrapper_off   =  -8,
 182     result_off         =  -7,
 183     result_type_off    =  -6,
 184     method_off         =  -5,
 185     entry_point_off    =  -4,
 186     parameter_size_off =  -2,
 187     thread_off         =  -1,
 188     fp_f               =   0,
 189     retaddr_off        =   1,
 190   };
 191 
 192   address generate_call_stub(address& return_address) {
 193     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 194            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 195            "adjust this code");
 196 
 197     StubCodeMark mark(this, "StubRoutines", "call_stub");
 198     address start = __ pc();
 199 
 200     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 201 
 202     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 203     const Address result        (rfp, result_off         * wordSize);
 204     const Address result_type   (rfp, result_type_off    * wordSize);
 205     const Address method        (rfp, method_off         * wordSize);
 206     const Address entry_point   (rfp, entry_point_off    * wordSize);
 207     const Address parameter_size(rfp, parameter_size_off * wordSize);
 208 
 209     const Address thread        (rfp, thread_off         * wordSize);
 210 
 211     const Address d15_save      (rfp, d15_off * wordSize);
 212     const Address d13_save      (rfp, d13_off * wordSize);
 213     const Address d11_save      (rfp, d11_off * wordSize);
 214     const Address d9_save       (rfp, d9_off * wordSize);
 215 
 216     const Address r28_save      (rfp, r28_off * wordSize);
 217     const Address r26_save      (rfp, r26_off * wordSize);
 218     const Address r24_save      (rfp, r24_off * wordSize);
 219     const Address r22_save      (rfp, r22_off * wordSize);
 220     const Address r20_save      (rfp, r20_off * wordSize);
 221 
 222     // stub code
 223 
 224     // we need a C prolog to bootstrap the x86 caller into the sim
 225     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 226 
 227     address aarch64_entry = __ pc();
 228 
 229 #ifdef BUILTIN_SIM
 230     // Save sender's SP for stack traces.
 231     __ mov(rscratch1, sp);
 232     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 233 #endif
 234     // set up frame and move sp to end of save area
 235     __ enter();
 236     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 237 
 238     // save register parameters and Java scratch/global registers
 239     // n.b. we save thread even though it gets installed in
 240     // rthread because we want to sanity check rthread later
 241     __ str(c_rarg7,  thread);
 242     __ strw(c_rarg6, parameter_size);
 243     __ stp(c_rarg4, c_rarg5,  entry_point);
 244     __ stp(c_rarg2, c_rarg3,  result_type);
 245     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 246 
 247     __ stp(r20, r19,   r20_save);
 248     __ stp(r22, r21,   r22_save);
 249     __ stp(r24, r23,   r24_save);
 250     __ stp(r26, r25,   r26_save);
 251     __ stp(r28, r27,   r28_save);
 252 
 253     __ stpd(v9,  v8,   d9_save);
 254     __ stpd(v11, v10,  d11_save);
 255     __ stpd(v13, v12,  d13_save);
 256     __ stpd(v15, v14,  d15_save);
 257 
 258     // install Java thread in global register now we have saved
 259     // whatever value it held
 260     __ mov(rthread, c_rarg7);
 261     // And method
 262     __ mov(rmethod, c_rarg3);
 263 
 264     // set up the heapbase register
 265     __ reinit_heapbase();
 266 
 267 #ifdef ASSERT
 268     // make sure we have no pending exceptions
 269     {
 270       Label L;
 271       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 272       __ cmp(rscratch1, (unsigned)NULL_WORD);
 273       __ br(Assembler::EQ, L);
 274       __ stop("StubRoutines::call_stub: entered with pending exception");
 275       __ BIND(L);
 276     }
 277 #endif
 278     // pass parameters if any
 279     __ mov(esp, sp);
 280     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 281     __ andr(sp, rscratch1, -2 * wordSize);
 282 
 283     BLOCK_COMMENT("pass parameters if any");
 284     Label parameters_done;
 285     // parameter count is still in c_rarg6
 286     // and parameter pointer identifying param 1 is in c_rarg5
 287     __ cbzw(c_rarg6, parameters_done);
 288 
 289     address loop = __ pc();
 290     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 291     __ subsw(c_rarg6, c_rarg6, 1);
 292     __ push(rscratch1);
 293     __ br(Assembler::GT, loop);
 294 
 295     __ BIND(parameters_done);
 296 
 297     // call Java entry -- passing methdoOop, and current sp
 298     //      rmethod: Method*
 299     //      r13: sender sp
 300     BLOCK_COMMENT("call Java function");
 301     __ mov(r13, sp);
 302     __ blr(c_rarg4);
 303 
 304     // tell the simulator we have returned to the stub
 305 
 306     // we do this here because the notify will already have been done
 307     // if we get to the next instruction via an exception
 308     //
 309     // n.b. adding this instruction here affects the calculation of
 310     // whether or not a routine returns to the call stub (used when
 311     // doing stack walks) since the normal test is to check the return
 312     // pc against the address saved below. so we may need to allow for
 313     // this extra instruction in the check.
 314 
 315     if (NotifySimulator) {
 316       __ notify(Assembler::method_reentry);
 317     }
 318     // save current address for use by exception handling code
 319 
 320     return_address = __ pc();
 321 
 322     // store result depending on type (everything that is not
 323     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 324     // n.b. this assumes Java returns an integral result in r0
 325     // and a floating result in j_farg0
 326     __ ldr(j_rarg2, result);
 327     Label is_long, is_float, is_double, exit;
 328     __ ldr(j_rarg1, result_type);
 329     __ cmp(j_rarg1, T_OBJECT);
 330     __ br(Assembler::EQ, is_long);
 331     __ cmp(j_rarg1, T_LONG);
 332     __ br(Assembler::EQ, is_long);
 333     __ cmp(j_rarg1, T_FLOAT);
 334     __ br(Assembler::EQ, is_float);
 335     __ cmp(j_rarg1, T_DOUBLE);
 336     __ br(Assembler::EQ, is_double);
 337 
 338     // handle T_INT case
 339     __ strw(r0, Address(j_rarg2));
 340 
 341     __ BIND(exit);
 342 
 343     // pop parameters
 344     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 345 
 346 #ifdef ASSERT
 347     // verify that threads correspond
 348     {
 349       Label L, S;
 350       __ ldr(rscratch1, thread);
 351       __ cmp(rthread, rscratch1);
 352       __ br(Assembler::NE, S);
 353       __ get_thread(rscratch1);
 354       __ cmp(rthread, rscratch1);
 355       __ br(Assembler::EQ, L);
 356       __ BIND(S);
 357       __ stop("StubRoutines::call_stub: threads must correspond");
 358       __ BIND(L);
 359     }
 360 #endif
 361 
 362     // restore callee-save registers
 363     __ ldpd(v15, v14,  d15_save);
 364     __ ldpd(v13, v12,  d13_save);
 365     __ ldpd(v11, v10,  d11_save);
 366     __ ldpd(v9,  v8,   d9_save);
 367 
 368     __ ldp(r28, r27,   r28_save);
 369     __ ldp(r26, r25,   r26_save);
 370     __ ldp(r24, r23,   r24_save);
 371     __ ldp(r22, r21,   r22_save);
 372     __ ldp(r20, r19,   r20_save);
 373 
 374     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 375     __ ldrw(c_rarg2, result_type);
 376     __ ldr(c_rarg3,  method);
 377     __ ldp(c_rarg4, c_rarg5,  entry_point);
 378     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 379 
 380 #ifndef PRODUCT
 381     // tell the simulator we are about to end Java execution
 382     if (NotifySimulator) {
 383       __ notify(Assembler::method_exit);
 384     }
 385 #endif
 386     // leave frame and return to caller
 387     __ leave();
 388     __ ret(lr);
 389 
 390     // handle return types different from T_INT
 391 
 392     __ BIND(is_long);
 393     __ str(r0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_float);
 397     __ strs(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     __ BIND(is_double);
 401     __ strd(j_farg0, Address(j_rarg2, 0));
 402     __ br(Assembler::AL, exit);
 403 
 404     return start;
 405   }
 406 
 407   // Return point for a Java call if there's an exception thrown in
 408   // Java code.  The exception is caught and transformed into a
 409   // pending exception stored in JavaThread that can be tested from
 410   // within the VM.
 411   //
 412   // Note: Usually the parameters are removed by the callee. In case
 413   // of an exception crossing an activation frame boundary, that is
 414   // not the case if the callee is compiled code => need to setup the
 415   // rsp.
 416   //
 417   // r0: exception oop
 418 
 419   // NOTE: this is used as a target from the signal handler so it
 420   // needs an x86 prolog which returns into the current simulator
 421   // executing the generated catch_exception code. so the prolog
 422   // needs to install rax in a sim register and adjust the sim's
 423   // restart pc to enter the generated code at the start position
 424   // then return from native to simulated execution.
 425 
 426   address generate_catch_exception() {
 427     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 428     address start = __ pc();
 429 
 430     // same as in generate_call_stub():
 431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 432     const Address thread        (rfp, thread_off         * wordSize);
 433 
 434 #ifdef ASSERT
 435     // verify that threads correspond
 436     {
 437       Label L, S;
 438       __ ldr(rscratch1, thread);
 439       __ cmp(rthread, rscratch1);
 440       __ br(Assembler::NE, S);
 441       __ get_thread(rscratch1);
 442       __ cmp(rthread, rscratch1);
 443       __ br(Assembler::EQ, L);
 444       __ bind(S);
 445       __ stop("StubRoutines::catch_exception: threads must correspond");
 446       __ bind(L);
 447     }
 448 #endif
 449 
 450     // set pending exception
 451     __ verify_oop(r0);
 452 
 453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 454     __ mov(rscratch1, (address)__FILE__);
 455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 456     __ movw(rscratch1, (int)__LINE__);
 457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 458 
 459     // complete return to VM
 460     assert(StubRoutines::_call_stub_return_address != NULL,
 461            "_call_stub_return_address must have been generated before");
 462     __ b(StubRoutines::_call_stub_return_address);
 463 
 464     return start;
 465   }
 466 
 467   // Continuation point for runtime calls returning with a pending
 468   // exception.  The pending exception check happened in the runtime
 469   // or native call stub.  The pending exception in Thread is
 470   // converted into a Java-level exception.
 471   //
 472   // Contract with Java-level exception handlers:
 473   // r0: exception
 474   // r3: throwing pc
 475   //
 476   // NOTE: At entry of this stub, exception-pc must be in LR !!
 477 
 478   // NOTE: this is always used as a jump target within generated code
 479   // so it just needs to be generated code wiht no x86 prolog
 480 
 481   address generate_forward_exception() {
 482     StubCodeMark mark(this, "StubRoutines", "forward exception");
 483     address start = __ pc();
 484 
 485     // Upon entry, LR points to the return address returning into
 486     // Java (interpreted or compiled) code; i.e., the return address
 487     // becomes the throwing pc.
 488     //
 489     // Arguments pushed before the runtime call are still on the stack
 490     // but the exception handler will reset the stack pointer ->
 491     // ignore them.  A potential result in registers can be ignored as
 492     // well.
 493 
 494 #ifdef ASSERT
 495     // make sure this code is only executed if there is a pending exception
 496     {
 497       Label L;
 498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 499       __ cbnz(rscratch1, L);
 500       __ stop("StubRoutines::forward exception: no pending exception (1)");
 501       __ bind(L);
 502     }
 503 #endif
 504 
 505     // compute exception handler into r19
 506 
 507     // call the VM to find the handler address associated with the
 508     // caller address. pass thread in r0 and caller pc (ret address)
 509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 510     // the stack.
 511     __ mov(c_rarg1, lr);
 512     // lr will be trashed by the VM call so we move it to R19
 513     // (callee-saved) because we also need to pass it to the handler
 514     // returned by this call.
 515     __ mov(r19, lr);
 516     BLOCK_COMMENT("call exception_handler_for_return_address");
 517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 518                          SharedRuntime::exception_handler_for_return_address),
 519                     rthread, c_rarg1);
 520     // we should not really care that lr is no longer the callee
 521     // address. we saved the value the handler needs in r19 so we can
 522     // just copy it to r3. however, the C2 handler will push its own
 523     // frame and then calls into the VM and the VM code asserts that
 524     // the PC for the frame above the handler belongs to a compiled
 525     // Java method. So, we restore lr here to satisfy that assert.
 526     __ mov(lr, r19);
 527     // setup r0 & r3 & clear pending exception
 528     __ mov(r3, r19);
 529     __ mov(r19, r0);
 530     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 531     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 532 
 533 #ifdef ASSERT
 534     // make sure exception is set
 535     {
 536       Label L;
 537       __ cbnz(r0, L);
 538       __ stop("StubRoutines::forward exception: no pending exception (2)");
 539       __ bind(L);
 540     }
 541 #endif
 542 
 543     // continue at exception handler
 544     // r0: exception
 545     // r3: throwing pc
 546     // r19: exception handler
 547     __ verify_oop(r0);
 548     __ br(r19);
 549 
 550     return start;
 551   }
 552 
 553   // Shenandoah write barrier.
 554   //
 555   // Input:
 556   //   r0: OOP to evacuate.  Not null.
 557   //
 558   // Output:
 559   //   r0: Pointer to evacuated OOP.
 560   //
 561   // Trash rscratch1, rscratch2.  Preserve everything else.
 562 
 563   address generate_shenandoah_wb(bool c_abi, bool do_cset_test) {
 564     StubCodeMark mark(this, "StubRoutines", "shenandoah_wb");
 565 
 566     __ align(6);
 567     address start = __ pc();
 568 
 569     if (do_cset_test) {
 570       Label work;
 571       __ mov(rscratch2, ShenandoahHeap::in_cset_fast_test_addr());
 572       __ lsr(rscratch1, r0, ShenandoahHeapRegion::region_size_bytes_shift_jint());
 573       __ ldrb(rscratch2, Address(rscratch2, rscratch1));
 574       __ tbnz(rscratch2, 0, work);
 575       __ ret(lr);
 576       __ bind(work);
 577     }
 578 
 579     Register obj = r0;
 580 
 581     __ enter(); // required for proper stackwalking of RuntimeStub frame
 582 
 583     if (!c_abi) {
 584       __ push_call_clobbered_registers();
 585     } else {
 586       __ push_call_clobbered_fp_registers();
 587     }
 588 
 589     __ mov(lr, CAST_FROM_FN_PTR(address, ShenandoahBarrierSet::write_barrier_JRT));
 590     __ blrt(lr, 1, 0, MacroAssembler::ret_type_integral);
 591     if (!c_abi) {
 592       __ mov(rscratch1, obj);
 593       __ pop_call_clobbered_registers();
 594       __ mov(obj, rscratch1);
 595     } else {
 596       __ pop_call_clobbered_fp_registers();
 597     }
 598 
 599     __ leave(); // required for proper stackwalking of RuntimeStub frame
 600     __ ret(lr);
 601 
 602     return start;
 603   }
 604 
 605   // Non-destructive plausibility checks for oops
 606   //
 607   // Arguments:
 608   //    r0: oop to verify
 609   //    rscratch1: error message
 610   //
 611   // Stack after saving c_rarg3:
 612   //    [tos + 0]: saved c_rarg3
 613   //    [tos + 1]: saved c_rarg2
 614   //    [tos + 2]: saved lr
 615   //    [tos + 3]: saved rscratch2
 616   //    [tos + 4]: saved r0
 617   //    [tos + 5]: saved rscratch1
 618   address generate_verify_oop() {
 619 
 620     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 621     address start = __ pc();
 622 
 623     Label exit, error;
 624 
 625     // save c_rarg2 and c_rarg3
 626     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 627 
 628     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 629     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 630     __ ldr(c_rarg3, Address(c_rarg2));
 631     __ add(c_rarg3, c_rarg3, 1);
 632     __ str(c_rarg3, Address(c_rarg2));
 633 
 634     // object is in r0
 635     // make sure object is 'reasonable'
 636     __ cbz(r0, exit); // if obj is NULL it is OK
 637 
 638     // Check if the oop is in the right area of memory
 639     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 640     __ andr(c_rarg2, r0, c_rarg3);
 641     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 642 
 643     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 644     // instruction here because the flags register is live.
 645     __ eor(c_rarg2, c_rarg2, c_rarg3);
 646     __ cbnz(c_rarg2, error);
 647 
 648     // make sure klass is 'reasonable', which is not zero.
 649     __ load_klass(r0, r0);  // get klass
 650     __ cbz(r0, error);      // if klass is NULL it is broken
 651 
 652     // return if everything seems ok
 653     __ bind(exit);
 654 
 655     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 656     __ ret(lr);
 657 
 658     // handle errors
 659     __ bind(error);
 660     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 661 
 662     __ push(RegSet::range(r0, r29), sp);
 663     // debug(char* msg, int64_t pc, int64_t regs[])
 664     __ mov(c_rarg0, rscratch1);      // pass address of error message
 665     __ mov(c_rarg1, lr);             // pass return address
 666     __ mov(c_rarg2, sp);             // pass address of regs on stack
 667 #ifndef PRODUCT
 668     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 669 #endif
 670     BLOCK_COMMENT("call MacroAssembler::debug");
 671     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 672     __ blrt(rscratch1, 3, 0, 1);
 673 
 674     return start;
 675   }
 676 
 677   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 678 
 679   // The inner part of zero_words().  This is the bulk operation,
 680   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 681   // caller is responsible for zeroing the last few words.
 682   //
 683   // Inputs:
 684   // r10: the HeapWord-aligned base address of an array to zero.
 685   // r11: the count in HeapWords, r11 > 0.
 686   //
 687   // Returns r10 and r11, adjusted for the caller to clear.
 688   // r10: the base address of the tail of words left to clear.
 689   // r11: the number of words in the tail.
 690   //      r11 < MacroAssembler::zero_words_block_size.
 691 
 692   address generate_zero_blocks() {
 693     Label store_pair, loop_store_pair, done;
 694     Label base_aligned;
 695 
 696     Register base = r10, cnt = r11;
 697 
 698     __ align(CodeEntryAlignment);
 699     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 700     address start = __ pc();
 701 
 702     if (UseBlockZeroing) {
 703       int zva_length = VM_Version::zva_length();
 704 
 705       // Ensure ZVA length can be divided by 16. This is required by
 706       // the subsequent operations.
 707       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 708 
 709       __ tbz(base, 3, base_aligned);
 710       __ str(zr, Address(__ post(base, 8)));
 711       __ sub(cnt, cnt, 1);
 712       __ bind(base_aligned);
 713 
 714       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 715       // alignment.
 716       Label small;
 717       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 718       __ subs(rscratch1, cnt, low_limit >> 3);
 719       __ br(Assembler::LT, small);
 720       __ zero_dcache_blocks(base, cnt);
 721       __ bind(small);
 722     }
 723 
 724     {
 725       // Number of stp instructions we'll unroll
 726       const int unroll =
 727         MacroAssembler::zero_words_block_size / 2;
 728       // Clear the remaining blocks.
 729       Label loop;
 730       __ subs(cnt, cnt, unroll * 2);
 731       __ br(Assembler::LT, done);
 732       __ bind(loop);
 733       for (int i = 0; i < unroll; i++)
 734         __ stp(zr, zr, __ post(base, 16));
 735       __ subs(cnt, cnt, unroll * 2);
 736       __ br(Assembler::GE, loop);
 737       __ bind(done);
 738       __ add(cnt, cnt, unroll * 2);
 739     }
 740 
 741     __ ret(lr);
 742 
 743     return start;
 744   }
 745 
 746 
 747   typedef enum {
 748     copy_forwards = 1,
 749     copy_backwards = -1
 750   } copy_direction;
 751 
 752   // Bulk copy of blocks of 8 words.
 753   //
 754   // count is a count of words.
 755   //
 756   // Precondition: count >= 8
 757   //
 758   // Postconditions:
 759   //
 760   // The least significant bit of count contains the remaining count
 761   // of words to copy.  The rest of count is trash.
 762   //
 763   // s and d are adjusted to point to the remaining words to copy
 764   //
 765   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 766                            copy_direction direction) {
 767     int unit = wordSize * direction;
 768     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 769 
 770     int offset;
 771     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 772       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 773     const Register stride = r13;
 774 
 775     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 776     assert_different_registers(s, d, count, rscratch1);
 777 
 778     Label again, drain;
 779     const char *stub_name;
 780     if (direction == copy_forwards)
 781       stub_name = "forward_copy_longs";
 782     else
 783       stub_name = "backward_copy_longs";
 784     StubCodeMark mark(this, "StubRoutines", stub_name);
 785     __ align(CodeEntryAlignment);
 786     __ bind(start);
 787 
 788     Label unaligned_copy_long;
 789     if (AvoidUnalignedAccesses) {
 790       __ tbnz(d, 3, unaligned_copy_long);
 791     }
 792 
 793     if (direction == copy_forwards) {
 794       __ sub(s, s, bias);
 795       __ sub(d, d, bias);
 796     }
 797 
 798 #ifdef ASSERT
 799     // Make sure we are never given < 8 words
 800     {
 801       Label L;
 802       __ cmp(count, 8);
 803       __ br(Assembler::GE, L);
 804       __ stop("genrate_copy_longs called with < 8 words");
 805       __ bind(L);
 806     }
 807 #endif
 808 
 809     // Fill 8 registers
 810     if (UseSIMDForMemoryOps) {
 811       __ ldpq(v0, v1, Address(s, 4 * unit));
 812       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 813     } else {
 814       __ ldp(t0, t1, Address(s, 2 * unit));
 815       __ ldp(t2, t3, Address(s, 4 * unit));
 816       __ ldp(t4, t5, Address(s, 6 * unit));
 817       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 818     }
 819 
 820     __ subs(count, count, 16);
 821     __ br(Assembler::LO, drain);
 822 
 823     int prefetch = PrefetchCopyIntervalInBytes;
 824     bool use_stride = false;
 825     if (direction == copy_backwards) {
 826        use_stride = prefetch > 256;
 827        prefetch = -prefetch;
 828        if (use_stride) __ mov(stride, prefetch);
 829     }
 830 
 831     __ bind(again);
 832 
 833     if (PrefetchCopyIntervalInBytes > 0)
 834       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 835 
 836     if (UseSIMDForMemoryOps) {
 837       __ stpq(v0, v1, Address(d, 4 * unit));
 838       __ ldpq(v0, v1, Address(s, 4 * unit));
 839       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 840       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 841     } else {
 842       __ stp(t0, t1, Address(d, 2 * unit));
 843       __ ldp(t0, t1, Address(s, 2 * unit));
 844       __ stp(t2, t3, Address(d, 4 * unit));
 845       __ ldp(t2, t3, Address(s, 4 * unit));
 846       __ stp(t4, t5, Address(d, 6 * unit));
 847       __ ldp(t4, t5, Address(s, 6 * unit));
 848       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 849       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 850     }
 851 
 852     __ subs(count, count, 8);
 853     __ br(Assembler::HS, again);
 854 
 855     // Drain
 856     __ bind(drain);
 857     if (UseSIMDForMemoryOps) {
 858       __ stpq(v0, v1, Address(d, 4 * unit));
 859       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 860     } else {
 861       __ stp(t0, t1, Address(d, 2 * unit));
 862       __ stp(t2, t3, Address(d, 4 * unit));
 863       __ stp(t4, t5, Address(d, 6 * unit));
 864       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 865     }
 866 
 867     {
 868       Label L1, L2;
 869       __ tbz(count, exact_log2(4), L1);
 870       if (UseSIMDForMemoryOps) {
 871         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 872         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 873       } else {
 874         __ ldp(t0, t1, Address(s, 2 * unit));
 875         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 876         __ stp(t0, t1, Address(d, 2 * unit));
 877         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 878       }
 879       __ bind(L1);
 880 
 881       if (direction == copy_forwards) {
 882         __ add(s, s, bias);
 883         __ add(d, d, bias);
 884       }
 885 
 886       __ tbz(count, 1, L2);
 887       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 888       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 889       __ bind(L2);
 890     }
 891 
 892     __ ret(lr);
 893 
 894     if (AvoidUnalignedAccesses) {
 895       Label drain, again;
 896       // Register order for storing. Order is different for backward copy.
 897 
 898       __ bind(unaligned_copy_long);
 899 
 900       // source address is even aligned, target odd aligned
 901       //
 902       // when forward copying word pairs we read long pairs at offsets
 903       // {0, 2, 4, 6} (in long words). when backwards copying we read
 904       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 905       // address by -2 in the forwards case so we can compute the
 906       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 907       // or -1.
 908       //
 909       // when forward copying we need to store 1 word, 3 pairs and
 910       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 911       // zero offset We adjust the destination by -1 which means we
 912       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 913       //
 914       // When backwards copyng we need to store 1 word, 3 pairs and
 915       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 916       // offsets {1, 3, 5, 7, 8} * unit.
 917 
 918       if (direction == copy_forwards) {
 919         __ sub(s, s, 16);
 920         __ sub(d, d, 8);
 921       }
 922 
 923       // Fill 8 registers
 924       //
 925       // for forwards copy s was offset by -16 from the original input
 926       // value of s so the register contents are at these offsets
 927       // relative to the 64 bit block addressed by that original input
 928       // and so on for each successive 64 byte block when s is updated
 929       //
 930       // t0 at offset 0,  t1 at offset 8
 931       // t2 at offset 16, t3 at offset 24
 932       // t4 at offset 32, t5 at offset 40
 933       // t6 at offset 48, t7 at offset 56
 934 
 935       // for backwards copy s was not offset so the register contents
 936       // are at these offsets into the preceding 64 byte block
 937       // relative to that original input and so on for each successive
 938       // preceding 64 byte block when s is updated. this explains the
 939       // slightly counter-intuitive looking pattern of register usage
 940       // in the stp instructions for backwards copy.
 941       //
 942       // t0 at offset -16, t1 at offset -8
 943       // t2 at offset -32, t3 at offset -24
 944       // t4 at offset -48, t5 at offset -40
 945       // t6 at offset -64, t7 at offset -56
 946 
 947       __ ldp(t0, t1, Address(s, 2 * unit));
 948       __ ldp(t2, t3, Address(s, 4 * unit));
 949       __ ldp(t4, t5, Address(s, 6 * unit));
 950       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 951 
 952       __ subs(count, count, 16);
 953       __ br(Assembler::LO, drain);
 954 
 955       int prefetch = PrefetchCopyIntervalInBytes;
 956       bool use_stride = false;
 957       if (direction == copy_backwards) {
 958          use_stride = prefetch > 256;
 959          prefetch = -prefetch;
 960          if (use_stride) __ mov(stride, prefetch);
 961       }
 962 
 963       __ bind(again);
 964 
 965       if (PrefetchCopyIntervalInBytes > 0)
 966         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 967 
 968       if (direction == copy_forwards) {
 969        // allowing for the offset of -8 the store instructions place
 970        // registers into the target 64 bit block at the following
 971        // offsets
 972        //
 973        // t0 at offset 0
 974        // t1 at offset 8,  t2 at offset 16
 975        // t3 at offset 24, t4 at offset 32
 976        // t5 at offset 40, t6 at offset 48
 977        // t7 at offset 56
 978 
 979         __ str(t0, Address(d, 1 * unit));
 980         __ stp(t1, t2, Address(d, 2 * unit));
 981         __ ldp(t0, t1, Address(s, 2 * unit));
 982         __ stp(t3, t4, Address(d, 4 * unit));
 983         __ ldp(t2, t3, Address(s, 4 * unit));
 984         __ stp(t5, t6, Address(d, 6 * unit));
 985         __ ldp(t4, t5, Address(s, 6 * unit));
 986         __ str(t7, Address(__ pre(d, 8 * unit)));
 987         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 988       } else {
 989        // d was not offset when we started so the registers are
 990        // written into the 64 bit block preceding d with the following
 991        // offsets
 992        //
 993        // t1 at offset -8
 994        // t3 at offset -24, t0 at offset -16
 995        // t5 at offset -48, t2 at offset -32
 996        // t7 at offset -56, t4 at offset -48
 997        //                   t6 at offset -64
 998        //
 999        // note that this matches the offsets previously noted for the
1000        // loads
1001 
1002         __ str(t1, Address(d, 1 * unit));
1003         __ stp(t3, t0, Address(d, 3 * unit));
1004         __ ldp(t0, t1, Address(s, 2 * unit));
1005         __ stp(t5, t2, Address(d, 5 * unit));
1006         __ ldp(t2, t3, Address(s, 4 * unit));
1007         __ stp(t7, t4, Address(d, 7 * unit));
1008         __ ldp(t4, t5, Address(s, 6 * unit));
1009         __ str(t6, Address(__ pre(d, 8 * unit)));
1010         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1011       }
1012 
1013       __ subs(count, count, 8);
1014       __ br(Assembler::HS, again);
1015 
1016       // Drain
1017       //
1018       // this uses the same pattern of offsets and register arguments
1019       // as above
1020       __ bind(drain);
1021       if (direction == copy_forwards) {
1022         __ str(t0, Address(d, 1 * unit));
1023         __ stp(t1, t2, Address(d, 2 * unit));
1024         __ stp(t3, t4, Address(d, 4 * unit));
1025         __ stp(t5, t6, Address(d, 6 * unit));
1026         __ str(t7, Address(__ pre(d, 8 * unit)));
1027       } else {
1028         __ str(t1, Address(d, 1 * unit));
1029         __ stp(t3, t0, Address(d, 3 * unit));
1030         __ stp(t5, t2, Address(d, 5 * unit));
1031         __ stp(t7, t4, Address(d, 7 * unit));
1032         __ str(t6, Address(__ pre(d, 8 * unit)));
1033       }
1034       // now we need to copy any remaining part block which may
1035       // include a 4 word block subblock and/or a 2 word subblock.
1036       // bits 2 and 1 in the count are the tell-tale for whetehr we
1037       // have each such subblock
1038       {
1039         Label L1, L2;
1040         __ tbz(count, exact_log2(4), L1);
1041        // this is the same as above but copying only 4 longs hence
1042        // with ony one intervening stp between the str instructions
1043        // but note that the offsets and registers still follow the
1044        // same pattern
1045         __ ldp(t0, t1, Address(s, 2 * unit));
1046         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1047         if (direction == copy_forwards) {
1048           __ str(t0, Address(d, 1 * unit));
1049           __ stp(t1, t2, Address(d, 2 * unit));
1050           __ str(t3, Address(__ pre(d, 4 * unit)));
1051         } else {
1052           __ str(t1, Address(d, 1 * unit));
1053           __ stp(t3, t0, Address(d, 3 * unit));
1054           __ str(t2, Address(__ pre(d, 4 * unit)));
1055         }
1056         __ bind(L1);
1057 
1058         __ tbz(count, 1, L2);
1059        // this is the same as above but copying only 2 longs hence
1060        // there is no intervening stp between the str instructions
1061        // but note that the offset and register patterns are still
1062        // the same
1063         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1064         if (direction == copy_forwards) {
1065           __ str(t0, Address(d, 1 * unit));
1066           __ str(t1, Address(__ pre(d, 2 * unit)));
1067         } else {
1068           __ str(t1, Address(d, 1 * unit));
1069           __ str(t0, Address(__ pre(d, 2 * unit)));
1070         }
1071         __ bind(L2);
1072 
1073        // for forwards copy we need to re-adjust the offsets we
1074        // applied so that s and d are follow the last words written
1075 
1076        if (direction == copy_forwards) {
1077          __ add(s, s, 16);
1078          __ add(d, d, 8);
1079        }
1080 
1081       }
1082 
1083       __ ret(lr);
1084       }
1085   }
1086 
1087   // Small copy: less than 16 bytes.
1088   //
1089   // NB: Ignores all of the bits of count which represent more than 15
1090   // bytes, so a caller doesn't have to mask them.
1091 
1092   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1093     bool is_backwards = step < 0;
1094     size_t granularity = uabs(step);
1095     int direction = is_backwards ? -1 : 1;
1096     int unit = wordSize * direction;
1097 
1098     Label Lpair, Lword, Lint, Lshort, Lbyte;
1099 
1100     assert(granularity
1101            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1102 
1103     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1104 
1105     // ??? I don't know if this bit-test-and-branch is the right thing
1106     // to do.  It does a lot of jumping, resulting in several
1107     // mispredicted branches.  It might make more sense to do this
1108     // with something like Duff's device with a single computed branch.
1109 
1110     __ tbz(count, 3 - exact_log2(granularity), Lword);
1111     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1112     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1113     __ bind(Lword);
1114 
1115     if (granularity <= sizeof (jint)) {
1116       __ tbz(count, 2 - exact_log2(granularity), Lint);
1117       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1118       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1119       __ bind(Lint);
1120     }
1121 
1122     if (granularity <= sizeof (jshort)) {
1123       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1124       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1125       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1126       __ bind(Lshort);
1127     }
1128 
1129     if (granularity <= sizeof (jbyte)) {
1130       __ tbz(count, 0, Lbyte);
1131       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1132       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1133       __ bind(Lbyte);
1134     }
1135   }
1136 
1137   Label copy_f, copy_b;
1138 
1139   // All-singing all-dancing memory copy.
1140   //
1141   // Copy count units of memory from s to d.  The size of a unit is
1142   // step, which can be positive or negative depending on the direction
1143   // of copy.  If is_aligned is false, we align the source address.
1144   //
1145 
1146   void copy_memory(bool is_aligned, Register s, Register d,
1147                    Register count, Register tmp, int step) {
1148     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1149     bool is_backwards = step < 0;
1150     int granularity = uabs(step);
1151     const Register t0 = r3, t1 = r4;
1152 
1153     // <= 96 bytes do inline. Direction doesn't matter because we always
1154     // load all the data before writing anything
1155     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1156     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1157     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1158     const Register send = r17, dend = r18;
1159 
1160     if (PrefetchCopyIntervalInBytes > 0)
1161       __ prfm(Address(s, 0), PLDL1KEEP);
1162     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1163     __ br(Assembler::HI, copy_big);
1164 
1165     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1166     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1167 
1168     __ cmp(count, 16/granularity);
1169     __ br(Assembler::LS, copy16);
1170 
1171     __ cmp(count, 64/granularity);
1172     __ br(Assembler::HI, copy80);
1173 
1174     __ cmp(count, 32/granularity);
1175     __ br(Assembler::LS, copy32);
1176 
1177     // 33..64 bytes
1178     if (UseSIMDForMemoryOps) {
1179       __ ldpq(v0, v1, Address(s, 0));
1180       __ ldpq(v2, v3, Address(send, -32));
1181       __ stpq(v0, v1, Address(d, 0));
1182       __ stpq(v2, v3, Address(dend, -32));
1183     } else {
1184       __ ldp(t0, t1, Address(s, 0));
1185       __ ldp(t2, t3, Address(s, 16));
1186       __ ldp(t4, t5, Address(send, -32));
1187       __ ldp(t6, t7, Address(send, -16));
1188 
1189       __ stp(t0, t1, Address(d, 0));
1190       __ stp(t2, t3, Address(d, 16));
1191       __ stp(t4, t5, Address(dend, -32));
1192       __ stp(t6, t7, Address(dend, -16));
1193     }
1194     __ b(finish);
1195 
1196     // 17..32 bytes
1197     __ bind(copy32);
1198     __ ldp(t0, t1, Address(s, 0));
1199     __ ldp(t2, t3, Address(send, -16));
1200     __ stp(t0, t1, Address(d, 0));
1201     __ stp(t2, t3, Address(dend, -16));
1202     __ b(finish);
1203 
1204     // 65..80/96 bytes
1205     // (96 bytes if SIMD because we do 32 byes per instruction)
1206     __ bind(copy80);
1207     if (UseSIMDForMemoryOps) {
1208       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1209       __ ldpq(v4, v5, Address(send, -32));
1210       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1211       __ stpq(v4, v5, Address(dend, -32));
1212     } else {
1213       __ ldp(t0, t1, Address(s, 0));
1214       __ ldp(t2, t3, Address(s, 16));
1215       __ ldp(t4, t5, Address(s, 32));
1216       __ ldp(t6, t7, Address(s, 48));
1217       __ ldp(t8, t9, Address(send, -16));
1218 
1219       __ stp(t0, t1, Address(d, 0));
1220       __ stp(t2, t3, Address(d, 16));
1221       __ stp(t4, t5, Address(d, 32));
1222       __ stp(t6, t7, Address(d, 48));
1223       __ stp(t8, t9, Address(dend, -16));
1224     }
1225     __ b(finish);
1226 
1227     // 0..16 bytes
1228     __ bind(copy16);
1229     __ cmp(count, 8/granularity);
1230     __ br(Assembler::LO, copy8);
1231 
1232     // 8..16 bytes
1233     __ ldr(t0, Address(s, 0));
1234     __ ldr(t1, Address(send, -8));
1235     __ str(t0, Address(d, 0));
1236     __ str(t1, Address(dend, -8));
1237     __ b(finish);
1238 
1239     if (granularity < 8) {
1240       // 4..7 bytes
1241       __ bind(copy8);
1242       __ tbz(count, 2 - exact_log2(granularity), copy4);
1243       __ ldrw(t0, Address(s, 0));
1244       __ ldrw(t1, Address(send, -4));
1245       __ strw(t0, Address(d, 0));
1246       __ strw(t1, Address(dend, -4));
1247       __ b(finish);
1248       if (granularity < 4) {
1249         // 0..3 bytes
1250         __ bind(copy4);
1251         __ cbz(count, finish); // get rid of 0 case
1252         if (granularity == 2) {
1253           __ ldrh(t0, Address(s, 0));
1254           __ strh(t0, Address(d, 0));
1255         } else { // granularity == 1
1256           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1257           // the first and last byte.
1258           // Handle the 3 byte case by loading and storing base + count/2
1259           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1260           // This does means in the 1 byte case we load/store the same
1261           // byte 3 times.
1262           __ lsr(count, count, 1);
1263           __ ldrb(t0, Address(s, 0));
1264           __ ldrb(t1, Address(send, -1));
1265           __ ldrb(t2, Address(s, count));
1266           __ strb(t0, Address(d, 0));
1267           __ strb(t1, Address(dend, -1));
1268           __ strb(t2, Address(d, count));
1269         }
1270         __ b(finish);
1271       }
1272     }
1273 
1274     __ bind(copy_big);
1275     if (is_backwards) {
1276       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1277       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1278     }
1279 
1280     // Now we've got the small case out of the way we can align the
1281     // source address on a 2-word boundary.
1282 
1283     Label aligned;
1284 
1285     if (is_aligned) {
1286       // We may have to adjust by 1 word to get s 2-word-aligned.
1287       __ tbz(s, exact_log2(wordSize), aligned);
1288       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1289       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1290       __ sub(count, count, wordSize/granularity);
1291     } else {
1292       if (is_backwards) {
1293         __ andr(rscratch2, s, 2 * wordSize - 1);
1294       } else {
1295         __ neg(rscratch2, s);
1296         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1297       }
1298       // rscratch2 is the byte adjustment needed to align s.
1299       __ cbz(rscratch2, aligned);
1300       int shift = exact_log2(granularity);
1301       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1302       __ sub(count, count, rscratch2);
1303 
1304 #if 0
1305       // ?? This code is only correct for a disjoint copy.  It may or
1306       // may not make sense to use it in that case.
1307 
1308       // Copy the first pair; s and d may not be aligned.
1309       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1310       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1311 
1312       // Align s and d, adjust count
1313       if (is_backwards) {
1314         __ sub(s, s, rscratch2);
1315         __ sub(d, d, rscratch2);
1316       } else {
1317         __ add(s, s, rscratch2);
1318         __ add(d, d, rscratch2);
1319       }
1320 #else
1321       copy_memory_small(s, d, rscratch2, rscratch1, step);
1322 #endif
1323     }
1324 
1325     __ bind(aligned);
1326 
1327     // s is now 2-word-aligned.
1328 
1329     // We have a count of units and some trailing bytes.  Adjust the
1330     // count and do a bulk copy of words.
1331     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1332     if (direction == copy_forwards)
1333       __ bl(copy_f);
1334     else
1335       __ bl(copy_b);
1336 
1337     // And the tail.
1338     copy_memory_small(s, d, count, tmp, step);
1339 
1340     if (granularity >= 8) __ bind(copy8);
1341     if (granularity >= 4) __ bind(copy4);
1342     __ bind(finish);
1343   }
1344 
1345 
1346   void clobber_registers() {
1347 #ifdef ASSERT
1348     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1349     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1350     for (Register r = r3; r <= r18; r++)
1351       if (r != rscratch1) __ mov(r, rscratch1);
1352 #endif
1353   }
1354 
1355   // Scan over array at a for count oops, verifying each one.
1356   // Preserves a and count, clobbers rscratch1 and rscratch2.
1357   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1358     Label loop, end;
1359     __ mov(rscratch1, a);
1360     __ mov(rscratch2, zr);
1361     __ bind(loop);
1362     __ cmp(rscratch2, count);
1363     __ br(Assembler::HS, end);
1364     if (size == (size_t)wordSize) {
1365       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1366       __ verify_oop(temp);
1367     } else {
1368       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1369       __ decode_heap_oop(temp); // calls verify_oop
1370     }
1371     __ add(rscratch2, rscratch2, size);
1372     __ b(loop);
1373     __ bind(end);
1374   }
1375 
1376   // Arguments:
1377   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1378   //             ignored
1379   //   is_oop  - true => oop array, so generate store check code
1380   //   name    - stub name string
1381   //
1382   // Inputs:
1383   //   c_rarg0   - source array address
1384   //   c_rarg1   - destination array address
1385   //   c_rarg2   - element count, treated as ssize_t, can be zero
1386   //
1387   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1388   // the hardware handle it.  The two dwords within qwords that span
1389   // cache line boundaries will still be loaded and stored atomicly.
1390   //
1391   // Side Effects:
1392   //   disjoint_int_copy_entry is set to the no-overlap entry point
1393   //   used by generate_conjoint_int_oop_copy().
1394   //
1395   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1396                                   const char *name, bool dest_uninitialized = false) {
1397     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1398     RegSet saved_reg = RegSet::of(s, d, count);
1399     __ align(CodeEntryAlignment);
1400     StubCodeMark mark(this, "StubRoutines", name);
1401     address start = __ pc();
1402     __ enter();
1403 
1404     if (entry != NULL) {
1405       *entry = __ pc();
1406       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1407       BLOCK_COMMENT("Entry:");
1408     }
1409 
1410     DecoratorSet decorators = ARRAYCOPY_DISJOINT;
1411     if (dest_uninitialized) {
1412       decorators |= AS_DEST_NOT_INITIALIZED;
1413     }
1414     if (aligned) {
1415       decorators |= ARRAYCOPY_ALIGNED;
1416     }
1417 
1418     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1419     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1420 
1421     if (is_oop) {
1422       // save regs before copy_memory
1423       __ push(RegSet::of(d, count), sp);
1424     }
1425     copy_memory(aligned, s, d, count, rscratch1, size);
1426 
1427     if (is_oop) {
1428       __ pop(RegSet::of(d, count), sp);
1429       if (VerifyOops)
1430         verify_oop_array(size, d, count, r16);
1431       __ sub(count, count, 1); // make an inclusive end pointer
1432       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1433     }
1434 
1435     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1436 
1437     __ leave();
1438     __ mov(r0, zr); // return 0
1439     __ ret(lr);
1440 #ifdef BUILTIN_SIM
1441     {
1442       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1443       sim->notifyCompile(const_cast<char*>(name), start);
1444     }
1445 #endif
1446     return start;
1447   }
1448 
1449   // Arguments:
1450   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1451   //             ignored
1452   //   is_oop  - true => oop array, so generate store check code
1453   //   name    - stub name string
1454   //
1455   // Inputs:
1456   //   c_rarg0   - source array address
1457   //   c_rarg1   - destination array address
1458   //   c_rarg2   - element count, treated as ssize_t, can be zero
1459   //
1460   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1461   // the hardware handle it.  The two dwords within qwords that span
1462   // cache line boundaries will still be loaded and stored atomicly.
1463   //
1464   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1465                                  address *entry, const char *name,
1466                                  bool dest_uninitialized = false) {
1467     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1468     RegSet saved_regs = RegSet::of(s, d, count);
1469     StubCodeMark mark(this, "StubRoutines", name);
1470     address start = __ pc();
1471     __ enter();
1472 
1473     if (entry != NULL) {
1474       *entry = __ pc();
1475       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1476       BLOCK_COMMENT("Entry:");
1477     }
1478 
1479     // use fwd copy when (d-s) above_equal (count*size)
1480     __ sub(rscratch1, d, s);
1481     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1482     __ br(Assembler::HS, nooverlap_target);
1483 
1484     DecoratorSet decorators = 0;
1485     if (dest_uninitialized) {
1486       decorators |= AS_DEST_NOT_INITIALIZED;
1487     }
1488     if (aligned) {
1489       decorators |= ARRAYCOPY_ALIGNED;
1490     }
1491 
1492     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1493     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1494 
1495     if (is_oop) {
1496       // save regs before copy_memory
1497       __ push(RegSet::of(d, count), sp);
1498     }
1499     copy_memory(aligned, s, d, count, rscratch1, -size);
1500     if (is_oop) {
1501       __ pop(RegSet::of(d, count), sp);
1502       if (VerifyOops)
1503         verify_oop_array(size, d, count, r16);
1504       __ sub(count, count, 1); // make an inclusive end pointer
1505       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1506     }
1507     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1508     __ leave();
1509     __ mov(r0, zr); // return 0
1510     __ ret(lr);
1511 #ifdef BUILTIN_SIM
1512     {
1513       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1514       sim->notifyCompile(const_cast<char*>(name), start);
1515     }
1516 #endif
1517     return start;
1518 }
1519 
1520   // Arguments:
1521   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1522   //             ignored
1523   //   name    - stub name string
1524   //
1525   // Inputs:
1526   //   c_rarg0   - source array address
1527   //   c_rarg1   - destination array address
1528   //   c_rarg2   - element count, treated as ssize_t, can be zero
1529   //
1530   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1531   // we let the hardware handle it.  The one to eight bytes within words,
1532   // dwords or qwords that span cache line boundaries will still be loaded
1533   // and stored atomically.
1534   //
1535   // Side Effects:
1536   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1537   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1538   // we let the hardware handle it.  The one to eight bytes within words,
1539   // dwords or qwords that span cache line boundaries will still be loaded
1540   // and stored atomically.
1541   //
1542   // Side Effects:
1543   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1544   //   used by generate_conjoint_byte_copy().
1545   //
1546   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1547     const bool not_oop = false;
1548     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1549   }
1550 
1551   // Arguments:
1552   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1553   //             ignored
1554   //   name    - stub name string
1555   //
1556   // Inputs:
1557   //   c_rarg0   - source array address
1558   //   c_rarg1   - destination array address
1559   //   c_rarg2   - element count, treated as ssize_t, can be zero
1560   //
1561   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1562   // we let the hardware handle it.  The one to eight bytes within words,
1563   // dwords or qwords that span cache line boundaries will still be loaded
1564   // and stored atomically.
1565   //
1566   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1567                                       address* entry, const char *name) {
1568     const bool not_oop = false;
1569     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1570   }
1571 
1572   // Arguments:
1573   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1574   //             ignored
1575   //   name    - stub name string
1576   //
1577   // Inputs:
1578   //   c_rarg0   - source array address
1579   //   c_rarg1   - destination array address
1580   //   c_rarg2   - element count, treated as ssize_t, can be zero
1581   //
1582   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1583   // let the hardware handle it.  The two or four words within dwords
1584   // or qwords that span cache line boundaries will still be loaded
1585   // and stored atomically.
1586   //
1587   // Side Effects:
1588   //   disjoint_short_copy_entry is set to the no-overlap entry point
1589   //   used by generate_conjoint_short_copy().
1590   //
1591   address generate_disjoint_short_copy(bool aligned,
1592                                        address* entry, const char *name) {
1593     const bool not_oop = false;
1594     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1595   }
1596 
1597   // Arguments:
1598   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1599   //             ignored
1600   //   name    - stub name string
1601   //
1602   // Inputs:
1603   //   c_rarg0   - source array address
1604   //   c_rarg1   - destination array address
1605   //   c_rarg2   - element count, treated as ssize_t, can be zero
1606   //
1607   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1608   // let the hardware handle it.  The two or four words within dwords
1609   // or qwords that span cache line boundaries will still be loaded
1610   // and stored atomically.
1611   //
1612   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1613                                        address *entry, const char *name) {
1614     const bool not_oop = false;
1615     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1616 
1617   }
1618   // Arguments:
1619   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1620   //             ignored
1621   //   name    - stub name string
1622   //
1623   // Inputs:
1624   //   c_rarg0   - source array address
1625   //   c_rarg1   - destination array address
1626   //   c_rarg2   - element count, treated as ssize_t, can be zero
1627   //
1628   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1629   // the hardware handle it.  The two dwords within qwords that span
1630   // cache line boundaries will still be loaded and stored atomicly.
1631   //
1632   // Side Effects:
1633   //   disjoint_int_copy_entry is set to the no-overlap entry point
1634   //   used by generate_conjoint_int_oop_copy().
1635   //
1636   address generate_disjoint_int_copy(bool aligned, address *entry,
1637                                          const char *name, bool dest_uninitialized = false) {
1638     const bool not_oop = false;
1639     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1640   }
1641 
1642   // Arguments:
1643   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1644   //             ignored
1645   //   name    - stub name string
1646   //
1647   // Inputs:
1648   //   c_rarg0   - source array address
1649   //   c_rarg1   - destination array address
1650   //   c_rarg2   - element count, treated as ssize_t, can be zero
1651   //
1652   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1653   // the hardware handle it.  The two dwords within qwords that span
1654   // cache line boundaries will still be loaded and stored atomicly.
1655   //
1656   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1657                                      address *entry, const char *name,
1658                                      bool dest_uninitialized = false) {
1659     const bool not_oop = false;
1660     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1661   }
1662 
1663 
1664   // Arguments:
1665   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1666   //             ignored
1667   //   name    - stub name string
1668   //
1669   // Inputs:
1670   //   c_rarg0   - source array address
1671   //   c_rarg1   - destination array address
1672   //   c_rarg2   - element count, treated as size_t, can be zero
1673   //
1674   // Side Effects:
1675   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1676   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1677   //
1678   address generate_disjoint_long_copy(bool aligned, address *entry,
1679                                           const char *name, bool dest_uninitialized = false) {
1680     const bool not_oop = false;
1681     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1682   }
1683 
1684   // Arguments:
1685   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1686   //             ignored
1687   //   name    - stub name string
1688   //
1689   // Inputs:
1690   //   c_rarg0   - source array address
1691   //   c_rarg1   - destination array address
1692   //   c_rarg2   - element count, treated as size_t, can be zero
1693   //
1694   address generate_conjoint_long_copy(bool aligned,
1695                                       address nooverlap_target, address *entry,
1696                                       const char *name, bool dest_uninitialized = false) {
1697     const bool not_oop = false;
1698     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1699   }
1700 
1701   // Arguments:
1702   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1703   //             ignored
1704   //   name    - stub name string
1705   //
1706   // Inputs:
1707   //   c_rarg0   - source array address
1708   //   c_rarg1   - destination array address
1709   //   c_rarg2   - element count, treated as size_t, can be zero
1710   //
1711   // Side Effects:
1712   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1713   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1714   //
1715   address generate_disjoint_oop_copy(bool aligned, address *entry,
1716                                      const char *name, bool dest_uninitialized) {
1717     const bool is_oop = true;
1718     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1719     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1720   }
1721 
1722   // Arguments:
1723   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1724   //             ignored
1725   //   name    - stub name string
1726   //
1727   // Inputs:
1728   //   c_rarg0   - source array address
1729   //   c_rarg1   - destination array address
1730   //   c_rarg2   - element count, treated as size_t, can be zero
1731   //
1732   address generate_conjoint_oop_copy(bool aligned,
1733                                      address nooverlap_target, address *entry,
1734                                      const char *name, bool dest_uninitialized) {
1735     const bool is_oop = true;
1736     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1737     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1738                                   name, dest_uninitialized);
1739   }
1740 
1741 
1742   // Helper for generating a dynamic type check.
1743   // Smashes rscratch1.
1744   void generate_type_check(Register sub_klass,
1745                            Register super_check_offset,
1746                            Register super_klass,
1747                            Label& L_success) {
1748     assert_different_registers(sub_klass, super_check_offset, super_klass);
1749 
1750     BLOCK_COMMENT("type_check:");
1751 
1752     Label L_miss;
1753 
1754     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1755                                      super_check_offset);
1756     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1757 
1758     // Fall through on failure!
1759     __ BIND(L_miss);
1760   }
1761 
1762   //
1763   //  Generate checkcasting array copy stub
1764   //
1765   //  Input:
1766   //    c_rarg0   - source array address
1767   //    c_rarg1   - destination array address
1768   //    c_rarg2   - element count, treated as ssize_t, can be zero
1769   //    c_rarg3   - size_t ckoff (super_check_offset)
1770   //    c_rarg4   - oop ckval (super_klass)
1771   //
1772   //  Output:
1773   //    r0 ==  0  -  success
1774   //    r0 == -1^K - failure, where K is partial transfer count
1775   //
1776   address generate_checkcast_copy(const char *name, address *entry,
1777                                   bool dest_uninitialized = false) {
1778 
1779     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1780 
1781     // Input registers (after setup_arg_regs)
1782     const Register from        = c_rarg0;   // source array address
1783     const Register to          = c_rarg1;   // destination array address
1784     const Register count       = c_rarg2;   // elementscount
1785     const Register ckoff       = c_rarg3;   // super_check_offset
1786     const Register ckval       = c_rarg4;   // super_klass
1787 
1788     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1789     RegSet wb_post_saved_regs = RegSet::of(count);
1790 
1791     // Registers used as temps (r18, r19, r20 are save-on-entry)
1792     const Register count_save  = r21;       // orig elementscount
1793     const Register start_to    = r20;       // destination array start address
1794     const Register copied_oop  = r18;       // actual oop copied
1795     const Register r19_klass   = r19;       // oop._klass
1796 
1797     //---------------------------------------------------------------
1798     // Assembler stub will be used for this call to arraycopy
1799     // if the two arrays are subtypes of Object[] but the
1800     // destination array type is not equal to or a supertype
1801     // of the source type.  Each element must be separately
1802     // checked.
1803 
1804     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1805                                copied_oop, r19_klass, count_save);
1806 
1807     __ align(CodeEntryAlignment);
1808     StubCodeMark mark(this, "StubRoutines", name);
1809     address start = __ pc();
1810 
1811     __ enter(); // required for proper stackwalking of RuntimeStub frame
1812 
1813 #ifdef ASSERT
1814     // caller guarantees that the arrays really are different
1815     // otherwise, we would have to make conjoint checks
1816     { Label L;
1817       array_overlap_test(L, TIMES_OOP);
1818       __ stop("checkcast_copy within a single array");
1819       __ bind(L);
1820     }
1821 #endif //ASSERT
1822 
1823     // Caller of this entry point must set up the argument registers.
1824     if (entry != NULL) {
1825       *entry = __ pc();
1826       BLOCK_COMMENT("Entry:");
1827     }
1828 
1829      // Empty array:  Nothing to do.
1830     __ cbz(count, L_done);
1831 
1832     __ push(RegSet::of(r18, r19, r20, r21), sp);
1833 
1834 #ifdef ASSERT
1835     BLOCK_COMMENT("assert consistent ckoff/ckval");
1836     // The ckoff and ckval must be mutually consistent,
1837     // even though caller generates both.
1838     { Label L;
1839       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1840       __ ldrw(start_to, Address(ckval, sco_offset));
1841       __ cmpw(ckoff, start_to);
1842       __ br(Assembler::EQ, L);
1843       __ stop("super_check_offset inconsistent");
1844       __ bind(L);
1845     }
1846 #endif //ASSERT
1847 
1848     DecoratorSet decorators = ARRAYCOPY_CHECKCAST;
1849     bool is_oop = true;
1850     if (dest_uninitialized) {
1851       decorators |= AS_DEST_NOT_INITIALIZED;
1852     }
1853 
1854     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1855     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1856 
1857     // save the original count
1858     __ mov(count_save, count);
1859 
1860     // Copy from low to high addresses
1861     __ mov(start_to, to);              // Save destination array start address
1862     __ b(L_load_element);
1863 
1864     // ======== begin loop ========
1865     // (Loop is rotated; its entry is L_load_element.)
1866     // Loop control:
1867     //   for (; count != 0; count--) {
1868     //     copied_oop = load_heap_oop(from++);
1869     //     ... generate_type_check ...;
1870     //     store_heap_oop(to++, copied_oop);
1871     //   }
1872     __ align(OptoLoopAlignment);
1873 
1874     __ BIND(L_store_element);
1875     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1876     __ sub(count, count, 1);
1877     __ cbz(count, L_do_card_marks);
1878 
1879     // ======== loop entry is here ========
1880     __ BIND(L_load_element);
1881     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1882     __ cbz(copied_oop, L_store_element);
1883 
1884     __ load_klass(r19_klass, copied_oop);// query the object klass
1885     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1886     // ======== end loop ========
1887 
1888     // It was a real error; we must depend on the caller to finish the job.
1889     // Register count = remaining oops, count_orig = total oops.
1890     // Emit GC store barriers for the oops we have copied and report
1891     // their number to the caller.
1892 
1893     __ subs(count, count_save, count);     // K = partially copied oop count
1894     __ eon(count, count, zr);                   // report (-1^K) to caller
1895     __ br(Assembler::EQ, L_done_pop);
1896 
1897     __ BIND(L_do_card_marks);
1898     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1899     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1900 
1901     __ bind(L_done_pop);
1902     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1903     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1904 
1905     __ bind(L_done);
1906     __ mov(r0, count);
1907     __ leave();
1908     __ ret(lr);
1909 
1910     return start;
1911   }
1912 
1913   // Perform range checks on the proposed arraycopy.
1914   // Kills temp, but nothing else.
1915   // Also, clean the sign bits of src_pos and dst_pos.
1916   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1917                               Register src_pos, // source position (c_rarg1)
1918                               Register dst,     // destination array oo (c_rarg2)
1919                               Register dst_pos, // destination position (c_rarg3)
1920                               Register length,
1921                               Register temp,
1922                               Label& L_failed) {
1923     BLOCK_COMMENT("arraycopy_range_checks:");
1924 
1925     assert_different_registers(rscratch1, temp);
1926 
1927     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1928     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1929     __ addw(temp, length, src_pos);
1930     __ cmpw(temp, rscratch1);
1931     __ br(Assembler::HI, L_failed);
1932 
1933     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1934     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1935     __ addw(temp, length, dst_pos);
1936     __ cmpw(temp, rscratch1);
1937     __ br(Assembler::HI, L_failed);
1938 
1939     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1940     __ movw(src_pos, src_pos);
1941     __ movw(dst_pos, dst_pos);
1942 
1943     BLOCK_COMMENT("arraycopy_range_checks done");
1944   }
1945 
1946   // These stubs get called from some dumb test routine.
1947   // I'll write them properly when they're called from
1948   // something that's actually doing something.
1949   static void fake_arraycopy_stub(address src, address dst, int count) {
1950     assert(count == 0, "huh?");
1951   }
1952 
1953 
1954   //
1955   //  Generate 'unsafe' array copy stub
1956   //  Though just as safe as the other stubs, it takes an unscaled
1957   //  size_t argument instead of an element count.
1958   //
1959   //  Input:
1960   //    c_rarg0   - source array address
1961   //    c_rarg1   - destination array address
1962   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1963   //
1964   // Examines the alignment of the operands and dispatches
1965   // to a long, int, short, or byte copy loop.
1966   //
1967   address generate_unsafe_copy(const char *name,
1968                                address byte_copy_entry,
1969                                address short_copy_entry,
1970                                address int_copy_entry,
1971                                address long_copy_entry) {
1972     Label L_long_aligned, L_int_aligned, L_short_aligned;
1973     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1974 
1975     __ align(CodeEntryAlignment);
1976     StubCodeMark mark(this, "StubRoutines", name);
1977     address start = __ pc();
1978     __ enter(); // required for proper stackwalking of RuntimeStub frame
1979 
1980     // bump this on entry, not on exit:
1981     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1982 
1983     __ orr(rscratch1, s, d);
1984     __ orr(rscratch1, rscratch1, count);
1985 
1986     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1987     __ cbz(rscratch1, L_long_aligned);
1988     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1989     __ cbz(rscratch1, L_int_aligned);
1990     __ tbz(rscratch1, 0, L_short_aligned);
1991     __ b(RuntimeAddress(byte_copy_entry));
1992 
1993     __ BIND(L_short_aligned);
1994     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1995     __ b(RuntimeAddress(short_copy_entry));
1996     __ BIND(L_int_aligned);
1997     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1998     __ b(RuntimeAddress(int_copy_entry));
1999     __ BIND(L_long_aligned);
2000     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2001     __ b(RuntimeAddress(long_copy_entry));
2002 
2003     return start;
2004   }
2005 
2006   //
2007   //  Generate generic array copy stubs
2008   //
2009   //  Input:
2010   //    c_rarg0    -  src oop
2011   //    c_rarg1    -  src_pos (32-bits)
2012   //    c_rarg2    -  dst oop
2013   //    c_rarg3    -  dst_pos (32-bits)
2014   //    c_rarg4    -  element count (32-bits)
2015   //
2016   //  Output:
2017   //    r0 ==  0  -  success
2018   //    r0 == -1^K - failure, where K is partial transfer count
2019   //
2020   address generate_generic_copy(const char *name,
2021                                 address byte_copy_entry, address short_copy_entry,
2022                                 address int_copy_entry, address oop_copy_entry,
2023                                 address long_copy_entry, address checkcast_copy_entry) {
2024 
2025     Label L_failed, L_failed_0, L_objArray;
2026     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2027 
2028     // Input registers
2029     const Register src        = c_rarg0;  // source array oop
2030     const Register src_pos    = c_rarg1;  // source position
2031     const Register dst        = c_rarg2;  // destination array oop
2032     const Register dst_pos    = c_rarg3;  // destination position
2033     const Register length     = c_rarg4;
2034 
2035     StubCodeMark mark(this, "StubRoutines", name);
2036 
2037     __ align(CodeEntryAlignment);
2038     address start = __ pc();
2039 
2040     __ enter(); // required for proper stackwalking of RuntimeStub frame
2041 
2042     // bump this on entry, not on exit:
2043     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2044 
2045     //-----------------------------------------------------------------------
2046     // Assembler stub will be used for this call to arraycopy
2047     // if the following conditions are met:
2048     //
2049     // (1) src and dst must not be null.
2050     // (2) src_pos must not be negative.
2051     // (3) dst_pos must not be negative.
2052     // (4) length  must not be negative.
2053     // (5) src klass and dst klass should be the same and not NULL.
2054     // (6) src and dst should be arrays.
2055     // (7) src_pos + length must not exceed length of src.
2056     // (8) dst_pos + length must not exceed length of dst.
2057     //
2058 
2059     //  if (src == NULL) return -1;
2060     __ cbz(src, L_failed);
2061 
2062     //  if (src_pos < 0) return -1;
2063     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2064 
2065     //  if (dst == NULL) return -1;
2066     __ cbz(dst, L_failed);
2067 
2068     //  if (dst_pos < 0) return -1;
2069     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2070 
2071     // registers used as temp
2072     const Register scratch_length    = r16; // elements count to copy
2073     const Register scratch_src_klass = r17; // array klass
2074     const Register lh                = r18; // layout helper
2075 
2076     //  if (length < 0) return -1;
2077     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2078     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2079 
2080     __ load_klass(scratch_src_klass, src);
2081 #ifdef ASSERT
2082     //  assert(src->klass() != NULL);
2083     {
2084       BLOCK_COMMENT("assert klasses not null {");
2085       Label L1, L2;
2086       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2087       __ bind(L1);
2088       __ stop("broken null klass");
2089       __ bind(L2);
2090       __ load_klass(rscratch1, dst);
2091       __ cbz(rscratch1, L1);     // this would be broken also
2092       BLOCK_COMMENT("} assert klasses not null done");
2093     }
2094 #endif
2095 
2096     // Load layout helper (32-bits)
2097     //
2098     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2099     // 32        30    24            16              8     2                 0
2100     //
2101     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2102     //
2103 
2104     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2105 
2106     // Handle objArrays completely differently...
2107     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2108     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2109     __ movw(rscratch1, objArray_lh);
2110     __ eorw(rscratch2, lh, rscratch1);
2111     __ cbzw(rscratch2, L_objArray);
2112 
2113     //  if (src->klass() != dst->klass()) return -1;
2114     __ load_klass(rscratch2, dst);
2115     __ eor(rscratch2, rscratch2, scratch_src_klass);
2116     __ cbnz(rscratch2, L_failed);
2117 
2118     //  if (!src->is_Array()) return -1;
2119     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2120 
2121     // At this point, it is known to be a typeArray (array_tag 0x3).
2122 #ifdef ASSERT
2123     {
2124       BLOCK_COMMENT("assert primitive array {");
2125       Label L;
2126       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2127       __ cmpw(lh, rscratch2);
2128       __ br(Assembler::GE, L);
2129       __ stop("must be a primitive array");
2130       __ bind(L);
2131       BLOCK_COMMENT("} assert primitive array done");
2132     }
2133 #endif
2134 
2135     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2136                            rscratch2, L_failed);
2137 
2138     // TypeArrayKlass
2139     //
2140     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2141     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2142     //
2143 
2144     const Register rscratch1_offset = rscratch1;    // array offset
2145     const Register r18_elsize = lh; // element size
2146 
2147     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2148            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2149     __ add(src, src, rscratch1_offset);           // src array offset
2150     __ add(dst, dst, rscratch1_offset);           // dst array offset
2151     BLOCK_COMMENT("choose copy loop based on element size");
2152 
2153     // next registers should be set before the jump to corresponding stub
2154     const Register from     = c_rarg0;  // source array address
2155     const Register to       = c_rarg1;  // destination array address
2156     const Register count    = c_rarg2;  // elements count
2157 
2158     // 'from', 'to', 'count' registers should be set in such order
2159     // since they are the same as 'src', 'src_pos', 'dst'.
2160 
2161     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2162 
2163     // The possible values of elsize are 0-3, i.e. exact_log2(element
2164     // size in bytes).  We do a simple bitwise binary search.
2165   __ BIND(L_copy_bytes);
2166     __ tbnz(r18_elsize, 1, L_copy_ints);
2167     __ tbnz(r18_elsize, 0, L_copy_shorts);
2168     __ lea(from, Address(src, src_pos));// src_addr
2169     __ lea(to,   Address(dst, dst_pos));// dst_addr
2170     __ movw(count, scratch_length); // length
2171     __ b(RuntimeAddress(byte_copy_entry));
2172 
2173   __ BIND(L_copy_shorts);
2174     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2175     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2176     __ movw(count, scratch_length); // length
2177     __ b(RuntimeAddress(short_copy_entry));
2178 
2179   __ BIND(L_copy_ints);
2180     __ tbnz(r18_elsize, 0, L_copy_longs);
2181     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2182     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2183     __ movw(count, scratch_length); // length
2184     __ b(RuntimeAddress(int_copy_entry));
2185 
2186   __ BIND(L_copy_longs);
2187 #ifdef ASSERT
2188     {
2189       BLOCK_COMMENT("assert long copy {");
2190       Label L;
2191       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2192       __ cmpw(r18_elsize, LogBytesPerLong);
2193       __ br(Assembler::EQ, L);
2194       __ stop("must be long copy, but elsize is wrong");
2195       __ bind(L);
2196       BLOCK_COMMENT("} assert long copy done");
2197     }
2198 #endif
2199     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2200     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2201     __ movw(count, scratch_length); // length
2202     __ b(RuntimeAddress(long_copy_entry));
2203 
2204     // ObjArrayKlass
2205   __ BIND(L_objArray);
2206     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2207 
2208     Label L_plain_copy, L_checkcast_copy;
2209     //  test array classes for subtyping
2210     __ load_klass(r18, dst);
2211     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2212     __ br(Assembler::NE, L_checkcast_copy);
2213 
2214     // Identically typed arrays can be copied without element-wise checks.
2215     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2216                            rscratch2, L_failed);
2217 
2218     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2219     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2220     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2221     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2222     __ movw(count, scratch_length); // length
2223   __ BIND(L_plain_copy);
2224     __ b(RuntimeAddress(oop_copy_entry));
2225 
2226   __ BIND(L_checkcast_copy);
2227     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2228     {
2229       // Before looking at dst.length, make sure dst is also an objArray.
2230       __ ldrw(rscratch1, Address(r18, lh_offset));
2231       __ movw(rscratch2, objArray_lh);
2232       __ eorw(rscratch1, rscratch1, rscratch2);
2233       __ cbnzw(rscratch1, L_failed);
2234 
2235       // It is safe to examine both src.length and dst.length.
2236       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2237                              r18, L_failed);
2238 
2239       const Register rscratch2_dst_klass = rscratch2;
2240       __ load_klass(rscratch2_dst_klass, dst); // reload
2241 
2242       // Marshal the base address arguments now, freeing registers.
2243       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2244       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2245       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2246       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2247       __ movw(count, length);           // length (reloaded)
2248       Register sco_temp = c_rarg3;      // this register is free now
2249       assert_different_registers(from, to, count, sco_temp,
2250                                  rscratch2_dst_klass, scratch_src_klass);
2251       // assert_clean_int(count, sco_temp);
2252 
2253       // Generate the type check.
2254       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2255       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2256       // assert_clean_int(sco_temp, r18);
2257       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2258 
2259       // Fetch destination element klass from the ObjArrayKlass header.
2260       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2261       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2262       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2263 
2264       // the checkcast_copy loop needs two extra arguments:
2265       assert(c_rarg3 == sco_temp, "#3 already in place");
2266       // Set up arguments for checkcast_copy_entry.
2267       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2268       __ b(RuntimeAddress(checkcast_copy_entry));
2269     }
2270 
2271   __ BIND(L_failed);
2272     __ mov(r0, -1);
2273     __ leave();   // required for proper stackwalking of RuntimeStub frame
2274     __ ret(lr);
2275 
2276     return start;
2277   }
2278 
2279   //
2280   // Generate stub for array fill. If "aligned" is true, the
2281   // "to" address is assumed to be heapword aligned.
2282   //
2283   // Arguments for generated stub:
2284   //   to:    c_rarg0
2285   //   value: c_rarg1
2286   //   count: c_rarg2 treated as signed
2287   //
2288   address generate_fill(BasicType t, bool aligned, const char *name) {
2289     __ align(CodeEntryAlignment);
2290     StubCodeMark mark(this, "StubRoutines", name);
2291     address start = __ pc();
2292 
2293     BLOCK_COMMENT("Entry:");
2294 
2295     const Register to        = c_rarg0;  // source array address
2296     const Register value     = c_rarg1;  // value
2297     const Register count     = c_rarg2;  // elements count
2298 
2299     const Register bz_base = r10;        // base for block_zero routine
2300     const Register cnt_words = r11;      // temp register
2301 
2302     __ enter();
2303 
2304     Label L_fill_elements, L_exit1;
2305 
2306     int shift = -1;
2307     switch (t) {
2308       case T_BYTE:
2309         shift = 0;
2310         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2311         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2312         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2313         __ br(Assembler::LO, L_fill_elements);
2314         break;
2315       case T_SHORT:
2316         shift = 1;
2317         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2318         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2319         __ br(Assembler::LO, L_fill_elements);
2320         break;
2321       case T_INT:
2322         shift = 2;
2323         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2324         __ br(Assembler::LO, L_fill_elements);
2325         break;
2326       default: ShouldNotReachHere();
2327     }
2328 
2329     // Align source address at 8 bytes address boundary.
2330     Label L_skip_align1, L_skip_align2, L_skip_align4;
2331     if (!aligned) {
2332       switch (t) {
2333         case T_BYTE:
2334           // One byte misalignment happens only for byte arrays.
2335           __ tbz(to, 0, L_skip_align1);
2336           __ strb(value, Address(__ post(to, 1)));
2337           __ subw(count, count, 1);
2338           __ bind(L_skip_align1);
2339           // Fallthrough
2340         case T_SHORT:
2341           // Two bytes misalignment happens only for byte and short (char) arrays.
2342           __ tbz(to, 1, L_skip_align2);
2343           __ strh(value, Address(__ post(to, 2)));
2344           __ subw(count, count, 2 >> shift);
2345           __ bind(L_skip_align2);
2346           // Fallthrough
2347         case T_INT:
2348           // Align to 8 bytes, we know we are 4 byte aligned to start.
2349           __ tbz(to, 2, L_skip_align4);
2350           __ strw(value, Address(__ post(to, 4)));
2351           __ subw(count, count, 4 >> shift);
2352           __ bind(L_skip_align4);
2353           break;
2354         default: ShouldNotReachHere();
2355       }
2356     }
2357 
2358     //
2359     //  Fill large chunks
2360     //
2361     __ lsrw(cnt_words, count, 3 - shift); // number of words
2362     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2363     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2364     if (UseBlockZeroing) {
2365       Label non_block_zeroing, rest;
2366       // If the fill value is zero we can use the fast zero_words().
2367       __ cbnz(value, non_block_zeroing);
2368       __ mov(bz_base, to);
2369       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2370       __ zero_words(bz_base, cnt_words);
2371       __ b(rest);
2372       __ bind(non_block_zeroing);
2373       __ fill_words(to, cnt_words, value);
2374       __ bind(rest);
2375     } else {
2376       __ fill_words(to, cnt_words, value);
2377     }
2378 
2379     // Remaining count is less than 8 bytes. Fill it by a single store.
2380     // Note that the total length is no less than 8 bytes.
2381     if (t == T_BYTE || t == T_SHORT) {
2382       Label L_exit1;
2383       __ cbzw(count, L_exit1);
2384       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2385       __ str(value, Address(to, -8));    // overwrite some elements
2386       __ bind(L_exit1);
2387       __ leave();
2388       __ ret(lr);
2389     }
2390 
2391     // Handle copies less than 8 bytes.
2392     Label L_fill_2, L_fill_4, L_exit2;
2393     __ bind(L_fill_elements);
2394     switch (t) {
2395       case T_BYTE:
2396         __ tbz(count, 0, L_fill_2);
2397         __ strb(value, Address(__ post(to, 1)));
2398         __ bind(L_fill_2);
2399         __ tbz(count, 1, L_fill_4);
2400         __ strh(value, Address(__ post(to, 2)));
2401         __ bind(L_fill_4);
2402         __ tbz(count, 2, L_exit2);
2403         __ strw(value, Address(to));
2404         break;
2405       case T_SHORT:
2406         __ tbz(count, 0, L_fill_4);
2407         __ strh(value, Address(__ post(to, 2)));
2408         __ bind(L_fill_4);
2409         __ tbz(count, 1, L_exit2);
2410         __ strw(value, Address(to));
2411         break;
2412       case T_INT:
2413         __ cbzw(count, L_exit2);
2414         __ strw(value, Address(to));
2415         break;
2416       default: ShouldNotReachHere();
2417     }
2418     __ bind(L_exit2);
2419     __ leave();
2420     __ ret(lr);
2421     return start;
2422   }
2423 
2424   void generate_arraycopy_stubs() {
2425     address entry;
2426     address entry_jbyte_arraycopy;
2427     address entry_jshort_arraycopy;
2428     address entry_jint_arraycopy;
2429     address entry_oop_arraycopy;
2430     address entry_jlong_arraycopy;
2431     address entry_checkcast_arraycopy;
2432 
2433     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2434     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2435 
2436     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2437 
2438     //*** jbyte
2439     // Always need aligned and unaligned versions
2440     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2441                                                                                   "jbyte_disjoint_arraycopy");
2442     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2443                                                                                   &entry_jbyte_arraycopy,
2444                                                                                   "jbyte_arraycopy");
2445     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2446                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2447     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2448                                                                                   "arrayof_jbyte_arraycopy");
2449 
2450     //*** jshort
2451     // Always need aligned and unaligned versions
2452     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2453                                                                                     "jshort_disjoint_arraycopy");
2454     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2455                                                                                     &entry_jshort_arraycopy,
2456                                                                                     "jshort_arraycopy");
2457     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2458                                                                                     "arrayof_jshort_disjoint_arraycopy");
2459     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2460                                                                                     "arrayof_jshort_arraycopy");
2461 
2462     //*** jint
2463     // Aligned versions
2464     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2465                                                                                 "arrayof_jint_disjoint_arraycopy");
2466     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2467                                                                                 "arrayof_jint_arraycopy");
2468     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2469     // entry_jint_arraycopy always points to the unaligned version
2470     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2471                                                                                 "jint_disjoint_arraycopy");
2472     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2473                                                                                 &entry_jint_arraycopy,
2474                                                                                 "jint_arraycopy");
2475 
2476     //*** jlong
2477     // It is always aligned
2478     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2479                                                                                   "arrayof_jlong_disjoint_arraycopy");
2480     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2481                                                                                   "arrayof_jlong_arraycopy");
2482     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2483     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2484 
2485     //*** oops
2486     {
2487       // With compressed oops we need unaligned versions; notice that
2488       // we overwrite entry_oop_arraycopy.
2489       bool aligned = !UseCompressedOops;
2490 
2491       StubRoutines::_arrayof_oop_disjoint_arraycopy
2492         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2493                                      /*dest_uninitialized*/false);
2494       StubRoutines::_arrayof_oop_arraycopy
2495         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2496                                      /*dest_uninitialized*/false);
2497       // Aligned versions without pre-barriers
2498       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2499         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2500                                      /*dest_uninitialized*/true);
2501       StubRoutines::_arrayof_oop_arraycopy_uninit
2502         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2503                                      /*dest_uninitialized*/true);
2504     }
2505 
2506     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2507     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2508     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2509     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2510 
2511     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2512     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2513                                                                         /*dest_uninitialized*/true);
2514 
2515     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2516                                                               entry_jbyte_arraycopy,
2517                                                               entry_jshort_arraycopy,
2518                                                               entry_jint_arraycopy,
2519                                                               entry_jlong_arraycopy);
2520 
2521     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2522                                                                entry_jbyte_arraycopy,
2523                                                                entry_jshort_arraycopy,
2524                                                                entry_jint_arraycopy,
2525                                                                entry_oop_arraycopy,
2526                                                                entry_jlong_arraycopy,
2527                                                                entry_checkcast_arraycopy);
2528 
2529     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2530     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2531     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2532     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2533     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2534     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2535   }
2536 
2537   void generate_math_stubs() { Unimplemented(); }
2538 
2539   // Arguments:
2540   //
2541   // Inputs:
2542   //   c_rarg0   - source byte array address
2543   //   c_rarg1   - destination byte array address
2544   //   c_rarg2   - K (key) in little endian int array
2545   //
2546   address generate_aescrypt_encryptBlock() {
2547     __ align(CodeEntryAlignment);
2548     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2549 
2550     Label L_doLast;
2551 
2552     const Register from        = c_rarg0;  // source array address
2553     const Register to          = c_rarg1;  // destination array address
2554     const Register key         = c_rarg2;  // key array address
2555     const Register keylen      = rscratch1;
2556 
2557     address start = __ pc();
2558     __ enter();
2559 
2560     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2561 
2562     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2563 
2564     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2565     __ rev32(v1, __ T16B, v1);
2566     __ rev32(v2, __ T16B, v2);
2567     __ rev32(v3, __ T16B, v3);
2568     __ rev32(v4, __ T16B, v4);
2569     __ aese(v0, v1);
2570     __ aesmc(v0, v0);
2571     __ aese(v0, v2);
2572     __ aesmc(v0, v0);
2573     __ aese(v0, v3);
2574     __ aesmc(v0, v0);
2575     __ aese(v0, v4);
2576     __ aesmc(v0, v0);
2577 
2578     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2579     __ rev32(v1, __ T16B, v1);
2580     __ rev32(v2, __ T16B, v2);
2581     __ rev32(v3, __ T16B, v3);
2582     __ rev32(v4, __ T16B, v4);
2583     __ aese(v0, v1);
2584     __ aesmc(v0, v0);
2585     __ aese(v0, v2);
2586     __ aesmc(v0, v0);
2587     __ aese(v0, v3);
2588     __ aesmc(v0, v0);
2589     __ aese(v0, v4);
2590     __ aesmc(v0, v0);
2591 
2592     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2593     __ rev32(v1, __ T16B, v1);
2594     __ rev32(v2, __ T16B, v2);
2595 
2596     __ cmpw(keylen, 44);
2597     __ br(Assembler::EQ, L_doLast);
2598 
2599     __ aese(v0, v1);
2600     __ aesmc(v0, v0);
2601     __ aese(v0, v2);
2602     __ aesmc(v0, v0);
2603 
2604     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2605     __ rev32(v1, __ T16B, v1);
2606     __ rev32(v2, __ T16B, v2);
2607 
2608     __ cmpw(keylen, 52);
2609     __ br(Assembler::EQ, L_doLast);
2610 
2611     __ aese(v0, v1);
2612     __ aesmc(v0, v0);
2613     __ aese(v0, v2);
2614     __ aesmc(v0, v0);
2615 
2616     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2617     __ rev32(v1, __ T16B, v1);
2618     __ rev32(v2, __ T16B, v2);
2619 
2620     __ BIND(L_doLast);
2621 
2622     __ aese(v0, v1);
2623     __ aesmc(v0, v0);
2624     __ aese(v0, v2);
2625 
2626     __ ld1(v1, __ T16B, key);
2627     __ rev32(v1, __ T16B, v1);
2628     __ eor(v0, __ T16B, v0, v1);
2629 
2630     __ st1(v0, __ T16B, to);
2631 
2632     __ mov(r0, 0);
2633 
2634     __ leave();
2635     __ ret(lr);
2636 
2637     return start;
2638   }
2639 
2640   // Arguments:
2641   //
2642   // Inputs:
2643   //   c_rarg0   - source byte array address
2644   //   c_rarg1   - destination byte array address
2645   //   c_rarg2   - K (key) in little endian int array
2646   //
2647   address generate_aescrypt_decryptBlock() {
2648     assert(UseAES, "need AES instructions and misaligned SSE support");
2649     __ align(CodeEntryAlignment);
2650     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2651     Label L_doLast;
2652 
2653     const Register from        = c_rarg0;  // source array address
2654     const Register to          = c_rarg1;  // destination array address
2655     const Register key         = c_rarg2;  // key array address
2656     const Register keylen      = rscratch1;
2657 
2658     address start = __ pc();
2659     __ enter(); // required for proper stackwalking of RuntimeStub frame
2660 
2661     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2662 
2663     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2664 
2665     __ ld1(v5, __ T16B, __ post(key, 16));
2666     __ rev32(v5, __ T16B, v5);
2667 
2668     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2669     __ rev32(v1, __ T16B, v1);
2670     __ rev32(v2, __ T16B, v2);
2671     __ rev32(v3, __ T16B, v3);
2672     __ rev32(v4, __ T16B, v4);
2673     __ aesd(v0, v1);
2674     __ aesimc(v0, v0);
2675     __ aesd(v0, v2);
2676     __ aesimc(v0, v0);
2677     __ aesd(v0, v3);
2678     __ aesimc(v0, v0);
2679     __ aesd(v0, v4);
2680     __ aesimc(v0, v0);
2681 
2682     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2683     __ rev32(v1, __ T16B, v1);
2684     __ rev32(v2, __ T16B, v2);
2685     __ rev32(v3, __ T16B, v3);
2686     __ rev32(v4, __ T16B, v4);
2687     __ aesd(v0, v1);
2688     __ aesimc(v0, v0);
2689     __ aesd(v0, v2);
2690     __ aesimc(v0, v0);
2691     __ aesd(v0, v3);
2692     __ aesimc(v0, v0);
2693     __ aesd(v0, v4);
2694     __ aesimc(v0, v0);
2695 
2696     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2697     __ rev32(v1, __ T16B, v1);
2698     __ rev32(v2, __ T16B, v2);
2699 
2700     __ cmpw(keylen, 44);
2701     __ br(Assembler::EQ, L_doLast);
2702 
2703     __ aesd(v0, v1);
2704     __ aesimc(v0, v0);
2705     __ aesd(v0, v2);
2706     __ aesimc(v0, v0);
2707 
2708     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2709     __ rev32(v1, __ T16B, v1);
2710     __ rev32(v2, __ T16B, v2);
2711 
2712     __ cmpw(keylen, 52);
2713     __ br(Assembler::EQ, L_doLast);
2714 
2715     __ aesd(v0, v1);
2716     __ aesimc(v0, v0);
2717     __ aesd(v0, v2);
2718     __ aesimc(v0, v0);
2719 
2720     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2721     __ rev32(v1, __ T16B, v1);
2722     __ rev32(v2, __ T16B, v2);
2723 
2724     __ BIND(L_doLast);
2725 
2726     __ aesd(v0, v1);
2727     __ aesimc(v0, v0);
2728     __ aesd(v0, v2);
2729 
2730     __ eor(v0, __ T16B, v0, v5);
2731 
2732     __ st1(v0, __ T16B, to);
2733 
2734     __ mov(r0, 0);
2735 
2736     __ leave();
2737     __ ret(lr);
2738 
2739     return start;
2740   }
2741 
2742   // Arguments:
2743   //
2744   // Inputs:
2745   //   c_rarg0   - source byte array address
2746   //   c_rarg1   - destination byte array address
2747   //   c_rarg2   - K (key) in little endian int array
2748   //   c_rarg3   - r vector byte array address
2749   //   c_rarg4   - input length
2750   //
2751   // Output:
2752   //   x0        - input length
2753   //
2754   address generate_cipherBlockChaining_encryptAESCrypt() {
2755     assert(UseAES, "need AES instructions and misaligned SSE support");
2756     __ align(CodeEntryAlignment);
2757     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2758 
2759     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2760 
2761     const Register from        = c_rarg0;  // source array address
2762     const Register to          = c_rarg1;  // destination array address
2763     const Register key         = c_rarg2;  // key array address
2764     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2765                                            // and left with the results of the last encryption block
2766     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2767     const Register keylen      = rscratch1;
2768 
2769     address start = __ pc();
2770 
2771       __ enter();
2772 
2773       __ movw(rscratch2, len_reg);
2774 
2775       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2776 
2777       __ ld1(v0, __ T16B, rvec);
2778 
2779       __ cmpw(keylen, 52);
2780       __ br(Assembler::CC, L_loadkeys_44);
2781       __ br(Assembler::EQ, L_loadkeys_52);
2782 
2783       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2784       __ rev32(v17, __ T16B, v17);
2785       __ rev32(v18, __ T16B, v18);
2786     __ BIND(L_loadkeys_52);
2787       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2788       __ rev32(v19, __ T16B, v19);
2789       __ rev32(v20, __ T16B, v20);
2790     __ BIND(L_loadkeys_44);
2791       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2792       __ rev32(v21, __ T16B, v21);
2793       __ rev32(v22, __ T16B, v22);
2794       __ rev32(v23, __ T16B, v23);
2795       __ rev32(v24, __ T16B, v24);
2796       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2797       __ rev32(v25, __ T16B, v25);
2798       __ rev32(v26, __ T16B, v26);
2799       __ rev32(v27, __ T16B, v27);
2800       __ rev32(v28, __ T16B, v28);
2801       __ ld1(v29, v30, v31, __ T16B, key);
2802       __ rev32(v29, __ T16B, v29);
2803       __ rev32(v30, __ T16B, v30);
2804       __ rev32(v31, __ T16B, v31);
2805 
2806     __ BIND(L_aes_loop);
2807       __ ld1(v1, __ T16B, __ post(from, 16));
2808       __ eor(v0, __ T16B, v0, v1);
2809 
2810       __ br(Assembler::CC, L_rounds_44);
2811       __ br(Assembler::EQ, L_rounds_52);
2812 
2813       __ aese(v0, v17); __ aesmc(v0, v0);
2814       __ aese(v0, v18); __ aesmc(v0, v0);
2815     __ BIND(L_rounds_52);
2816       __ aese(v0, v19); __ aesmc(v0, v0);
2817       __ aese(v0, v20); __ aesmc(v0, v0);
2818     __ BIND(L_rounds_44);
2819       __ aese(v0, v21); __ aesmc(v0, v0);
2820       __ aese(v0, v22); __ aesmc(v0, v0);
2821       __ aese(v0, v23); __ aesmc(v0, v0);
2822       __ aese(v0, v24); __ aesmc(v0, v0);
2823       __ aese(v0, v25); __ aesmc(v0, v0);
2824       __ aese(v0, v26); __ aesmc(v0, v0);
2825       __ aese(v0, v27); __ aesmc(v0, v0);
2826       __ aese(v0, v28); __ aesmc(v0, v0);
2827       __ aese(v0, v29); __ aesmc(v0, v0);
2828       __ aese(v0, v30);
2829       __ eor(v0, __ T16B, v0, v31);
2830 
2831       __ st1(v0, __ T16B, __ post(to, 16));
2832 
2833       __ subw(len_reg, len_reg, 16);
2834       __ cbnzw(len_reg, L_aes_loop);
2835 
2836       __ st1(v0, __ T16B, rvec);
2837 
2838       __ mov(r0, rscratch2);
2839 
2840       __ leave();
2841       __ ret(lr);
2842 
2843       return start;
2844   }
2845 
2846   // Arguments:
2847   //
2848   // Inputs:
2849   //   c_rarg0   - source byte array address
2850   //   c_rarg1   - destination byte array address
2851   //   c_rarg2   - K (key) in little endian int array
2852   //   c_rarg3   - r vector byte array address
2853   //   c_rarg4   - input length
2854   //
2855   // Output:
2856   //   r0        - input length
2857   //
2858   address generate_cipherBlockChaining_decryptAESCrypt() {
2859     assert(UseAES, "need AES instructions and misaligned SSE support");
2860     __ align(CodeEntryAlignment);
2861     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2862 
2863     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2864 
2865     const Register from        = c_rarg0;  // source array address
2866     const Register to          = c_rarg1;  // destination array address
2867     const Register key         = c_rarg2;  // key array address
2868     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2869                                            // and left with the results of the last encryption block
2870     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2871     const Register keylen      = rscratch1;
2872 
2873     address start = __ pc();
2874 
2875       __ enter();
2876 
2877       __ movw(rscratch2, len_reg);
2878 
2879       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2880 
2881       __ ld1(v2, __ T16B, rvec);
2882 
2883       __ ld1(v31, __ T16B, __ post(key, 16));
2884       __ rev32(v31, __ T16B, v31);
2885 
2886       __ cmpw(keylen, 52);
2887       __ br(Assembler::CC, L_loadkeys_44);
2888       __ br(Assembler::EQ, L_loadkeys_52);
2889 
2890       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2891       __ rev32(v17, __ T16B, v17);
2892       __ rev32(v18, __ T16B, v18);
2893     __ BIND(L_loadkeys_52);
2894       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2895       __ rev32(v19, __ T16B, v19);
2896       __ rev32(v20, __ T16B, v20);
2897     __ BIND(L_loadkeys_44);
2898       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2899       __ rev32(v21, __ T16B, v21);
2900       __ rev32(v22, __ T16B, v22);
2901       __ rev32(v23, __ T16B, v23);
2902       __ rev32(v24, __ T16B, v24);
2903       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2904       __ rev32(v25, __ T16B, v25);
2905       __ rev32(v26, __ T16B, v26);
2906       __ rev32(v27, __ T16B, v27);
2907       __ rev32(v28, __ T16B, v28);
2908       __ ld1(v29, v30, __ T16B, key);
2909       __ rev32(v29, __ T16B, v29);
2910       __ rev32(v30, __ T16B, v30);
2911 
2912     __ BIND(L_aes_loop);
2913       __ ld1(v0, __ T16B, __ post(from, 16));
2914       __ orr(v1, __ T16B, v0, v0);
2915 
2916       __ br(Assembler::CC, L_rounds_44);
2917       __ br(Assembler::EQ, L_rounds_52);
2918 
2919       __ aesd(v0, v17); __ aesimc(v0, v0);
2920       __ aesd(v0, v18); __ aesimc(v0, v0);
2921     __ BIND(L_rounds_52);
2922       __ aesd(v0, v19); __ aesimc(v0, v0);
2923       __ aesd(v0, v20); __ aesimc(v0, v0);
2924     __ BIND(L_rounds_44);
2925       __ aesd(v0, v21); __ aesimc(v0, v0);
2926       __ aesd(v0, v22); __ aesimc(v0, v0);
2927       __ aesd(v0, v23); __ aesimc(v0, v0);
2928       __ aesd(v0, v24); __ aesimc(v0, v0);
2929       __ aesd(v0, v25); __ aesimc(v0, v0);
2930       __ aesd(v0, v26); __ aesimc(v0, v0);
2931       __ aesd(v0, v27); __ aesimc(v0, v0);
2932       __ aesd(v0, v28); __ aesimc(v0, v0);
2933       __ aesd(v0, v29); __ aesimc(v0, v0);
2934       __ aesd(v0, v30);
2935       __ eor(v0, __ T16B, v0, v31);
2936       __ eor(v0, __ T16B, v0, v2);
2937 
2938       __ st1(v0, __ T16B, __ post(to, 16));
2939       __ orr(v2, __ T16B, v1, v1);
2940 
2941       __ subw(len_reg, len_reg, 16);
2942       __ cbnzw(len_reg, L_aes_loop);
2943 
2944       __ st1(v2, __ T16B, rvec);
2945 
2946       __ mov(r0, rscratch2);
2947 
2948       __ leave();
2949       __ ret(lr);
2950 
2951     return start;
2952   }
2953 
2954   // Arguments:
2955   //
2956   // Inputs:
2957   //   c_rarg0   - byte[]  source+offset
2958   //   c_rarg1   - int[]   SHA.state
2959   //   c_rarg2   - int     offset
2960   //   c_rarg3   - int     limit
2961   //
2962   address generate_sha1_implCompress(bool multi_block, const char *name) {
2963     __ align(CodeEntryAlignment);
2964     StubCodeMark mark(this, "StubRoutines", name);
2965     address start = __ pc();
2966 
2967     Register buf   = c_rarg0;
2968     Register state = c_rarg1;
2969     Register ofs   = c_rarg2;
2970     Register limit = c_rarg3;
2971 
2972     Label keys;
2973     Label sha1_loop;
2974 
2975     // load the keys into v0..v3
2976     __ adr(rscratch1, keys);
2977     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2978     // load 5 words state into v6, v7
2979     __ ldrq(v6, Address(state, 0));
2980     __ ldrs(v7, Address(state, 16));
2981 
2982 
2983     __ BIND(sha1_loop);
2984     // load 64 bytes of data into v16..v19
2985     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2986     __ rev32(v16, __ T16B, v16);
2987     __ rev32(v17, __ T16B, v17);
2988     __ rev32(v18, __ T16B, v18);
2989     __ rev32(v19, __ T16B, v19);
2990 
2991     // do the sha1
2992     __ addv(v4, __ T4S, v16, v0);
2993     __ orr(v20, __ T16B, v6, v6);
2994 
2995     FloatRegister d0 = v16;
2996     FloatRegister d1 = v17;
2997     FloatRegister d2 = v18;
2998     FloatRegister d3 = v19;
2999 
3000     for (int round = 0; round < 20; round++) {
3001       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3002       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3003       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3004       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3005       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3006 
3007       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3008       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3009       __ sha1h(tmp2, __ T4S, v20);
3010       if (round < 5)
3011         __ sha1c(v20, __ T4S, tmp3, tmp4);
3012       else if (round < 10 || round >= 15)
3013         __ sha1p(v20, __ T4S, tmp3, tmp4);
3014       else
3015         __ sha1m(v20, __ T4S, tmp3, tmp4);
3016       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3017 
3018       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3019     }
3020 
3021     __ addv(v7, __ T2S, v7, v21);
3022     __ addv(v6, __ T4S, v6, v20);
3023 
3024     if (multi_block) {
3025       __ add(ofs, ofs, 64);
3026       __ cmp(ofs, limit);
3027       __ br(Assembler::LE, sha1_loop);
3028       __ mov(c_rarg0, ofs); // return ofs
3029     }
3030 
3031     __ strq(v6, Address(state, 0));
3032     __ strs(v7, Address(state, 16));
3033 
3034     __ ret(lr);
3035 
3036     __ bind(keys);
3037     __ emit_int32(0x5a827999);
3038     __ emit_int32(0x6ed9eba1);
3039     __ emit_int32(0x8f1bbcdc);
3040     __ emit_int32(0xca62c1d6);
3041 
3042     return start;
3043   }
3044 
3045 
3046   // Arguments:
3047   //
3048   // Inputs:
3049   //   c_rarg0   - byte[]  source+offset
3050   //   c_rarg1   - int[]   SHA.state
3051   //   c_rarg2   - int     offset
3052   //   c_rarg3   - int     limit
3053   //
3054   address generate_sha256_implCompress(bool multi_block, const char *name) {
3055     static const uint32_t round_consts[64] = {
3056       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3057       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3058       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3059       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3060       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3061       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3062       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3063       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3064       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3065       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3066       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3067       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3068       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3069       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3070       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3071       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3072     };
3073     __ align(CodeEntryAlignment);
3074     StubCodeMark mark(this, "StubRoutines", name);
3075     address start = __ pc();
3076 
3077     Register buf   = c_rarg0;
3078     Register state = c_rarg1;
3079     Register ofs   = c_rarg2;
3080     Register limit = c_rarg3;
3081 
3082     Label sha1_loop;
3083 
3084     __ stpd(v8, v9, __ pre(sp, -32));
3085     __ stpd(v10, v11, Address(sp, 16));
3086 
3087 // dga == v0
3088 // dgb == v1
3089 // dg0 == v2
3090 // dg1 == v3
3091 // dg2 == v4
3092 // t0 == v6
3093 // t1 == v7
3094 
3095     // load 16 keys to v16..v31
3096     __ lea(rscratch1, ExternalAddress((address)round_consts));
3097     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3098     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3099     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3100     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3101 
3102     // load 8 words (256 bits) state
3103     __ ldpq(v0, v1, state);
3104 
3105     __ BIND(sha1_loop);
3106     // load 64 bytes of data into v8..v11
3107     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3108     __ rev32(v8, __ T16B, v8);
3109     __ rev32(v9, __ T16B, v9);
3110     __ rev32(v10, __ T16B, v10);
3111     __ rev32(v11, __ T16B, v11);
3112 
3113     __ addv(v6, __ T4S, v8, v16);
3114     __ orr(v2, __ T16B, v0, v0);
3115     __ orr(v3, __ T16B, v1, v1);
3116 
3117     FloatRegister d0 = v8;
3118     FloatRegister d1 = v9;
3119     FloatRegister d2 = v10;
3120     FloatRegister d3 = v11;
3121 
3122 
3123     for (int round = 0; round < 16; round++) {
3124       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3125       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3126       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3127       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3128 
3129       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3130        __ orr(v4, __ T16B, v2, v2);
3131       if (round < 15)
3132         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3133       __ sha256h(v2, __ T4S, v3, tmp2);
3134       __ sha256h2(v3, __ T4S, v4, tmp2);
3135       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3136 
3137       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3138     }
3139 
3140     __ addv(v0, __ T4S, v0, v2);
3141     __ addv(v1, __ T4S, v1, v3);
3142 
3143     if (multi_block) {
3144       __ add(ofs, ofs, 64);
3145       __ cmp(ofs, limit);
3146       __ br(Assembler::LE, sha1_loop);
3147       __ mov(c_rarg0, ofs); // return ofs
3148     }
3149 
3150     __ ldpd(v10, v11, Address(sp, 16));
3151     __ ldpd(v8, v9, __ post(sp, 32));
3152 
3153     __ stpq(v0, v1, state);
3154 
3155     __ ret(lr);
3156 
3157     return start;
3158   }
3159 
3160 #ifndef BUILTIN_SIM
3161   // Safefetch stubs.
3162   void generate_safefetch(const char* name, int size, address* entry,
3163                           address* fault_pc, address* continuation_pc) {
3164     // safefetch signatures:
3165     //   int      SafeFetch32(int*      adr, int      errValue);
3166     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3167     //
3168     // arguments:
3169     //   c_rarg0 = adr
3170     //   c_rarg1 = errValue
3171     //
3172     // result:
3173     //   PPC_RET  = *adr or errValue
3174 
3175     StubCodeMark mark(this, "StubRoutines", name);
3176 
3177     // Entry point, pc or function descriptor.
3178     *entry = __ pc();
3179 
3180     // Load *adr into c_rarg1, may fault.
3181     *fault_pc = __ pc();
3182     switch (size) {
3183       case 4:
3184         // int32_t
3185         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3186         break;
3187       case 8:
3188         // int64_t
3189         __ ldr(c_rarg1, Address(c_rarg0, 0));
3190         break;
3191       default:
3192         ShouldNotReachHere();
3193     }
3194 
3195     // return errValue or *adr
3196     *continuation_pc = __ pc();
3197     __ mov(r0, c_rarg1);
3198     __ ret(lr);
3199   }
3200 #endif
3201 
3202   /**
3203    *  Arguments:
3204    *
3205    * Inputs:
3206    *   c_rarg0   - int crc
3207    *   c_rarg1   - byte* buf
3208    *   c_rarg2   - int length
3209    *
3210    * Ouput:
3211    *       rax   - int crc result
3212    */
3213   address generate_updateBytesCRC32() {
3214     assert(UseCRC32Intrinsics, "what are we doing here?");
3215 
3216     __ align(CodeEntryAlignment);
3217     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3218 
3219     address start = __ pc();
3220 
3221     const Register crc   = c_rarg0;  // crc
3222     const Register buf   = c_rarg1;  // source java byte array address
3223     const Register len   = c_rarg2;  // length
3224     const Register table0 = c_rarg3; // crc_table address
3225     const Register table1 = c_rarg4;
3226     const Register table2 = c_rarg5;
3227     const Register table3 = c_rarg6;
3228     const Register tmp3 = c_rarg7;
3229 
3230     BLOCK_COMMENT("Entry:");
3231     __ enter(); // required for proper stackwalking of RuntimeStub frame
3232 
3233     __ kernel_crc32(crc, buf, len,
3234               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3235 
3236     __ leave(); // required for proper stackwalking of RuntimeStub frame
3237     __ ret(lr);
3238 
3239     return start;
3240   }
3241 
3242   /**
3243    *  Arguments:
3244    *
3245    * Inputs:
3246    *   c_rarg0   - int crc
3247    *   c_rarg1   - byte* buf
3248    *   c_rarg2   - int length
3249    *   c_rarg3   - int* table
3250    *
3251    * Ouput:
3252    *       r0   - int crc result
3253    */
3254   address generate_updateBytesCRC32C() {
3255     assert(UseCRC32CIntrinsics, "what are we doing here?");
3256 
3257     __ align(CodeEntryAlignment);
3258     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3259 
3260     address start = __ pc();
3261 
3262     const Register crc   = c_rarg0;  // crc
3263     const Register buf   = c_rarg1;  // source java byte array address
3264     const Register len   = c_rarg2;  // length
3265     const Register table0 = c_rarg3; // crc_table address
3266     const Register table1 = c_rarg4;
3267     const Register table2 = c_rarg5;
3268     const Register table3 = c_rarg6;
3269     const Register tmp3 = c_rarg7;
3270 
3271     BLOCK_COMMENT("Entry:");
3272     __ enter(); // required for proper stackwalking of RuntimeStub frame
3273 
3274     __ kernel_crc32c(crc, buf, len,
3275               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3276 
3277     __ leave(); // required for proper stackwalking of RuntimeStub frame
3278     __ ret(lr);
3279 
3280     return start;
3281   }
3282 
3283   /***
3284    *  Arguments:
3285    *
3286    *  Inputs:
3287    *   c_rarg0   - int   adler
3288    *   c_rarg1   - byte* buff
3289    *   c_rarg2   - int   len
3290    *
3291    * Output:
3292    *   c_rarg0   - int adler result
3293    */
3294   address generate_updateBytesAdler32() {
3295     __ align(CodeEntryAlignment);
3296     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3297     address start = __ pc();
3298 
3299     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3300 
3301     // Aliases
3302     Register adler  = c_rarg0;
3303     Register s1     = c_rarg0;
3304     Register s2     = c_rarg3;
3305     Register buff   = c_rarg1;
3306     Register len    = c_rarg2;
3307     Register nmax  = r4;
3308     Register base = r5;
3309     Register count = r6;
3310     Register temp0 = rscratch1;
3311     Register temp1 = rscratch2;
3312     Register temp2 = r7;
3313 
3314     // Max number of bytes we can process before having to take the mod
3315     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3316     unsigned long BASE = 0xfff1;
3317     unsigned long NMAX = 0x15B0;
3318 
3319     __ mov(base, BASE);
3320     __ mov(nmax, NMAX);
3321 
3322     // s1 is initialized to the lower 16 bits of adler
3323     // s2 is initialized to the upper 16 bits of adler
3324     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3325     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3326 
3327     // The pipelined loop needs at least 16 elements for 1 iteration
3328     // It does check this, but it is more effective to skip to the cleanup loop
3329     __ cmp(len, 16);
3330     __ br(Assembler::HS, L_nmax);
3331     __ cbz(len, L_combine);
3332 
3333     __ bind(L_simple_by1_loop);
3334     __ ldrb(temp0, Address(__ post(buff, 1)));
3335     __ add(s1, s1, temp0);
3336     __ add(s2, s2, s1);
3337     __ subs(len, len, 1);
3338     __ br(Assembler::HI, L_simple_by1_loop);
3339 
3340     // s1 = s1 % BASE
3341     __ subs(temp0, s1, base);
3342     __ csel(s1, temp0, s1, Assembler::HS);
3343 
3344     // s2 = s2 % BASE
3345     __ lsr(temp0, s2, 16);
3346     __ lsl(temp1, temp0, 4);
3347     __ sub(temp1, temp1, temp0);
3348     __ add(s2, temp1, s2, ext::uxth);
3349 
3350     __ subs(temp0, s2, base);
3351     __ csel(s2, temp0, s2, Assembler::HS);
3352 
3353     __ b(L_combine);
3354 
3355     __ bind(L_nmax);
3356     __ subs(len, len, nmax);
3357     __ sub(count, nmax, 16);
3358     __ br(Assembler::LO, L_by16);
3359 
3360     __ bind(L_nmax_loop);
3361 
3362     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3363 
3364     __ add(s1, s1, temp0, ext::uxtb);
3365     __ ubfx(temp2, temp0, 8, 8);
3366     __ add(s2, s2, s1);
3367     __ add(s1, s1, temp2);
3368     __ ubfx(temp2, temp0, 16, 8);
3369     __ add(s2, s2, s1);
3370     __ add(s1, s1, temp2);
3371     __ ubfx(temp2, temp0, 24, 8);
3372     __ add(s2, s2, s1);
3373     __ add(s1, s1, temp2);
3374     __ ubfx(temp2, temp0, 32, 8);
3375     __ add(s2, s2, s1);
3376     __ add(s1, s1, temp2);
3377     __ ubfx(temp2, temp0, 40, 8);
3378     __ add(s2, s2, s1);
3379     __ add(s1, s1, temp2);
3380     __ ubfx(temp2, temp0, 48, 8);
3381     __ add(s2, s2, s1);
3382     __ add(s1, s1, temp2);
3383     __ add(s2, s2, s1);
3384     __ add(s1, s1, temp0, Assembler::LSR, 56);
3385     __ add(s2, s2, s1);
3386 
3387     __ add(s1, s1, temp1, ext::uxtb);
3388     __ ubfx(temp2, temp1, 8, 8);
3389     __ add(s2, s2, s1);
3390     __ add(s1, s1, temp2);
3391     __ ubfx(temp2, temp1, 16, 8);
3392     __ add(s2, s2, s1);
3393     __ add(s1, s1, temp2);
3394     __ ubfx(temp2, temp1, 24, 8);
3395     __ add(s2, s2, s1);
3396     __ add(s1, s1, temp2);
3397     __ ubfx(temp2, temp1, 32, 8);
3398     __ add(s2, s2, s1);
3399     __ add(s1, s1, temp2);
3400     __ ubfx(temp2, temp1, 40, 8);
3401     __ add(s2, s2, s1);
3402     __ add(s1, s1, temp2);
3403     __ ubfx(temp2, temp1, 48, 8);
3404     __ add(s2, s2, s1);
3405     __ add(s1, s1, temp2);
3406     __ add(s2, s2, s1);
3407     __ add(s1, s1, temp1, Assembler::LSR, 56);
3408     __ add(s2, s2, s1);
3409 
3410     __ subs(count, count, 16);
3411     __ br(Assembler::HS, L_nmax_loop);
3412 
3413     // s1 = s1 % BASE
3414     __ lsr(temp0, s1, 16);
3415     __ lsl(temp1, temp0, 4);
3416     __ sub(temp1, temp1, temp0);
3417     __ add(temp1, temp1, s1, ext::uxth);
3418 
3419     __ lsr(temp0, temp1, 16);
3420     __ lsl(s1, temp0, 4);
3421     __ sub(s1, s1, temp0);
3422     __ add(s1, s1, temp1, ext:: uxth);
3423 
3424     __ subs(temp0, s1, base);
3425     __ csel(s1, temp0, s1, Assembler::HS);
3426 
3427     // s2 = s2 % BASE
3428     __ lsr(temp0, s2, 16);
3429     __ lsl(temp1, temp0, 4);
3430     __ sub(temp1, temp1, temp0);
3431     __ add(temp1, temp1, s2, ext::uxth);
3432 
3433     __ lsr(temp0, temp1, 16);
3434     __ lsl(s2, temp0, 4);
3435     __ sub(s2, s2, temp0);
3436     __ add(s2, s2, temp1, ext:: uxth);
3437 
3438     __ subs(temp0, s2, base);
3439     __ csel(s2, temp0, s2, Assembler::HS);
3440 
3441     __ subs(len, len, nmax);
3442     __ sub(count, nmax, 16);
3443     __ br(Assembler::HS, L_nmax_loop);
3444 
3445     __ bind(L_by16);
3446     __ adds(len, len, count);
3447     __ br(Assembler::LO, L_by1);
3448 
3449     __ bind(L_by16_loop);
3450 
3451     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3452 
3453     __ add(s1, s1, temp0, ext::uxtb);
3454     __ ubfx(temp2, temp0, 8, 8);
3455     __ add(s2, s2, s1);
3456     __ add(s1, s1, temp2);
3457     __ ubfx(temp2, temp0, 16, 8);
3458     __ add(s2, s2, s1);
3459     __ add(s1, s1, temp2);
3460     __ ubfx(temp2, temp0, 24, 8);
3461     __ add(s2, s2, s1);
3462     __ add(s1, s1, temp2);
3463     __ ubfx(temp2, temp0, 32, 8);
3464     __ add(s2, s2, s1);
3465     __ add(s1, s1, temp2);
3466     __ ubfx(temp2, temp0, 40, 8);
3467     __ add(s2, s2, s1);
3468     __ add(s1, s1, temp2);
3469     __ ubfx(temp2, temp0, 48, 8);
3470     __ add(s2, s2, s1);
3471     __ add(s1, s1, temp2);
3472     __ add(s2, s2, s1);
3473     __ add(s1, s1, temp0, Assembler::LSR, 56);
3474     __ add(s2, s2, s1);
3475 
3476     __ add(s1, s1, temp1, ext::uxtb);
3477     __ ubfx(temp2, temp1, 8, 8);
3478     __ add(s2, s2, s1);
3479     __ add(s1, s1, temp2);
3480     __ ubfx(temp2, temp1, 16, 8);
3481     __ add(s2, s2, s1);
3482     __ add(s1, s1, temp2);
3483     __ ubfx(temp2, temp1, 24, 8);
3484     __ add(s2, s2, s1);
3485     __ add(s1, s1, temp2);
3486     __ ubfx(temp2, temp1, 32, 8);
3487     __ add(s2, s2, s1);
3488     __ add(s1, s1, temp2);
3489     __ ubfx(temp2, temp1, 40, 8);
3490     __ add(s2, s2, s1);
3491     __ add(s1, s1, temp2);
3492     __ ubfx(temp2, temp1, 48, 8);
3493     __ add(s2, s2, s1);
3494     __ add(s1, s1, temp2);
3495     __ add(s2, s2, s1);
3496     __ add(s1, s1, temp1, Assembler::LSR, 56);
3497     __ add(s2, s2, s1);
3498 
3499     __ subs(len, len, 16);
3500     __ br(Assembler::HS, L_by16_loop);
3501 
3502     __ bind(L_by1);
3503     __ adds(len, len, 15);
3504     __ br(Assembler::LO, L_do_mod);
3505 
3506     __ bind(L_by1_loop);
3507     __ ldrb(temp0, Address(__ post(buff, 1)));
3508     __ add(s1, temp0, s1);
3509     __ add(s2, s2, s1);
3510     __ subs(len, len, 1);
3511     __ br(Assembler::HS, L_by1_loop);
3512 
3513     __ bind(L_do_mod);
3514     // s1 = s1 % BASE
3515     __ lsr(temp0, s1, 16);
3516     __ lsl(temp1, temp0, 4);
3517     __ sub(temp1, temp1, temp0);
3518     __ add(temp1, temp1, s1, ext::uxth);
3519 
3520     __ lsr(temp0, temp1, 16);
3521     __ lsl(s1, temp0, 4);
3522     __ sub(s1, s1, temp0);
3523     __ add(s1, s1, temp1, ext:: uxth);
3524 
3525     __ subs(temp0, s1, base);
3526     __ csel(s1, temp0, s1, Assembler::HS);
3527 
3528     // s2 = s2 % BASE
3529     __ lsr(temp0, s2, 16);
3530     __ lsl(temp1, temp0, 4);
3531     __ sub(temp1, temp1, temp0);
3532     __ add(temp1, temp1, s2, ext::uxth);
3533 
3534     __ lsr(temp0, temp1, 16);
3535     __ lsl(s2, temp0, 4);
3536     __ sub(s2, s2, temp0);
3537     __ add(s2, s2, temp1, ext:: uxth);
3538 
3539     __ subs(temp0, s2, base);
3540     __ csel(s2, temp0, s2, Assembler::HS);
3541 
3542     // Combine lower bits and higher bits
3543     __ bind(L_combine);
3544     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3545 
3546     __ ret(lr);
3547 
3548     return start;
3549   }
3550 
3551   /**
3552    *  Arguments:
3553    *
3554    *  Input:
3555    *    c_rarg0   - x address
3556    *    c_rarg1   - x length
3557    *    c_rarg2   - y address
3558    *    c_rarg3   - y lenth
3559    *    c_rarg4   - z address
3560    *    c_rarg5   - z length
3561    */
3562   address generate_multiplyToLen() {
3563     __ align(CodeEntryAlignment);
3564     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3565 
3566     address start = __ pc();
3567     const Register x     = r0;
3568     const Register xlen  = r1;
3569     const Register y     = r2;
3570     const Register ylen  = r3;
3571     const Register z     = r4;
3572     const Register zlen  = r5;
3573 
3574     const Register tmp1  = r10;
3575     const Register tmp2  = r11;
3576     const Register tmp3  = r12;
3577     const Register tmp4  = r13;
3578     const Register tmp5  = r14;
3579     const Register tmp6  = r15;
3580     const Register tmp7  = r16;
3581 
3582     BLOCK_COMMENT("Entry:");
3583     __ enter(); // required for proper stackwalking of RuntimeStub frame
3584     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3585     __ leave(); // required for proper stackwalking of RuntimeStub frame
3586     __ ret(lr);
3587 
3588     return start;
3589   }
3590 
3591   address generate_squareToLen() {
3592     // squareToLen algorithm for sizes 1..127 described in java code works
3593     // faster than multiply_to_len on some CPUs and slower on others, but
3594     // multiply_to_len shows a bit better overall results
3595     __ align(CodeEntryAlignment);
3596     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3597     address start = __ pc();
3598 
3599     const Register x     = r0;
3600     const Register xlen  = r1;
3601     const Register z     = r2;
3602     const Register zlen  = r3;
3603     const Register y     = r4; // == x
3604     const Register ylen  = r5; // == xlen
3605 
3606     const Register tmp1  = r10;
3607     const Register tmp2  = r11;
3608     const Register tmp3  = r12;
3609     const Register tmp4  = r13;
3610     const Register tmp5  = r14;
3611     const Register tmp6  = r15;
3612     const Register tmp7  = r16;
3613 
3614     RegSet spilled_regs = RegSet::of(y, ylen);
3615     BLOCK_COMMENT("Entry:");
3616     __ enter();
3617     __ push(spilled_regs, sp);
3618     __ mov(y, x);
3619     __ mov(ylen, xlen);
3620     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3621     __ pop(spilled_regs, sp);
3622     __ leave();
3623     __ ret(lr);
3624     return start;
3625   }
3626 
3627   address generate_mulAdd() {
3628     __ align(CodeEntryAlignment);
3629     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3630 
3631     address start = __ pc();
3632 
3633     const Register out     = r0;
3634     const Register in      = r1;
3635     const Register offset  = r2;
3636     const Register len     = r3;
3637     const Register k       = r4;
3638 
3639     BLOCK_COMMENT("Entry:");
3640     __ enter();
3641     __ mul_add(out, in, offset, len, k);
3642     __ leave();
3643     __ ret(lr);
3644 
3645     return start;
3646   }
3647 
3648   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3649                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3650                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3651     // Karatsuba multiplication performs a 128*128 -> 256-bit
3652     // multiplication in three 128-bit multiplications and a few
3653     // additions.
3654     //
3655     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3656     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3657     //
3658     // Inputs:
3659     //
3660     // A0 in a.d[0]     (subkey)
3661     // A1 in a.d[1]
3662     // (A1+A0) in a1_xor_a0.d[0]
3663     //
3664     // B0 in b.d[0]     (state)
3665     // B1 in b.d[1]
3666 
3667     __ ext(tmp1, __ T16B, b, b, 0x08);
3668     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3669     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3670     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3671     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3672 
3673     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3674     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3675     __ eor(tmp2, __ T16B, tmp2, tmp4);
3676     __ eor(tmp2, __ T16B, tmp2, tmp3);
3677 
3678     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3679     __ ins(result_hi, __ D, tmp2, 0, 1);
3680     __ ins(result_lo, __ D, tmp2, 1, 0);
3681   }
3682 
3683   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3684                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3685     const FloatRegister t0 = result;
3686 
3687     // The GCM field polynomial f is z^128 + p(z), where p =
3688     // z^7+z^2+z+1.
3689     //
3690     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3691     //
3692     // so, given that the product we're reducing is
3693     //    a == lo + hi * z^128
3694     // substituting,
3695     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3696     //
3697     // we reduce by multiplying hi by p(z) and subtracting the result
3698     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3699     // bits we can do this with two 64-bit multiplications, lo*p and
3700     // hi*p.
3701 
3702     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3703     __ ext(t1, __ T16B, t0, z, 8);
3704     __ eor(hi, __ T16B, hi, t1);
3705     __ ext(t1, __ T16B, z, t0, 8);
3706     __ eor(lo, __ T16B, lo, t1);
3707     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3708     __ eor(result, __ T16B, lo, t0);
3709   }
3710 
3711   address generate_has_negatives(address &has_negatives_long) {
3712     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3713     const int large_loop_size = 64;
3714     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3715     int dcache_line = VM_Version::dcache_line_size();
3716 
3717     Register ary1 = r1, len = r2, result = r0;
3718 
3719     __ align(CodeEntryAlignment);
3720     address entry = __ pc();
3721 
3722     __ enter();
3723 
3724   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3725         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3726 
3727   __ cmp(len, 15);
3728   __ br(Assembler::GT, LEN_OVER_15);
3729   // The only case when execution falls into this code is when pointer is near
3730   // the end of memory page and we have to avoid reading next page
3731   __ add(ary1, ary1, len);
3732   __ subs(len, len, 8);
3733   __ br(Assembler::GT, LEN_OVER_8);
3734   __ ldr(rscratch2, Address(ary1, -8));
3735   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3736   __ lsrv(rscratch2, rscratch2, rscratch1);
3737   __ tst(rscratch2, UPPER_BIT_MASK);
3738   __ cset(result, Assembler::NE);
3739   __ leave();
3740   __ ret(lr);
3741   __ bind(LEN_OVER_8);
3742   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3743   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3744   __ tst(rscratch2, UPPER_BIT_MASK);
3745   __ br(Assembler::NE, RET_TRUE_NO_POP);
3746   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3747   __ lsrv(rscratch1, rscratch1, rscratch2);
3748   __ tst(rscratch1, UPPER_BIT_MASK);
3749   __ cset(result, Assembler::NE);
3750   __ leave();
3751   __ ret(lr);
3752 
3753   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3754   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3755 
3756   has_negatives_long = __ pc(); // 2nd entry point
3757 
3758   __ enter();
3759 
3760   __ bind(LEN_OVER_15);
3761     __ push(spilled_regs, sp);
3762     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3763     __ cbz(rscratch2, ALIGNED);
3764     __ ldp(tmp6, tmp1, Address(ary1));
3765     __ mov(tmp5, 16);
3766     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3767     __ add(ary1, ary1, rscratch1);
3768     __ sub(len, len, rscratch1);
3769     __ orr(tmp6, tmp6, tmp1);
3770     __ tst(tmp6, UPPER_BIT_MASK);
3771     __ br(Assembler::NE, RET_TRUE);
3772 
3773   __ bind(ALIGNED);
3774     __ cmp(len, large_loop_size);
3775     __ br(Assembler::LT, CHECK_16);
3776     // Perform 16-byte load as early return in pre-loop to handle situation
3777     // when initially aligned large array has negative values at starting bytes,
3778     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3779     // slower. Cases with negative bytes further ahead won't be affected that
3780     // much. In fact, it'll be faster due to early loads, less instructions and
3781     // less branches in LARGE_LOOP.
3782     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3783     __ sub(len, len, 16);
3784     __ orr(tmp6, tmp6, tmp1);
3785     __ tst(tmp6, UPPER_BIT_MASK);
3786     __ br(Assembler::NE, RET_TRUE);
3787     __ cmp(len, large_loop_size);
3788     __ br(Assembler::LT, CHECK_16);
3789 
3790     if (SoftwarePrefetchHintDistance >= 0
3791         && SoftwarePrefetchHintDistance >= dcache_line) {
3792       // initial prefetch
3793       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3794     }
3795   __ bind(LARGE_LOOP);
3796     if (SoftwarePrefetchHintDistance >= 0) {
3797       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3798     }
3799     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3800     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3801     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3802     // instructions per cycle and have less branches, but this approach disables
3803     // early return, thus, all 64 bytes are loaded and checked every time.
3804     __ ldp(tmp2, tmp3, Address(ary1));
3805     __ ldp(tmp4, tmp5, Address(ary1, 16));
3806     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3807     __ ldp(tmp6, tmp1, Address(ary1, 48));
3808     __ add(ary1, ary1, large_loop_size);
3809     __ sub(len, len, large_loop_size);
3810     __ orr(tmp2, tmp2, tmp3);
3811     __ orr(tmp4, tmp4, tmp5);
3812     __ orr(rscratch1, rscratch1, rscratch2);
3813     __ orr(tmp6, tmp6, tmp1);
3814     __ orr(tmp2, tmp2, tmp4);
3815     __ orr(rscratch1, rscratch1, tmp6);
3816     __ orr(tmp2, tmp2, rscratch1);
3817     __ tst(tmp2, UPPER_BIT_MASK);
3818     __ br(Assembler::NE, RET_TRUE);
3819     __ cmp(len, large_loop_size);
3820     __ br(Assembler::GE, LARGE_LOOP);
3821 
3822   __ bind(CHECK_16); // small 16-byte load pre-loop
3823     __ cmp(len, 16);
3824     __ br(Assembler::LT, POST_LOOP16);
3825 
3826   __ bind(LOOP16); // small 16-byte load loop
3827     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3828     __ sub(len, len, 16);
3829     __ orr(tmp2, tmp2, tmp3);
3830     __ tst(tmp2, UPPER_BIT_MASK);
3831     __ br(Assembler::NE, RET_TRUE);
3832     __ cmp(len, 16);
3833     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3834 
3835   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3836     __ cmp(len, 8);
3837     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3838     __ ldr(tmp3, Address(__ post(ary1, 8)));
3839     __ sub(len, len, 8);
3840     __ tst(tmp3, UPPER_BIT_MASK);
3841     __ br(Assembler::NE, RET_TRUE);
3842 
3843   __ bind(POST_LOOP16_LOAD_TAIL);
3844     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3845     __ ldr(tmp1, Address(ary1));
3846     __ mov(tmp2, 64);
3847     __ sub(tmp4, tmp2, len, __ LSL, 3);
3848     __ lslv(tmp1, tmp1, tmp4);
3849     __ tst(tmp1, UPPER_BIT_MASK);
3850     __ br(Assembler::NE, RET_TRUE);
3851     // Fallthrough
3852 
3853   __ bind(RET_FALSE);
3854     __ pop(spilled_regs, sp);
3855     __ leave();
3856     __ mov(result, zr);
3857     __ ret(lr);
3858 
3859   __ bind(RET_TRUE);
3860     __ pop(spilled_regs, sp);
3861   __ bind(RET_TRUE_NO_POP);
3862     __ leave();
3863     __ mov(result, 1);
3864     __ ret(lr);
3865 
3866   __ bind(DONE);
3867     __ pop(spilled_regs, sp);
3868     __ leave();
3869     __ ret(lr);
3870     return entry;
3871   }
3872 
3873   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3874         bool usePrefetch, Label &NOT_EQUAL) {
3875     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3876         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3877         tmp7 = r12, tmp8 = r13;
3878     Label LOOP;
3879 
3880     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3881     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3882     __ bind(LOOP);
3883     if (usePrefetch) {
3884       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3885       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3886     }
3887     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3888     __ eor(tmp1, tmp1, tmp2);
3889     __ eor(tmp3, tmp3, tmp4);
3890     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3891     __ orr(tmp1, tmp1, tmp3);
3892     __ cbnz(tmp1, NOT_EQUAL);
3893     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3894     __ eor(tmp5, tmp5, tmp6);
3895     __ eor(tmp7, tmp7, tmp8);
3896     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3897     __ orr(tmp5, tmp5, tmp7);
3898     __ cbnz(tmp5, NOT_EQUAL);
3899     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3900     __ eor(tmp1, tmp1, tmp2);
3901     __ eor(tmp3, tmp3, tmp4);
3902     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3903     __ orr(tmp1, tmp1, tmp3);
3904     __ cbnz(tmp1, NOT_EQUAL);
3905     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3906     __ eor(tmp5, tmp5, tmp6);
3907     __ sub(cnt1, cnt1, 8 * wordSize);
3908     __ eor(tmp7, tmp7, tmp8);
3909     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3910     __ cmp(cnt1, loopThreshold);
3911     __ orr(tmp5, tmp5, tmp7);
3912     __ cbnz(tmp5, NOT_EQUAL);
3913     __ br(__ GE, LOOP);
3914     // post-loop
3915     __ eor(tmp1, tmp1, tmp2);
3916     __ eor(tmp3, tmp3, tmp4);
3917     __ orr(tmp1, tmp1, tmp3);
3918     __ sub(cnt1, cnt1, 2 * wordSize);
3919     __ cbnz(tmp1, NOT_EQUAL);
3920   }
3921 
3922   void generate_large_array_equals_loop_simd(int loopThreshold,
3923         bool usePrefetch, Label &NOT_EQUAL) {
3924     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3925         tmp2 = rscratch2;
3926     Label LOOP;
3927 
3928     __ bind(LOOP);
3929     if (usePrefetch) {
3930       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3931       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3932     }
3933     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3934     __ sub(cnt1, cnt1, 8 * wordSize);
3935     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3936     __ cmp(cnt1, loopThreshold);
3937     __ eor(v0, __ T16B, v0, v4);
3938     __ eor(v1, __ T16B, v1, v5);
3939     __ eor(v2, __ T16B, v2, v6);
3940     __ eor(v3, __ T16B, v3, v7);
3941     __ orr(v0, __ T16B, v0, v1);
3942     __ orr(v1, __ T16B, v2, v3);
3943     __ orr(v0, __ T16B, v0, v1);
3944     __ umov(tmp1, v0, __ D, 0);
3945     __ umov(tmp2, v0, __ D, 1);
3946     __ orr(tmp1, tmp1, tmp2);
3947     __ cbnz(tmp1, NOT_EQUAL);
3948     __ br(__ GE, LOOP);
3949   }
3950 
3951   // a1 = r1 - array1 address
3952   // a2 = r2 - array2 address
3953   // result = r0 - return value. Already contains "false"
3954   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3955   // r3-r5 are reserved temporary registers
3956   address generate_large_array_equals() {
3957     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3958     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3959         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3960         tmp7 = r12, tmp8 = r13;
3961     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3962         SMALL_LOOP, POST_LOOP;
3963     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3964     // calculate if at least 32 prefetched bytes are used
3965     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3966     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3967     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3968     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3969         tmp5, tmp6, tmp7, tmp8);
3970 
3971     __ align(CodeEntryAlignment);
3972     address entry = __ pc();
3973     __ enter();
3974     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3975     // also advance pointers to use post-increment instead of pre-increment
3976     __ add(a1, a1, wordSize);
3977     __ add(a2, a2, wordSize);
3978     if (AvoidUnalignedAccesses) {
3979       // both implementations (SIMD/nonSIMD) are using relatively large load
3980       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3981       // on some CPUs in case of address is not at least 16-byte aligned.
3982       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3983       // load if needed at least for 1st address and make if 16-byte aligned.
3984       Label ALIGNED16;
3985       __ tbz(a1, 3, ALIGNED16);
3986       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3987       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3988       __ sub(cnt1, cnt1, wordSize);
3989       __ eor(tmp1, tmp1, tmp2);
3990       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3991       __ bind(ALIGNED16);
3992     }
3993     if (UseSIMDForArrayEquals) {
3994       if (SoftwarePrefetchHintDistance >= 0) {
3995         __ cmp(cnt1, prefetchLoopThreshold);
3996         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3997         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3998             /* prfm = */ true, NOT_EQUAL);
3999         __ cmp(cnt1, nonPrefetchLoopThreshold);
4000         __ br(__ LT, TAIL);
4001       }
4002       __ bind(NO_PREFETCH_LARGE_LOOP);
4003       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4004           /* prfm = */ false, NOT_EQUAL);
4005     } else {
4006       __ push(spilled_regs, sp);
4007       if (SoftwarePrefetchHintDistance >= 0) {
4008         __ cmp(cnt1, prefetchLoopThreshold);
4009         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4010         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4011             /* prfm = */ true, NOT_EQUAL);
4012         __ cmp(cnt1, nonPrefetchLoopThreshold);
4013         __ br(__ LT, TAIL);
4014       }
4015       __ bind(NO_PREFETCH_LARGE_LOOP);
4016       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4017           /* prfm = */ false, NOT_EQUAL);
4018     }
4019     __ bind(TAIL);
4020       __ cbz(cnt1, EQUAL);
4021       __ subs(cnt1, cnt1, wordSize);
4022       __ br(__ LE, POST_LOOP);
4023     __ bind(SMALL_LOOP);
4024       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4025       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4026       __ subs(cnt1, cnt1, wordSize);
4027       __ eor(tmp1, tmp1, tmp2);
4028       __ cbnz(tmp1, NOT_EQUAL);
4029       __ br(__ GT, SMALL_LOOP);
4030     __ bind(POST_LOOP);
4031       __ ldr(tmp1, Address(a1, cnt1));
4032       __ ldr(tmp2, Address(a2, cnt1));
4033       __ eor(tmp1, tmp1, tmp2);
4034       __ cbnz(tmp1, NOT_EQUAL);
4035     __ bind(EQUAL);
4036       __ mov(result, true);
4037     __ bind(NOT_EQUAL);
4038       if (!UseSIMDForArrayEquals) {
4039         __ pop(spilled_regs, sp);
4040       }
4041     __ bind(NOT_EQUAL_NO_POP);
4042     __ leave();
4043     __ ret(lr);
4044     return entry;
4045   }
4046 
4047 
4048   /**
4049    *  Arguments:
4050    *
4051    *  Input:
4052    *  c_rarg0   - current state address
4053    *  c_rarg1   - H key address
4054    *  c_rarg2   - data address
4055    *  c_rarg3   - number of blocks
4056    *
4057    *  Output:
4058    *  Updated state at c_rarg0
4059    */
4060   address generate_ghash_processBlocks() {
4061     // Bafflingly, GCM uses little-endian for the byte order, but
4062     // big-endian for the bit order.  For example, the polynomial 1 is
4063     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4064     //
4065     // So, we must either reverse the bytes in each word and do
4066     // everything big-endian or reverse the bits in each byte and do
4067     // it little-endian.  On AArch64 it's more idiomatic to reverse
4068     // the bits in each byte (we have an instruction, RBIT, to do
4069     // that) and keep the data in little-endian bit order throught the
4070     // calculation, bit-reversing the inputs and outputs.
4071 
4072     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4073     __ align(wordSize * 2);
4074     address p = __ pc();
4075     __ emit_int64(0x87);  // The low-order bits of the field
4076                           // polynomial (i.e. p = z^7+z^2+z+1)
4077                           // repeated in the low and high parts of a
4078                           // 128-bit vector
4079     __ emit_int64(0x87);
4080 
4081     __ align(CodeEntryAlignment);
4082     address start = __ pc();
4083 
4084     Register state   = c_rarg0;
4085     Register subkeyH = c_rarg1;
4086     Register data    = c_rarg2;
4087     Register blocks  = c_rarg3;
4088 
4089     FloatRegister vzr = v30;
4090     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4091 
4092     __ ldrq(v0, Address(state));
4093     __ ldrq(v1, Address(subkeyH));
4094 
4095     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4096     __ rbit(v0, __ T16B, v0);
4097     __ rev64(v1, __ T16B, v1);
4098     __ rbit(v1, __ T16B, v1);
4099 
4100     __ ldrq(v26, p);
4101 
4102     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4103     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4104 
4105     {
4106       Label L_ghash_loop;
4107       __ bind(L_ghash_loop);
4108 
4109       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4110                                                  // reversing each byte
4111       __ rbit(v2, __ T16B, v2);
4112       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4113 
4114       // Multiply state in v2 by subkey in v1
4115       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4116                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4117                      /*temps*/v6, v20, v18, v21);
4118       // Reduce v7:v5 by the field polynomial
4119       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4120 
4121       __ sub(blocks, blocks, 1);
4122       __ cbnz(blocks, L_ghash_loop);
4123     }
4124 
4125     // The bit-reversed result is at this point in v0
4126     __ rev64(v1, __ T16B, v0);
4127     __ rbit(v1, __ T16B, v1);
4128 
4129     __ st1(v1, __ T16B, state);
4130     __ ret(lr);
4131 
4132     return start;
4133   }
4134 
4135   // Continuation point for throwing of implicit exceptions that are
4136   // not handled in the current activation. Fabricates an exception
4137   // oop and initiates normal exception dispatching in this
4138   // frame. Since we need to preserve callee-saved values (currently
4139   // only for C2, but done for C1 as well) we need a callee-saved oop
4140   // map and therefore have to make these stubs into RuntimeStubs
4141   // rather than BufferBlobs.  If the compiler needs all registers to
4142   // be preserved between the fault point and the exception handler
4143   // then it must assume responsibility for that in
4144   // AbstractCompiler::continuation_for_implicit_null_exception or
4145   // continuation_for_implicit_division_by_zero_exception. All other
4146   // implicit exceptions (e.g., NullPointerException or
4147   // AbstractMethodError on entry) are either at call sites or
4148   // otherwise assume that stack unwinding will be initiated, so
4149   // caller saved registers were assumed volatile in the compiler.
4150 
4151 #undef __
4152 #define __ masm->
4153 
4154   address generate_throw_exception(const char* name,
4155                                    address runtime_entry,
4156                                    Register arg1 = noreg,
4157                                    Register arg2 = noreg) {
4158     // Information about frame layout at time of blocking runtime call.
4159     // Note that we only have to preserve callee-saved registers since
4160     // the compilers are responsible for supplying a continuation point
4161     // if they expect all registers to be preserved.
4162     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4163     enum layout {
4164       rfp_off = 0,
4165       rfp_off2,
4166       return_off,
4167       return_off2,
4168       framesize // inclusive of return address
4169     };
4170 
4171     int insts_size = 512;
4172     int locs_size  = 64;
4173 
4174     CodeBuffer code(name, insts_size, locs_size);
4175     OopMapSet* oop_maps  = new OopMapSet();
4176     MacroAssembler* masm = new MacroAssembler(&code);
4177 
4178     address start = __ pc();
4179 
4180     // This is an inlined and slightly modified version of call_VM
4181     // which has the ability to fetch the return PC out of
4182     // thread-local storage and also sets up last_Java_sp slightly
4183     // differently than the real call_VM
4184 
4185     __ enter(); // Save FP and LR before call
4186 
4187     assert(is_even(framesize/2), "sp not 16-byte aligned");
4188 
4189     // lr and fp are already in place
4190     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4191 
4192     int frame_complete = __ pc() - start;
4193 
4194     // Set up last_Java_sp and last_Java_fp
4195     address the_pc = __ pc();
4196     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4197 
4198     // Call runtime
4199     if (arg1 != noreg) {
4200       assert(arg2 != c_rarg1, "clobbered");
4201       __ mov(c_rarg1, arg1);
4202     }
4203     if (arg2 != noreg) {
4204       __ mov(c_rarg2, arg2);
4205     }
4206     __ mov(c_rarg0, rthread);
4207     BLOCK_COMMENT("call runtime_entry");
4208     __ mov(rscratch1, runtime_entry);
4209     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4210 
4211     // Generate oop map
4212     OopMap* map = new OopMap(framesize, 0);
4213 
4214     oop_maps->add_gc_map(the_pc - start, map);
4215 
4216     __ reset_last_Java_frame(true);
4217     __ maybe_isb();
4218 
4219     __ leave();
4220 
4221     // check for pending exceptions
4222 #ifdef ASSERT
4223     Label L;
4224     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4225     __ cbnz(rscratch1, L);
4226     __ should_not_reach_here();
4227     __ bind(L);
4228 #endif // ASSERT
4229     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4230 
4231 
4232     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4233     RuntimeStub* stub =
4234       RuntimeStub::new_runtime_stub(name,
4235                                     &code,
4236                                     frame_complete,
4237                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4238                                     oop_maps, false);
4239     return stub->entry_point();
4240   }
4241 
4242   class MontgomeryMultiplyGenerator : public MacroAssembler {
4243 
4244     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4245       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4246 
4247     RegSet _toSave;
4248     bool _squaring;
4249 
4250   public:
4251     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4252       : MacroAssembler(as->code()), _squaring(squaring) {
4253 
4254       // Register allocation
4255 
4256       Register reg = c_rarg0;
4257       Pa_base = reg;       // Argument registers
4258       if (squaring)
4259         Pb_base = Pa_base;
4260       else
4261         Pb_base = ++reg;
4262       Pn_base = ++reg;
4263       Rlen= ++reg;
4264       inv = ++reg;
4265       Pm_base = ++reg;
4266 
4267                           // Working registers:
4268       Ra =  ++reg;        // The current digit of a, b, n, and m.
4269       Rb =  ++reg;
4270       Rm =  ++reg;
4271       Rn =  ++reg;
4272 
4273       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4274       Pb =  ++reg;
4275       Pm =  ++reg;
4276       Pn =  ++reg;
4277 
4278       t0 =  ++reg;        // Three registers which form a
4279       t1 =  ++reg;        // triple-precision accumuator.
4280       t2 =  ++reg;
4281 
4282       Ri =  ++reg;        // Inner and outer loop indexes.
4283       Rj =  ++reg;
4284 
4285       Rhi_ab = ++reg;     // Product registers: low and high parts
4286       Rlo_ab = ++reg;     // of a*b and m*n.
4287       Rhi_mn = ++reg;
4288       Rlo_mn = ++reg;
4289 
4290       // r19 and up are callee-saved.
4291       _toSave = RegSet::range(r19, reg) + Pm_base;
4292     }
4293 
4294   private:
4295     void save_regs() {
4296       push(_toSave, sp);
4297     }
4298 
4299     void restore_regs() {
4300       pop(_toSave, sp);
4301     }
4302 
4303     template <typename T>
4304     void unroll_2(Register count, T block) {
4305       Label loop, end, odd;
4306       tbnz(count, 0, odd);
4307       cbz(count, end);
4308       align(16);
4309       bind(loop);
4310       (this->*block)();
4311       bind(odd);
4312       (this->*block)();
4313       subs(count, count, 2);
4314       br(Assembler::GT, loop);
4315       bind(end);
4316     }
4317 
4318     template <typename T>
4319     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4320       Label loop, end, odd;
4321       tbnz(count, 0, odd);
4322       cbz(count, end);
4323       align(16);
4324       bind(loop);
4325       (this->*block)(d, s, tmp);
4326       bind(odd);
4327       (this->*block)(d, s, tmp);
4328       subs(count, count, 2);
4329       br(Assembler::GT, loop);
4330       bind(end);
4331     }
4332 
4333     void pre1(RegisterOrConstant i) {
4334       block_comment("pre1");
4335       // Pa = Pa_base;
4336       // Pb = Pb_base + i;
4337       // Pm = Pm_base;
4338       // Pn = Pn_base + i;
4339       // Ra = *Pa;
4340       // Rb = *Pb;
4341       // Rm = *Pm;
4342       // Rn = *Pn;
4343       ldr(Ra, Address(Pa_base));
4344       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4345       ldr(Rm, Address(Pm_base));
4346       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4347       lea(Pa, Address(Pa_base));
4348       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4349       lea(Pm, Address(Pm_base));
4350       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4351 
4352       // Zero the m*n result.
4353       mov(Rhi_mn, zr);
4354       mov(Rlo_mn, zr);
4355     }
4356 
4357     // The core multiply-accumulate step of a Montgomery
4358     // multiplication.  The idea is to schedule operations as a
4359     // pipeline so that instructions with long latencies (loads and
4360     // multiplies) have time to complete before their results are
4361     // used.  This most benefits in-order implementations of the
4362     // architecture but out-of-order ones also benefit.
4363     void step() {
4364       block_comment("step");
4365       // MACC(Ra, Rb, t0, t1, t2);
4366       // Ra = *++Pa;
4367       // Rb = *--Pb;
4368       umulh(Rhi_ab, Ra, Rb);
4369       mul(Rlo_ab, Ra, Rb);
4370       ldr(Ra, pre(Pa, wordSize));
4371       ldr(Rb, pre(Pb, -wordSize));
4372       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4373                                        // previous iteration.
4374       // MACC(Rm, Rn, t0, t1, t2);
4375       // Rm = *++Pm;
4376       // Rn = *--Pn;
4377       umulh(Rhi_mn, Rm, Rn);
4378       mul(Rlo_mn, Rm, Rn);
4379       ldr(Rm, pre(Pm, wordSize));
4380       ldr(Rn, pre(Pn, -wordSize));
4381       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4382     }
4383 
4384     void post1() {
4385       block_comment("post1");
4386 
4387       // MACC(Ra, Rb, t0, t1, t2);
4388       // Ra = *++Pa;
4389       // Rb = *--Pb;
4390       umulh(Rhi_ab, Ra, Rb);
4391       mul(Rlo_ab, Ra, Rb);
4392       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4393       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4394 
4395       // *Pm = Rm = t0 * inv;
4396       mul(Rm, t0, inv);
4397       str(Rm, Address(Pm));
4398 
4399       // MACC(Rm, Rn, t0, t1, t2);
4400       // t0 = t1; t1 = t2; t2 = 0;
4401       umulh(Rhi_mn, Rm, Rn);
4402 
4403 #ifndef PRODUCT
4404       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4405       {
4406         mul(Rlo_mn, Rm, Rn);
4407         add(Rlo_mn, t0, Rlo_mn);
4408         Label ok;
4409         cbz(Rlo_mn, ok); {
4410           stop("broken Montgomery multiply");
4411         } bind(ok);
4412       }
4413 #endif
4414       // We have very carefully set things up so that
4415       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4416       // the lower half of Rm * Rn because we know the result already:
4417       // it must be -t0.  t0 + (-t0) must generate a carry iff
4418       // t0 != 0.  So, rather than do a mul and an adds we just set
4419       // the carry flag iff t0 is nonzero.
4420       //
4421       // mul(Rlo_mn, Rm, Rn);
4422       // adds(zr, t0, Rlo_mn);
4423       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4424       adcs(t0, t1, Rhi_mn);
4425       adc(t1, t2, zr);
4426       mov(t2, zr);
4427     }
4428 
4429     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4430       block_comment("pre2");
4431       // Pa = Pa_base + i-len;
4432       // Pb = Pb_base + len;
4433       // Pm = Pm_base + i-len;
4434       // Pn = Pn_base + len;
4435 
4436       if (i.is_register()) {
4437         sub(Rj, i.as_register(), len);
4438       } else {
4439         mov(Rj, i.as_constant());
4440         sub(Rj, Rj, len);
4441       }
4442       // Rj == i-len
4443 
4444       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4445       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4446       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4447       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4448 
4449       // Ra = *++Pa;
4450       // Rb = *--Pb;
4451       // Rm = *++Pm;
4452       // Rn = *--Pn;
4453       ldr(Ra, pre(Pa, wordSize));
4454       ldr(Rb, pre(Pb, -wordSize));
4455       ldr(Rm, pre(Pm, wordSize));
4456       ldr(Rn, pre(Pn, -wordSize));
4457 
4458       mov(Rhi_mn, zr);
4459       mov(Rlo_mn, zr);
4460     }
4461 
4462     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4463       block_comment("post2");
4464       if (i.is_constant()) {
4465         mov(Rj, i.as_constant()-len.as_constant());
4466       } else {
4467         sub(Rj, i.as_register(), len);
4468       }
4469 
4470       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4471 
4472       // As soon as we know the least significant digit of our result,
4473       // store it.
4474       // Pm_base[i-len] = t0;
4475       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4476 
4477       // t0 = t1; t1 = t2; t2 = 0;
4478       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4479       adc(t1, t2, zr);
4480       mov(t2, zr);
4481     }
4482 
4483     // A carry in t0 after Montgomery multiplication means that we
4484     // should subtract multiples of n from our result in m.  We'll
4485     // keep doing that until there is no carry.
4486     void normalize(RegisterOrConstant len) {
4487       block_comment("normalize");
4488       // while (t0)
4489       //   t0 = sub(Pm_base, Pn_base, t0, len);
4490       Label loop, post, again;
4491       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4492       cbz(t0, post); {
4493         bind(again); {
4494           mov(i, zr);
4495           mov(cnt, len);
4496           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4497           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4498           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4499           align(16);
4500           bind(loop); {
4501             sbcs(Rm, Rm, Rn);
4502             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4503             add(i, i, 1);
4504             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4505             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4506             sub(cnt, cnt, 1);
4507           } cbnz(cnt, loop);
4508           sbc(t0, t0, zr);
4509         } cbnz(t0, again);
4510       } bind(post);
4511     }
4512 
4513     // Move memory at s to d, reversing words.
4514     //    Increments d to end of copied memory
4515     //    Destroys tmp1, tmp2
4516     //    Preserves len
4517     //    Leaves s pointing to the address which was in d at start
4518     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4519       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4520 
4521       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4522       mov(tmp1, len);
4523       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4524       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4525     }
4526     // where
4527     void reverse1(Register d, Register s, Register tmp) {
4528       ldr(tmp, pre(s, -wordSize));
4529       ror(tmp, tmp, 32);
4530       str(tmp, post(d, wordSize));
4531     }
4532 
4533     void step_squaring() {
4534       // An extra ACC
4535       step();
4536       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4537     }
4538 
4539     void last_squaring(RegisterOrConstant i) {
4540       Label dont;
4541       // if ((i & 1) == 0) {
4542       tbnz(i.as_register(), 0, dont); {
4543         // MACC(Ra, Rb, t0, t1, t2);
4544         // Ra = *++Pa;
4545         // Rb = *--Pb;
4546         umulh(Rhi_ab, Ra, Rb);
4547         mul(Rlo_ab, Ra, Rb);
4548         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4549       } bind(dont);
4550     }
4551 
4552     void extra_step_squaring() {
4553       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4554 
4555       // MACC(Rm, Rn, t0, t1, t2);
4556       // Rm = *++Pm;
4557       // Rn = *--Pn;
4558       umulh(Rhi_mn, Rm, Rn);
4559       mul(Rlo_mn, Rm, Rn);
4560       ldr(Rm, pre(Pm, wordSize));
4561       ldr(Rn, pre(Pn, -wordSize));
4562     }
4563 
4564     void post1_squaring() {
4565       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4566 
4567       // *Pm = Rm = t0 * inv;
4568       mul(Rm, t0, inv);
4569       str(Rm, Address(Pm));
4570 
4571       // MACC(Rm, Rn, t0, t1, t2);
4572       // t0 = t1; t1 = t2; t2 = 0;
4573       umulh(Rhi_mn, Rm, Rn);
4574 
4575 #ifndef PRODUCT
4576       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4577       {
4578         mul(Rlo_mn, Rm, Rn);
4579         add(Rlo_mn, t0, Rlo_mn);
4580         Label ok;
4581         cbz(Rlo_mn, ok); {
4582           stop("broken Montgomery multiply");
4583         } bind(ok);
4584       }
4585 #endif
4586       // We have very carefully set things up so that
4587       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4588       // the lower half of Rm * Rn because we know the result already:
4589       // it must be -t0.  t0 + (-t0) must generate a carry iff
4590       // t0 != 0.  So, rather than do a mul and an adds we just set
4591       // the carry flag iff t0 is nonzero.
4592       //
4593       // mul(Rlo_mn, Rm, Rn);
4594       // adds(zr, t0, Rlo_mn);
4595       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4596       adcs(t0, t1, Rhi_mn);
4597       adc(t1, t2, zr);
4598       mov(t2, zr);
4599     }
4600 
4601     void acc(Register Rhi, Register Rlo,
4602              Register t0, Register t1, Register t2) {
4603       adds(t0, t0, Rlo);
4604       adcs(t1, t1, Rhi);
4605       adc(t2, t2, zr);
4606     }
4607 
4608   public:
4609     /**
4610      * Fast Montgomery multiplication.  The derivation of the
4611      * algorithm is in A Cryptographic Library for the Motorola
4612      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4613      *
4614      * Arguments:
4615      *
4616      * Inputs for multiplication:
4617      *   c_rarg0   - int array elements a
4618      *   c_rarg1   - int array elements b
4619      *   c_rarg2   - int array elements n (the modulus)
4620      *   c_rarg3   - int length
4621      *   c_rarg4   - int inv
4622      *   c_rarg5   - int array elements m (the result)
4623      *
4624      * Inputs for squaring:
4625      *   c_rarg0   - int array elements a
4626      *   c_rarg1   - int array elements n (the modulus)
4627      *   c_rarg2   - int length
4628      *   c_rarg3   - int inv
4629      *   c_rarg4   - int array elements m (the result)
4630      *
4631      */
4632     address generate_multiply() {
4633       Label argh, nothing;
4634       bind(argh);
4635       stop("MontgomeryMultiply total_allocation must be <= 8192");
4636 
4637       align(CodeEntryAlignment);
4638       address entry = pc();
4639 
4640       cbzw(Rlen, nothing);
4641 
4642       enter();
4643 
4644       // Make room.
4645       cmpw(Rlen, 512);
4646       br(Assembler::HI, argh);
4647       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4648       andr(sp, Ra, -2 * wordSize);
4649 
4650       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4651 
4652       {
4653         // Copy input args, reversing as we go.  We use Ra as a
4654         // temporary variable.
4655         reverse(Ra, Pa_base, Rlen, t0, t1);
4656         if (!_squaring)
4657           reverse(Ra, Pb_base, Rlen, t0, t1);
4658         reverse(Ra, Pn_base, Rlen, t0, t1);
4659       }
4660 
4661       // Push all call-saved registers and also Pm_base which we'll need
4662       // at the end.
4663       save_regs();
4664 
4665 #ifndef PRODUCT
4666       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4667       {
4668         ldr(Rn, Address(Pn_base, 0));
4669         mul(Rlo_mn, Rn, inv);
4670         cmp(Rlo_mn, -1);
4671         Label ok;
4672         br(EQ, ok); {
4673           stop("broken inverse in Montgomery multiply");
4674         } bind(ok);
4675       }
4676 #endif
4677 
4678       mov(Pm_base, Ra);
4679 
4680       mov(t0, zr);
4681       mov(t1, zr);
4682       mov(t2, zr);
4683 
4684       block_comment("for (int i = 0; i < len; i++) {");
4685       mov(Ri, zr); {
4686         Label loop, end;
4687         cmpw(Ri, Rlen);
4688         br(Assembler::GE, end);
4689 
4690         bind(loop);
4691         pre1(Ri);
4692 
4693         block_comment("  for (j = i; j; j--) {"); {
4694           movw(Rj, Ri);
4695           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4696         } block_comment("  } // j");
4697 
4698         post1();
4699         addw(Ri, Ri, 1);
4700         cmpw(Ri, Rlen);
4701         br(Assembler::LT, loop);
4702         bind(end);
4703         block_comment("} // i");
4704       }
4705 
4706       block_comment("for (int i = len; i < 2*len; i++) {");
4707       mov(Ri, Rlen); {
4708         Label loop, end;
4709         cmpw(Ri, Rlen, Assembler::LSL, 1);
4710         br(Assembler::GE, end);
4711 
4712         bind(loop);
4713         pre2(Ri, Rlen);
4714 
4715         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4716           lslw(Rj, Rlen, 1);
4717           subw(Rj, Rj, Ri);
4718           subw(Rj, Rj, 1);
4719           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4720         } block_comment("  } // j");
4721 
4722         post2(Ri, Rlen);
4723         addw(Ri, Ri, 1);
4724         cmpw(Ri, Rlen, Assembler::LSL, 1);
4725         br(Assembler::LT, loop);
4726         bind(end);
4727       }
4728       block_comment("} // i");
4729 
4730       normalize(Rlen);
4731 
4732       mov(Ra, Pm_base);  // Save Pm_base in Ra
4733       restore_regs();  // Restore caller's Pm_base
4734 
4735       // Copy our result into caller's Pm_base
4736       reverse(Pm_base, Ra, Rlen, t0, t1);
4737 
4738       leave();
4739       bind(nothing);
4740       ret(lr);
4741 
4742       return entry;
4743     }
4744     // In C, approximately:
4745 
4746     // void
4747     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4748     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4749     //                     unsigned long inv, int len) {
4750     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4751     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4752     //   unsigned long Ra, Rb, Rn, Rm;
4753 
4754     //   int i;
4755 
4756     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4757 
4758     //   for (i = 0; i < len; i++) {
4759     //     int j;
4760 
4761     //     Pa = Pa_base;
4762     //     Pb = Pb_base + i;
4763     //     Pm = Pm_base;
4764     //     Pn = Pn_base + i;
4765 
4766     //     Ra = *Pa;
4767     //     Rb = *Pb;
4768     //     Rm = *Pm;
4769     //     Rn = *Pn;
4770 
4771     //     int iters = i;
4772     //     for (j = 0; iters--; j++) {
4773     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4774     //       MACC(Ra, Rb, t0, t1, t2);
4775     //       Ra = *++Pa;
4776     //       Rb = *--Pb;
4777     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4778     //       MACC(Rm, Rn, t0, t1, t2);
4779     //       Rm = *++Pm;
4780     //       Rn = *--Pn;
4781     //     }
4782 
4783     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4784     //     MACC(Ra, Rb, t0, t1, t2);
4785     //     *Pm = Rm = t0 * inv;
4786     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4787     //     MACC(Rm, Rn, t0, t1, t2);
4788 
4789     //     assert(t0 == 0, "broken Montgomery multiply");
4790 
4791     //     t0 = t1; t1 = t2; t2 = 0;
4792     //   }
4793 
4794     //   for (i = len; i < 2*len; i++) {
4795     //     int j;
4796 
4797     //     Pa = Pa_base + i-len;
4798     //     Pb = Pb_base + len;
4799     //     Pm = Pm_base + i-len;
4800     //     Pn = Pn_base + len;
4801 
4802     //     Ra = *++Pa;
4803     //     Rb = *--Pb;
4804     //     Rm = *++Pm;
4805     //     Rn = *--Pn;
4806 
4807     //     int iters = len*2-i-1;
4808     //     for (j = i-len+1; iters--; j++) {
4809     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4810     //       MACC(Ra, Rb, t0, t1, t2);
4811     //       Ra = *++Pa;
4812     //       Rb = *--Pb;
4813     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4814     //       MACC(Rm, Rn, t0, t1, t2);
4815     //       Rm = *++Pm;
4816     //       Rn = *--Pn;
4817     //     }
4818 
4819     //     Pm_base[i-len] = t0;
4820     //     t0 = t1; t1 = t2; t2 = 0;
4821     //   }
4822 
4823     //   while (t0)
4824     //     t0 = sub(Pm_base, Pn_base, t0, len);
4825     // }
4826 
4827     /**
4828      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4829      * multiplies than Montgomery multiplication so it should be up to
4830      * 25% faster.  However, its loop control is more complex and it
4831      * may actually run slower on some machines.
4832      *
4833      * Arguments:
4834      *
4835      * Inputs:
4836      *   c_rarg0   - int array elements a
4837      *   c_rarg1   - int array elements n (the modulus)
4838      *   c_rarg2   - int length
4839      *   c_rarg3   - int inv
4840      *   c_rarg4   - int array elements m (the result)
4841      *
4842      */
4843     address generate_square() {
4844       Label argh;
4845       bind(argh);
4846       stop("MontgomeryMultiply total_allocation must be <= 8192");
4847 
4848       align(CodeEntryAlignment);
4849       address entry = pc();
4850 
4851       enter();
4852 
4853       // Make room.
4854       cmpw(Rlen, 512);
4855       br(Assembler::HI, argh);
4856       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4857       andr(sp, Ra, -2 * wordSize);
4858 
4859       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4860 
4861       {
4862         // Copy input args, reversing as we go.  We use Ra as a
4863         // temporary variable.
4864         reverse(Ra, Pa_base, Rlen, t0, t1);
4865         reverse(Ra, Pn_base, Rlen, t0, t1);
4866       }
4867 
4868       // Push all call-saved registers and also Pm_base which we'll need
4869       // at the end.
4870       save_regs();
4871 
4872       mov(Pm_base, Ra);
4873 
4874       mov(t0, zr);
4875       mov(t1, zr);
4876       mov(t2, zr);
4877 
4878       block_comment("for (int i = 0; i < len; i++) {");
4879       mov(Ri, zr); {
4880         Label loop, end;
4881         bind(loop);
4882         cmp(Ri, Rlen);
4883         br(Assembler::GE, end);
4884 
4885         pre1(Ri);
4886 
4887         block_comment("for (j = (i+1)/2; j; j--) {"); {
4888           add(Rj, Ri, 1);
4889           lsr(Rj, Rj, 1);
4890           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4891         } block_comment("  } // j");
4892 
4893         last_squaring(Ri);
4894 
4895         block_comment("  for (j = i/2; j; j--) {"); {
4896           lsr(Rj, Ri, 1);
4897           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4898         } block_comment("  } // j");
4899 
4900         post1_squaring();
4901         add(Ri, Ri, 1);
4902         cmp(Ri, Rlen);
4903         br(Assembler::LT, loop);
4904 
4905         bind(end);
4906         block_comment("} // i");
4907       }
4908 
4909       block_comment("for (int i = len; i < 2*len; i++) {");
4910       mov(Ri, Rlen); {
4911         Label loop, end;
4912         bind(loop);
4913         cmp(Ri, Rlen, Assembler::LSL, 1);
4914         br(Assembler::GE, end);
4915 
4916         pre2(Ri, Rlen);
4917 
4918         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4919           lsl(Rj, Rlen, 1);
4920           sub(Rj, Rj, Ri);
4921           sub(Rj, Rj, 1);
4922           lsr(Rj, Rj, 1);
4923           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4924         } block_comment("  } // j");
4925 
4926         last_squaring(Ri);
4927 
4928         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4929           lsl(Rj, Rlen, 1);
4930           sub(Rj, Rj, Ri);
4931           lsr(Rj, Rj, 1);
4932           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4933         } block_comment("  } // j");
4934 
4935         post2(Ri, Rlen);
4936         add(Ri, Ri, 1);
4937         cmp(Ri, Rlen, Assembler::LSL, 1);
4938 
4939         br(Assembler::LT, loop);
4940         bind(end);
4941         block_comment("} // i");
4942       }
4943 
4944       normalize(Rlen);
4945 
4946       mov(Ra, Pm_base);  // Save Pm_base in Ra
4947       restore_regs();  // Restore caller's Pm_base
4948 
4949       // Copy our result into caller's Pm_base
4950       reverse(Pm_base, Ra, Rlen, t0, t1);
4951 
4952       leave();
4953       ret(lr);
4954 
4955       return entry;
4956     }
4957     // In C, approximately:
4958 
4959     // void
4960     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4961     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4962     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4963     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4964     //   unsigned long Ra, Rb, Rn, Rm;
4965 
4966     //   int i;
4967 
4968     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4969 
4970     //   for (i = 0; i < len; i++) {
4971     //     int j;
4972 
4973     //     Pa = Pa_base;
4974     //     Pb = Pa_base + i;
4975     //     Pm = Pm_base;
4976     //     Pn = Pn_base + i;
4977 
4978     //     Ra = *Pa;
4979     //     Rb = *Pb;
4980     //     Rm = *Pm;
4981     //     Rn = *Pn;
4982 
4983     //     int iters = (i+1)/2;
4984     //     for (j = 0; iters--; j++) {
4985     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4986     //       MACC2(Ra, Rb, t0, t1, t2);
4987     //       Ra = *++Pa;
4988     //       Rb = *--Pb;
4989     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4990     //       MACC(Rm, Rn, t0, t1, t2);
4991     //       Rm = *++Pm;
4992     //       Rn = *--Pn;
4993     //     }
4994     //     if ((i & 1) == 0) {
4995     //       assert(Ra == Pa_base[j], "must be");
4996     //       MACC(Ra, Ra, t0, t1, t2);
4997     //     }
4998     //     iters = i/2;
4999     //     assert(iters == i-j, "must be");
5000     //     for (; iters--; j++) {
5001     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5002     //       MACC(Rm, Rn, t0, t1, t2);
5003     //       Rm = *++Pm;
5004     //       Rn = *--Pn;
5005     //     }
5006 
5007     //     *Pm = Rm = t0 * inv;
5008     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5009     //     MACC(Rm, Rn, t0, t1, t2);
5010 
5011     //     assert(t0 == 0, "broken Montgomery multiply");
5012 
5013     //     t0 = t1; t1 = t2; t2 = 0;
5014     //   }
5015 
5016     //   for (i = len; i < 2*len; i++) {
5017     //     int start = i-len+1;
5018     //     int end = start + (len - start)/2;
5019     //     int j;
5020 
5021     //     Pa = Pa_base + i-len;
5022     //     Pb = Pa_base + len;
5023     //     Pm = Pm_base + i-len;
5024     //     Pn = Pn_base + len;
5025 
5026     //     Ra = *++Pa;
5027     //     Rb = *--Pb;
5028     //     Rm = *++Pm;
5029     //     Rn = *--Pn;
5030 
5031     //     int iters = (2*len-i-1)/2;
5032     //     assert(iters == end-start, "must be");
5033     //     for (j = start; iters--; j++) {
5034     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5035     //       MACC2(Ra, Rb, t0, t1, t2);
5036     //       Ra = *++Pa;
5037     //       Rb = *--Pb;
5038     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5039     //       MACC(Rm, Rn, t0, t1, t2);
5040     //       Rm = *++Pm;
5041     //       Rn = *--Pn;
5042     //     }
5043     //     if ((i & 1) == 0) {
5044     //       assert(Ra == Pa_base[j], "must be");
5045     //       MACC(Ra, Ra, t0, t1, t2);
5046     //     }
5047     //     iters =  (2*len-i)/2;
5048     //     assert(iters == len-j, "must be");
5049     //     for (; iters--; j++) {
5050     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5051     //       MACC(Rm, Rn, t0, t1, t2);
5052     //       Rm = *++Pm;
5053     //       Rn = *--Pn;
5054     //     }
5055     //     Pm_base[i-len] = t0;
5056     //     t0 = t1; t1 = t2; t2 = 0;
5057     //   }
5058 
5059     //   while (t0)
5060     //     t0 = sub(Pm_base, Pn_base, t0, len);
5061     // }
5062   };
5063 
5064 
5065   // Initialization
5066   void generate_initial() {
5067     // Generate initial stubs and initializes the entry points
5068 
5069     // entry points that exist in all platforms Note: This is code
5070     // that could be shared among different platforms - however the
5071     // benefit seems to be smaller than the disadvantage of having a
5072     // much more complicated generator structure. See also comment in
5073     // stubRoutines.hpp.
5074 
5075     StubRoutines::_forward_exception_entry = generate_forward_exception();
5076 
5077     StubRoutines::_call_stub_entry =
5078       generate_call_stub(StubRoutines::_call_stub_return_address);
5079 
5080     // is referenced by megamorphic call
5081     StubRoutines::_catch_exception_entry = generate_catch_exception();
5082 
5083     // Build this early so it's available for the interpreter.
5084     StubRoutines::_throw_StackOverflowError_entry =
5085       generate_throw_exception("StackOverflowError throw_exception",
5086                                CAST_FROM_FN_PTR(address,
5087                                                 SharedRuntime::throw_StackOverflowError));
5088     StubRoutines::_throw_delayed_StackOverflowError_entry =
5089       generate_throw_exception("delayed StackOverflowError throw_exception",
5090                                CAST_FROM_FN_PTR(address,
5091                                                 SharedRuntime::throw_delayed_StackOverflowError));
5092     if (UseCRC32Intrinsics) {
5093       // set table address before stub generation which use it
5094       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5095       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5096     }
5097 
5098     if (UseCRC32CIntrinsics) {
5099       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5100     }
5101   }
5102 
5103   void generate_all() {
5104     // support for verify_oop (must happen after universe_init)
5105     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5106     StubRoutines::_throw_AbstractMethodError_entry =
5107       generate_throw_exception("AbstractMethodError throw_exception",
5108                                CAST_FROM_FN_PTR(address,
5109                                                 SharedRuntime::
5110                                                 throw_AbstractMethodError));
5111 
5112     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5113       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5114                                CAST_FROM_FN_PTR(address,
5115                                                 SharedRuntime::
5116                                                 throw_IncompatibleClassChangeError));
5117 
5118     StubRoutines::_throw_NullPointerException_at_call_entry =
5119       generate_throw_exception("NullPointerException at call throw_exception",
5120                                CAST_FROM_FN_PTR(address,
5121                                                 SharedRuntime::
5122                                                 throw_NullPointerException_at_call));
5123 
5124     // arraycopy stubs used by compilers
5125     generate_arraycopy_stubs();
5126 
5127     // has negatives stub for large arrays.
5128     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5129 
5130     // array equals stub for large arrays.
5131     if (!UseSimpleArrayEquals) {
5132       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5133     }
5134 
5135     if (UseMultiplyToLenIntrinsic) {
5136       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5137     }
5138 
5139     if (UseSquareToLenIntrinsic) {
5140       StubRoutines::_squareToLen = generate_squareToLen();
5141     }
5142 
5143     if (UseMulAddIntrinsic) {
5144       StubRoutines::_mulAdd = generate_mulAdd();
5145     }
5146 
5147     if (UseMontgomeryMultiplyIntrinsic) {
5148       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5149       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5150       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5151     }
5152 
5153     if (UseMontgomerySquareIntrinsic) {
5154       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5155       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5156       // We use generate_multiply() rather than generate_square()
5157       // because it's faster for the sizes of modulus we care about.
5158       StubRoutines::_montgomerySquare = g.generate_multiply();
5159     }
5160 
5161     if (UseShenandoahGC && ShenandoahWriteBarrier) {
5162       StubRoutines::aarch64::_shenandoah_wb = generate_shenandoah_wb(false, true);
5163       StubRoutines::_shenandoah_wb_C = generate_shenandoah_wb(true, !ShenandoahWriteBarrierCsetTestInIR);
5164     }
5165 
5166 #ifndef BUILTIN_SIM
5167     // generate GHASH intrinsics code
5168     if (UseGHASHIntrinsics) {
5169       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5170     }
5171 
5172     if (UseAESIntrinsics) {
5173       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5174       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5175       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5176       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5177     }
5178 
5179     if (UseSHA1Intrinsics) {
5180       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5181       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5182     }
5183     if (UseSHA256Intrinsics) {
5184       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5185       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5186     }
5187 
5188     // generate Adler32 intrinsics code
5189     if (UseAdler32Intrinsics) {
5190       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5191     }
5192 
5193     // Safefetch stubs.
5194     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5195                                                        &StubRoutines::_safefetch32_fault_pc,
5196                                                        &StubRoutines::_safefetch32_continuation_pc);
5197     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5198                                                        &StubRoutines::_safefetchN_fault_pc,
5199                                                        &StubRoutines::_safefetchN_continuation_pc);
5200 #endif
5201     StubRoutines::aarch64::set_completed();
5202   }
5203 
5204  public:
5205   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5206     if (all) {
5207       generate_all();
5208     } else {
5209       generate_initial();
5210     }
5211   }
5212 }; // end class declaration
5213 
5214 void StubGenerator_generate(CodeBuffer* code, bool all) {
5215   StubGenerator g(code, all);
5216 }