1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (unsigned)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(j_rarg1, T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(j_rarg2));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376 #ifndef PRODUCT
 377     // tell the simulator we are about to end Java execution
 378     if (NotifySimulator) {
 379       __ notify(Assembler::method_exit);
 380     }
 381 #endif
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387 
 388     __ BIND(is_long);
 389     __ str(r0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_float);
 393     __ strs(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_double);
 397     __ strd(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     return start;
 401   }
 402 
 403   // Return point for a Java call if there's an exception thrown in
 404   // Java code.  The exception is caught and transformed into a
 405   // pending exception stored in JavaThread that can be tested from
 406   // within the VM.
 407   //
 408   // Note: Usually the parameters are removed by the callee. In case
 409   // of an exception crossing an activation frame boundary, that is
 410   // not the case if the callee is compiled code => need to setup the
 411   // rsp.
 412   //
 413   // r0: exception oop
 414 
 415   // NOTE: this is used as a target from the signal handler so it
 416   // needs an x86 prolog which returns into the current simulator
 417   // executing the generated catch_exception code. so the prolog
 418   // needs to install rax in a sim register and adjust the sim's
 419   // restart pc to enter the generated code at the start position
 420   // then return from native to simulated execution.
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != NULL,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code wiht no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // we should not really care that lr is no longer the callee
 517     // address. we saved the value the handler needs in r19 so we can
 518     // just copy it to r3. however, the C2 handler will push its own
 519     // frame and then calls into the VM and the VM code asserts that
 520     // the PC for the frame above the handler belongs to a compiled
 521     // Java method. So, we restore lr here to satisfy that assert.
 522     __ mov(lr, r19);
 523     // setup r0 & r3 & clear pending exception
 524     __ mov(r3, r19);
 525     __ mov(r19, r0);
 526     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 527     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ cbnz(r0, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler
 540     // r0: exception
 541     // r3: throwing pc
 542     // r19: exception handler
 543     __ verify_oop(r0);
 544     __ br(r19);
 545 
 546     return start;
 547   }
 548 
 549   // Non-destructive plausibility checks for oops
 550   //
 551   // Arguments:
 552   //    r0: oop to verify
 553   //    rscratch1: error message
 554   //
 555   // Stack after saving c_rarg3:
 556   //    [tos + 0]: saved c_rarg3
 557   //    [tos + 1]: saved c_rarg2
 558   //    [tos + 2]: saved lr
 559   //    [tos + 3]: saved rscratch2
 560   //    [tos + 4]: saved r0
 561   //    [tos + 5]: saved rscratch1
 562   address generate_verify_oop() {
 563 
 564     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 565     address start = __ pc();
 566 
 567     Label exit, error;
 568 
 569     // save c_rarg2 and c_rarg3
 570     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 571 
 572     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 573     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ ldr(c_rarg3, Address(c_rarg2));
 575     __ add(c_rarg3, c_rarg3, 1);
 576     __ str(c_rarg3, Address(c_rarg2));
 577 
 578     // object is in r0
 579     // make sure object is 'reasonable'
 580     __ cbz(r0, exit); // if obj is NULL it is OK
 581 
 582     // Check if the oop is in the right area of memory
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 584     __ andr(c_rarg2, r0, c_rarg3);
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 586 
 587     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 588     // instruction here because the flags register is live.
 589     __ eor(c_rarg2, c_rarg2, c_rarg3);
 590     __ cbnz(c_rarg2, error);
 591 
 592     // make sure klass is 'reasonable', which is not zero.
 593     __ load_klass(r0, r0);  // get klass
 594     __ cbz(r0, error);      // if klass is NULL it is broken
 595 
 596     // return if everything seems ok
 597     __ bind(exit);
 598 
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600     __ ret(lr);
 601 
 602     // handle errors
 603     __ bind(error);
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605 
 606     __ push(RegSet::range(r0, r29), sp);
 607     // debug(char* msg, int64_t pc, int64_t regs[])
 608     __ mov(c_rarg0, rscratch1);      // pass address of error message
 609     __ mov(c_rarg1, lr);             // pass return address
 610     __ mov(c_rarg2, sp);             // pass address of regs on stack
 611 #ifndef PRODUCT
 612     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 613 #endif
 614     BLOCK_COMMENT("call MacroAssembler::debug");
 615     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 616     __ blrt(rscratch1, 3, 0, 1);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label store_pair, loop_store_pair, done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657 
 658       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 659       // alignment.
 660       Label small;
 661       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 662       __ subs(rscratch1, cnt, low_limit >> 3);
 663       __ br(Assembler::LT, small);
 664       __ zero_dcache_blocks(base, cnt);
 665       __ bind(small);
 666     }
 667 
 668     {
 669       // Number of stp instructions we'll unroll
 670       const int unroll =
 671         MacroAssembler::zero_words_block_size / 2;
 672       // Clear the remaining blocks.
 673       Label loop;
 674       __ subs(cnt, cnt, unroll * 2);
 675       __ br(Assembler::LT, done);
 676       __ bind(loop);
 677       for (int i = 0; i < unroll; i++)
 678         __ stp(zr, zr, __ post(base, 16));
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::GE, loop);
 681       __ bind(done);
 682       __ add(cnt, cnt, unroll * 2);
 683     }
 684 
 685     __ ret(lr);
 686 
 687     return start;
 688   }
 689 
 690 
 691   typedef enum {
 692     copy_forwards = 1,
 693     copy_backwards = -1
 694   } copy_direction;
 695 
 696   // Bulk copy of blocks of 8 words.
 697   //
 698   // count is a count of words.
 699   //
 700   // Precondition: count >= 8
 701   //
 702   // Postconditions:
 703   //
 704   // The least significant bit of count contains the remaining count
 705   // of words to copy.  The rest of count is trash.
 706   //
 707   // s and d are adjusted to point to the remaining words to copy
 708   //
 709   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 710                            copy_direction direction) {
 711     int unit = wordSize * direction;
 712     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 713 
 714     int offset;
 715     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 716       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 717     const Register stride = r13;
 718 
 719     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 720     assert_different_registers(s, d, count, rscratch1);
 721 
 722     Label again, drain;
 723     const char *stub_name;
 724     if (direction == copy_forwards)
 725       stub_name = "forward_copy_longs";
 726     else
 727       stub_name = "backward_copy_longs";
 728     StubCodeMark mark(this, "StubRoutines", stub_name);
 729     __ align(CodeEntryAlignment);
 730     __ bind(start);
 731 
 732     Label unaligned_copy_long;
 733     if (AvoidUnalignedAccesses) {
 734       __ tbnz(d, 3, unaligned_copy_long);
 735     }
 736 
 737     if (direction == copy_forwards) {
 738       __ sub(s, s, bias);
 739       __ sub(d, d, bias);
 740     }
 741 
 742 #ifdef ASSERT
 743     // Make sure we are never given < 8 words
 744     {
 745       Label L;
 746       __ cmp(count, 8);
 747       __ br(Assembler::GE, L);
 748       __ stop("genrate_copy_longs called with < 8 words");
 749       __ bind(L);
 750     }
 751 #endif
 752 
 753     // Fill 8 registers
 754     if (UseSIMDForMemoryOps) {
 755       __ ldpq(v0, v1, Address(s, 4 * unit));
 756       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 757     } else {
 758       __ ldp(t0, t1, Address(s, 2 * unit));
 759       __ ldp(t2, t3, Address(s, 4 * unit));
 760       __ ldp(t4, t5, Address(s, 6 * unit));
 761       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 762     }
 763 
 764     __ subs(count, count, 16);
 765     __ br(Assembler::LO, drain);
 766 
 767     int prefetch = PrefetchCopyIntervalInBytes;
 768     bool use_stride = false;
 769     if (direction == copy_backwards) {
 770        use_stride = prefetch > 256;
 771        prefetch = -prefetch;
 772        if (use_stride) __ mov(stride, prefetch);
 773     }
 774 
 775     __ bind(again);
 776 
 777     if (PrefetchCopyIntervalInBytes > 0)
 778       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 779 
 780     if (UseSIMDForMemoryOps) {
 781       __ stpq(v0, v1, Address(d, 4 * unit));
 782       __ ldpq(v0, v1, Address(s, 4 * unit));
 783       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 784       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 785     } else {
 786       __ stp(t0, t1, Address(d, 2 * unit));
 787       __ ldp(t0, t1, Address(s, 2 * unit));
 788       __ stp(t2, t3, Address(d, 4 * unit));
 789       __ ldp(t2, t3, Address(s, 4 * unit));
 790       __ stp(t4, t5, Address(d, 6 * unit));
 791       __ ldp(t4, t5, Address(s, 6 * unit));
 792       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 793       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 794     }
 795 
 796     __ subs(count, count, 8);
 797     __ br(Assembler::HS, again);
 798 
 799     // Drain
 800     __ bind(drain);
 801     if (UseSIMDForMemoryOps) {
 802       __ stpq(v0, v1, Address(d, 4 * unit));
 803       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 804     } else {
 805       __ stp(t0, t1, Address(d, 2 * unit));
 806       __ stp(t2, t3, Address(d, 4 * unit));
 807       __ stp(t4, t5, Address(d, 6 * unit));
 808       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 809     }
 810 
 811     {
 812       Label L1, L2;
 813       __ tbz(count, exact_log2(4), L1);
 814       if (UseSIMDForMemoryOps) {
 815         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 816         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 817       } else {
 818         __ ldp(t0, t1, Address(s, 2 * unit));
 819         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 820         __ stp(t0, t1, Address(d, 2 * unit));
 821         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 822       }
 823       __ bind(L1);
 824 
 825       if (direction == copy_forwards) {
 826         __ add(s, s, bias);
 827         __ add(d, d, bias);
 828       }
 829 
 830       __ tbz(count, 1, L2);
 831       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 832       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 833       __ bind(L2);
 834     }
 835 
 836     __ ret(lr);
 837 
 838     if (AvoidUnalignedAccesses) {
 839       Label drain, again;
 840       // Register order for storing. Order is different for backward copy.
 841 
 842       __ bind(unaligned_copy_long);
 843 
 844       // source address is even aligned, target odd aligned
 845       //
 846       // when forward copying word pairs we read long pairs at offsets
 847       // {0, 2, 4, 6} (in long words). when backwards copying we read
 848       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 849       // address by -2 in the forwards case so we can compute the
 850       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 851       // or -1.
 852       //
 853       // when forward copying we need to store 1 word, 3 pairs and
 854       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 855       // zero offset We adjust the destination by -1 which means we
 856       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 857       //
 858       // When backwards copyng we need to store 1 word, 3 pairs and
 859       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 860       // offsets {1, 3, 5, 7, 8} * unit.
 861 
 862       if (direction == copy_forwards) {
 863         __ sub(s, s, 16);
 864         __ sub(d, d, 8);
 865       }
 866 
 867       // Fill 8 registers
 868       //
 869       // for forwards copy s was offset by -16 from the original input
 870       // value of s so the register contents are at these offsets
 871       // relative to the 64 bit block addressed by that original input
 872       // and so on for each successive 64 byte block when s is updated
 873       //
 874       // t0 at offset 0,  t1 at offset 8
 875       // t2 at offset 16, t3 at offset 24
 876       // t4 at offset 32, t5 at offset 40
 877       // t6 at offset 48, t7 at offset 56
 878 
 879       // for backwards copy s was not offset so the register contents
 880       // are at these offsets into the preceding 64 byte block
 881       // relative to that original input and so on for each successive
 882       // preceding 64 byte block when s is updated. this explains the
 883       // slightly counter-intuitive looking pattern of register usage
 884       // in the stp instructions for backwards copy.
 885       //
 886       // t0 at offset -16, t1 at offset -8
 887       // t2 at offset -32, t3 at offset -24
 888       // t4 at offset -48, t5 at offset -40
 889       // t6 at offset -64, t7 at offset -56
 890 
 891       __ ldp(t0, t1, Address(s, 2 * unit));
 892       __ ldp(t2, t3, Address(s, 4 * unit));
 893       __ ldp(t4, t5, Address(s, 6 * unit));
 894       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 895 
 896       __ subs(count, count, 16);
 897       __ br(Assembler::LO, drain);
 898 
 899       int prefetch = PrefetchCopyIntervalInBytes;
 900       bool use_stride = false;
 901       if (direction == copy_backwards) {
 902          use_stride = prefetch > 256;
 903          prefetch = -prefetch;
 904          if (use_stride) __ mov(stride, prefetch);
 905       }
 906 
 907       __ bind(again);
 908 
 909       if (PrefetchCopyIntervalInBytes > 0)
 910         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 911 
 912       if (direction == copy_forwards) {
 913        // allowing for the offset of -8 the store instructions place
 914        // registers into the target 64 bit block at the following
 915        // offsets
 916        //
 917        // t0 at offset 0
 918        // t1 at offset 8,  t2 at offset 16
 919        // t3 at offset 24, t4 at offset 32
 920        // t5 at offset 40, t6 at offset 48
 921        // t7 at offset 56
 922 
 923         __ str(t0, Address(d, 1 * unit));
 924         __ stp(t1, t2, Address(d, 2 * unit));
 925         __ ldp(t0, t1, Address(s, 2 * unit));
 926         __ stp(t3, t4, Address(d, 4 * unit));
 927         __ ldp(t2, t3, Address(s, 4 * unit));
 928         __ stp(t5, t6, Address(d, 6 * unit));
 929         __ ldp(t4, t5, Address(s, 6 * unit));
 930         __ str(t7, Address(__ pre(d, 8 * unit)));
 931         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 932       } else {
 933        // d was not offset when we started so the registers are
 934        // written into the 64 bit block preceding d with the following
 935        // offsets
 936        //
 937        // t1 at offset -8
 938        // t3 at offset -24, t0 at offset -16
 939        // t5 at offset -48, t2 at offset -32
 940        // t7 at offset -56, t4 at offset -48
 941        //                   t6 at offset -64
 942        //
 943        // note that this matches the offsets previously noted for the
 944        // loads
 945 
 946         __ str(t1, Address(d, 1 * unit));
 947         __ stp(t3, t0, Address(d, 3 * unit));
 948         __ ldp(t0, t1, Address(s, 2 * unit));
 949         __ stp(t5, t2, Address(d, 5 * unit));
 950         __ ldp(t2, t3, Address(s, 4 * unit));
 951         __ stp(t7, t4, Address(d, 7 * unit));
 952         __ ldp(t4, t5, Address(s, 6 * unit));
 953         __ str(t6, Address(__ pre(d, 8 * unit)));
 954         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 955       }
 956 
 957       __ subs(count, count, 8);
 958       __ br(Assembler::HS, again);
 959 
 960       // Drain
 961       //
 962       // this uses the same pattern of offsets and register arguments
 963       // as above
 964       __ bind(drain);
 965       if (direction == copy_forwards) {
 966         __ str(t0, Address(d, 1 * unit));
 967         __ stp(t1, t2, Address(d, 2 * unit));
 968         __ stp(t3, t4, Address(d, 4 * unit));
 969         __ stp(t5, t6, Address(d, 6 * unit));
 970         __ str(t7, Address(__ pre(d, 8 * unit)));
 971       } else {
 972         __ str(t1, Address(d, 1 * unit));
 973         __ stp(t3, t0, Address(d, 3 * unit));
 974         __ stp(t5, t2, Address(d, 5 * unit));
 975         __ stp(t7, t4, Address(d, 7 * unit));
 976         __ str(t6, Address(__ pre(d, 8 * unit)));
 977       }
 978       // now we need to copy any remaining part block which may
 979       // include a 4 word block subblock and/or a 2 word subblock.
 980       // bits 2 and 1 in the count are the tell-tale for whetehr we
 981       // have each such subblock
 982       {
 983         Label L1, L2;
 984         __ tbz(count, exact_log2(4), L1);
 985        // this is the same as above but copying only 4 longs hence
 986        // with ony one intervening stp between the str instructions
 987        // but note that the offsets and registers still follow the
 988        // same pattern
 989         __ ldp(t0, t1, Address(s, 2 * unit));
 990         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 991         if (direction == copy_forwards) {
 992           __ str(t0, Address(d, 1 * unit));
 993           __ stp(t1, t2, Address(d, 2 * unit));
 994           __ str(t3, Address(__ pre(d, 4 * unit)));
 995         } else {
 996           __ str(t1, Address(d, 1 * unit));
 997           __ stp(t3, t0, Address(d, 3 * unit));
 998           __ str(t2, Address(__ pre(d, 4 * unit)));
 999         }
1000         __ bind(L1);
1001 
1002         __ tbz(count, 1, L2);
1003        // this is the same as above but copying only 2 longs hence
1004        // there is no intervening stp between the str instructions
1005        // but note that the offset and register patterns are still
1006        // the same
1007         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1008         if (direction == copy_forwards) {
1009           __ str(t0, Address(d, 1 * unit));
1010           __ str(t1, Address(__ pre(d, 2 * unit)));
1011         } else {
1012           __ str(t1, Address(d, 1 * unit));
1013           __ str(t0, Address(__ pre(d, 2 * unit)));
1014         }
1015         __ bind(L2);
1016 
1017        // for forwards copy we need to re-adjust the offsets we
1018        // applied so that s and d are follow the last words written
1019 
1020        if (direction == copy_forwards) {
1021          __ add(s, s, 16);
1022          __ add(d, d, 8);
1023        }
1024 
1025       }
1026 
1027       __ ret(lr);
1028       }
1029   }
1030 
1031   // Small copy: less than 16 bytes.
1032   //
1033   // NB: Ignores all of the bits of count which represent more than 15
1034   // bytes, so a caller doesn't have to mask them.
1035 
1036   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1037     bool is_backwards = step < 0;
1038     size_t granularity = uabs(step);
1039     int direction = is_backwards ? -1 : 1;
1040     int unit = wordSize * direction;
1041 
1042     Label Lpair, Lword, Lint, Lshort, Lbyte;
1043 
1044     assert(granularity
1045            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1046 
1047     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1048 
1049     // ??? I don't know if this bit-test-and-branch is the right thing
1050     // to do.  It does a lot of jumping, resulting in several
1051     // mispredicted branches.  It might make more sense to do this
1052     // with something like Duff's device with a single computed branch.
1053 
1054     __ tbz(count, 3 - exact_log2(granularity), Lword);
1055     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1056     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1057     __ bind(Lword);
1058 
1059     if (granularity <= sizeof (jint)) {
1060       __ tbz(count, 2 - exact_log2(granularity), Lint);
1061       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1062       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1063       __ bind(Lint);
1064     }
1065 
1066     if (granularity <= sizeof (jshort)) {
1067       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1068       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1069       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1070       __ bind(Lshort);
1071     }
1072 
1073     if (granularity <= sizeof (jbyte)) {
1074       __ tbz(count, 0, Lbyte);
1075       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1076       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1077       __ bind(Lbyte);
1078     }
1079   }
1080 
1081   Label copy_f, copy_b;
1082 
1083   // All-singing all-dancing memory copy.
1084   //
1085   // Copy count units of memory from s to d.  The size of a unit is
1086   // step, which can be positive or negative depending on the direction
1087   // of copy.  If is_aligned is false, we align the source address.
1088   //
1089 
1090   void copy_memory(bool is_aligned, Register s, Register d,
1091                    Register count, Register tmp, int step) {
1092     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1093     bool is_backwards = step < 0;
1094     int granularity = uabs(step);
1095     const Register t0 = r3, t1 = r4;
1096 
1097     // <= 96 bytes do inline. Direction doesn't matter because we always
1098     // load all the data before writing anything
1099     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1100     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1101     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1102     const Register send = r17, dend = r18;
1103 
1104     if (PrefetchCopyIntervalInBytes > 0)
1105       __ prfm(Address(s, 0), PLDL1KEEP);
1106     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1107     __ br(Assembler::HI, copy_big);
1108 
1109     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1110     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1111 
1112     __ cmp(count, 16/granularity);
1113     __ br(Assembler::LS, copy16);
1114 
1115     __ cmp(count, 64/granularity);
1116     __ br(Assembler::HI, copy80);
1117 
1118     __ cmp(count, 32/granularity);
1119     __ br(Assembler::LS, copy32);
1120 
1121     // 33..64 bytes
1122     if (UseSIMDForMemoryOps) {
1123       __ ldpq(v0, v1, Address(s, 0));
1124       __ ldpq(v2, v3, Address(send, -32));
1125       __ stpq(v0, v1, Address(d, 0));
1126       __ stpq(v2, v3, Address(dend, -32));
1127     } else {
1128       __ ldp(t0, t1, Address(s, 0));
1129       __ ldp(t2, t3, Address(s, 16));
1130       __ ldp(t4, t5, Address(send, -32));
1131       __ ldp(t6, t7, Address(send, -16));
1132 
1133       __ stp(t0, t1, Address(d, 0));
1134       __ stp(t2, t3, Address(d, 16));
1135       __ stp(t4, t5, Address(dend, -32));
1136       __ stp(t6, t7, Address(dend, -16));
1137     }
1138     __ b(finish);
1139 
1140     // 17..32 bytes
1141     __ bind(copy32);
1142     __ ldp(t0, t1, Address(s, 0));
1143     __ ldp(t2, t3, Address(send, -16));
1144     __ stp(t0, t1, Address(d, 0));
1145     __ stp(t2, t3, Address(dend, -16));
1146     __ b(finish);
1147 
1148     // 65..80/96 bytes
1149     // (96 bytes if SIMD because we do 32 byes per instruction)
1150     __ bind(copy80);
1151     if (UseSIMDForMemoryOps) {
1152       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1153       __ ldpq(v4, v5, Address(send, -32));
1154       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1155       __ stpq(v4, v5, Address(dend, -32));
1156     } else {
1157       __ ldp(t0, t1, Address(s, 0));
1158       __ ldp(t2, t3, Address(s, 16));
1159       __ ldp(t4, t5, Address(s, 32));
1160       __ ldp(t6, t7, Address(s, 48));
1161       __ ldp(t8, t9, Address(send, -16));
1162 
1163       __ stp(t0, t1, Address(d, 0));
1164       __ stp(t2, t3, Address(d, 16));
1165       __ stp(t4, t5, Address(d, 32));
1166       __ stp(t6, t7, Address(d, 48));
1167       __ stp(t8, t9, Address(dend, -16));
1168     }
1169     __ b(finish);
1170 
1171     // 0..16 bytes
1172     __ bind(copy16);
1173     __ cmp(count, 8/granularity);
1174     __ br(Assembler::LO, copy8);
1175 
1176     // 8..16 bytes
1177     __ ldr(t0, Address(s, 0));
1178     __ ldr(t1, Address(send, -8));
1179     __ str(t0, Address(d, 0));
1180     __ str(t1, Address(dend, -8));
1181     __ b(finish);
1182 
1183     if (granularity < 8) {
1184       // 4..7 bytes
1185       __ bind(copy8);
1186       __ tbz(count, 2 - exact_log2(granularity), copy4);
1187       __ ldrw(t0, Address(s, 0));
1188       __ ldrw(t1, Address(send, -4));
1189       __ strw(t0, Address(d, 0));
1190       __ strw(t1, Address(dend, -4));
1191       __ b(finish);
1192       if (granularity < 4) {
1193         // 0..3 bytes
1194         __ bind(copy4);
1195         __ cbz(count, finish); // get rid of 0 case
1196         if (granularity == 2) {
1197           __ ldrh(t0, Address(s, 0));
1198           __ strh(t0, Address(d, 0));
1199         } else { // granularity == 1
1200           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1201           // the first and last byte.
1202           // Handle the 3 byte case by loading and storing base + count/2
1203           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1204           // This does means in the 1 byte case we load/store the same
1205           // byte 3 times.
1206           __ lsr(count, count, 1);
1207           __ ldrb(t0, Address(s, 0));
1208           __ ldrb(t1, Address(send, -1));
1209           __ ldrb(t2, Address(s, count));
1210           __ strb(t0, Address(d, 0));
1211           __ strb(t1, Address(dend, -1));
1212           __ strb(t2, Address(d, count));
1213         }
1214         __ b(finish);
1215       }
1216     }
1217 
1218     __ bind(copy_big);
1219     if (is_backwards) {
1220       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1221       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1222     }
1223 
1224     // Now we've got the small case out of the way we can align the
1225     // source address on a 2-word boundary.
1226 
1227     Label aligned;
1228 
1229     if (is_aligned) {
1230       // We may have to adjust by 1 word to get s 2-word-aligned.
1231       __ tbz(s, exact_log2(wordSize), aligned);
1232       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1233       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1234       __ sub(count, count, wordSize/granularity);
1235     } else {
1236       if (is_backwards) {
1237         __ andr(rscratch2, s, 2 * wordSize - 1);
1238       } else {
1239         __ neg(rscratch2, s);
1240         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1241       }
1242       // rscratch2 is the byte adjustment needed to align s.
1243       __ cbz(rscratch2, aligned);
1244       int shift = exact_log2(granularity);
1245       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1246       __ sub(count, count, rscratch2);
1247 
1248 #if 0
1249       // ?? This code is only correct for a disjoint copy.  It may or
1250       // may not make sense to use it in that case.
1251 
1252       // Copy the first pair; s and d may not be aligned.
1253       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1254       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1255 
1256       // Align s and d, adjust count
1257       if (is_backwards) {
1258         __ sub(s, s, rscratch2);
1259         __ sub(d, d, rscratch2);
1260       } else {
1261         __ add(s, s, rscratch2);
1262         __ add(d, d, rscratch2);
1263       }
1264 #else
1265       copy_memory_small(s, d, rscratch2, rscratch1, step);
1266 #endif
1267     }
1268 
1269     __ bind(aligned);
1270 
1271     // s is now 2-word-aligned.
1272 
1273     // We have a count of units and some trailing bytes.  Adjust the
1274     // count and do a bulk copy of words.
1275     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1276     if (direction == copy_forwards)
1277       __ bl(copy_f);
1278     else
1279       __ bl(copy_b);
1280 
1281     // And the tail.
1282     copy_memory_small(s, d, count, tmp, step);
1283 
1284     if (granularity >= 8) __ bind(copy8);
1285     if (granularity >= 4) __ bind(copy4);
1286     __ bind(finish);
1287   }
1288 
1289 
1290   void clobber_registers() {
1291 #ifdef ASSERT
1292     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1293     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1294     for (Register r = r3; r <= r18; r++)
1295       if (r != rscratch1) __ mov(r, rscratch1);
1296 #endif
1297   }
1298 
1299   // Scan over array at a for count oops, verifying each one.
1300   // Preserves a and count, clobbers rscratch1 and rscratch2.
1301   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1302     Label loop, end;
1303     __ mov(rscratch1, a);
1304     __ mov(rscratch2, zr);
1305     __ bind(loop);
1306     __ cmp(rscratch2, count);
1307     __ br(Assembler::HS, end);
1308     if (size == (size_t)wordSize) {
1309       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1310       __ verify_oop(temp);
1311     } else {
1312       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1313       __ decode_heap_oop(temp); // calls verify_oop
1314     }
1315     __ add(rscratch2, rscratch2, size);
1316     __ b(loop);
1317     __ bind(end);
1318   }
1319 
1320   // Arguments:
1321   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1322   //             ignored
1323   //   is_oop  - true => oop array, so generate store check code
1324   //   name    - stub name string
1325   //
1326   // Inputs:
1327   //   c_rarg0   - source array address
1328   //   c_rarg1   - destination array address
1329   //   c_rarg2   - element count, treated as ssize_t, can be zero
1330   //
1331   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1332   // the hardware handle it.  The two dwords within qwords that span
1333   // cache line boundaries will still be loaded and stored atomicly.
1334   //
1335   // Side Effects:
1336   //   disjoint_int_copy_entry is set to the no-overlap entry point
1337   //   used by generate_conjoint_int_oop_copy().
1338   //
1339   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1340                                   const char *name, bool dest_uninitialized = false) {
1341     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1342     RegSet saved_reg = RegSet::of(s, d, count);
1343     __ align(CodeEntryAlignment);
1344     StubCodeMark mark(this, "StubRoutines", name);
1345     address start = __ pc();
1346     __ enter();
1347 
1348     if (entry != NULL) {
1349       *entry = __ pc();
1350       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1351       BLOCK_COMMENT("Entry:");
1352     }
1353 
1354     DecoratorSet decorators = ARRAYCOPY_DISJOINT;
1355     if (dest_uninitialized) {
1356       decorators |= AS_DEST_NOT_INITIALIZED;
1357     }
1358     if (aligned) {
1359       decorators |= ARRAYCOPY_ALIGNED;
1360     }
1361 
1362     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1363     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1364 
1365     if (is_oop) {
1366       // save regs before copy_memory
1367       __ push(RegSet::of(d, count), sp);
1368     }
1369     copy_memory(aligned, s, d, count, rscratch1, size);
1370 
1371     if (is_oop) {
1372       __ pop(RegSet::of(d, count), sp);
1373       if (VerifyOops)
1374         verify_oop_array(size, d, count, r16);
1375       __ sub(count, count, 1); // make an inclusive end pointer
1376       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1377     }
1378 
1379     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1380 
1381     __ leave();
1382     __ mov(r0, zr); // return 0
1383     __ ret(lr);
1384 #ifdef BUILTIN_SIM
1385     {
1386       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1387       sim->notifyCompile(const_cast<char*>(name), start);
1388     }
1389 #endif
1390     return start;
1391   }
1392 
1393   // Arguments:
1394   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1395   //             ignored
1396   //   is_oop  - true => oop array, so generate store check code
1397   //   name    - stub name string
1398   //
1399   // Inputs:
1400   //   c_rarg0   - source array address
1401   //   c_rarg1   - destination array address
1402   //   c_rarg2   - element count, treated as ssize_t, can be zero
1403   //
1404   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1405   // the hardware handle it.  The two dwords within qwords that span
1406   // cache line boundaries will still be loaded and stored atomicly.
1407   //
1408   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1409                                  address *entry, const char *name,
1410                                  bool dest_uninitialized = false) {
1411     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1412     RegSet saved_regs = RegSet::of(s, d, count);
1413     StubCodeMark mark(this, "StubRoutines", name);
1414     address start = __ pc();
1415     __ enter();
1416 
1417     if (entry != NULL) {
1418       *entry = __ pc();
1419       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1420       BLOCK_COMMENT("Entry:");
1421     }
1422 
1423     // use fwd copy when (d-s) above_equal (count*size)
1424     __ sub(rscratch1, d, s);
1425     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1426     __ br(Assembler::HS, nooverlap_target);
1427 
1428     DecoratorSet decorators = 0;
1429     if (dest_uninitialized) {
1430       decorators |= AS_DEST_NOT_INITIALIZED;
1431     }
1432     if (aligned) {
1433       decorators |= ARRAYCOPY_ALIGNED;
1434     }
1435 
1436     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1437     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1438 
1439     if (is_oop) {
1440       // save regs before copy_memory
1441       __ push(RegSet::of(d, count), sp);
1442     }
1443     copy_memory(aligned, s, d, count, rscratch1, -size);
1444     if (is_oop) {
1445       __ pop(RegSet::of(d, count), sp);
1446       if (VerifyOops)
1447         verify_oop_array(size, d, count, r16);
1448       __ sub(count, count, 1); // make an inclusive end pointer
1449       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1450     }
1451     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1452     __ leave();
1453     __ mov(r0, zr); // return 0
1454     __ ret(lr);
1455 #ifdef BUILTIN_SIM
1456     {
1457       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1458       sim->notifyCompile(const_cast<char*>(name), start);
1459     }
1460 #endif
1461     return start;
1462 }
1463 
1464   // Arguments:
1465   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1466   //             ignored
1467   //   name    - stub name string
1468   //
1469   // Inputs:
1470   //   c_rarg0   - source array address
1471   //   c_rarg1   - destination array address
1472   //   c_rarg2   - element count, treated as ssize_t, can be zero
1473   //
1474   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1475   // we let the hardware handle it.  The one to eight bytes within words,
1476   // dwords or qwords that span cache line boundaries will still be loaded
1477   // and stored atomically.
1478   //
1479   // Side Effects:
1480   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1481   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1482   // we let the hardware handle it.  The one to eight bytes within words,
1483   // dwords or qwords that span cache line boundaries will still be loaded
1484   // and stored atomically.
1485   //
1486   // Side Effects:
1487   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1488   //   used by generate_conjoint_byte_copy().
1489   //
1490   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1491     const bool not_oop = false;
1492     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1493   }
1494 
1495   // Arguments:
1496   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1497   //             ignored
1498   //   name    - stub name string
1499   //
1500   // Inputs:
1501   //   c_rarg0   - source array address
1502   //   c_rarg1   - destination array address
1503   //   c_rarg2   - element count, treated as ssize_t, can be zero
1504   //
1505   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1506   // we let the hardware handle it.  The one to eight bytes within words,
1507   // dwords or qwords that span cache line boundaries will still be loaded
1508   // and stored atomically.
1509   //
1510   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1511                                       address* entry, const char *name) {
1512     const bool not_oop = false;
1513     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1514   }
1515 
1516   // Arguments:
1517   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1518   //             ignored
1519   //   name    - stub name string
1520   //
1521   // Inputs:
1522   //   c_rarg0   - source array address
1523   //   c_rarg1   - destination array address
1524   //   c_rarg2   - element count, treated as ssize_t, can be zero
1525   //
1526   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1527   // let the hardware handle it.  The two or four words within dwords
1528   // or qwords that span cache line boundaries will still be loaded
1529   // and stored atomically.
1530   //
1531   // Side Effects:
1532   //   disjoint_short_copy_entry is set to the no-overlap entry point
1533   //   used by generate_conjoint_short_copy().
1534   //
1535   address generate_disjoint_short_copy(bool aligned,
1536                                        address* entry, const char *name) {
1537     const bool not_oop = false;
1538     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1539   }
1540 
1541   // Arguments:
1542   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1543   //             ignored
1544   //   name    - stub name string
1545   //
1546   // Inputs:
1547   //   c_rarg0   - source array address
1548   //   c_rarg1   - destination array address
1549   //   c_rarg2   - element count, treated as ssize_t, can be zero
1550   //
1551   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1552   // let the hardware handle it.  The two or four words within dwords
1553   // or qwords that span cache line boundaries will still be loaded
1554   // and stored atomically.
1555   //
1556   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1557                                        address *entry, const char *name) {
1558     const bool not_oop = false;
1559     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1560 
1561   }
1562   // Arguments:
1563   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1564   //             ignored
1565   //   name    - stub name string
1566   //
1567   // Inputs:
1568   //   c_rarg0   - source array address
1569   //   c_rarg1   - destination array address
1570   //   c_rarg2   - element count, treated as ssize_t, can be zero
1571   //
1572   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1573   // the hardware handle it.  The two dwords within qwords that span
1574   // cache line boundaries will still be loaded and stored atomicly.
1575   //
1576   // Side Effects:
1577   //   disjoint_int_copy_entry is set to the no-overlap entry point
1578   //   used by generate_conjoint_int_oop_copy().
1579   //
1580   address generate_disjoint_int_copy(bool aligned, address *entry,
1581                                          const char *name, bool dest_uninitialized = false) {
1582     const bool not_oop = false;
1583     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1584   }
1585 
1586   // Arguments:
1587   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1588   //             ignored
1589   //   name    - stub name string
1590   //
1591   // Inputs:
1592   //   c_rarg0   - source array address
1593   //   c_rarg1   - destination array address
1594   //   c_rarg2   - element count, treated as ssize_t, can be zero
1595   //
1596   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1597   // the hardware handle it.  The two dwords within qwords that span
1598   // cache line boundaries will still be loaded and stored atomicly.
1599   //
1600   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1601                                      address *entry, const char *name,
1602                                      bool dest_uninitialized = false) {
1603     const bool not_oop = false;
1604     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1605   }
1606 
1607 
1608   // Arguments:
1609   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1610   //             ignored
1611   //   name    - stub name string
1612   //
1613   // Inputs:
1614   //   c_rarg0   - source array address
1615   //   c_rarg1   - destination array address
1616   //   c_rarg2   - element count, treated as size_t, can be zero
1617   //
1618   // Side Effects:
1619   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1620   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1621   //
1622   address generate_disjoint_long_copy(bool aligned, address *entry,
1623                                           const char *name, bool dest_uninitialized = false) {
1624     const bool not_oop = false;
1625     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1626   }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as size_t, can be zero
1637   //
1638   address generate_conjoint_long_copy(bool aligned,
1639                                       address nooverlap_target, address *entry,
1640                                       const char *name, bool dest_uninitialized = false) {
1641     const bool not_oop = false;
1642     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1643   }
1644 
1645   // Arguments:
1646   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1647   //             ignored
1648   //   name    - stub name string
1649   //
1650   // Inputs:
1651   //   c_rarg0   - source array address
1652   //   c_rarg1   - destination array address
1653   //   c_rarg2   - element count, treated as size_t, can be zero
1654   //
1655   // Side Effects:
1656   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1657   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1658   //
1659   address generate_disjoint_oop_copy(bool aligned, address *entry,
1660                                      const char *name, bool dest_uninitialized) {
1661     const bool is_oop = true;
1662     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1663     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1664   }
1665 
1666   // Arguments:
1667   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1668   //             ignored
1669   //   name    - stub name string
1670   //
1671   // Inputs:
1672   //   c_rarg0   - source array address
1673   //   c_rarg1   - destination array address
1674   //   c_rarg2   - element count, treated as size_t, can be zero
1675   //
1676   address generate_conjoint_oop_copy(bool aligned,
1677                                      address nooverlap_target, address *entry,
1678                                      const char *name, bool dest_uninitialized) {
1679     const bool is_oop = true;
1680     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1681     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1682                                   name, dest_uninitialized);
1683   }
1684 
1685 
1686   // Helper for generating a dynamic type check.
1687   // Smashes rscratch1.
1688   void generate_type_check(Register sub_klass,
1689                            Register super_check_offset,
1690                            Register super_klass,
1691                            Label& L_success) {
1692     assert_different_registers(sub_klass, super_check_offset, super_klass);
1693 
1694     BLOCK_COMMENT("type_check:");
1695 
1696     Label L_miss;
1697 
1698     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1699                                      super_check_offset);
1700     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1701 
1702     // Fall through on failure!
1703     __ BIND(L_miss);
1704   }
1705 
1706   //
1707   //  Generate checkcasting array copy stub
1708   //
1709   //  Input:
1710   //    c_rarg0   - source array address
1711   //    c_rarg1   - destination array address
1712   //    c_rarg2   - element count, treated as ssize_t, can be zero
1713   //    c_rarg3   - size_t ckoff (super_check_offset)
1714   //    c_rarg4   - oop ckval (super_klass)
1715   //
1716   //  Output:
1717   //    r0 ==  0  -  success
1718   //    r0 == -1^K - failure, where K is partial transfer count
1719   //
1720   address generate_checkcast_copy(const char *name, address *entry,
1721                                   bool dest_uninitialized = false) {
1722 
1723     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1724 
1725     // Input registers (after setup_arg_regs)
1726     const Register from        = c_rarg0;   // source array address
1727     const Register to          = c_rarg1;   // destination array address
1728     const Register count       = c_rarg2;   // elementscount
1729     const Register ckoff       = c_rarg3;   // super_check_offset
1730     const Register ckval       = c_rarg4;   // super_klass
1731 
1732     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1733     RegSet wb_post_saved_regs = RegSet::of(count);
1734 
1735     // Registers used as temps (r18, r19, r20 are save-on-entry)
1736     const Register count_save  = r21;       // orig elementscount
1737     const Register start_to    = r20;       // destination array start address
1738     const Register copied_oop  = r18;       // actual oop copied
1739     const Register r19_klass   = r19;       // oop._klass
1740 
1741     //---------------------------------------------------------------
1742     // Assembler stub will be used for this call to arraycopy
1743     // if the two arrays are subtypes of Object[] but the
1744     // destination array type is not equal to or a supertype
1745     // of the source type.  Each element must be separately
1746     // checked.
1747 
1748     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1749                                copied_oop, r19_klass, count_save);
1750 
1751     __ align(CodeEntryAlignment);
1752     StubCodeMark mark(this, "StubRoutines", name);
1753     address start = __ pc();
1754 
1755     __ enter(); // required for proper stackwalking of RuntimeStub frame
1756 
1757 #ifdef ASSERT
1758     // caller guarantees that the arrays really are different
1759     // otherwise, we would have to make conjoint checks
1760     { Label L;
1761       array_overlap_test(L, TIMES_OOP);
1762       __ stop("checkcast_copy within a single array");
1763       __ bind(L);
1764     }
1765 #endif //ASSERT
1766 
1767     // Caller of this entry point must set up the argument registers.
1768     if (entry != NULL) {
1769       *entry = __ pc();
1770       BLOCK_COMMENT("Entry:");
1771     }
1772 
1773      // Empty array:  Nothing to do.
1774     __ cbz(count, L_done);
1775 
1776     __ push(RegSet::of(r18, r19, r20, r21), sp);
1777 
1778 #ifdef ASSERT
1779     BLOCK_COMMENT("assert consistent ckoff/ckval");
1780     // The ckoff and ckval must be mutually consistent,
1781     // even though caller generates both.
1782     { Label L;
1783       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1784       __ ldrw(start_to, Address(ckval, sco_offset));
1785       __ cmpw(ckoff, start_to);
1786       __ br(Assembler::EQ, L);
1787       __ stop("super_check_offset inconsistent");
1788       __ bind(L);
1789     }
1790 #endif //ASSERT
1791 
1792     DecoratorSet decorators = ARRAYCOPY_CHECKCAST;
1793     bool is_oop = true;
1794     if (dest_uninitialized) {
1795       decorators |= AS_DEST_NOT_INITIALIZED;
1796     }
1797 
1798     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1799     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1800 
1801     // save the original count
1802     __ mov(count_save, count);
1803 
1804     // Copy from low to high addresses
1805     __ mov(start_to, to);              // Save destination array start address
1806     __ b(L_load_element);
1807 
1808     // ======== begin loop ========
1809     // (Loop is rotated; its entry is L_load_element.)
1810     // Loop control:
1811     //   for (; count != 0; count--) {
1812     //     copied_oop = load_heap_oop(from++);
1813     //     ... generate_type_check ...;
1814     //     store_heap_oop(to++, copied_oop);
1815     //   }
1816     __ align(OptoLoopAlignment);
1817 
1818     __ BIND(L_store_element);
1819     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1820     __ sub(count, count, 1);
1821     __ cbz(count, L_do_card_marks);
1822 
1823     // ======== loop entry is here ========
1824     __ BIND(L_load_element);
1825     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1826     __ cbz(copied_oop, L_store_element);
1827 
1828     __ load_klass(r19_klass, copied_oop);// query the object klass
1829     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1830     // ======== end loop ========
1831 
1832     // It was a real error; we must depend on the caller to finish the job.
1833     // Register count = remaining oops, count_orig = total oops.
1834     // Emit GC store barriers for the oops we have copied and report
1835     // their number to the caller.
1836 
1837     __ subs(count, count_save, count);     // K = partially copied oop count
1838     __ eon(count, count, zr);                   // report (-1^K) to caller
1839     __ br(Assembler::EQ, L_done_pop);
1840 
1841     __ BIND(L_do_card_marks);
1842     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1843     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1844 
1845     __ bind(L_done_pop);
1846     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1847     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1848 
1849     __ bind(L_done);
1850     __ mov(r0, count);
1851     __ leave();
1852     __ ret(lr);
1853 
1854     return start;
1855   }
1856 
1857   // Perform range checks on the proposed arraycopy.
1858   // Kills temp, but nothing else.
1859   // Also, clean the sign bits of src_pos and dst_pos.
1860   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1861                               Register src_pos, // source position (c_rarg1)
1862                               Register dst,     // destination array oo (c_rarg2)
1863                               Register dst_pos, // destination position (c_rarg3)
1864                               Register length,
1865                               Register temp,
1866                               Label& L_failed) {
1867     BLOCK_COMMENT("arraycopy_range_checks:");
1868 
1869     assert_different_registers(rscratch1, temp);
1870 
1871     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1872     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1873     __ addw(temp, length, src_pos);
1874     __ cmpw(temp, rscratch1);
1875     __ br(Assembler::HI, L_failed);
1876 
1877     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1878     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1879     __ addw(temp, length, dst_pos);
1880     __ cmpw(temp, rscratch1);
1881     __ br(Assembler::HI, L_failed);
1882 
1883     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1884     __ movw(src_pos, src_pos);
1885     __ movw(dst_pos, dst_pos);
1886 
1887     BLOCK_COMMENT("arraycopy_range_checks done");
1888   }
1889 
1890   // These stubs get called from some dumb test routine.
1891   // I'll write them properly when they're called from
1892   // something that's actually doing something.
1893   static void fake_arraycopy_stub(address src, address dst, int count) {
1894     assert(count == 0, "huh?");
1895   }
1896 
1897 
1898   //
1899   //  Generate 'unsafe' array copy stub
1900   //  Though just as safe as the other stubs, it takes an unscaled
1901   //  size_t argument instead of an element count.
1902   //
1903   //  Input:
1904   //    c_rarg0   - source array address
1905   //    c_rarg1   - destination array address
1906   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1907   //
1908   // Examines the alignment of the operands and dispatches
1909   // to a long, int, short, or byte copy loop.
1910   //
1911   address generate_unsafe_copy(const char *name,
1912                                address byte_copy_entry,
1913                                address short_copy_entry,
1914                                address int_copy_entry,
1915                                address long_copy_entry) {
1916     Label L_long_aligned, L_int_aligned, L_short_aligned;
1917     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1918 
1919     __ align(CodeEntryAlignment);
1920     StubCodeMark mark(this, "StubRoutines", name);
1921     address start = __ pc();
1922     __ enter(); // required for proper stackwalking of RuntimeStub frame
1923 
1924     // bump this on entry, not on exit:
1925     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1926 
1927     __ orr(rscratch1, s, d);
1928     __ orr(rscratch1, rscratch1, count);
1929 
1930     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1931     __ cbz(rscratch1, L_long_aligned);
1932     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1933     __ cbz(rscratch1, L_int_aligned);
1934     __ tbz(rscratch1, 0, L_short_aligned);
1935     __ b(RuntimeAddress(byte_copy_entry));
1936 
1937     __ BIND(L_short_aligned);
1938     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1939     __ b(RuntimeAddress(short_copy_entry));
1940     __ BIND(L_int_aligned);
1941     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1942     __ b(RuntimeAddress(int_copy_entry));
1943     __ BIND(L_long_aligned);
1944     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1945     __ b(RuntimeAddress(long_copy_entry));
1946 
1947     return start;
1948   }
1949 
1950   //
1951   //  Generate generic array copy stubs
1952   //
1953   //  Input:
1954   //    c_rarg0    -  src oop
1955   //    c_rarg1    -  src_pos (32-bits)
1956   //    c_rarg2    -  dst oop
1957   //    c_rarg3    -  dst_pos (32-bits)
1958   //    c_rarg4    -  element count (32-bits)
1959   //
1960   //  Output:
1961   //    r0 ==  0  -  success
1962   //    r0 == -1^K - failure, where K is partial transfer count
1963   //
1964   address generate_generic_copy(const char *name,
1965                                 address byte_copy_entry, address short_copy_entry,
1966                                 address int_copy_entry, address oop_copy_entry,
1967                                 address long_copy_entry, address checkcast_copy_entry) {
1968 
1969     Label L_failed, L_failed_0, L_objArray;
1970     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1971 
1972     // Input registers
1973     const Register src        = c_rarg0;  // source array oop
1974     const Register src_pos    = c_rarg1;  // source position
1975     const Register dst        = c_rarg2;  // destination array oop
1976     const Register dst_pos    = c_rarg3;  // destination position
1977     const Register length     = c_rarg4;
1978 
1979     StubCodeMark mark(this, "StubRoutines", name);
1980 
1981     __ align(CodeEntryAlignment);
1982     address start = __ pc();
1983 
1984     __ enter(); // required for proper stackwalking of RuntimeStub frame
1985 
1986     // bump this on entry, not on exit:
1987     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1988 
1989     //-----------------------------------------------------------------------
1990     // Assembler stub will be used for this call to arraycopy
1991     // if the following conditions are met:
1992     //
1993     // (1) src and dst must not be null.
1994     // (2) src_pos must not be negative.
1995     // (3) dst_pos must not be negative.
1996     // (4) length  must not be negative.
1997     // (5) src klass and dst klass should be the same and not NULL.
1998     // (6) src and dst should be arrays.
1999     // (7) src_pos + length must not exceed length of src.
2000     // (8) dst_pos + length must not exceed length of dst.
2001     //
2002 
2003     //  if (src == NULL) return -1;
2004     __ cbz(src, L_failed);
2005 
2006     //  if (src_pos < 0) return -1;
2007     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2008 
2009     //  if (dst == NULL) return -1;
2010     __ cbz(dst, L_failed);
2011 
2012     //  if (dst_pos < 0) return -1;
2013     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2014 
2015     // registers used as temp
2016     const Register scratch_length    = r16; // elements count to copy
2017     const Register scratch_src_klass = r17; // array klass
2018     const Register lh                = r18; // layout helper
2019 
2020     //  if (length < 0) return -1;
2021     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2022     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2023 
2024     __ load_klass(scratch_src_klass, src);
2025 #ifdef ASSERT
2026     //  assert(src->klass() != NULL);
2027     {
2028       BLOCK_COMMENT("assert klasses not null {");
2029       Label L1, L2;
2030       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2031       __ bind(L1);
2032       __ stop("broken null klass");
2033       __ bind(L2);
2034       __ load_klass(rscratch1, dst);
2035       __ cbz(rscratch1, L1);     // this would be broken also
2036       BLOCK_COMMENT("} assert klasses not null done");
2037     }
2038 #endif
2039 
2040     // Load layout helper (32-bits)
2041     //
2042     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2043     // 32        30    24            16              8     2                 0
2044     //
2045     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2046     //
2047 
2048     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2049 
2050     // Handle objArrays completely differently...
2051     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2052     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2053     __ movw(rscratch1, objArray_lh);
2054     __ eorw(rscratch2, lh, rscratch1);
2055     __ cbzw(rscratch2, L_objArray);
2056 
2057     //  if (src->klass() != dst->klass()) return -1;
2058     __ load_klass(rscratch2, dst);
2059     __ eor(rscratch2, rscratch2, scratch_src_klass);
2060     __ cbnz(rscratch2, L_failed);
2061 
2062     //  if (!src->is_Array()) return -1;
2063     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2064 
2065     // At this point, it is known to be a typeArray (array_tag 0x3).
2066 #ifdef ASSERT
2067     {
2068       BLOCK_COMMENT("assert primitive array {");
2069       Label L;
2070       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2071       __ cmpw(lh, rscratch2);
2072       __ br(Assembler::GE, L);
2073       __ stop("must be a primitive array");
2074       __ bind(L);
2075       BLOCK_COMMENT("} assert primitive array done");
2076     }
2077 #endif
2078 
2079     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2080                            rscratch2, L_failed);
2081 
2082     // TypeArrayKlass
2083     //
2084     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2085     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2086     //
2087 
2088     const Register rscratch1_offset = rscratch1;    // array offset
2089     const Register r18_elsize = lh; // element size
2090 
2091     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2092            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2093     __ add(src, src, rscratch1_offset);           // src array offset
2094     __ add(dst, dst, rscratch1_offset);           // dst array offset
2095     BLOCK_COMMENT("choose copy loop based on element size");
2096 
2097     // next registers should be set before the jump to corresponding stub
2098     const Register from     = c_rarg0;  // source array address
2099     const Register to       = c_rarg1;  // destination array address
2100     const Register count    = c_rarg2;  // elements count
2101 
2102     // 'from', 'to', 'count' registers should be set in such order
2103     // since they are the same as 'src', 'src_pos', 'dst'.
2104 
2105     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2106 
2107     // The possible values of elsize are 0-3, i.e. exact_log2(element
2108     // size in bytes).  We do a simple bitwise binary search.
2109   __ BIND(L_copy_bytes);
2110     __ tbnz(r18_elsize, 1, L_copy_ints);
2111     __ tbnz(r18_elsize, 0, L_copy_shorts);
2112     __ lea(from, Address(src, src_pos));// src_addr
2113     __ lea(to,   Address(dst, dst_pos));// dst_addr
2114     __ movw(count, scratch_length); // length
2115     __ b(RuntimeAddress(byte_copy_entry));
2116 
2117   __ BIND(L_copy_shorts);
2118     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2119     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2120     __ movw(count, scratch_length); // length
2121     __ b(RuntimeAddress(short_copy_entry));
2122 
2123   __ BIND(L_copy_ints);
2124     __ tbnz(r18_elsize, 0, L_copy_longs);
2125     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2126     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2127     __ movw(count, scratch_length); // length
2128     __ b(RuntimeAddress(int_copy_entry));
2129 
2130   __ BIND(L_copy_longs);
2131 #ifdef ASSERT
2132     {
2133       BLOCK_COMMENT("assert long copy {");
2134       Label L;
2135       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2136       __ cmpw(r18_elsize, LogBytesPerLong);
2137       __ br(Assembler::EQ, L);
2138       __ stop("must be long copy, but elsize is wrong");
2139       __ bind(L);
2140       BLOCK_COMMENT("} assert long copy done");
2141     }
2142 #endif
2143     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2144     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2145     __ movw(count, scratch_length); // length
2146     __ b(RuntimeAddress(long_copy_entry));
2147 
2148     // ObjArrayKlass
2149   __ BIND(L_objArray);
2150     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2151 
2152     Label L_plain_copy, L_checkcast_copy;
2153     //  test array classes for subtyping
2154     __ load_klass(r18, dst);
2155     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2156     __ br(Assembler::NE, L_checkcast_copy);
2157 
2158     // Identically typed arrays can be copied without element-wise checks.
2159     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2160                            rscratch2, L_failed);
2161 
2162     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2163     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2164     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2165     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2166     __ movw(count, scratch_length); // length
2167   __ BIND(L_plain_copy);
2168     __ b(RuntimeAddress(oop_copy_entry));
2169 
2170   __ BIND(L_checkcast_copy);
2171     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2172     {
2173       // Before looking at dst.length, make sure dst is also an objArray.
2174       __ ldrw(rscratch1, Address(r18, lh_offset));
2175       __ movw(rscratch2, objArray_lh);
2176       __ eorw(rscratch1, rscratch1, rscratch2);
2177       __ cbnzw(rscratch1, L_failed);
2178 
2179       // It is safe to examine both src.length and dst.length.
2180       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2181                              r18, L_failed);
2182 
2183       const Register rscratch2_dst_klass = rscratch2;
2184       __ load_klass(rscratch2_dst_klass, dst); // reload
2185 
2186       // Marshal the base address arguments now, freeing registers.
2187       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2188       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2189       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2190       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2191       __ movw(count, length);           // length (reloaded)
2192       Register sco_temp = c_rarg3;      // this register is free now
2193       assert_different_registers(from, to, count, sco_temp,
2194                                  rscratch2_dst_klass, scratch_src_klass);
2195       // assert_clean_int(count, sco_temp);
2196 
2197       // Generate the type check.
2198       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2199       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2200       // assert_clean_int(sco_temp, r18);
2201       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2202 
2203       // Fetch destination element klass from the ObjArrayKlass header.
2204       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2205       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2206       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2207 
2208       // the checkcast_copy loop needs two extra arguments:
2209       assert(c_rarg3 == sco_temp, "#3 already in place");
2210       // Set up arguments for checkcast_copy_entry.
2211       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2212       __ b(RuntimeAddress(checkcast_copy_entry));
2213     }
2214 
2215   __ BIND(L_failed);
2216     __ mov(r0, -1);
2217     __ leave();   // required for proper stackwalking of RuntimeStub frame
2218     __ ret(lr);
2219 
2220     return start;
2221   }
2222 
2223   //
2224   // Generate stub for array fill. If "aligned" is true, the
2225   // "to" address is assumed to be heapword aligned.
2226   //
2227   // Arguments for generated stub:
2228   //   to:    c_rarg0
2229   //   value: c_rarg1
2230   //   count: c_rarg2 treated as signed
2231   //
2232   address generate_fill(BasicType t, bool aligned, const char *name) {
2233     __ align(CodeEntryAlignment);
2234     StubCodeMark mark(this, "StubRoutines", name);
2235     address start = __ pc();
2236 
2237     BLOCK_COMMENT("Entry:");
2238 
2239     const Register to        = c_rarg0;  // source array address
2240     const Register value     = c_rarg1;  // value
2241     const Register count     = c_rarg2;  // elements count
2242 
2243     const Register bz_base = r10;        // base for block_zero routine
2244     const Register cnt_words = r11;      // temp register
2245 
2246     __ enter();
2247 
2248     Label L_fill_elements, L_exit1;
2249 
2250     int shift = -1;
2251     switch (t) {
2252       case T_BYTE:
2253         shift = 0;
2254         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2255         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2256         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2257         __ br(Assembler::LO, L_fill_elements);
2258         break;
2259       case T_SHORT:
2260         shift = 1;
2261         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2262         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2263         __ br(Assembler::LO, L_fill_elements);
2264         break;
2265       case T_INT:
2266         shift = 2;
2267         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2268         __ br(Assembler::LO, L_fill_elements);
2269         break;
2270       default: ShouldNotReachHere();
2271     }
2272 
2273     // Align source address at 8 bytes address boundary.
2274     Label L_skip_align1, L_skip_align2, L_skip_align4;
2275     if (!aligned) {
2276       switch (t) {
2277         case T_BYTE:
2278           // One byte misalignment happens only for byte arrays.
2279           __ tbz(to, 0, L_skip_align1);
2280           __ strb(value, Address(__ post(to, 1)));
2281           __ subw(count, count, 1);
2282           __ bind(L_skip_align1);
2283           // Fallthrough
2284         case T_SHORT:
2285           // Two bytes misalignment happens only for byte and short (char) arrays.
2286           __ tbz(to, 1, L_skip_align2);
2287           __ strh(value, Address(__ post(to, 2)));
2288           __ subw(count, count, 2 >> shift);
2289           __ bind(L_skip_align2);
2290           // Fallthrough
2291         case T_INT:
2292           // Align to 8 bytes, we know we are 4 byte aligned to start.
2293           __ tbz(to, 2, L_skip_align4);
2294           __ strw(value, Address(__ post(to, 4)));
2295           __ subw(count, count, 4 >> shift);
2296           __ bind(L_skip_align4);
2297           break;
2298         default: ShouldNotReachHere();
2299       }
2300     }
2301 
2302     //
2303     //  Fill large chunks
2304     //
2305     __ lsrw(cnt_words, count, 3 - shift); // number of words
2306     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2307     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2308     if (UseBlockZeroing) {
2309       Label non_block_zeroing, rest;
2310       // If the fill value is zero we can use the fast zero_words().
2311       __ cbnz(value, non_block_zeroing);
2312       __ mov(bz_base, to);
2313       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2314       __ zero_words(bz_base, cnt_words);
2315       __ b(rest);
2316       __ bind(non_block_zeroing);
2317       __ fill_words(to, cnt_words, value);
2318       __ bind(rest);
2319     } else {
2320       __ fill_words(to, cnt_words, value);
2321     }
2322 
2323     // Remaining count is less than 8 bytes. Fill it by a single store.
2324     // Note that the total length is no less than 8 bytes.
2325     if (t == T_BYTE || t == T_SHORT) {
2326       Label L_exit1;
2327       __ cbzw(count, L_exit1);
2328       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2329       __ str(value, Address(to, -8));    // overwrite some elements
2330       __ bind(L_exit1);
2331       __ leave();
2332       __ ret(lr);
2333     }
2334 
2335     // Handle copies less than 8 bytes.
2336     Label L_fill_2, L_fill_4, L_exit2;
2337     __ bind(L_fill_elements);
2338     switch (t) {
2339       case T_BYTE:
2340         __ tbz(count, 0, L_fill_2);
2341         __ strb(value, Address(__ post(to, 1)));
2342         __ bind(L_fill_2);
2343         __ tbz(count, 1, L_fill_4);
2344         __ strh(value, Address(__ post(to, 2)));
2345         __ bind(L_fill_4);
2346         __ tbz(count, 2, L_exit2);
2347         __ strw(value, Address(to));
2348         break;
2349       case T_SHORT:
2350         __ tbz(count, 0, L_fill_4);
2351         __ strh(value, Address(__ post(to, 2)));
2352         __ bind(L_fill_4);
2353         __ tbz(count, 1, L_exit2);
2354         __ strw(value, Address(to));
2355         break;
2356       case T_INT:
2357         __ cbzw(count, L_exit2);
2358         __ strw(value, Address(to));
2359         break;
2360       default: ShouldNotReachHere();
2361     }
2362     __ bind(L_exit2);
2363     __ leave();
2364     __ ret(lr);
2365     return start;
2366   }
2367 
2368   void generate_arraycopy_stubs() {
2369     address entry;
2370     address entry_jbyte_arraycopy;
2371     address entry_jshort_arraycopy;
2372     address entry_jint_arraycopy;
2373     address entry_oop_arraycopy;
2374     address entry_jlong_arraycopy;
2375     address entry_checkcast_arraycopy;
2376 
2377     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2378     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2379 
2380     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2381 
2382     //*** jbyte
2383     // Always need aligned and unaligned versions
2384     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2385                                                                                   "jbyte_disjoint_arraycopy");
2386     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2387                                                                                   &entry_jbyte_arraycopy,
2388                                                                                   "jbyte_arraycopy");
2389     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2390                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2391     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2392                                                                                   "arrayof_jbyte_arraycopy");
2393 
2394     //*** jshort
2395     // Always need aligned and unaligned versions
2396     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2397                                                                                     "jshort_disjoint_arraycopy");
2398     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2399                                                                                     &entry_jshort_arraycopy,
2400                                                                                     "jshort_arraycopy");
2401     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2402                                                                                     "arrayof_jshort_disjoint_arraycopy");
2403     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2404                                                                                     "arrayof_jshort_arraycopy");
2405 
2406     //*** jint
2407     // Aligned versions
2408     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2409                                                                                 "arrayof_jint_disjoint_arraycopy");
2410     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2411                                                                                 "arrayof_jint_arraycopy");
2412     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2413     // entry_jint_arraycopy always points to the unaligned version
2414     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2415                                                                                 "jint_disjoint_arraycopy");
2416     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2417                                                                                 &entry_jint_arraycopy,
2418                                                                                 "jint_arraycopy");
2419 
2420     //*** jlong
2421     // It is always aligned
2422     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2423                                                                                   "arrayof_jlong_disjoint_arraycopy");
2424     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2425                                                                                   "arrayof_jlong_arraycopy");
2426     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2427     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2428 
2429     //*** oops
2430     {
2431       // With compressed oops we need unaligned versions; notice that
2432       // we overwrite entry_oop_arraycopy.
2433       bool aligned = !UseCompressedOops;
2434 
2435       StubRoutines::_arrayof_oop_disjoint_arraycopy
2436         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2437                                      /*dest_uninitialized*/false);
2438       StubRoutines::_arrayof_oop_arraycopy
2439         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2440                                      /*dest_uninitialized*/false);
2441       // Aligned versions without pre-barriers
2442       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2443         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2444                                      /*dest_uninitialized*/true);
2445       StubRoutines::_arrayof_oop_arraycopy_uninit
2446         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2447                                      /*dest_uninitialized*/true);
2448     }
2449 
2450     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2451     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2452     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2453     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2454 
2455     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2456     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2457                                                                         /*dest_uninitialized*/true);
2458 
2459     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2460                                                               entry_jbyte_arraycopy,
2461                                                               entry_jshort_arraycopy,
2462                                                               entry_jint_arraycopy,
2463                                                               entry_jlong_arraycopy);
2464 
2465     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2466                                                                entry_jbyte_arraycopy,
2467                                                                entry_jshort_arraycopy,
2468                                                                entry_jint_arraycopy,
2469                                                                entry_oop_arraycopy,
2470                                                                entry_jlong_arraycopy,
2471                                                                entry_checkcast_arraycopy);
2472 
2473     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2474     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2475     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2476     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2477     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2478     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2479   }
2480 
2481   void generate_math_stubs() { Unimplemented(); }
2482 
2483   // Arguments:
2484   //
2485   // Inputs:
2486   //   c_rarg0   - source byte array address
2487   //   c_rarg1   - destination byte array address
2488   //   c_rarg2   - K (key) in little endian int array
2489   //
2490   address generate_aescrypt_encryptBlock() {
2491     __ align(CodeEntryAlignment);
2492     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2493 
2494     Label L_doLast;
2495 
2496     const Register from        = c_rarg0;  // source array address
2497     const Register to          = c_rarg1;  // destination array address
2498     const Register key         = c_rarg2;  // key array address
2499     const Register keylen      = rscratch1;
2500 
2501     address start = __ pc();
2502     __ enter();
2503 
2504     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2505 
2506     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2507 
2508     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2509     __ rev32(v1, __ T16B, v1);
2510     __ rev32(v2, __ T16B, v2);
2511     __ rev32(v3, __ T16B, v3);
2512     __ rev32(v4, __ T16B, v4);
2513     __ aese(v0, v1);
2514     __ aesmc(v0, v0);
2515     __ aese(v0, v2);
2516     __ aesmc(v0, v0);
2517     __ aese(v0, v3);
2518     __ aesmc(v0, v0);
2519     __ aese(v0, v4);
2520     __ aesmc(v0, v0);
2521 
2522     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2523     __ rev32(v1, __ T16B, v1);
2524     __ rev32(v2, __ T16B, v2);
2525     __ rev32(v3, __ T16B, v3);
2526     __ rev32(v4, __ T16B, v4);
2527     __ aese(v0, v1);
2528     __ aesmc(v0, v0);
2529     __ aese(v0, v2);
2530     __ aesmc(v0, v0);
2531     __ aese(v0, v3);
2532     __ aesmc(v0, v0);
2533     __ aese(v0, v4);
2534     __ aesmc(v0, v0);
2535 
2536     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2537     __ rev32(v1, __ T16B, v1);
2538     __ rev32(v2, __ T16B, v2);
2539 
2540     __ cmpw(keylen, 44);
2541     __ br(Assembler::EQ, L_doLast);
2542 
2543     __ aese(v0, v1);
2544     __ aesmc(v0, v0);
2545     __ aese(v0, v2);
2546     __ aesmc(v0, v0);
2547 
2548     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2549     __ rev32(v1, __ T16B, v1);
2550     __ rev32(v2, __ T16B, v2);
2551 
2552     __ cmpw(keylen, 52);
2553     __ br(Assembler::EQ, L_doLast);
2554 
2555     __ aese(v0, v1);
2556     __ aesmc(v0, v0);
2557     __ aese(v0, v2);
2558     __ aesmc(v0, v0);
2559 
2560     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2561     __ rev32(v1, __ T16B, v1);
2562     __ rev32(v2, __ T16B, v2);
2563 
2564     __ BIND(L_doLast);
2565 
2566     __ aese(v0, v1);
2567     __ aesmc(v0, v0);
2568     __ aese(v0, v2);
2569 
2570     __ ld1(v1, __ T16B, key);
2571     __ rev32(v1, __ T16B, v1);
2572     __ eor(v0, __ T16B, v0, v1);
2573 
2574     __ st1(v0, __ T16B, to);
2575 
2576     __ mov(r0, 0);
2577 
2578     __ leave();
2579     __ ret(lr);
2580 
2581     return start;
2582   }
2583 
2584   // Arguments:
2585   //
2586   // Inputs:
2587   //   c_rarg0   - source byte array address
2588   //   c_rarg1   - destination byte array address
2589   //   c_rarg2   - K (key) in little endian int array
2590   //
2591   address generate_aescrypt_decryptBlock() {
2592     assert(UseAES, "need AES instructions and misaligned SSE support");
2593     __ align(CodeEntryAlignment);
2594     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2595     Label L_doLast;
2596 
2597     const Register from        = c_rarg0;  // source array address
2598     const Register to          = c_rarg1;  // destination array address
2599     const Register key         = c_rarg2;  // key array address
2600     const Register keylen      = rscratch1;
2601 
2602     address start = __ pc();
2603     __ enter(); // required for proper stackwalking of RuntimeStub frame
2604 
2605     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2606 
2607     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2608 
2609     __ ld1(v5, __ T16B, __ post(key, 16));
2610     __ rev32(v5, __ T16B, v5);
2611 
2612     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2613     __ rev32(v1, __ T16B, v1);
2614     __ rev32(v2, __ T16B, v2);
2615     __ rev32(v3, __ T16B, v3);
2616     __ rev32(v4, __ T16B, v4);
2617     __ aesd(v0, v1);
2618     __ aesimc(v0, v0);
2619     __ aesd(v0, v2);
2620     __ aesimc(v0, v0);
2621     __ aesd(v0, v3);
2622     __ aesimc(v0, v0);
2623     __ aesd(v0, v4);
2624     __ aesimc(v0, v0);
2625 
2626     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2627     __ rev32(v1, __ T16B, v1);
2628     __ rev32(v2, __ T16B, v2);
2629     __ rev32(v3, __ T16B, v3);
2630     __ rev32(v4, __ T16B, v4);
2631     __ aesd(v0, v1);
2632     __ aesimc(v0, v0);
2633     __ aesd(v0, v2);
2634     __ aesimc(v0, v0);
2635     __ aesd(v0, v3);
2636     __ aesimc(v0, v0);
2637     __ aesd(v0, v4);
2638     __ aesimc(v0, v0);
2639 
2640     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2641     __ rev32(v1, __ T16B, v1);
2642     __ rev32(v2, __ T16B, v2);
2643 
2644     __ cmpw(keylen, 44);
2645     __ br(Assembler::EQ, L_doLast);
2646 
2647     __ aesd(v0, v1);
2648     __ aesimc(v0, v0);
2649     __ aesd(v0, v2);
2650     __ aesimc(v0, v0);
2651 
2652     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2653     __ rev32(v1, __ T16B, v1);
2654     __ rev32(v2, __ T16B, v2);
2655 
2656     __ cmpw(keylen, 52);
2657     __ br(Assembler::EQ, L_doLast);
2658 
2659     __ aesd(v0, v1);
2660     __ aesimc(v0, v0);
2661     __ aesd(v0, v2);
2662     __ aesimc(v0, v0);
2663 
2664     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2665     __ rev32(v1, __ T16B, v1);
2666     __ rev32(v2, __ T16B, v2);
2667 
2668     __ BIND(L_doLast);
2669 
2670     __ aesd(v0, v1);
2671     __ aesimc(v0, v0);
2672     __ aesd(v0, v2);
2673 
2674     __ eor(v0, __ T16B, v0, v5);
2675 
2676     __ st1(v0, __ T16B, to);
2677 
2678     __ mov(r0, 0);
2679 
2680     __ leave();
2681     __ ret(lr);
2682 
2683     return start;
2684   }
2685 
2686   // Arguments:
2687   //
2688   // Inputs:
2689   //   c_rarg0   - source byte array address
2690   //   c_rarg1   - destination byte array address
2691   //   c_rarg2   - K (key) in little endian int array
2692   //   c_rarg3   - r vector byte array address
2693   //   c_rarg4   - input length
2694   //
2695   // Output:
2696   //   x0        - input length
2697   //
2698   address generate_cipherBlockChaining_encryptAESCrypt() {
2699     assert(UseAES, "need AES instructions and misaligned SSE support");
2700     __ align(CodeEntryAlignment);
2701     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2702 
2703     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2704 
2705     const Register from        = c_rarg0;  // source array address
2706     const Register to          = c_rarg1;  // destination array address
2707     const Register key         = c_rarg2;  // key array address
2708     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2709                                            // and left with the results of the last encryption block
2710     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2711     const Register keylen      = rscratch1;
2712 
2713     address start = __ pc();
2714 
2715       __ enter();
2716 
2717       __ movw(rscratch2, len_reg);
2718 
2719       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2720 
2721       __ ld1(v0, __ T16B, rvec);
2722 
2723       __ cmpw(keylen, 52);
2724       __ br(Assembler::CC, L_loadkeys_44);
2725       __ br(Assembler::EQ, L_loadkeys_52);
2726 
2727       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2728       __ rev32(v17, __ T16B, v17);
2729       __ rev32(v18, __ T16B, v18);
2730     __ BIND(L_loadkeys_52);
2731       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2732       __ rev32(v19, __ T16B, v19);
2733       __ rev32(v20, __ T16B, v20);
2734     __ BIND(L_loadkeys_44);
2735       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2736       __ rev32(v21, __ T16B, v21);
2737       __ rev32(v22, __ T16B, v22);
2738       __ rev32(v23, __ T16B, v23);
2739       __ rev32(v24, __ T16B, v24);
2740       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2741       __ rev32(v25, __ T16B, v25);
2742       __ rev32(v26, __ T16B, v26);
2743       __ rev32(v27, __ T16B, v27);
2744       __ rev32(v28, __ T16B, v28);
2745       __ ld1(v29, v30, v31, __ T16B, key);
2746       __ rev32(v29, __ T16B, v29);
2747       __ rev32(v30, __ T16B, v30);
2748       __ rev32(v31, __ T16B, v31);
2749 
2750     __ BIND(L_aes_loop);
2751       __ ld1(v1, __ T16B, __ post(from, 16));
2752       __ eor(v0, __ T16B, v0, v1);
2753 
2754       __ br(Assembler::CC, L_rounds_44);
2755       __ br(Assembler::EQ, L_rounds_52);
2756 
2757       __ aese(v0, v17); __ aesmc(v0, v0);
2758       __ aese(v0, v18); __ aesmc(v0, v0);
2759     __ BIND(L_rounds_52);
2760       __ aese(v0, v19); __ aesmc(v0, v0);
2761       __ aese(v0, v20); __ aesmc(v0, v0);
2762     __ BIND(L_rounds_44);
2763       __ aese(v0, v21); __ aesmc(v0, v0);
2764       __ aese(v0, v22); __ aesmc(v0, v0);
2765       __ aese(v0, v23); __ aesmc(v0, v0);
2766       __ aese(v0, v24); __ aesmc(v0, v0);
2767       __ aese(v0, v25); __ aesmc(v0, v0);
2768       __ aese(v0, v26); __ aesmc(v0, v0);
2769       __ aese(v0, v27); __ aesmc(v0, v0);
2770       __ aese(v0, v28); __ aesmc(v0, v0);
2771       __ aese(v0, v29); __ aesmc(v0, v0);
2772       __ aese(v0, v30);
2773       __ eor(v0, __ T16B, v0, v31);
2774 
2775       __ st1(v0, __ T16B, __ post(to, 16));
2776 
2777       __ subw(len_reg, len_reg, 16);
2778       __ cbnzw(len_reg, L_aes_loop);
2779 
2780       __ st1(v0, __ T16B, rvec);
2781 
2782       __ mov(r0, rscratch2);
2783 
2784       __ leave();
2785       __ ret(lr);
2786 
2787       return start;
2788   }
2789 
2790   // Arguments:
2791   //
2792   // Inputs:
2793   //   c_rarg0   - source byte array address
2794   //   c_rarg1   - destination byte array address
2795   //   c_rarg2   - K (key) in little endian int array
2796   //   c_rarg3   - r vector byte array address
2797   //   c_rarg4   - input length
2798   //
2799   // Output:
2800   //   r0        - input length
2801   //
2802   address generate_cipherBlockChaining_decryptAESCrypt() {
2803     assert(UseAES, "need AES instructions and misaligned SSE support");
2804     __ align(CodeEntryAlignment);
2805     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2806 
2807     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2808 
2809     const Register from        = c_rarg0;  // source array address
2810     const Register to          = c_rarg1;  // destination array address
2811     const Register key         = c_rarg2;  // key array address
2812     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2813                                            // and left with the results of the last encryption block
2814     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2815     const Register keylen      = rscratch1;
2816 
2817     address start = __ pc();
2818 
2819       __ enter();
2820 
2821       __ movw(rscratch2, len_reg);
2822 
2823       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2824 
2825       __ ld1(v2, __ T16B, rvec);
2826 
2827       __ ld1(v31, __ T16B, __ post(key, 16));
2828       __ rev32(v31, __ T16B, v31);
2829 
2830       __ cmpw(keylen, 52);
2831       __ br(Assembler::CC, L_loadkeys_44);
2832       __ br(Assembler::EQ, L_loadkeys_52);
2833 
2834       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2835       __ rev32(v17, __ T16B, v17);
2836       __ rev32(v18, __ T16B, v18);
2837     __ BIND(L_loadkeys_52);
2838       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2839       __ rev32(v19, __ T16B, v19);
2840       __ rev32(v20, __ T16B, v20);
2841     __ BIND(L_loadkeys_44);
2842       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2843       __ rev32(v21, __ T16B, v21);
2844       __ rev32(v22, __ T16B, v22);
2845       __ rev32(v23, __ T16B, v23);
2846       __ rev32(v24, __ T16B, v24);
2847       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2848       __ rev32(v25, __ T16B, v25);
2849       __ rev32(v26, __ T16B, v26);
2850       __ rev32(v27, __ T16B, v27);
2851       __ rev32(v28, __ T16B, v28);
2852       __ ld1(v29, v30, __ T16B, key);
2853       __ rev32(v29, __ T16B, v29);
2854       __ rev32(v30, __ T16B, v30);
2855 
2856     __ BIND(L_aes_loop);
2857       __ ld1(v0, __ T16B, __ post(from, 16));
2858       __ orr(v1, __ T16B, v0, v0);
2859 
2860       __ br(Assembler::CC, L_rounds_44);
2861       __ br(Assembler::EQ, L_rounds_52);
2862 
2863       __ aesd(v0, v17); __ aesimc(v0, v0);
2864       __ aesd(v0, v18); __ aesimc(v0, v0);
2865     __ BIND(L_rounds_52);
2866       __ aesd(v0, v19); __ aesimc(v0, v0);
2867       __ aesd(v0, v20); __ aesimc(v0, v0);
2868     __ BIND(L_rounds_44);
2869       __ aesd(v0, v21); __ aesimc(v0, v0);
2870       __ aesd(v0, v22); __ aesimc(v0, v0);
2871       __ aesd(v0, v23); __ aesimc(v0, v0);
2872       __ aesd(v0, v24); __ aesimc(v0, v0);
2873       __ aesd(v0, v25); __ aesimc(v0, v0);
2874       __ aesd(v0, v26); __ aesimc(v0, v0);
2875       __ aesd(v0, v27); __ aesimc(v0, v0);
2876       __ aesd(v0, v28); __ aesimc(v0, v0);
2877       __ aesd(v0, v29); __ aesimc(v0, v0);
2878       __ aesd(v0, v30);
2879       __ eor(v0, __ T16B, v0, v31);
2880       __ eor(v0, __ T16B, v0, v2);
2881 
2882       __ st1(v0, __ T16B, __ post(to, 16));
2883       __ orr(v2, __ T16B, v1, v1);
2884 
2885       __ subw(len_reg, len_reg, 16);
2886       __ cbnzw(len_reg, L_aes_loop);
2887 
2888       __ st1(v2, __ T16B, rvec);
2889 
2890       __ mov(r0, rscratch2);
2891 
2892       __ leave();
2893       __ ret(lr);
2894 
2895     return start;
2896   }
2897 
2898   // Arguments:
2899   //
2900   // Inputs:
2901   //   c_rarg0   - byte[]  source+offset
2902   //   c_rarg1   - int[]   SHA.state
2903   //   c_rarg2   - int     offset
2904   //   c_rarg3   - int     limit
2905   //
2906   address generate_sha1_implCompress(bool multi_block, const char *name) {
2907     __ align(CodeEntryAlignment);
2908     StubCodeMark mark(this, "StubRoutines", name);
2909     address start = __ pc();
2910 
2911     Register buf   = c_rarg0;
2912     Register state = c_rarg1;
2913     Register ofs   = c_rarg2;
2914     Register limit = c_rarg3;
2915 
2916     Label keys;
2917     Label sha1_loop;
2918 
2919     // load the keys into v0..v3
2920     __ adr(rscratch1, keys);
2921     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2922     // load 5 words state into v6, v7
2923     __ ldrq(v6, Address(state, 0));
2924     __ ldrs(v7, Address(state, 16));
2925 
2926 
2927     __ BIND(sha1_loop);
2928     // load 64 bytes of data into v16..v19
2929     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2930     __ rev32(v16, __ T16B, v16);
2931     __ rev32(v17, __ T16B, v17);
2932     __ rev32(v18, __ T16B, v18);
2933     __ rev32(v19, __ T16B, v19);
2934 
2935     // do the sha1
2936     __ addv(v4, __ T4S, v16, v0);
2937     __ orr(v20, __ T16B, v6, v6);
2938 
2939     FloatRegister d0 = v16;
2940     FloatRegister d1 = v17;
2941     FloatRegister d2 = v18;
2942     FloatRegister d3 = v19;
2943 
2944     for (int round = 0; round < 20; round++) {
2945       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2946       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2947       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2948       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2949       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2950 
2951       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2952       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2953       __ sha1h(tmp2, __ T4S, v20);
2954       if (round < 5)
2955         __ sha1c(v20, __ T4S, tmp3, tmp4);
2956       else if (round < 10 || round >= 15)
2957         __ sha1p(v20, __ T4S, tmp3, tmp4);
2958       else
2959         __ sha1m(v20, __ T4S, tmp3, tmp4);
2960       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2961 
2962       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2963     }
2964 
2965     __ addv(v7, __ T2S, v7, v21);
2966     __ addv(v6, __ T4S, v6, v20);
2967 
2968     if (multi_block) {
2969       __ add(ofs, ofs, 64);
2970       __ cmp(ofs, limit);
2971       __ br(Assembler::LE, sha1_loop);
2972       __ mov(c_rarg0, ofs); // return ofs
2973     }
2974 
2975     __ strq(v6, Address(state, 0));
2976     __ strs(v7, Address(state, 16));
2977 
2978     __ ret(lr);
2979 
2980     __ bind(keys);
2981     __ emit_int32(0x5a827999);
2982     __ emit_int32(0x6ed9eba1);
2983     __ emit_int32(0x8f1bbcdc);
2984     __ emit_int32(0xca62c1d6);
2985 
2986     return start;
2987   }
2988 
2989 
2990   // Arguments:
2991   //
2992   // Inputs:
2993   //   c_rarg0   - byte[]  source+offset
2994   //   c_rarg1   - int[]   SHA.state
2995   //   c_rarg2   - int     offset
2996   //   c_rarg3   - int     limit
2997   //
2998   address generate_sha256_implCompress(bool multi_block, const char *name) {
2999     static const uint32_t round_consts[64] = {
3000       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3001       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3002       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3003       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3004       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3005       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3006       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3007       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3008       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3009       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3010       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3011       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3012       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3013       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3014       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3015       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3016     };
3017     __ align(CodeEntryAlignment);
3018     StubCodeMark mark(this, "StubRoutines", name);
3019     address start = __ pc();
3020 
3021     Register buf   = c_rarg0;
3022     Register state = c_rarg1;
3023     Register ofs   = c_rarg2;
3024     Register limit = c_rarg3;
3025 
3026     Label sha1_loop;
3027 
3028     __ stpd(v8, v9, __ pre(sp, -32));
3029     __ stpd(v10, v11, Address(sp, 16));
3030 
3031 // dga == v0
3032 // dgb == v1
3033 // dg0 == v2
3034 // dg1 == v3
3035 // dg2 == v4
3036 // t0 == v6
3037 // t1 == v7
3038 
3039     // load 16 keys to v16..v31
3040     __ lea(rscratch1, ExternalAddress((address)round_consts));
3041     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3042     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3043     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3044     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3045 
3046     // load 8 words (256 bits) state
3047     __ ldpq(v0, v1, state);
3048 
3049     __ BIND(sha1_loop);
3050     // load 64 bytes of data into v8..v11
3051     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3052     __ rev32(v8, __ T16B, v8);
3053     __ rev32(v9, __ T16B, v9);
3054     __ rev32(v10, __ T16B, v10);
3055     __ rev32(v11, __ T16B, v11);
3056 
3057     __ addv(v6, __ T4S, v8, v16);
3058     __ orr(v2, __ T16B, v0, v0);
3059     __ orr(v3, __ T16B, v1, v1);
3060 
3061     FloatRegister d0 = v8;
3062     FloatRegister d1 = v9;
3063     FloatRegister d2 = v10;
3064     FloatRegister d3 = v11;
3065 
3066 
3067     for (int round = 0; round < 16; round++) {
3068       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3069       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3070       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3071       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3072 
3073       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3074        __ orr(v4, __ T16B, v2, v2);
3075       if (round < 15)
3076         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3077       __ sha256h(v2, __ T4S, v3, tmp2);
3078       __ sha256h2(v3, __ T4S, v4, tmp2);
3079       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3080 
3081       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3082     }
3083 
3084     __ addv(v0, __ T4S, v0, v2);
3085     __ addv(v1, __ T4S, v1, v3);
3086 
3087     if (multi_block) {
3088       __ add(ofs, ofs, 64);
3089       __ cmp(ofs, limit);
3090       __ br(Assembler::LE, sha1_loop);
3091       __ mov(c_rarg0, ofs); // return ofs
3092     }
3093 
3094     __ ldpd(v10, v11, Address(sp, 16));
3095     __ ldpd(v8, v9, __ post(sp, 32));
3096 
3097     __ stpq(v0, v1, state);
3098 
3099     __ ret(lr);
3100 
3101     return start;
3102   }
3103 
3104 #ifndef BUILTIN_SIM
3105   // Safefetch stubs.
3106   void generate_safefetch(const char* name, int size, address* entry,
3107                           address* fault_pc, address* continuation_pc) {
3108     // safefetch signatures:
3109     //   int      SafeFetch32(int*      adr, int      errValue);
3110     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3111     //
3112     // arguments:
3113     //   c_rarg0 = adr
3114     //   c_rarg1 = errValue
3115     //
3116     // result:
3117     //   PPC_RET  = *adr or errValue
3118 
3119     StubCodeMark mark(this, "StubRoutines", name);
3120 
3121     // Entry point, pc or function descriptor.
3122     *entry = __ pc();
3123 
3124     // Load *adr into c_rarg1, may fault.
3125     *fault_pc = __ pc();
3126     switch (size) {
3127       case 4:
3128         // int32_t
3129         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3130         break;
3131       case 8:
3132         // int64_t
3133         __ ldr(c_rarg1, Address(c_rarg0, 0));
3134         break;
3135       default:
3136         ShouldNotReachHere();
3137     }
3138 
3139     // return errValue or *adr
3140     *continuation_pc = __ pc();
3141     __ mov(r0, c_rarg1);
3142     __ ret(lr);
3143   }
3144 #endif
3145 
3146   /**
3147    *  Arguments:
3148    *
3149    * Inputs:
3150    *   c_rarg0   - int crc
3151    *   c_rarg1   - byte* buf
3152    *   c_rarg2   - int length
3153    *
3154    * Ouput:
3155    *       rax   - int crc result
3156    */
3157   address generate_updateBytesCRC32() {
3158     assert(UseCRC32Intrinsics, "what are we doing here?");
3159 
3160     __ align(CodeEntryAlignment);
3161     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3162 
3163     address start = __ pc();
3164 
3165     const Register crc   = c_rarg0;  // crc
3166     const Register buf   = c_rarg1;  // source java byte array address
3167     const Register len   = c_rarg2;  // length
3168     const Register table0 = c_rarg3; // crc_table address
3169     const Register table1 = c_rarg4;
3170     const Register table2 = c_rarg5;
3171     const Register table3 = c_rarg6;
3172     const Register tmp3 = c_rarg7;
3173 
3174     BLOCK_COMMENT("Entry:");
3175     __ enter(); // required for proper stackwalking of RuntimeStub frame
3176 
3177     __ kernel_crc32(crc, buf, len,
3178               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3179 
3180     __ leave(); // required for proper stackwalking of RuntimeStub frame
3181     __ ret(lr);
3182 
3183     return start;
3184   }
3185 
3186   /**
3187    *  Arguments:
3188    *
3189    * Inputs:
3190    *   c_rarg0   - int crc
3191    *   c_rarg1   - byte* buf
3192    *   c_rarg2   - int length
3193    *   c_rarg3   - int* table
3194    *
3195    * Ouput:
3196    *       r0   - int crc result
3197    */
3198   address generate_updateBytesCRC32C() {
3199     assert(UseCRC32CIntrinsics, "what are we doing here?");
3200 
3201     __ align(CodeEntryAlignment);
3202     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3203 
3204     address start = __ pc();
3205 
3206     const Register crc   = c_rarg0;  // crc
3207     const Register buf   = c_rarg1;  // source java byte array address
3208     const Register len   = c_rarg2;  // length
3209     const Register table0 = c_rarg3; // crc_table address
3210     const Register table1 = c_rarg4;
3211     const Register table2 = c_rarg5;
3212     const Register table3 = c_rarg6;
3213     const Register tmp3 = c_rarg7;
3214 
3215     BLOCK_COMMENT("Entry:");
3216     __ enter(); // required for proper stackwalking of RuntimeStub frame
3217 
3218     __ kernel_crc32c(crc, buf, len,
3219               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3220 
3221     __ leave(); // required for proper stackwalking of RuntimeStub frame
3222     __ ret(lr);
3223 
3224     return start;
3225   }
3226 
3227   /***
3228    *  Arguments:
3229    *
3230    *  Inputs:
3231    *   c_rarg0   - int   adler
3232    *   c_rarg1   - byte* buff
3233    *   c_rarg2   - int   len
3234    *
3235    * Output:
3236    *   c_rarg0   - int adler result
3237    */
3238   address generate_updateBytesAdler32() {
3239     __ align(CodeEntryAlignment);
3240     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3241     address start = __ pc();
3242 
3243     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3244 
3245     // Aliases
3246     Register adler  = c_rarg0;
3247     Register s1     = c_rarg0;
3248     Register s2     = c_rarg3;
3249     Register buff   = c_rarg1;
3250     Register len    = c_rarg2;
3251     Register nmax  = r4;
3252     Register base = r5;
3253     Register count = r6;
3254     Register temp0 = rscratch1;
3255     Register temp1 = rscratch2;
3256     Register temp2 = r7;
3257 
3258     // Max number of bytes we can process before having to take the mod
3259     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3260     unsigned long BASE = 0xfff1;
3261     unsigned long NMAX = 0x15B0;
3262 
3263     __ mov(base, BASE);
3264     __ mov(nmax, NMAX);
3265 
3266     // s1 is initialized to the lower 16 bits of adler
3267     // s2 is initialized to the upper 16 bits of adler
3268     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3269     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3270 
3271     // The pipelined loop needs at least 16 elements for 1 iteration
3272     // It does check this, but it is more effective to skip to the cleanup loop
3273     __ cmp(len, 16);
3274     __ br(Assembler::HS, L_nmax);
3275     __ cbz(len, L_combine);
3276 
3277     __ bind(L_simple_by1_loop);
3278     __ ldrb(temp0, Address(__ post(buff, 1)));
3279     __ add(s1, s1, temp0);
3280     __ add(s2, s2, s1);
3281     __ subs(len, len, 1);
3282     __ br(Assembler::HI, L_simple_by1_loop);
3283 
3284     // s1 = s1 % BASE
3285     __ subs(temp0, s1, base);
3286     __ csel(s1, temp0, s1, Assembler::HS);
3287 
3288     // s2 = s2 % BASE
3289     __ lsr(temp0, s2, 16);
3290     __ lsl(temp1, temp0, 4);
3291     __ sub(temp1, temp1, temp0);
3292     __ add(s2, temp1, s2, ext::uxth);
3293 
3294     __ subs(temp0, s2, base);
3295     __ csel(s2, temp0, s2, Assembler::HS);
3296 
3297     __ b(L_combine);
3298 
3299     __ bind(L_nmax);
3300     __ subs(len, len, nmax);
3301     __ sub(count, nmax, 16);
3302     __ br(Assembler::LO, L_by16);
3303 
3304     __ bind(L_nmax_loop);
3305 
3306     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3307 
3308     __ add(s1, s1, temp0, ext::uxtb);
3309     __ ubfx(temp2, temp0, 8, 8);
3310     __ add(s2, s2, s1);
3311     __ add(s1, s1, temp2);
3312     __ ubfx(temp2, temp0, 16, 8);
3313     __ add(s2, s2, s1);
3314     __ add(s1, s1, temp2);
3315     __ ubfx(temp2, temp0, 24, 8);
3316     __ add(s2, s2, s1);
3317     __ add(s1, s1, temp2);
3318     __ ubfx(temp2, temp0, 32, 8);
3319     __ add(s2, s2, s1);
3320     __ add(s1, s1, temp2);
3321     __ ubfx(temp2, temp0, 40, 8);
3322     __ add(s2, s2, s1);
3323     __ add(s1, s1, temp2);
3324     __ ubfx(temp2, temp0, 48, 8);
3325     __ add(s2, s2, s1);
3326     __ add(s1, s1, temp2);
3327     __ add(s2, s2, s1);
3328     __ add(s1, s1, temp0, Assembler::LSR, 56);
3329     __ add(s2, s2, s1);
3330 
3331     __ add(s1, s1, temp1, ext::uxtb);
3332     __ ubfx(temp2, temp1, 8, 8);
3333     __ add(s2, s2, s1);
3334     __ add(s1, s1, temp2);
3335     __ ubfx(temp2, temp1, 16, 8);
3336     __ add(s2, s2, s1);
3337     __ add(s1, s1, temp2);
3338     __ ubfx(temp2, temp1, 24, 8);
3339     __ add(s2, s2, s1);
3340     __ add(s1, s1, temp2);
3341     __ ubfx(temp2, temp1, 32, 8);
3342     __ add(s2, s2, s1);
3343     __ add(s1, s1, temp2);
3344     __ ubfx(temp2, temp1, 40, 8);
3345     __ add(s2, s2, s1);
3346     __ add(s1, s1, temp2);
3347     __ ubfx(temp2, temp1, 48, 8);
3348     __ add(s2, s2, s1);
3349     __ add(s1, s1, temp2);
3350     __ add(s2, s2, s1);
3351     __ add(s1, s1, temp1, Assembler::LSR, 56);
3352     __ add(s2, s2, s1);
3353 
3354     __ subs(count, count, 16);
3355     __ br(Assembler::HS, L_nmax_loop);
3356 
3357     // s1 = s1 % BASE
3358     __ lsr(temp0, s1, 16);
3359     __ lsl(temp1, temp0, 4);
3360     __ sub(temp1, temp1, temp0);
3361     __ add(temp1, temp1, s1, ext::uxth);
3362 
3363     __ lsr(temp0, temp1, 16);
3364     __ lsl(s1, temp0, 4);
3365     __ sub(s1, s1, temp0);
3366     __ add(s1, s1, temp1, ext:: uxth);
3367 
3368     __ subs(temp0, s1, base);
3369     __ csel(s1, temp0, s1, Assembler::HS);
3370 
3371     // s2 = s2 % BASE
3372     __ lsr(temp0, s2, 16);
3373     __ lsl(temp1, temp0, 4);
3374     __ sub(temp1, temp1, temp0);
3375     __ add(temp1, temp1, s2, ext::uxth);
3376 
3377     __ lsr(temp0, temp1, 16);
3378     __ lsl(s2, temp0, 4);
3379     __ sub(s2, s2, temp0);
3380     __ add(s2, s2, temp1, ext:: uxth);
3381 
3382     __ subs(temp0, s2, base);
3383     __ csel(s2, temp0, s2, Assembler::HS);
3384 
3385     __ subs(len, len, nmax);
3386     __ sub(count, nmax, 16);
3387     __ br(Assembler::HS, L_nmax_loop);
3388 
3389     __ bind(L_by16);
3390     __ adds(len, len, count);
3391     __ br(Assembler::LO, L_by1);
3392 
3393     __ bind(L_by16_loop);
3394 
3395     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3396 
3397     __ add(s1, s1, temp0, ext::uxtb);
3398     __ ubfx(temp2, temp0, 8, 8);
3399     __ add(s2, s2, s1);
3400     __ add(s1, s1, temp2);
3401     __ ubfx(temp2, temp0, 16, 8);
3402     __ add(s2, s2, s1);
3403     __ add(s1, s1, temp2);
3404     __ ubfx(temp2, temp0, 24, 8);
3405     __ add(s2, s2, s1);
3406     __ add(s1, s1, temp2);
3407     __ ubfx(temp2, temp0, 32, 8);
3408     __ add(s2, s2, s1);
3409     __ add(s1, s1, temp2);
3410     __ ubfx(temp2, temp0, 40, 8);
3411     __ add(s2, s2, s1);
3412     __ add(s1, s1, temp2);
3413     __ ubfx(temp2, temp0, 48, 8);
3414     __ add(s2, s2, s1);
3415     __ add(s1, s1, temp2);
3416     __ add(s2, s2, s1);
3417     __ add(s1, s1, temp0, Assembler::LSR, 56);
3418     __ add(s2, s2, s1);
3419 
3420     __ add(s1, s1, temp1, ext::uxtb);
3421     __ ubfx(temp2, temp1, 8, 8);
3422     __ add(s2, s2, s1);
3423     __ add(s1, s1, temp2);
3424     __ ubfx(temp2, temp1, 16, 8);
3425     __ add(s2, s2, s1);
3426     __ add(s1, s1, temp2);
3427     __ ubfx(temp2, temp1, 24, 8);
3428     __ add(s2, s2, s1);
3429     __ add(s1, s1, temp2);
3430     __ ubfx(temp2, temp1, 32, 8);
3431     __ add(s2, s2, s1);
3432     __ add(s1, s1, temp2);
3433     __ ubfx(temp2, temp1, 40, 8);
3434     __ add(s2, s2, s1);
3435     __ add(s1, s1, temp2);
3436     __ ubfx(temp2, temp1, 48, 8);
3437     __ add(s2, s2, s1);
3438     __ add(s1, s1, temp2);
3439     __ add(s2, s2, s1);
3440     __ add(s1, s1, temp1, Assembler::LSR, 56);
3441     __ add(s2, s2, s1);
3442 
3443     __ subs(len, len, 16);
3444     __ br(Assembler::HS, L_by16_loop);
3445 
3446     __ bind(L_by1);
3447     __ adds(len, len, 15);
3448     __ br(Assembler::LO, L_do_mod);
3449 
3450     __ bind(L_by1_loop);
3451     __ ldrb(temp0, Address(__ post(buff, 1)));
3452     __ add(s1, temp0, s1);
3453     __ add(s2, s2, s1);
3454     __ subs(len, len, 1);
3455     __ br(Assembler::HS, L_by1_loop);
3456 
3457     __ bind(L_do_mod);
3458     // s1 = s1 % BASE
3459     __ lsr(temp0, s1, 16);
3460     __ lsl(temp1, temp0, 4);
3461     __ sub(temp1, temp1, temp0);
3462     __ add(temp1, temp1, s1, ext::uxth);
3463 
3464     __ lsr(temp0, temp1, 16);
3465     __ lsl(s1, temp0, 4);
3466     __ sub(s1, s1, temp0);
3467     __ add(s1, s1, temp1, ext:: uxth);
3468 
3469     __ subs(temp0, s1, base);
3470     __ csel(s1, temp0, s1, Assembler::HS);
3471 
3472     // s2 = s2 % BASE
3473     __ lsr(temp0, s2, 16);
3474     __ lsl(temp1, temp0, 4);
3475     __ sub(temp1, temp1, temp0);
3476     __ add(temp1, temp1, s2, ext::uxth);
3477 
3478     __ lsr(temp0, temp1, 16);
3479     __ lsl(s2, temp0, 4);
3480     __ sub(s2, s2, temp0);
3481     __ add(s2, s2, temp1, ext:: uxth);
3482 
3483     __ subs(temp0, s2, base);
3484     __ csel(s2, temp0, s2, Assembler::HS);
3485 
3486     // Combine lower bits and higher bits
3487     __ bind(L_combine);
3488     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3489 
3490     __ ret(lr);
3491 
3492     return start;
3493   }
3494 
3495   /**
3496    *  Arguments:
3497    *
3498    *  Input:
3499    *    c_rarg0   - x address
3500    *    c_rarg1   - x length
3501    *    c_rarg2   - y address
3502    *    c_rarg3   - y lenth
3503    *    c_rarg4   - z address
3504    *    c_rarg5   - z length
3505    */
3506   address generate_multiplyToLen() {
3507     __ align(CodeEntryAlignment);
3508     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3509 
3510     address start = __ pc();
3511     const Register x     = r0;
3512     const Register xlen  = r1;
3513     const Register y     = r2;
3514     const Register ylen  = r3;
3515     const Register z     = r4;
3516     const Register zlen  = r5;
3517 
3518     const Register tmp1  = r10;
3519     const Register tmp2  = r11;
3520     const Register tmp3  = r12;
3521     const Register tmp4  = r13;
3522     const Register tmp5  = r14;
3523     const Register tmp6  = r15;
3524     const Register tmp7  = r16;
3525 
3526     BLOCK_COMMENT("Entry:");
3527     __ enter(); // required for proper stackwalking of RuntimeStub frame
3528     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3529     __ leave(); // required for proper stackwalking of RuntimeStub frame
3530     __ ret(lr);
3531 
3532     return start;
3533   }
3534 
3535   address generate_squareToLen() {
3536     // squareToLen algorithm for sizes 1..127 described in java code works
3537     // faster than multiply_to_len on some CPUs and slower on others, but
3538     // multiply_to_len shows a bit better overall results
3539     __ align(CodeEntryAlignment);
3540     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3541     address start = __ pc();
3542 
3543     const Register x     = r0;
3544     const Register xlen  = r1;
3545     const Register z     = r2;
3546     const Register zlen  = r3;
3547     const Register y     = r4; // == x
3548     const Register ylen  = r5; // == xlen
3549 
3550     const Register tmp1  = r10;
3551     const Register tmp2  = r11;
3552     const Register tmp3  = r12;
3553     const Register tmp4  = r13;
3554     const Register tmp5  = r14;
3555     const Register tmp6  = r15;
3556     const Register tmp7  = r16;
3557 
3558     RegSet spilled_regs = RegSet::of(y, ylen);
3559     BLOCK_COMMENT("Entry:");
3560     __ enter();
3561     __ push(spilled_regs, sp);
3562     __ mov(y, x);
3563     __ mov(ylen, xlen);
3564     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3565     __ pop(spilled_regs, sp);
3566     __ leave();
3567     __ ret(lr);
3568     return start;
3569   }
3570 
3571   address generate_mulAdd() {
3572     __ align(CodeEntryAlignment);
3573     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3574 
3575     address start = __ pc();
3576 
3577     const Register out     = r0;
3578     const Register in      = r1;
3579     const Register offset  = r2;
3580     const Register len     = r3;
3581     const Register k       = r4;
3582 
3583     BLOCK_COMMENT("Entry:");
3584     __ enter();
3585     __ mul_add(out, in, offset, len, k);
3586     __ leave();
3587     __ ret(lr);
3588 
3589     return start;
3590   }
3591 
3592   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3593                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3594                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3595     // Karatsuba multiplication performs a 128*128 -> 256-bit
3596     // multiplication in three 128-bit multiplications and a few
3597     // additions.
3598     //
3599     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3600     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3601     //
3602     // Inputs:
3603     //
3604     // A0 in a.d[0]     (subkey)
3605     // A1 in a.d[1]
3606     // (A1+A0) in a1_xor_a0.d[0]
3607     //
3608     // B0 in b.d[0]     (state)
3609     // B1 in b.d[1]
3610 
3611     __ ext(tmp1, __ T16B, b, b, 0x08);
3612     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3613     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3614     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3615     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3616 
3617     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3618     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3619     __ eor(tmp2, __ T16B, tmp2, tmp4);
3620     __ eor(tmp2, __ T16B, tmp2, tmp3);
3621 
3622     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3623     __ ins(result_hi, __ D, tmp2, 0, 1);
3624     __ ins(result_lo, __ D, tmp2, 1, 0);
3625   }
3626 
3627   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3628                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3629     const FloatRegister t0 = result;
3630 
3631     // The GCM field polynomial f is z^128 + p(z), where p =
3632     // z^7+z^2+z+1.
3633     //
3634     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3635     //
3636     // so, given that the product we're reducing is
3637     //    a == lo + hi * z^128
3638     // substituting,
3639     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3640     //
3641     // we reduce by multiplying hi by p(z) and subtracting the result
3642     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3643     // bits we can do this with two 64-bit multiplications, lo*p and
3644     // hi*p.
3645 
3646     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3647     __ ext(t1, __ T16B, t0, z, 8);
3648     __ eor(hi, __ T16B, hi, t1);
3649     __ ext(t1, __ T16B, z, t0, 8);
3650     __ eor(lo, __ T16B, lo, t1);
3651     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3652     __ eor(result, __ T16B, lo, t0);
3653   }
3654 
3655   address generate_has_negatives(address &has_negatives_long) {
3656     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3657     const int large_loop_size = 64;
3658     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3659     int dcache_line = VM_Version::dcache_line_size();
3660 
3661     Register ary1 = r1, len = r2, result = r0;
3662 
3663     __ align(CodeEntryAlignment);
3664     address entry = __ pc();
3665 
3666     __ enter();
3667 
3668   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3669         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3670 
3671   __ cmp(len, 15);
3672   __ br(Assembler::GT, LEN_OVER_15);
3673   // The only case when execution falls into this code is when pointer is near
3674   // the end of memory page and we have to avoid reading next page
3675   __ add(ary1, ary1, len);
3676   __ subs(len, len, 8);
3677   __ br(Assembler::GT, LEN_OVER_8);
3678   __ ldr(rscratch2, Address(ary1, -8));
3679   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3680   __ lsrv(rscratch2, rscratch2, rscratch1);
3681   __ tst(rscratch2, UPPER_BIT_MASK);
3682   __ cset(result, Assembler::NE);
3683   __ leave();
3684   __ ret(lr);
3685   __ bind(LEN_OVER_8);
3686   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3687   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3688   __ tst(rscratch2, UPPER_BIT_MASK);
3689   __ br(Assembler::NE, RET_TRUE_NO_POP);
3690   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3691   __ lsrv(rscratch1, rscratch1, rscratch2);
3692   __ tst(rscratch1, UPPER_BIT_MASK);
3693   __ cset(result, Assembler::NE);
3694   __ leave();
3695   __ ret(lr);
3696 
3697   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3698   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3699 
3700   has_negatives_long = __ pc(); // 2nd entry point
3701 
3702   __ enter();
3703 
3704   __ bind(LEN_OVER_15);
3705     __ push(spilled_regs, sp);
3706     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3707     __ cbz(rscratch2, ALIGNED);
3708     __ ldp(tmp6, tmp1, Address(ary1));
3709     __ mov(tmp5, 16);
3710     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3711     __ add(ary1, ary1, rscratch1);
3712     __ sub(len, len, rscratch1);
3713     __ orr(tmp6, tmp6, tmp1);
3714     __ tst(tmp6, UPPER_BIT_MASK);
3715     __ br(Assembler::NE, RET_TRUE);
3716 
3717   __ bind(ALIGNED);
3718     __ cmp(len, large_loop_size);
3719     __ br(Assembler::LT, CHECK_16);
3720     // Perform 16-byte load as early return in pre-loop to handle situation
3721     // when initially aligned large array has negative values at starting bytes,
3722     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3723     // slower. Cases with negative bytes further ahead won't be affected that
3724     // much. In fact, it'll be faster due to early loads, less instructions and
3725     // less branches in LARGE_LOOP.
3726     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3727     __ sub(len, len, 16);
3728     __ orr(tmp6, tmp6, tmp1);
3729     __ tst(tmp6, UPPER_BIT_MASK);
3730     __ br(Assembler::NE, RET_TRUE);
3731     __ cmp(len, large_loop_size);
3732     __ br(Assembler::LT, CHECK_16);
3733 
3734     if (SoftwarePrefetchHintDistance >= 0
3735         && SoftwarePrefetchHintDistance >= dcache_line) {
3736       // initial prefetch
3737       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3738     }
3739   __ bind(LARGE_LOOP);
3740     if (SoftwarePrefetchHintDistance >= 0) {
3741       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3742     }
3743     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3744     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3745     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3746     // instructions per cycle and have less branches, but this approach disables
3747     // early return, thus, all 64 bytes are loaded and checked every time.
3748     __ ldp(tmp2, tmp3, Address(ary1));
3749     __ ldp(tmp4, tmp5, Address(ary1, 16));
3750     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3751     __ ldp(tmp6, tmp1, Address(ary1, 48));
3752     __ add(ary1, ary1, large_loop_size);
3753     __ sub(len, len, large_loop_size);
3754     __ orr(tmp2, tmp2, tmp3);
3755     __ orr(tmp4, tmp4, tmp5);
3756     __ orr(rscratch1, rscratch1, rscratch2);
3757     __ orr(tmp6, tmp6, tmp1);
3758     __ orr(tmp2, tmp2, tmp4);
3759     __ orr(rscratch1, rscratch1, tmp6);
3760     __ orr(tmp2, tmp2, rscratch1);
3761     __ tst(tmp2, UPPER_BIT_MASK);
3762     __ br(Assembler::NE, RET_TRUE);
3763     __ cmp(len, large_loop_size);
3764     __ br(Assembler::GE, LARGE_LOOP);
3765 
3766   __ bind(CHECK_16); // small 16-byte load pre-loop
3767     __ cmp(len, 16);
3768     __ br(Assembler::LT, POST_LOOP16);
3769 
3770   __ bind(LOOP16); // small 16-byte load loop
3771     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3772     __ sub(len, len, 16);
3773     __ orr(tmp2, tmp2, tmp3);
3774     __ tst(tmp2, UPPER_BIT_MASK);
3775     __ br(Assembler::NE, RET_TRUE);
3776     __ cmp(len, 16);
3777     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3778 
3779   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3780     __ cmp(len, 8);
3781     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3782     __ ldr(tmp3, Address(__ post(ary1, 8)));
3783     __ sub(len, len, 8);
3784     __ tst(tmp3, UPPER_BIT_MASK);
3785     __ br(Assembler::NE, RET_TRUE);
3786 
3787   __ bind(POST_LOOP16_LOAD_TAIL);
3788     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3789     __ ldr(tmp1, Address(ary1));
3790     __ mov(tmp2, 64);
3791     __ sub(tmp4, tmp2, len, __ LSL, 3);
3792     __ lslv(tmp1, tmp1, tmp4);
3793     __ tst(tmp1, UPPER_BIT_MASK);
3794     __ br(Assembler::NE, RET_TRUE);
3795     // Fallthrough
3796 
3797   __ bind(RET_FALSE);
3798     __ pop(spilled_regs, sp);
3799     __ leave();
3800     __ mov(result, zr);
3801     __ ret(lr);
3802 
3803   __ bind(RET_TRUE);
3804     __ pop(spilled_regs, sp);
3805   __ bind(RET_TRUE_NO_POP);
3806     __ leave();
3807     __ mov(result, 1);
3808     __ ret(lr);
3809 
3810   __ bind(DONE);
3811     __ pop(spilled_regs, sp);
3812     __ leave();
3813     __ ret(lr);
3814     return entry;
3815   }
3816 
3817   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3818         bool usePrefetch, Label &NOT_EQUAL) {
3819     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3820         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3821         tmp7 = r12, tmp8 = r13;
3822     Label LOOP;
3823 
3824     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3825     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3826     __ bind(LOOP);
3827     if (usePrefetch) {
3828       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3829       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3830     }
3831     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3832     __ eor(tmp1, tmp1, tmp2);
3833     __ eor(tmp3, tmp3, tmp4);
3834     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3835     __ orr(tmp1, tmp1, tmp3);
3836     __ cbnz(tmp1, NOT_EQUAL);
3837     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3838     __ eor(tmp5, tmp5, tmp6);
3839     __ eor(tmp7, tmp7, tmp8);
3840     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3841     __ orr(tmp5, tmp5, tmp7);
3842     __ cbnz(tmp5, NOT_EQUAL);
3843     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3844     __ eor(tmp1, tmp1, tmp2);
3845     __ eor(tmp3, tmp3, tmp4);
3846     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3847     __ orr(tmp1, tmp1, tmp3);
3848     __ cbnz(tmp1, NOT_EQUAL);
3849     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3850     __ eor(tmp5, tmp5, tmp6);
3851     __ sub(cnt1, cnt1, 8 * wordSize);
3852     __ eor(tmp7, tmp7, tmp8);
3853     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3854     __ cmp(cnt1, loopThreshold);
3855     __ orr(tmp5, tmp5, tmp7);
3856     __ cbnz(tmp5, NOT_EQUAL);
3857     __ br(__ GE, LOOP);
3858     // post-loop
3859     __ eor(tmp1, tmp1, tmp2);
3860     __ eor(tmp3, tmp3, tmp4);
3861     __ orr(tmp1, tmp1, tmp3);
3862     __ sub(cnt1, cnt1, 2 * wordSize);
3863     __ cbnz(tmp1, NOT_EQUAL);
3864   }
3865 
3866   void generate_large_array_equals_loop_simd(int loopThreshold,
3867         bool usePrefetch, Label &NOT_EQUAL) {
3868     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3869         tmp2 = rscratch2;
3870     Label LOOP;
3871 
3872     __ bind(LOOP);
3873     if (usePrefetch) {
3874       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3875       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3876     }
3877     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3878     __ sub(cnt1, cnt1, 8 * wordSize);
3879     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3880     __ cmp(cnt1, loopThreshold);
3881     __ eor(v0, __ T16B, v0, v4);
3882     __ eor(v1, __ T16B, v1, v5);
3883     __ eor(v2, __ T16B, v2, v6);
3884     __ eor(v3, __ T16B, v3, v7);
3885     __ orr(v0, __ T16B, v0, v1);
3886     __ orr(v1, __ T16B, v2, v3);
3887     __ orr(v0, __ T16B, v0, v1);
3888     __ umov(tmp1, v0, __ D, 0);
3889     __ umov(tmp2, v0, __ D, 1);
3890     __ orr(tmp1, tmp1, tmp2);
3891     __ cbnz(tmp1, NOT_EQUAL);
3892     __ br(__ GE, LOOP);
3893   }
3894 
3895   // a1 = r1 - array1 address
3896   // a2 = r2 - array2 address
3897   // result = r0 - return value. Already contains "false"
3898   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3899   // r3-r5 are reserved temporary registers
3900   address generate_large_array_equals() {
3901     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3902     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3903         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3904         tmp7 = r12, tmp8 = r13;
3905     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3906         SMALL_LOOP, POST_LOOP;
3907     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3908     // calculate if at least 32 prefetched bytes are used
3909     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3910     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3911     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3912     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3913         tmp5, tmp6, tmp7, tmp8);
3914 
3915     __ align(CodeEntryAlignment);
3916     address entry = __ pc();
3917     __ enter();
3918     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3919     // also advance pointers to use post-increment instead of pre-increment
3920     __ add(a1, a1, wordSize);
3921     __ add(a2, a2, wordSize);
3922     if (AvoidUnalignedAccesses) {
3923       // both implementations (SIMD/nonSIMD) are using relatively large load
3924       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3925       // on some CPUs in case of address is not at least 16-byte aligned.
3926       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3927       // load if needed at least for 1st address and make if 16-byte aligned.
3928       Label ALIGNED16;
3929       __ tbz(a1, 3, ALIGNED16);
3930       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3931       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3932       __ sub(cnt1, cnt1, wordSize);
3933       __ eor(tmp1, tmp1, tmp2);
3934       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3935       __ bind(ALIGNED16);
3936     }
3937     if (UseSIMDForArrayEquals) {
3938       if (SoftwarePrefetchHintDistance >= 0) {
3939         __ cmp(cnt1, prefetchLoopThreshold);
3940         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3941         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3942             /* prfm = */ true, NOT_EQUAL);
3943         __ cmp(cnt1, nonPrefetchLoopThreshold);
3944         __ br(__ LT, TAIL);
3945       }
3946       __ bind(NO_PREFETCH_LARGE_LOOP);
3947       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3948           /* prfm = */ false, NOT_EQUAL);
3949     } else {
3950       __ push(spilled_regs, sp);
3951       if (SoftwarePrefetchHintDistance >= 0) {
3952         __ cmp(cnt1, prefetchLoopThreshold);
3953         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3954         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3955             /* prfm = */ true, NOT_EQUAL);
3956         __ cmp(cnt1, nonPrefetchLoopThreshold);
3957         __ br(__ LT, TAIL);
3958       }
3959       __ bind(NO_PREFETCH_LARGE_LOOP);
3960       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3961           /* prfm = */ false, NOT_EQUAL);
3962     }
3963     __ bind(TAIL);
3964       __ cbz(cnt1, EQUAL);
3965       __ subs(cnt1, cnt1, wordSize);
3966       __ br(__ LE, POST_LOOP);
3967     __ bind(SMALL_LOOP);
3968       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3969       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3970       __ subs(cnt1, cnt1, wordSize);
3971       __ eor(tmp1, tmp1, tmp2);
3972       __ cbnz(tmp1, NOT_EQUAL);
3973       __ br(__ GT, SMALL_LOOP);
3974     __ bind(POST_LOOP);
3975       __ ldr(tmp1, Address(a1, cnt1));
3976       __ ldr(tmp2, Address(a2, cnt1));
3977       __ eor(tmp1, tmp1, tmp2);
3978       __ cbnz(tmp1, NOT_EQUAL);
3979     __ bind(EQUAL);
3980       __ mov(result, true);
3981     __ bind(NOT_EQUAL);
3982       if (!UseSIMDForArrayEquals) {
3983         __ pop(spilled_regs, sp);
3984       }
3985     __ bind(NOT_EQUAL_NO_POP);
3986     __ leave();
3987     __ ret(lr);
3988     return entry;
3989   }
3990 
3991 
3992   /**
3993    *  Arguments:
3994    *
3995    *  Input:
3996    *  c_rarg0   - current state address
3997    *  c_rarg1   - H key address
3998    *  c_rarg2   - data address
3999    *  c_rarg3   - number of blocks
4000    *
4001    *  Output:
4002    *  Updated state at c_rarg0
4003    */
4004   address generate_ghash_processBlocks() {
4005     // Bafflingly, GCM uses little-endian for the byte order, but
4006     // big-endian for the bit order.  For example, the polynomial 1 is
4007     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4008     //
4009     // So, we must either reverse the bytes in each word and do
4010     // everything big-endian or reverse the bits in each byte and do
4011     // it little-endian.  On AArch64 it's more idiomatic to reverse
4012     // the bits in each byte (we have an instruction, RBIT, to do
4013     // that) and keep the data in little-endian bit order throught the
4014     // calculation, bit-reversing the inputs and outputs.
4015 
4016     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4017     __ align(wordSize * 2);
4018     address p = __ pc();
4019     __ emit_int64(0x87);  // The low-order bits of the field
4020                           // polynomial (i.e. p = z^7+z^2+z+1)
4021                           // repeated in the low and high parts of a
4022                           // 128-bit vector
4023     __ emit_int64(0x87);
4024 
4025     __ align(CodeEntryAlignment);
4026     address start = __ pc();
4027 
4028     Register state   = c_rarg0;
4029     Register subkeyH = c_rarg1;
4030     Register data    = c_rarg2;
4031     Register blocks  = c_rarg3;
4032 
4033     FloatRegister vzr = v30;
4034     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4035 
4036     __ ldrq(v0, Address(state));
4037     __ ldrq(v1, Address(subkeyH));
4038 
4039     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4040     __ rbit(v0, __ T16B, v0);
4041     __ rev64(v1, __ T16B, v1);
4042     __ rbit(v1, __ T16B, v1);
4043 
4044     __ ldrq(v26, p);
4045 
4046     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4047     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4048 
4049     {
4050       Label L_ghash_loop;
4051       __ bind(L_ghash_loop);
4052 
4053       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4054                                                  // reversing each byte
4055       __ rbit(v2, __ T16B, v2);
4056       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4057 
4058       // Multiply state in v2 by subkey in v1
4059       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4060                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4061                      /*temps*/v6, v20, v18, v21);
4062       // Reduce v7:v5 by the field polynomial
4063       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4064 
4065       __ sub(blocks, blocks, 1);
4066       __ cbnz(blocks, L_ghash_loop);
4067     }
4068 
4069     // The bit-reversed result is at this point in v0
4070     __ rev64(v1, __ T16B, v0);
4071     __ rbit(v1, __ T16B, v1);
4072 
4073     __ st1(v1, __ T16B, state);
4074     __ ret(lr);
4075 
4076     return start;
4077   }
4078 
4079   // Continuation point for throwing of implicit exceptions that are
4080   // not handled in the current activation. Fabricates an exception
4081   // oop and initiates normal exception dispatching in this
4082   // frame. Since we need to preserve callee-saved values (currently
4083   // only for C2, but done for C1 as well) we need a callee-saved oop
4084   // map and therefore have to make these stubs into RuntimeStubs
4085   // rather than BufferBlobs.  If the compiler needs all registers to
4086   // be preserved between the fault point and the exception handler
4087   // then it must assume responsibility for that in
4088   // AbstractCompiler::continuation_for_implicit_null_exception or
4089   // continuation_for_implicit_division_by_zero_exception. All other
4090   // implicit exceptions (e.g., NullPointerException or
4091   // AbstractMethodError on entry) are either at call sites or
4092   // otherwise assume that stack unwinding will be initiated, so
4093   // caller saved registers were assumed volatile in the compiler.
4094 
4095 #undef __
4096 #define __ masm->
4097 
4098   address generate_throw_exception(const char* name,
4099                                    address runtime_entry,
4100                                    Register arg1 = noreg,
4101                                    Register arg2 = noreg) {
4102     // Information about frame layout at time of blocking runtime call.
4103     // Note that we only have to preserve callee-saved registers since
4104     // the compilers are responsible for supplying a continuation point
4105     // if they expect all registers to be preserved.
4106     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4107     enum layout {
4108       rfp_off = 0,
4109       rfp_off2,
4110       return_off,
4111       return_off2,
4112       framesize // inclusive of return address
4113     };
4114 
4115     int insts_size = 512;
4116     int locs_size  = 64;
4117 
4118     CodeBuffer code(name, insts_size, locs_size);
4119     OopMapSet* oop_maps  = new OopMapSet();
4120     MacroAssembler* masm = new MacroAssembler(&code);
4121 
4122     address start = __ pc();
4123 
4124     // This is an inlined and slightly modified version of call_VM
4125     // which has the ability to fetch the return PC out of
4126     // thread-local storage and also sets up last_Java_sp slightly
4127     // differently than the real call_VM
4128 
4129     __ enter(); // Save FP and LR before call
4130 
4131     assert(is_even(framesize/2), "sp not 16-byte aligned");
4132 
4133     // lr and fp are already in place
4134     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4135 
4136     int frame_complete = __ pc() - start;
4137 
4138     // Set up last_Java_sp and last_Java_fp
4139     address the_pc = __ pc();
4140     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4141 
4142     // Call runtime
4143     if (arg1 != noreg) {
4144       assert(arg2 != c_rarg1, "clobbered");
4145       __ mov(c_rarg1, arg1);
4146     }
4147     if (arg2 != noreg) {
4148       __ mov(c_rarg2, arg2);
4149     }
4150     __ mov(c_rarg0, rthread);
4151     BLOCK_COMMENT("call runtime_entry");
4152     __ mov(rscratch1, runtime_entry);
4153     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4154 
4155     // Generate oop map
4156     OopMap* map = new OopMap(framesize, 0);
4157 
4158     oop_maps->add_gc_map(the_pc - start, map);
4159 
4160     __ reset_last_Java_frame(true);
4161     __ maybe_isb();
4162 
4163     __ leave();
4164 
4165     // check for pending exceptions
4166 #ifdef ASSERT
4167     Label L;
4168     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4169     __ cbnz(rscratch1, L);
4170     __ should_not_reach_here();
4171     __ bind(L);
4172 #endif // ASSERT
4173     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4174 
4175 
4176     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4177     RuntimeStub* stub =
4178       RuntimeStub::new_runtime_stub(name,
4179                                     &code,
4180                                     frame_complete,
4181                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4182                                     oop_maps, false);
4183     return stub->entry_point();
4184   }
4185 
4186   class MontgomeryMultiplyGenerator : public MacroAssembler {
4187 
4188     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4189       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4190 
4191     RegSet _toSave;
4192     bool _squaring;
4193 
4194   public:
4195     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4196       : MacroAssembler(as->code()), _squaring(squaring) {
4197 
4198       // Register allocation
4199 
4200       Register reg = c_rarg0;
4201       Pa_base = reg;       // Argument registers
4202       if (squaring)
4203         Pb_base = Pa_base;
4204       else
4205         Pb_base = ++reg;
4206       Pn_base = ++reg;
4207       Rlen= ++reg;
4208       inv = ++reg;
4209       Pm_base = ++reg;
4210 
4211                           // Working registers:
4212       Ra =  ++reg;        // The current digit of a, b, n, and m.
4213       Rb =  ++reg;
4214       Rm =  ++reg;
4215       Rn =  ++reg;
4216 
4217       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4218       Pb =  ++reg;
4219       Pm =  ++reg;
4220       Pn =  ++reg;
4221 
4222       t0 =  ++reg;        // Three registers which form a
4223       t1 =  ++reg;        // triple-precision accumuator.
4224       t2 =  ++reg;
4225 
4226       Ri =  ++reg;        // Inner and outer loop indexes.
4227       Rj =  ++reg;
4228 
4229       Rhi_ab = ++reg;     // Product registers: low and high parts
4230       Rlo_ab = ++reg;     // of a*b and m*n.
4231       Rhi_mn = ++reg;
4232       Rlo_mn = ++reg;
4233 
4234       // r19 and up are callee-saved.
4235       _toSave = RegSet::range(r19, reg) + Pm_base;
4236     }
4237 
4238   private:
4239     void save_regs() {
4240       push(_toSave, sp);
4241     }
4242 
4243     void restore_regs() {
4244       pop(_toSave, sp);
4245     }
4246 
4247     template <typename T>
4248     void unroll_2(Register count, T block) {
4249       Label loop, end, odd;
4250       tbnz(count, 0, odd);
4251       cbz(count, end);
4252       align(16);
4253       bind(loop);
4254       (this->*block)();
4255       bind(odd);
4256       (this->*block)();
4257       subs(count, count, 2);
4258       br(Assembler::GT, loop);
4259       bind(end);
4260     }
4261 
4262     template <typename T>
4263     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4264       Label loop, end, odd;
4265       tbnz(count, 0, odd);
4266       cbz(count, end);
4267       align(16);
4268       bind(loop);
4269       (this->*block)(d, s, tmp);
4270       bind(odd);
4271       (this->*block)(d, s, tmp);
4272       subs(count, count, 2);
4273       br(Assembler::GT, loop);
4274       bind(end);
4275     }
4276 
4277     void pre1(RegisterOrConstant i) {
4278       block_comment("pre1");
4279       // Pa = Pa_base;
4280       // Pb = Pb_base + i;
4281       // Pm = Pm_base;
4282       // Pn = Pn_base + i;
4283       // Ra = *Pa;
4284       // Rb = *Pb;
4285       // Rm = *Pm;
4286       // Rn = *Pn;
4287       ldr(Ra, Address(Pa_base));
4288       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4289       ldr(Rm, Address(Pm_base));
4290       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4291       lea(Pa, Address(Pa_base));
4292       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4293       lea(Pm, Address(Pm_base));
4294       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4295 
4296       // Zero the m*n result.
4297       mov(Rhi_mn, zr);
4298       mov(Rlo_mn, zr);
4299     }
4300 
4301     // The core multiply-accumulate step of a Montgomery
4302     // multiplication.  The idea is to schedule operations as a
4303     // pipeline so that instructions with long latencies (loads and
4304     // multiplies) have time to complete before their results are
4305     // used.  This most benefits in-order implementations of the
4306     // architecture but out-of-order ones also benefit.
4307     void step() {
4308       block_comment("step");
4309       // MACC(Ra, Rb, t0, t1, t2);
4310       // Ra = *++Pa;
4311       // Rb = *--Pb;
4312       umulh(Rhi_ab, Ra, Rb);
4313       mul(Rlo_ab, Ra, Rb);
4314       ldr(Ra, pre(Pa, wordSize));
4315       ldr(Rb, pre(Pb, -wordSize));
4316       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4317                                        // previous iteration.
4318       // MACC(Rm, Rn, t0, t1, t2);
4319       // Rm = *++Pm;
4320       // Rn = *--Pn;
4321       umulh(Rhi_mn, Rm, Rn);
4322       mul(Rlo_mn, Rm, Rn);
4323       ldr(Rm, pre(Pm, wordSize));
4324       ldr(Rn, pre(Pn, -wordSize));
4325       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4326     }
4327 
4328     void post1() {
4329       block_comment("post1");
4330 
4331       // MACC(Ra, Rb, t0, t1, t2);
4332       // Ra = *++Pa;
4333       // Rb = *--Pb;
4334       umulh(Rhi_ab, Ra, Rb);
4335       mul(Rlo_ab, Ra, Rb);
4336       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4337       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4338 
4339       // *Pm = Rm = t0 * inv;
4340       mul(Rm, t0, inv);
4341       str(Rm, Address(Pm));
4342 
4343       // MACC(Rm, Rn, t0, t1, t2);
4344       // t0 = t1; t1 = t2; t2 = 0;
4345       umulh(Rhi_mn, Rm, Rn);
4346 
4347 #ifndef PRODUCT
4348       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4349       {
4350         mul(Rlo_mn, Rm, Rn);
4351         add(Rlo_mn, t0, Rlo_mn);
4352         Label ok;
4353         cbz(Rlo_mn, ok); {
4354           stop("broken Montgomery multiply");
4355         } bind(ok);
4356       }
4357 #endif
4358       // We have very carefully set things up so that
4359       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4360       // the lower half of Rm * Rn because we know the result already:
4361       // it must be -t0.  t0 + (-t0) must generate a carry iff
4362       // t0 != 0.  So, rather than do a mul and an adds we just set
4363       // the carry flag iff t0 is nonzero.
4364       //
4365       // mul(Rlo_mn, Rm, Rn);
4366       // adds(zr, t0, Rlo_mn);
4367       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4368       adcs(t0, t1, Rhi_mn);
4369       adc(t1, t2, zr);
4370       mov(t2, zr);
4371     }
4372 
4373     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4374       block_comment("pre2");
4375       // Pa = Pa_base + i-len;
4376       // Pb = Pb_base + len;
4377       // Pm = Pm_base + i-len;
4378       // Pn = Pn_base + len;
4379 
4380       if (i.is_register()) {
4381         sub(Rj, i.as_register(), len);
4382       } else {
4383         mov(Rj, i.as_constant());
4384         sub(Rj, Rj, len);
4385       }
4386       // Rj == i-len
4387 
4388       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4389       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4390       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4391       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4392 
4393       // Ra = *++Pa;
4394       // Rb = *--Pb;
4395       // Rm = *++Pm;
4396       // Rn = *--Pn;
4397       ldr(Ra, pre(Pa, wordSize));
4398       ldr(Rb, pre(Pb, -wordSize));
4399       ldr(Rm, pre(Pm, wordSize));
4400       ldr(Rn, pre(Pn, -wordSize));
4401 
4402       mov(Rhi_mn, zr);
4403       mov(Rlo_mn, zr);
4404     }
4405 
4406     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4407       block_comment("post2");
4408       if (i.is_constant()) {
4409         mov(Rj, i.as_constant()-len.as_constant());
4410       } else {
4411         sub(Rj, i.as_register(), len);
4412       }
4413 
4414       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4415 
4416       // As soon as we know the least significant digit of our result,
4417       // store it.
4418       // Pm_base[i-len] = t0;
4419       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4420 
4421       // t0 = t1; t1 = t2; t2 = 0;
4422       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4423       adc(t1, t2, zr);
4424       mov(t2, zr);
4425     }
4426 
4427     // A carry in t0 after Montgomery multiplication means that we
4428     // should subtract multiples of n from our result in m.  We'll
4429     // keep doing that until there is no carry.
4430     void normalize(RegisterOrConstant len) {
4431       block_comment("normalize");
4432       // while (t0)
4433       //   t0 = sub(Pm_base, Pn_base, t0, len);
4434       Label loop, post, again;
4435       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4436       cbz(t0, post); {
4437         bind(again); {
4438           mov(i, zr);
4439           mov(cnt, len);
4440           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4441           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4442           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4443           align(16);
4444           bind(loop); {
4445             sbcs(Rm, Rm, Rn);
4446             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4447             add(i, i, 1);
4448             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4449             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4450             sub(cnt, cnt, 1);
4451           } cbnz(cnt, loop);
4452           sbc(t0, t0, zr);
4453         } cbnz(t0, again);
4454       } bind(post);
4455     }
4456 
4457     // Move memory at s to d, reversing words.
4458     //    Increments d to end of copied memory
4459     //    Destroys tmp1, tmp2
4460     //    Preserves len
4461     //    Leaves s pointing to the address which was in d at start
4462     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4463       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4464 
4465       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4466       mov(tmp1, len);
4467       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4468       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4469     }
4470     // where
4471     void reverse1(Register d, Register s, Register tmp) {
4472       ldr(tmp, pre(s, -wordSize));
4473       ror(tmp, tmp, 32);
4474       str(tmp, post(d, wordSize));
4475     }
4476 
4477     void step_squaring() {
4478       // An extra ACC
4479       step();
4480       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4481     }
4482 
4483     void last_squaring(RegisterOrConstant i) {
4484       Label dont;
4485       // if ((i & 1) == 0) {
4486       tbnz(i.as_register(), 0, dont); {
4487         // MACC(Ra, Rb, t0, t1, t2);
4488         // Ra = *++Pa;
4489         // Rb = *--Pb;
4490         umulh(Rhi_ab, Ra, Rb);
4491         mul(Rlo_ab, Ra, Rb);
4492         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4493       } bind(dont);
4494     }
4495 
4496     void extra_step_squaring() {
4497       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4498 
4499       // MACC(Rm, Rn, t0, t1, t2);
4500       // Rm = *++Pm;
4501       // Rn = *--Pn;
4502       umulh(Rhi_mn, Rm, Rn);
4503       mul(Rlo_mn, Rm, Rn);
4504       ldr(Rm, pre(Pm, wordSize));
4505       ldr(Rn, pre(Pn, -wordSize));
4506     }
4507 
4508     void post1_squaring() {
4509       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4510 
4511       // *Pm = Rm = t0 * inv;
4512       mul(Rm, t0, inv);
4513       str(Rm, Address(Pm));
4514 
4515       // MACC(Rm, Rn, t0, t1, t2);
4516       // t0 = t1; t1 = t2; t2 = 0;
4517       umulh(Rhi_mn, Rm, Rn);
4518 
4519 #ifndef PRODUCT
4520       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4521       {
4522         mul(Rlo_mn, Rm, Rn);
4523         add(Rlo_mn, t0, Rlo_mn);
4524         Label ok;
4525         cbz(Rlo_mn, ok); {
4526           stop("broken Montgomery multiply");
4527         } bind(ok);
4528       }
4529 #endif
4530       // We have very carefully set things up so that
4531       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4532       // the lower half of Rm * Rn because we know the result already:
4533       // it must be -t0.  t0 + (-t0) must generate a carry iff
4534       // t0 != 0.  So, rather than do a mul and an adds we just set
4535       // the carry flag iff t0 is nonzero.
4536       //
4537       // mul(Rlo_mn, Rm, Rn);
4538       // adds(zr, t0, Rlo_mn);
4539       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4540       adcs(t0, t1, Rhi_mn);
4541       adc(t1, t2, zr);
4542       mov(t2, zr);
4543     }
4544 
4545     void acc(Register Rhi, Register Rlo,
4546              Register t0, Register t1, Register t2) {
4547       adds(t0, t0, Rlo);
4548       adcs(t1, t1, Rhi);
4549       adc(t2, t2, zr);
4550     }
4551 
4552   public:
4553     /**
4554      * Fast Montgomery multiplication.  The derivation of the
4555      * algorithm is in A Cryptographic Library for the Motorola
4556      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4557      *
4558      * Arguments:
4559      *
4560      * Inputs for multiplication:
4561      *   c_rarg0   - int array elements a
4562      *   c_rarg1   - int array elements b
4563      *   c_rarg2   - int array elements n (the modulus)
4564      *   c_rarg3   - int length
4565      *   c_rarg4   - int inv
4566      *   c_rarg5   - int array elements m (the result)
4567      *
4568      * Inputs for squaring:
4569      *   c_rarg0   - int array elements a
4570      *   c_rarg1   - int array elements n (the modulus)
4571      *   c_rarg2   - int length
4572      *   c_rarg3   - int inv
4573      *   c_rarg4   - int array elements m (the result)
4574      *
4575      */
4576     address generate_multiply() {
4577       Label argh, nothing;
4578       bind(argh);
4579       stop("MontgomeryMultiply total_allocation must be <= 8192");
4580 
4581       align(CodeEntryAlignment);
4582       address entry = pc();
4583 
4584       cbzw(Rlen, nothing);
4585 
4586       enter();
4587 
4588       // Make room.
4589       cmpw(Rlen, 512);
4590       br(Assembler::HI, argh);
4591       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4592       andr(sp, Ra, -2 * wordSize);
4593 
4594       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4595 
4596       {
4597         // Copy input args, reversing as we go.  We use Ra as a
4598         // temporary variable.
4599         reverse(Ra, Pa_base, Rlen, t0, t1);
4600         if (!_squaring)
4601           reverse(Ra, Pb_base, Rlen, t0, t1);
4602         reverse(Ra, Pn_base, Rlen, t0, t1);
4603       }
4604 
4605       // Push all call-saved registers and also Pm_base which we'll need
4606       // at the end.
4607       save_regs();
4608 
4609 #ifndef PRODUCT
4610       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4611       {
4612         ldr(Rn, Address(Pn_base, 0));
4613         mul(Rlo_mn, Rn, inv);
4614         cmp(Rlo_mn, -1);
4615         Label ok;
4616         br(EQ, ok); {
4617           stop("broken inverse in Montgomery multiply");
4618         } bind(ok);
4619       }
4620 #endif
4621 
4622       mov(Pm_base, Ra);
4623 
4624       mov(t0, zr);
4625       mov(t1, zr);
4626       mov(t2, zr);
4627 
4628       block_comment("for (int i = 0; i < len; i++) {");
4629       mov(Ri, zr); {
4630         Label loop, end;
4631         cmpw(Ri, Rlen);
4632         br(Assembler::GE, end);
4633 
4634         bind(loop);
4635         pre1(Ri);
4636 
4637         block_comment("  for (j = i; j; j--) {"); {
4638           movw(Rj, Ri);
4639           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4640         } block_comment("  } // j");
4641 
4642         post1();
4643         addw(Ri, Ri, 1);
4644         cmpw(Ri, Rlen);
4645         br(Assembler::LT, loop);
4646         bind(end);
4647         block_comment("} // i");
4648       }
4649 
4650       block_comment("for (int i = len; i < 2*len; i++) {");
4651       mov(Ri, Rlen); {
4652         Label loop, end;
4653         cmpw(Ri, Rlen, Assembler::LSL, 1);
4654         br(Assembler::GE, end);
4655 
4656         bind(loop);
4657         pre2(Ri, Rlen);
4658 
4659         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4660           lslw(Rj, Rlen, 1);
4661           subw(Rj, Rj, Ri);
4662           subw(Rj, Rj, 1);
4663           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4664         } block_comment("  } // j");
4665 
4666         post2(Ri, Rlen);
4667         addw(Ri, Ri, 1);
4668         cmpw(Ri, Rlen, Assembler::LSL, 1);
4669         br(Assembler::LT, loop);
4670         bind(end);
4671       }
4672       block_comment("} // i");
4673 
4674       normalize(Rlen);
4675 
4676       mov(Ra, Pm_base);  // Save Pm_base in Ra
4677       restore_regs();  // Restore caller's Pm_base
4678 
4679       // Copy our result into caller's Pm_base
4680       reverse(Pm_base, Ra, Rlen, t0, t1);
4681 
4682       leave();
4683       bind(nothing);
4684       ret(lr);
4685 
4686       return entry;
4687     }
4688     // In C, approximately:
4689 
4690     // void
4691     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4692     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4693     //                     unsigned long inv, int len) {
4694     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4695     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4696     //   unsigned long Ra, Rb, Rn, Rm;
4697 
4698     //   int i;
4699 
4700     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4701 
4702     //   for (i = 0; i < len; i++) {
4703     //     int j;
4704 
4705     //     Pa = Pa_base;
4706     //     Pb = Pb_base + i;
4707     //     Pm = Pm_base;
4708     //     Pn = Pn_base + i;
4709 
4710     //     Ra = *Pa;
4711     //     Rb = *Pb;
4712     //     Rm = *Pm;
4713     //     Rn = *Pn;
4714 
4715     //     int iters = i;
4716     //     for (j = 0; iters--; j++) {
4717     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4718     //       MACC(Ra, Rb, t0, t1, t2);
4719     //       Ra = *++Pa;
4720     //       Rb = *--Pb;
4721     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4722     //       MACC(Rm, Rn, t0, t1, t2);
4723     //       Rm = *++Pm;
4724     //       Rn = *--Pn;
4725     //     }
4726 
4727     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4728     //     MACC(Ra, Rb, t0, t1, t2);
4729     //     *Pm = Rm = t0 * inv;
4730     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4731     //     MACC(Rm, Rn, t0, t1, t2);
4732 
4733     //     assert(t0 == 0, "broken Montgomery multiply");
4734 
4735     //     t0 = t1; t1 = t2; t2 = 0;
4736     //   }
4737 
4738     //   for (i = len; i < 2*len; i++) {
4739     //     int j;
4740 
4741     //     Pa = Pa_base + i-len;
4742     //     Pb = Pb_base + len;
4743     //     Pm = Pm_base + i-len;
4744     //     Pn = Pn_base + len;
4745 
4746     //     Ra = *++Pa;
4747     //     Rb = *--Pb;
4748     //     Rm = *++Pm;
4749     //     Rn = *--Pn;
4750 
4751     //     int iters = len*2-i-1;
4752     //     for (j = i-len+1; iters--; j++) {
4753     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4754     //       MACC(Ra, Rb, t0, t1, t2);
4755     //       Ra = *++Pa;
4756     //       Rb = *--Pb;
4757     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4758     //       MACC(Rm, Rn, t0, t1, t2);
4759     //       Rm = *++Pm;
4760     //       Rn = *--Pn;
4761     //     }
4762 
4763     //     Pm_base[i-len] = t0;
4764     //     t0 = t1; t1 = t2; t2 = 0;
4765     //   }
4766 
4767     //   while (t0)
4768     //     t0 = sub(Pm_base, Pn_base, t0, len);
4769     // }
4770 
4771     /**
4772      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4773      * multiplies than Montgomery multiplication so it should be up to
4774      * 25% faster.  However, its loop control is more complex and it
4775      * may actually run slower on some machines.
4776      *
4777      * Arguments:
4778      *
4779      * Inputs:
4780      *   c_rarg0   - int array elements a
4781      *   c_rarg1   - int array elements n (the modulus)
4782      *   c_rarg2   - int length
4783      *   c_rarg3   - int inv
4784      *   c_rarg4   - int array elements m (the result)
4785      *
4786      */
4787     address generate_square() {
4788       Label argh;
4789       bind(argh);
4790       stop("MontgomeryMultiply total_allocation must be <= 8192");
4791 
4792       align(CodeEntryAlignment);
4793       address entry = pc();
4794 
4795       enter();
4796 
4797       // Make room.
4798       cmpw(Rlen, 512);
4799       br(Assembler::HI, argh);
4800       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4801       andr(sp, Ra, -2 * wordSize);
4802 
4803       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4804 
4805       {
4806         // Copy input args, reversing as we go.  We use Ra as a
4807         // temporary variable.
4808         reverse(Ra, Pa_base, Rlen, t0, t1);
4809         reverse(Ra, Pn_base, Rlen, t0, t1);
4810       }
4811 
4812       // Push all call-saved registers and also Pm_base which we'll need
4813       // at the end.
4814       save_regs();
4815 
4816       mov(Pm_base, Ra);
4817 
4818       mov(t0, zr);
4819       mov(t1, zr);
4820       mov(t2, zr);
4821 
4822       block_comment("for (int i = 0; i < len; i++) {");
4823       mov(Ri, zr); {
4824         Label loop, end;
4825         bind(loop);
4826         cmp(Ri, Rlen);
4827         br(Assembler::GE, end);
4828 
4829         pre1(Ri);
4830 
4831         block_comment("for (j = (i+1)/2; j; j--) {"); {
4832           add(Rj, Ri, 1);
4833           lsr(Rj, Rj, 1);
4834           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4835         } block_comment("  } // j");
4836 
4837         last_squaring(Ri);
4838 
4839         block_comment("  for (j = i/2; j; j--) {"); {
4840           lsr(Rj, Ri, 1);
4841           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4842         } block_comment("  } // j");
4843 
4844         post1_squaring();
4845         add(Ri, Ri, 1);
4846         cmp(Ri, Rlen);
4847         br(Assembler::LT, loop);
4848 
4849         bind(end);
4850         block_comment("} // i");
4851       }
4852 
4853       block_comment("for (int i = len; i < 2*len; i++) {");
4854       mov(Ri, Rlen); {
4855         Label loop, end;
4856         bind(loop);
4857         cmp(Ri, Rlen, Assembler::LSL, 1);
4858         br(Assembler::GE, end);
4859 
4860         pre2(Ri, Rlen);
4861 
4862         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4863           lsl(Rj, Rlen, 1);
4864           sub(Rj, Rj, Ri);
4865           sub(Rj, Rj, 1);
4866           lsr(Rj, Rj, 1);
4867           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4868         } block_comment("  } // j");
4869 
4870         last_squaring(Ri);
4871 
4872         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4873           lsl(Rj, Rlen, 1);
4874           sub(Rj, Rj, Ri);
4875           lsr(Rj, Rj, 1);
4876           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4877         } block_comment("  } // j");
4878 
4879         post2(Ri, Rlen);
4880         add(Ri, Ri, 1);
4881         cmp(Ri, Rlen, Assembler::LSL, 1);
4882 
4883         br(Assembler::LT, loop);
4884         bind(end);
4885         block_comment("} // i");
4886       }
4887 
4888       normalize(Rlen);
4889 
4890       mov(Ra, Pm_base);  // Save Pm_base in Ra
4891       restore_regs();  // Restore caller's Pm_base
4892 
4893       // Copy our result into caller's Pm_base
4894       reverse(Pm_base, Ra, Rlen, t0, t1);
4895 
4896       leave();
4897       ret(lr);
4898 
4899       return entry;
4900     }
4901     // In C, approximately:
4902 
4903     // void
4904     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4905     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4906     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4907     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4908     //   unsigned long Ra, Rb, Rn, Rm;
4909 
4910     //   int i;
4911 
4912     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4913 
4914     //   for (i = 0; i < len; i++) {
4915     //     int j;
4916 
4917     //     Pa = Pa_base;
4918     //     Pb = Pa_base + i;
4919     //     Pm = Pm_base;
4920     //     Pn = Pn_base + i;
4921 
4922     //     Ra = *Pa;
4923     //     Rb = *Pb;
4924     //     Rm = *Pm;
4925     //     Rn = *Pn;
4926 
4927     //     int iters = (i+1)/2;
4928     //     for (j = 0; iters--; j++) {
4929     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4930     //       MACC2(Ra, Rb, t0, t1, t2);
4931     //       Ra = *++Pa;
4932     //       Rb = *--Pb;
4933     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4934     //       MACC(Rm, Rn, t0, t1, t2);
4935     //       Rm = *++Pm;
4936     //       Rn = *--Pn;
4937     //     }
4938     //     if ((i & 1) == 0) {
4939     //       assert(Ra == Pa_base[j], "must be");
4940     //       MACC(Ra, Ra, t0, t1, t2);
4941     //     }
4942     //     iters = i/2;
4943     //     assert(iters == i-j, "must be");
4944     //     for (; iters--; j++) {
4945     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4946     //       MACC(Rm, Rn, t0, t1, t2);
4947     //       Rm = *++Pm;
4948     //       Rn = *--Pn;
4949     //     }
4950 
4951     //     *Pm = Rm = t0 * inv;
4952     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4953     //     MACC(Rm, Rn, t0, t1, t2);
4954 
4955     //     assert(t0 == 0, "broken Montgomery multiply");
4956 
4957     //     t0 = t1; t1 = t2; t2 = 0;
4958     //   }
4959 
4960     //   for (i = len; i < 2*len; i++) {
4961     //     int start = i-len+1;
4962     //     int end = start + (len - start)/2;
4963     //     int j;
4964 
4965     //     Pa = Pa_base + i-len;
4966     //     Pb = Pa_base + len;
4967     //     Pm = Pm_base + i-len;
4968     //     Pn = Pn_base + len;
4969 
4970     //     Ra = *++Pa;
4971     //     Rb = *--Pb;
4972     //     Rm = *++Pm;
4973     //     Rn = *--Pn;
4974 
4975     //     int iters = (2*len-i-1)/2;
4976     //     assert(iters == end-start, "must be");
4977     //     for (j = start; iters--; j++) {
4978     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4979     //       MACC2(Ra, Rb, t0, t1, t2);
4980     //       Ra = *++Pa;
4981     //       Rb = *--Pb;
4982     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4983     //       MACC(Rm, Rn, t0, t1, t2);
4984     //       Rm = *++Pm;
4985     //       Rn = *--Pn;
4986     //     }
4987     //     if ((i & 1) == 0) {
4988     //       assert(Ra == Pa_base[j], "must be");
4989     //       MACC(Ra, Ra, t0, t1, t2);
4990     //     }
4991     //     iters =  (2*len-i)/2;
4992     //     assert(iters == len-j, "must be");
4993     //     for (; iters--; j++) {
4994     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4995     //       MACC(Rm, Rn, t0, t1, t2);
4996     //       Rm = *++Pm;
4997     //       Rn = *--Pn;
4998     //     }
4999     //     Pm_base[i-len] = t0;
5000     //     t0 = t1; t1 = t2; t2 = 0;
5001     //   }
5002 
5003     //   while (t0)
5004     //     t0 = sub(Pm_base, Pn_base, t0, len);
5005     // }
5006   };
5007 
5008 
5009   // Initialization
5010   void generate_initial() {
5011     // Generate initial stubs and initializes the entry points
5012 
5013     // entry points that exist in all platforms Note: This is code
5014     // that could be shared among different platforms - however the
5015     // benefit seems to be smaller than the disadvantage of having a
5016     // much more complicated generator structure. See also comment in
5017     // stubRoutines.hpp.
5018 
5019     StubRoutines::_forward_exception_entry = generate_forward_exception();
5020 
5021     StubRoutines::_call_stub_entry =
5022       generate_call_stub(StubRoutines::_call_stub_return_address);
5023 
5024     // is referenced by megamorphic call
5025     StubRoutines::_catch_exception_entry = generate_catch_exception();
5026 
5027     // Build this early so it's available for the interpreter.
5028     StubRoutines::_throw_StackOverflowError_entry =
5029       generate_throw_exception("StackOverflowError throw_exception",
5030                                CAST_FROM_FN_PTR(address,
5031                                                 SharedRuntime::throw_StackOverflowError));
5032     StubRoutines::_throw_delayed_StackOverflowError_entry =
5033       generate_throw_exception("delayed StackOverflowError throw_exception",
5034                                CAST_FROM_FN_PTR(address,
5035                                                 SharedRuntime::throw_delayed_StackOverflowError));
5036     if (UseCRC32Intrinsics) {
5037       // set table address before stub generation which use it
5038       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5039       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5040     }
5041 
5042     if (UseCRC32CIntrinsics) {
5043       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5044     }
5045   }
5046 
5047   void generate_all() {
5048     // support for verify_oop (must happen after universe_init)
5049     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5050     StubRoutines::_throw_AbstractMethodError_entry =
5051       generate_throw_exception("AbstractMethodError throw_exception",
5052                                CAST_FROM_FN_PTR(address,
5053                                                 SharedRuntime::
5054                                                 throw_AbstractMethodError));
5055 
5056     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5057       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5058                                CAST_FROM_FN_PTR(address,
5059                                                 SharedRuntime::
5060                                                 throw_IncompatibleClassChangeError));
5061 
5062     StubRoutines::_throw_NullPointerException_at_call_entry =
5063       generate_throw_exception("NullPointerException at call throw_exception",
5064                                CAST_FROM_FN_PTR(address,
5065                                                 SharedRuntime::
5066                                                 throw_NullPointerException_at_call));
5067 
5068     // arraycopy stubs used by compilers
5069     generate_arraycopy_stubs();
5070 
5071     // has negatives stub for large arrays.
5072     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5073 
5074     // array equals stub for large arrays.
5075     if (!UseSimpleArrayEquals) {
5076       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5077     }
5078 
5079     if (UseMultiplyToLenIntrinsic) {
5080       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5081     }
5082 
5083     if (UseSquareToLenIntrinsic) {
5084       StubRoutines::_squareToLen = generate_squareToLen();
5085     }
5086 
5087     if (UseMulAddIntrinsic) {
5088       StubRoutines::_mulAdd = generate_mulAdd();
5089     }
5090 
5091     if (UseMontgomeryMultiplyIntrinsic) {
5092       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5093       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5094       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5095     }
5096 
5097     if (UseMontgomerySquareIntrinsic) {
5098       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5099       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5100       // We use generate_multiply() rather than generate_square()
5101       // because it's faster for the sizes of modulus we care about.
5102       StubRoutines::_montgomerySquare = g.generate_multiply();
5103     }
5104 
5105 #ifndef BUILTIN_SIM
5106     // generate GHASH intrinsics code
5107     if (UseGHASHIntrinsics) {
5108       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5109     }
5110 
5111     if (UseAESIntrinsics) {
5112       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5113       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5114       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5115       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5116     }
5117 
5118     if (UseSHA1Intrinsics) {
5119       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5120       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5121     }
5122     if (UseSHA256Intrinsics) {
5123       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5124       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5125     }
5126 
5127     // generate Adler32 intrinsics code
5128     if (UseAdler32Intrinsics) {
5129       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5130     }
5131 
5132     // Safefetch stubs.
5133     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5134                                                        &StubRoutines::_safefetch32_fault_pc,
5135                                                        &StubRoutines::_safefetch32_continuation_pc);
5136     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5137                                                        &StubRoutines::_safefetchN_fault_pc,
5138                                                        &StubRoutines::_safefetchN_continuation_pc);
5139 #endif
5140     StubRoutines::aarch64::set_completed();
5141   }
5142 
5143  public:
5144   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5145     if (all) {
5146       generate_all();
5147     } else {
5148       generate_initial();
5149     }
5150   }
5151 }; // end class declaration
5152 
5153 void StubGenerator_generate(CodeBuffer* code, bool all) {
5154   StubGenerator g(code, all);
5155 }