1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (u1)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_VALUETYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, (u1)T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, (u1)T_VALUETYPE);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, (u1)T_LONG);
 330     __ br(Assembler::EQ, is_long);
 331     __ cmp(j_rarg1, (u1)T_FLOAT);
 332     __ br(Assembler::EQ, is_float);
 333     __ cmp(j_rarg1, (u1)T_DOUBLE);
 334     __ br(Assembler::EQ, is_double);
 335 
 336     // handle T_INT case
 337     __ strw(r0, Address(j_rarg2));
 338 
 339     __ BIND(exit);
 340 
 341     // pop parameters
 342     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 343 
 344 #ifdef ASSERT
 345     // verify that threads correspond
 346     {
 347       Label L, S;
 348       __ ldr(rscratch1, thread);
 349       __ cmp(rthread, rscratch1);
 350       __ br(Assembler::NE, S);
 351       __ get_thread(rscratch1);
 352       __ cmp(rthread, rscratch1);
 353       __ br(Assembler::EQ, L);
 354       __ BIND(S);
 355       __ stop("StubRoutines::call_stub: threads must correspond");
 356       __ BIND(L);
 357     }
 358 #endif
 359 
 360     // restore callee-save registers
 361     __ ldpd(v15, v14,  d15_save);
 362     __ ldpd(v13, v12,  d13_save);
 363     __ ldpd(v11, v10,  d11_save);
 364     __ ldpd(v9,  v8,   d9_save);
 365 
 366     __ ldp(r28, r27,   r28_save);
 367     __ ldp(r26, r25,   r26_save);
 368     __ ldp(r24, r23,   r24_save);
 369     __ ldp(r22, r21,   r22_save);
 370     __ ldp(r20, r19,   r20_save);
 371 
 372     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 373     __ ldrw(c_rarg2, result_type);
 374     __ ldr(c_rarg3,  method);
 375     __ ldp(c_rarg4, c_rarg5,  entry_point);
 376     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 377 
 378 #ifndef PRODUCT
 379     // tell the simulator we are about to end Java execution
 380     if (NotifySimulator) {
 381       __ notify(Assembler::method_exit);
 382     }
 383 #endif
 384     // leave frame and return to caller
 385     __ leave();
 386     __ ret(lr);
 387 
 388     // handle return types different from T_INT
 389 
 390     __ BIND(is_long);
 391     __ str(r0, Address(j_rarg2, 0));
 392     __ br(Assembler::AL, exit);
 393 
 394     __ BIND(is_float);
 395     __ strs(j_farg0, Address(j_rarg2, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     __ BIND(is_double);
 399     __ strd(j_farg0, Address(j_rarg2, 0));
 400     __ br(Assembler::AL, exit);
 401 
 402     return start;
 403   }
 404 
 405   // Return point for a Java call if there's an exception thrown in
 406   // Java code.  The exception is caught and transformed into a
 407   // pending exception stored in JavaThread that can be tested from
 408   // within the VM.
 409   //
 410   // Note: Usually the parameters are removed by the callee. In case
 411   // of an exception crossing an activation frame boundary, that is
 412   // not the case if the callee is compiled code => need to setup the
 413   // rsp.
 414   //
 415   // r0: exception oop
 416 
 417   // NOTE: this is used as a target from the signal handler so it
 418   // needs an x86 prolog which returns into the current simulator
 419   // executing the generated catch_exception code. so the prolog
 420   // needs to install rax in a sim register and adjust the sim's
 421   // restart pc to enter the generated code at the start position
 422   // then return from native to simulated execution.
 423 
 424   address generate_catch_exception() {
 425     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 426     address start = __ pc();
 427 
 428     // same as in generate_call_stub():
 429     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 430     const Address thread        (rfp, thread_off         * wordSize);
 431 
 432 #ifdef ASSERT
 433     // verify that threads correspond
 434     {
 435       Label L, S;
 436       __ ldr(rscratch1, thread);
 437       __ cmp(rthread, rscratch1);
 438       __ br(Assembler::NE, S);
 439       __ get_thread(rscratch1);
 440       __ cmp(rthread, rscratch1);
 441       __ br(Assembler::EQ, L);
 442       __ bind(S);
 443       __ stop("StubRoutines::catch_exception: threads must correspond");
 444       __ bind(L);
 445     }
 446 #endif
 447 
 448     // set pending exception
 449     __ verify_oop(r0);
 450 
 451     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 452     __ mov(rscratch1, (address)__FILE__);
 453     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 454     __ movw(rscratch1, (int)__LINE__);
 455     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 456 
 457     // complete return to VM
 458     assert(StubRoutines::_call_stub_return_address != NULL,
 459            "_call_stub_return_address must have been generated before");
 460     __ b(StubRoutines::_call_stub_return_address);
 461 
 462     return start;
 463   }
 464 
 465   // Continuation point for runtime calls returning with a pending
 466   // exception.  The pending exception check happened in the runtime
 467   // or native call stub.  The pending exception in Thread is
 468   // converted into a Java-level exception.
 469   //
 470   // Contract with Java-level exception handlers:
 471   // r0: exception
 472   // r3: throwing pc
 473   //
 474   // NOTE: At entry of this stub, exception-pc must be in LR !!
 475 
 476   // NOTE: this is always used as a jump target within generated code
 477   // so it just needs to be generated code wiht no x86 prolog
 478 
 479   address generate_forward_exception() {
 480     StubCodeMark mark(this, "StubRoutines", "forward exception");
 481     address start = __ pc();
 482 
 483     // Upon entry, LR points to the return address returning into
 484     // Java (interpreted or compiled) code; i.e., the return address
 485     // becomes the throwing pc.
 486     //
 487     // Arguments pushed before the runtime call are still on the stack
 488     // but the exception handler will reset the stack pointer ->
 489     // ignore them.  A potential result in registers can be ignored as
 490     // well.
 491 
 492 #ifdef ASSERT
 493     // make sure this code is only executed if there is a pending exception
 494     {
 495       Label L;
 496       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 497       __ cbnz(rscratch1, L);
 498       __ stop("StubRoutines::forward exception: no pending exception (1)");
 499       __ bind(L);
 500     }
 501 #endif
 502 
 503     // compute exception handler into r19
 504 
 505     // call the VM to find the handler address associated with the
 506     // caller address. pass thread in r0 and caller pc (ret address)
 507     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 508     // the stack.
 509     __ mov(c_rarg1, lr);
 510     // lr will be trashed by the VM call so we move it to R19
 511     // (callee-saved) because we also need to pass it to the handler
 512     // returned by this call.
 513     __ mov(r19, lr);
 514     BLOCK_COMMENT("call exception_handler_for_return_address");
 515     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 516                          SharedRuntime::exception_handler_for_return_address),
 517                     rthread, c_rarg1);
 518     // we should not really care that lr is no longer the callee
 519     // address. we saved the value the handler needs in r19 so we can
 520     // just copy it to r3. however, the C2 handler will push its own
 521     // frame and then calls into the VM and the VM code asserts that
 522     // the PC for the frame above the handler belongs to a compiled
 523     // Java method. So, we restore lr here to satisfy that assert.
 524     __ mov(lr, r19);
 525     // setup r0 & r3 & clear pending exception
 526     __ mov(r3, r19);
 527     __ mov(r19, r0);
 528     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 529     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 530 
 531 #ifdef ASSERT
 532     // make sure exception is set
 533     {
 534       Label L;
 535       __ cbnz(r0, L);
 536       __ stop("StubRoutines::forward exception: no pending exception (2)");
 537       __ bind(L);
 538     }
 539 #endif
 540 
 541     // continue at exception handler
 542     // r0: exception
 543     // r3: throwing pc
 544     // r19: exception handler
 545     __ verify_oop(r0);
 546     __ br(r19);
 547 
 548     return start;
 549   }
 550 
 551   // Non-destructive plausibility checks for oops
 552   //
 553   // Arguments:
 554   //    r0: oop to verify
 555   //    rscratch1: error message
 556   //
 557   // Stack after saving c_rarg3:
 558   //    [tos + 0]: saved c_rarg3
 559   //    [tos + 1]: saved c_rarg2
 560   //    [tos + 2]: saved lr
 561   //    [tos + 3]: saved rscratch2
 562   //    [tos + 4]: saved r0
 563   //    [tos + 5]: saved rscratch1
 564   address generate_verify_oop() {
 565 
 566     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 567     address start = __ pc();
 568 
 569     Label exit, error;
 570 
 571     // save c_rarg2 and c_rarg3
 572     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 573 
 574     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 575     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 576     __ ldr(c_rarg3, Address(c_rarg2));
 577     __ add(c_rarg3, c_rarg3, 1);
 578     __ str(c_rarg3, Address(c_rarg2));
 579 
 580     // object is in r0
 581     // make sure object is 'reasonable'
 582     __ cbz(r0, exit); // if obj is NULL it is OK
 583 
 584     // Check if the oop is in the right area of memory
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 586     __ andr(c_rarg2, r0, c_rarg3);
 587     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 588 
 589     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 590     // instruction here because the flags register is live.
 591     __ eor(c_rarg2, c_rarg2, c_rarg3);
 592     __ cbnz(c_rarg2, error);
 593 
 594     // make sure klass is 'reasonable', which is not zero.
 595     __ load_klass(r0, r0);  // get klass
 596     __ cbz(r0, error);      // if klass is NULL it is broken
 597 
 598     // return if everything seems ok
 599     __ bind(exit);
 600 
 601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 602     __ ret(lr);
 603 
 604     // handle errors
 605     __ bind(error);
 606     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 607 
 608     __ push(RegSet::range(r0, r29), sp);
 609     // debug(char* msg, int64_t pc, int64_t regs[])
 610     __ mov(c_rarg0, rscratch1);      // pass address of error message
 611     __ mov(c_rarg1, lr);             // pass return address
 612     __ mov(c_rarg2, sp);             // pass address of regs on stack
 613 #ifndef PRODUCT
 614     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 615 #endif
 616     BLOCK_COMMENT("call MacroAssembler::debug");
 617     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 618     __ blrt(rscratch1, 3, 0, 1);
 619 
 620     return start;
 621   }
 622 
 623   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 624 
 625   // The inner part of zero_words().  This is the bulk operation,
 626   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 627   // caller is responsible for zeroing the last few words.
 628   //
 629   // Inputs:
 630   // r10: the HeapWord-aligned base address of an array to zero.
 631   // r11: the count in HeapWords, r11 > 0.
 632   //
 633   // Returns r10 and r11, adjusted for the caller to clear.
 634   // r10: the base address of the tail of words left to clear.
 635   // r11: the number of words in the tail.
 636   //      r11 < MacroAssembler::zero_words_block_size.
 637 
 638   address generate_zero_blocks() {
 639     Label done;
 640     Label base_aligned;
 641 
 642     Register base = r10, cnt = r11;
 643 
 644     __ align(CodeEntryAlignment);
 645     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 646     address start = __ pc();
 647 
 648     if (UseBlockZeroing) {
 649       int zva_length = VM_Version::zva_length();
 650 
 651       // Ensure ZVA length can be divided by 16. This is required by
 652       // the subsequent operations.
 653       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 654 
 655       __ tbz(base, 3, base_aligned);
 656       __ str(zr, Address(__ post(base, 8)));
 657       __ sub(cnt, cnt, 1);
 658       __ bind(base_aligned);
 659 
 660       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 661       // alignment.
 662       Label small;
 663       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 664       __ subs(rscratch1, cnt, low_limit >> 3);
 665       __ br(Assembler::LT, small);
 666       __ zero_dcache_blocks(base, cnt);
 667       __ bind(small);
 668     }
 669 
 670     {
 671       // Number of stp instructions we'll unroll
 672       const int unroll =
 673         MacroAssembler::zero_words_block_size / 2;
 674       // Clear the remaining blocks.
 675       Label loop;
 676       __ subs(cnt, cnt, unroll * 2);
 677       __ br(Assembler::LT, done);
 678       __ bind(loop);
 679       for (int i = 0; i < unroll; i++)
 680         __ stp(zr, zr, __ post(base, 16));
 681       __ subs(cnt, cnt, unroll * 2);
 682       __ br(Assembler::GE, loop);
 683       __ bind(done);
 684       __ add(cnt, cnt, unroll * 2);
 685     }
 686 
 687     __ ret(lr);
 688 
 689     return start;
 690   }
 691 
 692 
 693   typedef enum {
 694     copy_forwards = 1,
 695     copy_backwards = -1
 696   } copy_direction;
 697 
 698   // Bulk copy of blocks of 8 words.
 699   //
 700   // count is a count of words.
 701   //
 702   // Precondition: count >= 8
 703   //
 704   // Postconditions:
 705   //
 706   // The least significant bit of count contains the remaining count
 707   // of words to copy.  The rest of count is trash.
 708   //
 709   // s and d are adjusted to point to the remaining words to copy
 710   //
 711   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 712                            copy_direction direction) {
 713     int unit = wordSize * direction;
 714     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 715 
 716     int offset;
 717     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 718       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 719     const Register stride = r13;
 720 
 721     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 722     assert_different_registers(s, d, count, rscratch1);
 723 
 724     Label again, drain;
 725     const char *stub_name;
 726     if (direction == copy_forwards)
 727       stub_name = "forward_copy_longs";
 728     else
 729       stub_name = "backward_copy_longs";
 730 
 731     __ align(CodeEntryAlignment);
 732 
 733     StubCodeMark mark(this, "StubRoutines", stub_name);
 734 
 735     __ bind(start);
 736 
 737     Label unaligned_copy_long;
 738     if (AvoidUnalignedAccesses) {
 739       __ tbnz(d, 3, unaligned_copy_long);
 740     }
 741 
 742     if (direction == copy_forwards) {
 743       __ sub(s, s, bias);
 744       __ sub(d, d, bias);
 745     }
 746 
 747 #ifdef ASSERT
 748     // Make sure we are never given < 8 words
 749     {
 750       Label L;
 751       __ cmp(count, (u1)8);
 752       __ br(Assembler::GE, L);
 753       __ stop("genrate_copy_longs called with < 8 words");
 754       __ bind(L);
 755     }
 756 #endif
 757 
 758     // Fill 8 registers
 759     if (UseSIMDForMemoryOps) {
 760       __ ldpq(v0, v1, Address(s, 4 * unit));
 761       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 762     } else {
 763       __ ldp(t0, t1, Address(s, 2 * unit));
 764       __ ldp(t2, t3, Address(s, 4 * unit));
 765       __ ldp(t4, t5, Address(s, 6 * unit));
 766       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 767     }
 768 
 769     __ subs(count, count, 16);
 770     __ br(Assembler::LO, drain);
 771 
 772     int prefetch = PrefetchCopyIntervalInBytes;
 773     bool use_stride = false;
 774     if (direction == copy_backwards) {
 775        use_stride = prefetch > 256;
 776        prefetch = -prefetch;
 777        if (use_stride) __ mov(stride, prefetch);
 778     }
 779 
 780     __ bind(again);
 781 
 782     if (PrefetchCopyIntervalInBytes > 0)
 783       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 784 
 785     if (UseSIMDForMemoryOps) {
 786       __ stpq(v0, v1, Address(d, 4 * unit));
 787       __ ldpq(v0, v1, Address(s, 4 * unit));
 788       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 789       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 790     } else {
 791       __ stp(t0, t1, Address(d, 2 * unit));
 792       __ ldp(t0, t1, Address(s, 2 * unit));
 793       __ stp(t2, t3, Address(d, 4 * unit));
 794       __ ldp(t2, t3, Address(s, 4 * unit));
 795       __ stp(t4, t5, Address(d, 6 * unit));
 796       __ ldp(t4, t5, Address(s, 6 * unit));
 797       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 798       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 799     }
 800 
 801     __ subs(count, count, 8);
 802     __ br(Assembler::HS, again);
 803 
 804     // Drain
 805     __ bind(drain);
 806     if (UseSIMDForMemoryOps) {
 807       __ stpq(v0, v1, Address(d, 4 * unit));
 808       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 809     } else {
 810       __ stp(t0, t1, Address(d, 2 * unit));
 811       __ stp(t2, t3, Address(d, 4 * unit));
 812       __ stp(t4, t5, Address(d, 6 * unit));
 813       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 814     }
 815 
 816     {
 817       Label L1, L2;
 818       __ tbz(count, exact_log2(4), L1);
 819       if (UseSIMDForMemoryOps) {
 820         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 821         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 822       } else {
 823         __ ldp(t0, t1, Address(s, 2 * unit));
 824         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 825         __ stp(t0, t1, Address(d, 2 * unit));
 826         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 827       }
 828       __ bind(L1);
 829 
 830       if (direction == copy_forwards) {
 831         __ add(s, s, bias);
 832         __ add(d, d, bias);
 833       }
 834 
 835       __ tbz(count, 1, L2);
 836       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 837       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 838       __ bind(L2);
 839     }
 840 
 841     __ ret(lr);
 842 
 843     if (AvoidUnalignedAccesses) {
 844       Label drain, again;
 845       // Register order for storing. Order is different for backward copy.
 846 
 847       __ bind(unaligned_copy_long);
 848 
 849       // source address is even aligned, target odd aligned
 850       //
 851       // when forward copying word pairs we read long pairs at offsets
 852       // {0, 2, 4, 6} (in long words). when backwards copying we read
 853       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 854       // address by -2 in the forwards case so we can compute the
 855       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 856       // or -1.
 857       //
 858       // when forward copying we need to store 1 word, 3 pairs and
 859       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 860       // zero offset We adjust the destination by -1 which means we
 861       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 862       //
 863       // When backwards copyng we need to store 1 word, 3 pairs and
 864       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 865       // offsets {1, 3, 5, 7, 8} * unit.
 866 
 867       if (direction == copy_forwards) {
 868         __ sub(s, s, 16);
 869         __ sub(d, d, 8);
 870       }
 871 
 872       // Fill 8 registers
 873       //
 874       // for forwards copy s was offset by -16 from the original input
 875       // value of s so the register contents are at these offsets
 876       // relative to the 64 bit block addressed by that original input
 877       // and so on for each successive 64 byte block when s is updated
 878       //
 879       // t0 at offset 0,  t1 at offset 8
 880       // t2 at offset 16, t3 at offset 24
 881       // t4 at offset 32, t5 at offset 40
 882       // t6 at offset 48, t7 at offset 56
 883 
 884       // for backwards copy s was not offset so the register contents
 885       // are at these offsets into the preceding 64 byte block
 886       // relative to that original input and so on for each successive
 887       // preceding 64 byte block when s is updated. this explains the
 888       // slightly counter-intuitive looking pattern of register usage
 889       // in the stp instructions for backwards copy.
 890       //
 891       // t0 at offset -16, t1 at offset -8
 892       // t2 at offset -32, t3 at offset -24
 893       // t4 at offset -48, t5 at offset -40
 894       // t6 at offset -64, t7 at offset -56
 895 
 896       __ ldp(t0, t1, Address(s, 2 * unit));
 897       __ ldp(t2, t3, Address(s, 4 * unit));
 898       __ ldp(t4, t5, Address(s, 6 * unit));
 899       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 900 
 901       __ subs(count, count, 16);
 902       __ br(Assembler::LO, drain);
 903 
 904       int prefetch = PrefetchCopyIntervalInBytes;
 905       bool use_stride = false;
 906       if (direction == copy_backwards) {
 907          use_stride = prefetch > 256;
 908          prefetch = -prefetch;
 909          if (use_stride) __ mov(stride, prefetch);
 910       }
 911 
 912       __ bind(again);
 913 
 914       if (PrefetchCopyIntervalInBytes > 0)
 915         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 916 
 917       if (direction == copy_forwards) {
 918        // allowing for the offset of -8 the store instructions place
 919        // registers into the target 64 bit block at the following
 920        // offsets
 921        //
 922        // t0 at offset 0
 923        // t1 at offset 8,  t2 at offset 16
 924        // t3 at offset 24, t4 at offset 32
 925        // t5 at offset 40, t6 at offset 48
 926        // t7 at offset 56
 927 
 928         __ str(t0, Address(d, 1 * unit));
 929         __ stp(t1, t2, Address(d, 2 * unit));
 930         __ ldp(t0, t1, Address(s, 2 * unit));
 931         __ stp(t3, t4, Address(d, 4 * unit));
 932         __ ldp(t2, t3, Address(s, 4 * unit));
 933         __ stp(t5, t6, Address(d, 6 * unit));
 934         __ ldp(t4, t5, Address(s, 6 * unit));
 935         __ str(t7, Address(__ pre(d, 8 * unit)));
 936         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 937       } else {
 938        // d was not offset when we started so the registers are
 939        // written into the 64 bit block preceding d with the following
 940        // offsets
 941        //
 942        // t1 at offset -8
 943        // t3 at offset -24, t0 at offset -16
 944        // t5 at offset -48, t2 at offset -32
 945        // t7 at offset -56, t4 at offset -48
 946        //                   t6 at offset -64
 947        //
 948        // note that this matches the offsets previously noted for the
 949        // loads
 950 
 951         __ str(t1, Address(d, 1 * unit));
 952         __ stp(t3, t0, Address(d, 3 * unit));
 953         __ ldp(t0, t1, Address(s, 2 * unit));
 954         __ stp(t5, t2, Address(d, 5 * unit));
 955         __ ldp(t2, t3, Address(s, 4 * unit));
 956         __ stp(t7, t4, Address(d, 7 * unit));
 957         __ ldp(t4, t5, Address(s, 6 * unit));
 958         __ str(t6, Address(__ pre(d, 8 * unit)));
 959         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 960       }
 961 
 962       __ subs(count, count, 8);
 963       __ br(Assembler::HS, again);
 964 
 965       // Drain
 966       //
 967       // this uses the same pattern of offsets and register arguments
 968       // as above
 969       __ bind(drain);
 970       if (direction == copy_forwards) {
 971         __ str(t0, Address(d, 1 * unit));
 972         __ stp(t1, t2, Address(d, 2 * unit));
 973         __ stp(t3, t4, Address(d, 4 * unit));
 974         __ stp(t5, t6, Address(d, 6 * unit));
 975         __ str(t7, Address(__ pre(d, 8 * unit)));
 976       } else {
 977         __ str(t1, Address(d, 1 * unit));
 978         __ stp(t3, t0, Address(d, 3 * unit));
 979         __ stp(t5, t2, Address(d, 5 * unit));
 980         __ stp(t7, t4, Address(d, 7 * unit));
 981         __ str(t6, Address(__ pre(d, 8 * unit)));
 982       }
 983       // now we need to copy any remaining part block which may
 984       // include a 4 word block subblock and/or a 2 word subblock.
 985       // bits 2 and 1 in the count are the tell-tale for whetehr we
 986       // have each such subblock
 987       {
 988         Label L1, L2;
 989         __ tbz(count, exact_log2(4), L1);
 990        // this is the same as above but copying only 4 longs hence
 991        // with ony one intervening stp between the str instructions
 992        // but note that the offsets and registers still follow the
 993        // same pattern
 994         __ ldp(t0, t1, Address(s, 2 * unit));
 995         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 996         if (direction == copy_forwards) {
 997           __ str(t0, Address(d, 1 * unit));
 998           __ stp(t1, t2, Address(d, 2 * unit));
 999           __ str(t3, Address(__ pre(d, 4 * unit)));
1000         } else {
1001           __ str(t1, Address(d, 1 * unit));
1002           __ stp(t3, t0, Address(d, 3 * unit));
1003           __ str(t2, Address(__ pre(d, 4 * unit)));
1004         }
1005         __ bind(L1);
1006 
1007         __ tbz(count, 1, L2);
1008        // this is the same as above but copying only 2 longs hence
1009        // there is no intervening stp between the str instructions
1010        // but note that the offset and register patterns are still
1011        // the same
1012         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1013         if (direction == copy_forwards) {
1014           __ str(t0, Address(d, 1 * unit));
1015           __ str(t1, Address(__ pre(d, 2 * unit)));
1016         } else {
1017           __ str(t1, Address(d, 1 * unit));
1018           __ str(t0, Address(__ pre(d, 2 * unit)));
1019         }
1020         __ bind(L2);
1021 
1022        // for forwards copy we need to re-adjust the offsets we
1023        // applied so that s and d are follow the last words written
1024 
1025        if (direction == copy_forwards) {
1026          __ add(s, s, 16);
1027          __ add(d, d, 8);
1028        }
1029 
1030       }
1031 
1032       __ ret(lr);
1033       }
1034   }
1035 
1036   // Small copy: less than 16 bytes.
1037   //
1038   // NB: Ignores all of the bits of count which represent more than 15
1039   // bytes, so a caller doesn't have to mask them.
1040 
1041   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1042     bool is_backwards = step < 0;
1043     size_t granularity = uabs(step);
1044     int direction = is_backwards ? -1 : 1;
1045     int unit = wordSize * direction;
1046 
1047     Label Lword, Lint, Lshort, Lbyte;
1048 
1049     assert(granularity
1050            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1051 
1052     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1053 
1054     // ??? I don't know if this bit-test-and-branch is the right thing
1055     // to do.  It does a lot of jumping, resulting in several
1056     // mispredicted branches.  It might make more sense to do this
1057     // with something like Duff's device with a single computed branch.
1058 
1059     __ tbz(count, 3 - exact_log2(granularity), Lword);
1060     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1061     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1062     __ bind(Lword);
1063 
1064     if (granularity <= sizeof (jint)) {
1065       __ tbz(count, 2 - exact_log2(granularity), Lint);
1066       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1067       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1068       __ bind(Lint);
1069     }
1070 
1071     if (granularity <= sizeof (jshort)) {
1072       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1073       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1074       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1075       __ bind(Lshort);
1076     }
1077 
1078     if (granularity <= sizeof (jbyte)) {
1079       __ tbz(count, 0, Lbyte);
1080       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1081       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1082       __ bind(Lbyte);
1083     }
1084   }
1085 
1086   Label copy_f, copy_b;
1087 
1088   // All-singing all-dancing memory copy.
1089   //
1090   // Copy count units of memory from s to d.  The size of a unit is
1091   // step, which can be positive or negative depending on the direction
1092   // of copy.  If is_aligned is false, we align the source address.
1093   //
1094 
1095   void copy_memory(bool is_aligned, Register s, Register d,
1096                    Register count, Register tmp, int step) {
1097     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1098     bool is_backwards = step < 0;
1099     int granularity = uabs(step);
1100     const Register t0 = r3, t1 = r4;
1101 
1102     // <= 96 bytes do inline. Direction doesn't matter because we always
1103     // load all the data before writing anything
1104     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1105     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1106     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1107     const Register send = r17, dend = r18;
1108 
1109     if (PrefetchCopyIntervalInBytes > 0)
1110       __ prfm(Address(s, 0), PLDL1KEEP);
1111     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1112     __ br(Assembler::HI, copy_big);
1113 
1114     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1115     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1116 
1117     __ cmp(count, u1(16/granularity));
1118     __ br(Assembler::LS, copy16);
1119 
1120     __ cmp(count, u1(64/granularity));
1121     __ br(Assembler::HI, copy80);
1122 
1123     __ cmp(count, u1(32/granularity));
1124     __ br(Assembler::LS, copy32);
1125 
1126     // 33..64 bytes
1127     if (UseSIMDForMemoryOps) {
1128       __ ldpq(v0, v1, Address(s, 0));
1129       __ ldpq(v2, v3, Address(send, -32));
1130       __ stpq(v0, v1, Address(d, 0));
1131       __ stpq(v2, v3, Address(dend, -32));
1132     } else {
1133       __ ldp(t0, t1, Address(s, 0));
1134       __ ldp(t2, t3, Address(s, 16));
1135       __ ldp(t4, t5, Address(send, -32));
1136       __ ldp(t6, t7, Address(send, -16));
1137 
1138       __ stp(t0, t1, Address(d, 0));
1139       __ stp(t2, t3, Address(d, 16));
1140       __ stp(t4, t5, Address(dend, -32));
1141       __ stp(t6, t7, Address(dend, -16));
1142     }
1143     __ b(finish);
1144 
1145     // 17..32 bytes
1146     __ bind(copy32);
1147     __ ldp(t0, t1, Address(s, 0));
1148     __ ldp(t2, t3, Address(send, -16));
1149     __ stp(t0, t1, Address(d, 0));
1150     __ stp(t2, t3, Address(dend, -16));
1151     __ b(finish);
1152 
1153     // 65..80/96 bytes
1154     // (96 bytes if SIMD because we do 32 byes per instruction)
1155     __ bind(copy80);
1156     if (UseSIMDForMemoryOps) {
1157       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1158       __ ldpq(v4, v5, Address(send, -32));
1159       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1160       __ stpq(v4, v5, Address(dend, -32));
1161     } else {
1162       __ ldp(t0, t1, Address(s, 0));
1163       __ ldp(t2, t3, Address(s, 16));
1164       __ ldp(t4, t5, Address(s, 32));
1165       __ ldp(t6, t7, Address(s, 48));
1166       __ ldp(t8, t9, Address(send, -16));
1167 
1168       __ stp(t0, t1, Address(d, 0));
1169       __ stp(t2, t3, Address(d, 16));
1170       __ stp(t4, t5, Address(d, 32));
1171       __ stp(t6, t7, Address(d, 48));
1172       __ stp(t8, t9, Address(dend, -16));
1173     }
1174     __ b(finish);
1175 
1176     // 0..16 bytes
1177     __ bind(copy16);
1178     __ cmp(count, u1(8/granularity));
1179     __ br(Assembler::LO, copy8);
1180 
1181     // 8..16 bytes
1182     __ ldr(t0, Address(s, 0));
1183     __ ldr(t1, Address(send, -8));
1184     __ str(t0, Address(d, 0));
1185     __ str(t1, Address(dend, -8));
1186     __ b(finish);
1187 
1188     if (granularity < 8) {
1189       // 4..7 bytes
1190       __ bind(copy8);
1191       __ tbz(count, 2 - exact_log2(granularity), copy4);
1192       __ ldrw(t0, Address(s, 0));
1193       __ ldrw(t1, Address(send, -4));
1194       __ strw(t0, Address(d, 0));
1195       __ strw(t1, Address(dend, -4));
1196       __ b(finish);
1197       if (granularity < 4) {
1198         // 0..3 bytes
1199         __ bind(copy4);
1200         __ cbz(count, finish); // get rid of 0 case
1201         if (granularity == 2) {
1202           __ ldrh(t0, Address(s, 0));
1203           __ strh(t0, Address(d, 0));
1204         } else { // granularity == 1
1205           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1206           // the first and last byte.
1207           // Handle the 3 byte case by loading and storing base + count/2
1208           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1209           // This does means in the 1 byte case we load/store the same
1210           // byte 3 times.
1211           __ lsr(count, count, 1);
1212           __ ldrb(t0, Address(s, 0));
1213           __ ldrb(t1, Address(send, -1));
1214           __ ldrb(t2, Address(s, count));
1215           __ strb(t0, Address(d, 0));
1216           __ strb(t1, Address(dend, -1));
1217           __ strb(t2, Address(d, count));
1218         }
1219         __ b(finish);
1220       }
1221     }
1222 
1223     __ bind(copy_big);
1224     if (is_backwards) {
1225       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1226       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1227     }
1228 
1229     // Now we've got the small case out of the way we can align the
1230     // source address on a 2-word boundary.
1231 
1232     Label aligned;
1233 
1234     if (is_aligned) {
1235       // We may have to adjust by 1 word to get s 2-word-aligned.
1236       __ tbz(s, exact_log2(wordSize), aligned);
1237       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1238       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1239       __ sub(count, count, wordSize/granularity);
1240     } else {
1241       if (is_backwards) {
1242         __ andr(rscratch2, s, 2 * wordSize - 1);
1243       } else {
1244         __ neg(rscratch2, s);
1245         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1246       }
1247       // rscratch2 is the byte adjustment needed to align s.
1248       __ cbz(rscratch2, aligned);
1249       int shift = exact_log2(granularity);
1250       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1251       __ sub(count, count, rscratch2);
1252 
1253 #if 0
1254       // ?? This code is only correct for a disjoint copy.  It may or
1255       // may not make sense to use it in that case.
1256 
1257       // Copy the first pair; s and d may not be aligned.
1258       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1259       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1260 
1261       // Align s and d, adjust count
1262       if (is_backwards) {
1263         __ sub(s, s, rscratch2);
1264         __ sub(d, d, rscratch2);
1265       } else {
1266         __ add(s, s, rscratch2);
1267         __ add(d, d, rscratch2);
1268       }
1269 #else
1270       copy_memory_small(s, d, rscratch2, rscratch1, step);
1271 #endif
1272     }
1273 
1274     __ bind(aligned);
1275 
1276     // s is now 2-word-aligned.
1277 
1278     // We have a count of units and some trailing bytes.  Adjust the
1279     // count and do a bulk copy of words.
1280     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1281     if (direction == copy_forwards)
1282       __ bl(copy_f);
1283     else
1284       __ bl(copy_b);
1285 
1286     // And the tail.
1287     copy_memory_small(s, d, count, tmp, step);
1288 
1289     if (granularity >= 8) __ bind(copy8);
1290     if (granularity >= 4) __ bind(copy4);
1291     __ bind(finish);
1292   }
1293 
1294 
1295   void clobber_registers() {
1296 #ifdef ASSERT
1297     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1298     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1299     for (Register r = r3; r <= r18; r++)
1300       if (r != rscratch1) __ mov(r, rscratch1);
1301 #endif
1302   }
1303 
1304   // Scan over array at a for count oops, verifying each one.
1305   // Preserves a and count, clobbers rscratch1 and rscratch2.
1306   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1307     Label loop, end;
1308     __ mov(rscratch1, a);
1309     __ mov(rscratch2, zr);
1310     __ bind(loop);
1311     __ cmp(rscratch2, count);
1312     __ br(Assembler::HS, end);
1313     if (size == (size_t)wordSize) {
1314       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1315       __ verify_oop(temp);
1316     } else {
1317       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1318       __ decode_heap_oop(temp); // calls verify_oop
1319     }
1320     __ add(rscratch2, rscratch2, size);
1321     __ b(loop);
1322     __ bind(end);
1323   }
1324 
1325   // Arguments:
1326   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1327   //             ignored
1328   //   is_oop  - true => oop array, so generate store check code
1329   //   name    - stub name string
1330   //
1331   // Inputs:
1332   //   c_rarg0   - source array address
1333   //   c_rarg1   - destination array address
1334   //   c_rarg2   - element count, treated as ssize_t, can be zero
1335   //
1336   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1337   // the hardware handle it.  The two dwords within qwords that span
1338   // cache line boundaries will still be loaded and stored atomicly.
1339   //
1340   // Side Effects:
1341   //   disjoint_int_copy_entry is set to the no-overlap entry point
1342   //   used by generate_conjoint_int_oop_copy().
1343   //
1344   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1345                                   const char *name, bool dest_uninitialized = false) {
1346     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1347     RegSet saved_reg = RegSet::of(s, d, count);
1348     __ align(CodeEntryAlignment);
1349     StubCodeMark mark(this, "StubRoutines", name);
1350     address start = __ pc();
1351     __ enter();
1352 
1353     if (entry != NULL) {
1354       *entry = __ pc();
1355       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1356       BLOCK_COMMENT("Entry:");
1357     }
1358 
1359     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1360     if (dest_uninitialized) {
1361       decorators |= IS_DEST_UNINITIALIZED;
1362     }
1363     if (aligned) {
1364       decorators |= ARRAYCOPY_ALIGNED;
1365     }
1366 
1367     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1368     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1369 
1370     if (is_oop) {
1371       // save regs before copy_memory
1372       __ push(RegSet::of(d, count), sp);
1373     }
1374     copy_memory(aligned, s, d, count, rscratch1, size);
1375 
1376     if (is_oop) {
1377       __ pop(RegSet::of(d, count), sp);
1378       if (VerifyOops)
1379         verify_oop_array(size, d, count, r16);
1380       __ sub(count, count, 1); // make an inclusive end pointer
1381       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1382     }
1383 
1384     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1385 
1386     __ leave();
1387     __ mov(r0, zr); // return 0
1388     __ ret(lr);
1389 #ifdef BUILTIN_SIM
1390     {
1391       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1392       sim->notifyCompile(const_cast<char*>(name), start);
1393     }
1394 #endif
1395     return start;
1396   }
1397 
1398   // Arguments:
1399   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1400   //             ignored
1401   //   is_oop  - true => oop array, so generate store check code
1402   //   name    - stub name string
1403   //
1404   // Inputs:
1405   //   c_rarg0   - source array address
1406   //   c_rarg1   - destination array address
1407   //   c_rarg2   - element count, treated as ssize_t, can be zero
1408   //
1409   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1410   // the hardware handle it.  The two dwords within qwords that span
1411   // cache line boundaries will still be loaded and stored atomicly.
1412   //
1413   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1414                                  address *entry, const char *name,
1415                                  bool dest_uninitialized = false) {
1416     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1417     RegSet saved_regs = RegSet::of(s, d, count);
1418     StubCodeMark mark(this, "StubRoutines", name);
1419     address start = __ pc();
1420     __ enter();
1421 
1422     if (entry != NULL) {
1423       *entry = __ pc();
1424       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1425       BLOCK_COMMENT("Entry:");
1426     }
1427 
1428     // use fwd copy when (d-s) above_equal (count*size)
1429     __ sub(rscratch1, d, s);
1430     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1431     __ br(Assembler::HS, nooverlap_target);
1432 
1433     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1434     if (dest_uninitialized) {
1435       decorators |= IS_DEST_UNINITIALIZED;
1436     }
1437     if (aligned) {
1438       decorators |= ARRAYCOPY_ALIGNED;
1439     }
1440 
1441     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1442     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1443 
1444     if (is_oop) {
1445       // save regs before copy_memory
1446       __ push(RegSet::of(d, count), sp);
1447     }
1448     copy_memory(aligned, s, d, count, rscratch1, -size);
1449     if (is_oop) {
1450       __ pop(RegSet::of(d, count), sp);
1451       if (VerifyOops)
1452         verify_oop_array(size, d, count, r16);
1453       __ sub(count, count, 1); // make an inclusive end pointer
1454       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1455     }
1456     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1457     __ leave();
1458     __ mov(r0, zr); // return 0
1459     __ ret(lr);
1460 #ifdef BUILTIN_SIM
1461     {
1462       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1463       sim->notifyCompile(const_cast<char*>(name), start);
1464     }
1465 #endif
1466     return start;
1467 }
1468 
1469   // Arguments:
1470   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1471   //             ignored
1472   //   name    - stub name string
1473   //
1474   // Inputs:
1475   //   c_rarg0   - source array address
1476   //   c_rarg1   - destination array address
1477   //   c_rarg2   - element count, treated as ssize_t, can be zero
1478   //
1479   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1480   // we let the hardware handle it.  The one to eight bytes within words,
1481   // dwords or qwords that span cache line boundaries will still be loaded
1482   // and stored atomically.
1483   //
1484   // Side Effects:
1485   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1486   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1487   // we let the hardware handle it.  The one to eight bytes within words,
1488   // dwords or qwords that span cache line boundaries will still be loaded
1489   // and stored atomically.
1490   //
1491   // Side Effects:
1492   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1493   //   used by generate_conjoint_byte_copy().
1494   //
1495   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1496     const bool not_oop = false;
1497     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1498   }
1499 
1500   // Arguments:
1501   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1502   //             ignored
1503   //   name    - stub name string
1504   //
1505   // Inputs:
1506   //   c_rarg0   - source array address
1507   //   c_rarg1   - destination array address
1508   //   c_rarg2   - element count, treated as ssize_t, can be zero
1509   //
1510   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1511   // we let the hardware handle it.  The one to eight bytes within words,
1512   // dwords or qwords that span cache line boundaries will still be loaded
1513   // and stored atomically.
1514   //
1515   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1516                                       address* entry, const char *name) {
1517     const bool not_oop = false;
1518     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1519   }
1520 
1521   // Arguments:
1522   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1523   //             ignored
1524   //   name    - stub name string
1525   //
1526   // Inputs:
1527   //   c_rarg0   - source array address
1528   //   c_rarg1   - destination array address
1529   //   c_rarg2   - element count, treated as ssize_t, can be zero
1530   //
1531   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1532   // let the hardware handle it.  The two or four words within dwords
1533   // or qwords that span cache line boundaries will still be loaded
1534   // and stored atomically.
1535   //
1536   // Side Effects:
1537   //   disjoint_short_copy_entry is set to the no-overlap entry point
1538   //   used by generate_conjoint_short_copy().
1539   //
1540   address generate_disjoint_short_copy(bool aligned,
1541                                        address* entry, const char *name) {
1542     const bool not_oop = false;
1543     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1544   }
1545 
1546   // Arguments:
1547   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1548   //             ignored
1549   //   name    - stub name string
1550   //
1551   // Inputs:
1552   //   c_rarg0   - source array address
1553   //   c_rarg1   - destination array address
1554   //   c_rarg2   - element count, treated as ssize_t, can be zero
1555   //
1556   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1557   // let the hardware handle it.  The two or four words within dwords
1558   // or qwords that span cache line boundaries will still be loaded
1559   // and stored atomically.
1560   //
1561   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1562                                        address *entry, const char *name) {
1563     const bool not_oop = false;
1564     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1565 
1566   }
1567   // Arguments:
1568   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1569   //             ignored
1570   //   name    - stub name string
1571   //
1572   // Inputs:
1573   //   c_rarg0   - source array address
1574   //   c_rarg1   - destination array address
1575   //   c_rarg2   - element count, treated as ssize_t, can be zero
1576   //
1577   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1578   // the hardware handle it.  The two dwords within qwords that span
1579   // cache line boundaries will still be loaded and stored atomicly.
1580   //
1581   // Side Effects:
1582   //   disjoint_int_copy_entry is set to the no-overlap entry point
1583   //   used by generate_conjoint_int_oop_copy().
1584   //
1585   address generate_disjoint_int_copy(bool aligned, address *entry,
1586                                          const char *name, bool dest_uninitialized = false) {
1587     const bool not_oop = false;
1588     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1589   }
1590 
1591   // Arguments:
1592   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1593   //             ignored
1594   //   name    - stub name string
1595   //
1596   // Inputs:
1597   //   c_rarg0   - source array address
1598   //   c_rarg1   - destination array address
1599   //   c_rarg2   - element count, treated as ssize_t, can be zero
1600   //
1601   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1602   // the hardware handle it.  The two dwords within qwords that span
1603   // cache line boundaries will still be loaded and stored atomicly.
1604   //
1605   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1606                                      address *entry, const char *name,
1607                                      bool dest_uninitialized = false) {
1608     const bool not_oop = false;
1609     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1610   }
1611 
1612 
1613   // Arguments:
1614   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1615   //             ignored
1616   //   name    - stub name string
1617   //
1618   // Inputs:
1619   //   c_rarg0   - source array address
1620   //   c_rarg1   - destination array address
1621   //   c_rarg2   - element count, treated as size_t, can be zero
1622   //
1623   // Side Effects:
1624   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1625   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1626   //
1627   address generate_disjoint_long_copy(bool aligned, address *entry,
1628                                           const char *name, bool dest_uninitialized = false) {
1629     const bool not_oop = false;
1630     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1631   }
1632 
1633   // Arguments:
1634   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1635   //             ignored
1636   //   name    - stub name string
1637   //
1638   // Inputs:
1639   //   c_rarg0   - source array address
1640   //   c_rarg1   - destination array address
1641   //   c_rarg2   - element count, treated as size_t, can be zero
1642   //
1643   address generate_conjoint_long_copy(bool aligned,
1644                                       address nooverlap_target, address *entry,
1645                                       const char *name, bool dest_uninitialized = false) {
1646     const bool not_oop = false;
1647     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1648   }
1649 
1650   // Arguments:
1651   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1652   //             ignored
1653   //   name    - stub name string
1654   //
1655   // Inputs:
1656   //   c_rarg0   - source array address
1657   //   c_rarg1   - destination array address
1658   //   c_rarg2   - element count, treated as size_t, can be zero
1659   //
1660   // Side Effects:
1661   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1662   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1663   //
1664   address generate_disjoint_oop_copy(bool aligned, address *entry,
1665                                      const char *name, bool dest_uninitialized) {
1666     const bool is_oop = true;
1667     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1668     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1669   }
1670 
1671   // Arguments:
1672   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1673   //             ignored
1674   //   name    - stub name string
1675   //
1676   // Inputs:
1677   //   c_rarg0   - source array address
1678   //   c_rarg1   - destination array address
1679   //   c_rarg2   - element count, treated as size_t, can be zero
1680   //
1681   address generate_conjoint_oop_copy(bool aligned,
1682                                      address nooverlap_target, address *entry,
1683                                      const char *name, bool dest_uninitialized) {
1684     const bool is_oop = true;
1685     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1686     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1687                                   name, dest_uninitialized);
1688   }
1689 
1690 
1691   // Helper for generating a dynamic type check.
1692   // Smashes rscratch1, rscratch2.
1693   void generate_type_check(Register sub_klass,
1694                            Register super_check_offset,
1695                            Register super_klass,
1696                            Label& L_success) {
1697     assert_different_registers(sub_klass, super_check_offset, super_klass);
1698 
1699     BLOCK_COMMENT("type_check:");
1700 
1701     Label L_miss;
1702 
1703     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1704                                      super_check_offset);
1705     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1706 
1707     // Fall through on failure!
1708     __ BIND(L_miss);
1709   }
1710 
1711   //
1712   //  Generate checkcasting array copy stub
1713   //
1714   //  Input:
1715   //    c_rarg0   - source array address
1716   //    c_rarg1   - destination array address
1717   //    c_rarg2   - element count, treated as ssize_t, can be zero
1718   //    c_rarg3   - size_t ckoff (super_check_offset)
1719   //    c_rarg4   - oop ckval (super_klass)
1720   //
1721   //  Output:
1722   //    r0 ==  0  -  success
1723   //    r0 == -1^K - failure, where K is partial transfer count
1724   //
1725   address generate_checkcast_copy(const char *name, address *entry,
1726                                   bool dest_uninitialized = false) {
1727 
1728     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1729 
1730     // Input registers (after setup_arg_regs)
1731     const Register from        = c_rarg0;   // source array address
1732     const Register to          = c_rarg1;   // destination array address
1733     const Register count       = c_rarg2;   // elementscount
1734     const Register ckoff       = c_rarg3;   // super_check_offset
1735     const Register ckval       = c_rarg4;   // super_klass
1736 
1737     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1738     RegSet wb_post_saved_regs = RegSet::of(count);
1739 
1740     // Registers used as temps (r18, r19, r20 are save-on-entry)
1741     const Register count_save  = r21;       // orig elementscount
1742     const Register start_to    = r20;       // destination array start address
1743     const Register copied_oop  = r18;       // actual oop copied
1744     const Register r19_klass   = r19;       // oop._klass
1745 
1746     //---------------------------------------------------------------
1747     // Assembler stub will be used for this call to arraycopy
1748     // if the two arrays are subtypes of Object[] but the
1749     // destination array type is not equal to or a supertype
1750     // of the source type.  Each element must be separately
1751     // checked.
1752 
1753     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1754                                copied_oop, r19_klass, count_save);
1755 
1756     __ align(CodeEntryAlignment);
1757     StubCodeMark mark(this, "StubRoutines", name);
1758     address start = __ pc();
1759 
1760     __ enter(); // required for proper stackwalking of RuntimeStub frame
1761 
1762 #ifdef ASSERT
1763     // caller guarantees that the arrays really are different
1764     // otherwise, we would have to make conjoint checks
1765     { Label L;
1766       array_overlap_test(L, TIMES_OOP);
1767       __ stop("checkcast_copy within a single array");
1768       __ bind(L);
1769     }
1770 #endif //ASSERT
1771 
1772     // Caller of this entry point must set up the argument registers.
1773     if (entry != NULL) {
1774       *entry = __ pc();
1775       BLOCK_COMMENT("Entry:");
1776     }
1777 
1778      // Empty array:  Nothing to do.
1779     __ cbz(count, L_done);
1780 
1781     __ push(RegSet::of(r18, r19, r20, r21), sp);
1782 
1783 #ifdef ASSERT
1784     BLOCK_COMMENT("assert consistent ckoff/ckval");
1785     // The ckoff and ckval must be mutually consistent,
1786     // even though caller generates both.
1787     { Label L;
1788       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1789       __ ldrw(start_to, Address(ckval, sco_offset));
1790       __ cmpw(ckoff, start_to);
1791       __ br(Assembler::EQ, L);
1792       __ stop("super_check_offset inconsistent");
1793       __ bind(L);
1794     }
1795 #endif //ASSERT
1796 
1797     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1798     bool is_oop = true;
1799     if (dest_uninitialized) {
1800       decorators |= IS_DEST_UNINITIALIZED;
1801     }
1802 
1803     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1804     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1805 
1806     // save the original count
1807     __ mov(count_save, count);
1808 
1809     // Copy from low to high addresses
1810     __ mov(start_to, to);              // Save destination array start address
1811     __ b(L_load_element);
1812 
1813     // ======== begin loop ========
1814     // (Loop is rotated; its entry is L_load_element.)
1815     // Loop control:
1816     //   for (; count != 0; count--) {
1817     //     copied_oop = load_heap_oop(from++);
1818     //     ... generate_type_check ...;
1819     //     store_heap_oop(to++, copied_oop);
1820     //   }
1821     __ align(OptoLoopAlignment);
1822 
1823     __ BIND(L_store_element);
1824     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1825     __ sub(count, count, 1);
1826     __ cbz(count, L_do_card_marks);
1827 
1828     // ======== loop entry is here ========
1829     __ BIND(L_load_element);
1830     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1831     __ cbz(copied_oop, L_store_element);
1832 
1833     __ load_klass(r19_klass, copied_oop);// query the object klass
1834     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1835     // ======== end loop ========
1836 
1837     // It was a real error; we must depend on the caller to finish the job.
1838     // Register count = remaining oops, count_orig = total oops.
1839     // Emit GC store barriers for the oops we have copied and report
1840     // their number to the caller.
1841 
1842     __ subs(count, count_save, count);     // K = partially copied oop count
1843     __ eon(count, count, zr);                   // report (-1^K) to caller
1844     __ br(Assembler::EQ, L_done_pop);
1845 
1846     __ BIND(L_do_card_marks);
1847     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1848     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1849 
1850     __ bind(L_done_pop);
1851     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1852     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1853 
1854     __ bind(L_done);
1855     __ mov(r0, count);
1856     __ leave();
1857     __ ret(lr);
1858 
1859     return start;
1860   }
1861 
1862   // Perform range checks on the proposed arraycopy.
1863   // Kills temp, but nothing else.
1864   // Also, clean the sign bits of src_pos and dst_pos.
1865   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1866                               Register src_pos, // source position (c_rarg1)
1867                               Register dst,     // destination array oo (c_rarg2)
1868                               Register dst_pos, // destination position (c_rarg3)
1869                               Register length,
1870                               Register temp,
1871                               Label& L_failed) {
1872     BLOCK_COMMENT("arraycopy_range_checks:");
1873 
1874     assert_different_registers(rscratch1, temp);
1875 
1876     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1877     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1878     __ addw(temp, length, src_pos);
1879     __ cmpw(temp, rscratch1);
1880     __ br(Assembler::HI, L_failed);
1881 
1882     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1883     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1884     __ addw(temp, length, dst_pos);
1885     __ cmpw(temp, rscratch1);
1886     __ br(Assembler::HI, L_failed);
1887 
1888     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1889     __ movw(src_pos, src_pos);
1890     __ movw(dst_pos, dst_pos);
1891 
1892     BLOCK_COMMENT("arraycopy_range_checks done");
1893   }
1894 
1895   // These stubs get called from some dumb test routine.
1896   // I'll write them properly when they're called from
1897   // something that's actually doing something.
1898   static void fake_arraycopy_stub(address src, address dst, int count) {
1899     assert(count == 0, "huh?");
1900   }
1901 
1902 
1903   //
1904   //  Generate 'unsafe' array copy stub
1905   //  Though just as safe as the other stubs, it takes an unscaled
1906   //  size_t argument instead of an element count.
1907   //
1908   //  Input:
1909   //    c_rarg0   - source array address
1910   //    c_rarg1   - destination array address
1911   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1912   //
1913   // Examines the alignment of the operands and dispatches
1914   // to a long, int, short, or byte copy loop.
1915   //
1916   address generate_unsafe_copy(const char *name,
1917                                address byte_copy_entry,
1918                                address short_copy_entry,
1919                                address int_copy_entry,
1920                                address long_copy_entry) {
1921     Label L_long_aligned, L_int_aligned, L_short_aligned;
1922     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1923 
1924     __ align(CodeEntryAlignment);
1925     StubCodeMark mark(this, "StubRoutines", name);
1926     address start = __ pc();
1927     __ enter(); // required for proper stackwalking of RuntimeStub frame
1928 
1929     // bump this on entry, not on exit:
1930     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1931 
1932     __ orr(rscratch1, s, d);
1933     __ orr(rscratch1, rscratch1, count);
1934 
1935     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1936     __ cbz(rscratch1, L_long_aligned);
1937     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1938     __ cbz(rscratch1, L_int_aligned);
1939     __ tbz(rscratch1, 0, L_short_aligned);
1940     __ b(RuntimeAddress(byte_copy_entry));
1941 
1942     __ BIND(L_short_aligned);
1943     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1944     __ b(RuntimeAddress(short_copy_entry));
1945     __ BIND(L_int_aligned);
1946     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1947     __ b(RuntimeAddress(int_copy_entry));
1948     __ BIND(L_long_aligned);
1949     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1950     __ b(RuntimeAddress(long_copy_entry));
1951 
1952     return start;
1953   }
1954 
1955   //
1956   //  Generate generic array copy stubs
1957   //
1958   //  Input:
1959   //    c_rarg0    -  src oop
1960   //    c_rarg1    -  src_pos (32-bits)
1961   //    c_rarg2    -  dst oop
1962   //    c_rarg3    -  dst_pos (32-bits)
1963   //    c_rarg4    -  element count (32-bits)
1964   //
1965   //  Output:
1966   //    r0 ==  0  -  success
1967   //    r0 == -1^K - failure, where K is partial transfer count
1968   //
1969   address generate_generic_copy(const char *name,
1970                                 address byte_copy_entry, address short_copy_entry,
1971                                 address int_copy_entry, address oop_copy_entry,
1972                                 address long_copy_entry, address checkcast_copy_entry) {
1973 
1974     Label L_failed, L_objArray;
1975     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1976 
1977     // Input registers
1978     const Register src        = c_rarg0;  // source array oop
1979     const Register src_pos    = c_rarg1;  // source position
1980     const Register dst        = c_rarg2;  // destination array oop
1981     const Register dst_pos    = c_rarg3;  // destination position
1982     const Register length     = c_rarg4;
1983 
1984 
1985     // Registers used as temps
1986     const Register dst_klass  = c_rarg5;
1987 
1988     __ align(CodeEntryAlignment);
1989 
1990     StubCodeMark mark(this, "StubRoutines", name);
1991 
1992     address start = __ pc();
1993 
1994     __ enter(); // required for proper stackwalking of RuntimeStub frame
1995 
1996     // bump this on entry, not on exit:
1997     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1998 
1999     //-----------------------------------------------------------------------
2000     // Assembler stub will be used for this call to arraycopy
2001     // if the following conditions are met:
2002     //
2003     // (1) src and dst must not be null.
2004     // (2) src_pos must not be negative.
2005     // (3) dst_pos must not be negative.
2006     // (4) length  must not be negative.
2007     // (5) src klass and dst klass should be the same and not NULL.
2008     // (6) src and dst should be arrays.
2009     // (7) src_pos + length must not exceed length of src.
2010     // (8) dst_pos + length must not exceed length of dst.
2011     //
2012 
2013     //  if (src == NULL) return -1;
2014     __ cbz(src, L_failed);
2015 
2016     //  if (src_pos < 0) return -1;
2017     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2018 
2019     //  if (dst == NULL) return -1;
2020     __ cbz(dst, L_failed);
2021 
2022     //  if (dst_pos < 0) return -1;
2023     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2024 
2025     // registers used as temp
2026     const Register scratch_length    = r16; // elements count to copy
2027     const Register scratch_src_klass = r17; // array klass
2028     const Register lh                = r18; // layout helper
2029 
2030     //  if (length < 0) return -1;
2031     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2032     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2033 
2034     __ load_klass(scratch_src_klass, src);
2035 #ifdef ASSERT
2036     //  assert(src->klass() != NULL);
2037     {
2038       BLOCK_COMMENT("assert klasses not null {");
2039       Label L1, L2;
2040       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2041       __ bind(L1);
2042       __ stop("broken null klass");
2043       __ bind(L2);
2044       __ load_klass(rscratch1, dst);
2045       __ cbz(rscratch1, L1);     // this would be broken also
2046       BLOCK_COMMENT("} assert klasses not null done");
2047     }
2048 #endif
2049 
2050     // Load layout helper (32-bits)
2051     //
2052     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2053     // 32        30    24            16              8     2                 0
2054     //
2055     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2056     //
2057 
2058     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2059 
2060     // Handle objArrays completely differently...
2061     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2062     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2063     __ movw(rscratch1, objArray_lh);
2064     __ eorw(rscratch2, lh, rscratch1);
2065     __ cbzw(rscratch2, L_objArray);
2066 
2067     //  if (src->klass() != dst->klass()) return -1;
2068     __ load_klass(rscratch2, dst);
2069     __ eor(rscratch2, rscratch2, scratch_src_klass);
2070     __ cbnz(rscratch2, L_failed);
2071 
2072     //  if (!src->is_Array()) return -1;
2073     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2074 
2075     // At this point, it is known to be a typeArray (array_tag 0x3).
2076 #ifdef ASSERT
2077     {
2078       BLOCK_COMMENT("assert primitive array {");
2079       Label L;
2080       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2081       __ cmpw(lh, rscratch2);
2082       __ br(Assembler::GE, L);
2083       __ stop("must be a primitive array");
2084       __ bind(L);
2085       BLOCK_COMMENT("} assert primitive array done");
2086     }
2087 #endif
2088 
2089     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2090                            rscratch2, L_failed);
2091 
2092     // TypeArrayKlass
2093     //
2094     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2095     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2096     //
2097 
2098     const Register rscratch1_offset = rscratch1;    // array offset
2099     const Register r18_elsize = lh; // element size
2100 
2101     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2102            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2103     __ add(src, src, rscratch1_offset);           // src array offset
2104     __ add(dst, dst, rscratch1_offset);           // dst array offset
2105     BLOCK_COMMENT("choose copy loop based on element size");
2106 
2107     // next registers should be set before the jump to corresponding stub
2108     const Register from     = c_rarg0;  // source array address
2109     const Register to       = c_rarg1;  // destination array address
2110     const Register count    = c_rarg2;  // elements count
2111 
2112     // 'from', 'to', 'count' registers should be set in such order
2113     // since they are the same as 'src', 'src_pos', 'dst'.
2114 
2115     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2116 
2117     // The possible values of elsize are 0-3, i.e. exact_log2(element
2118     // size in bytes).  We do a simple bitwise binary search.
2119   __ BIND(L_copy_bytes);
2120     __ tbnz(r18_elsize, 1, L_copy_ints);
2121     __ tbnz(r18_elsize, 0, L_copy_shorts);
2122     __ lea(from, Address(src, src_pos));// src_addr
2123     __ lea(to,   Address(dst, dst_pos));// dst_addr
2124     __ movw(count, scratch_length); // length
2125     __ b(RuntimeAddress(byte_copy_entry));
2126 
2127   __ BIND(L_copy_shorts);
2128     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2129     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2130     __ movw(count, scratch_length); // length
2131     __ b(RuntimeAddress(short_copy_entry));
2132 
2133   __ BIND(L_copy_ints);
2134     __ tbnz(r18_elsize, 0, L_copy_longs);
2135     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2136     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2137     __ movw(count, scratch_length); // length
2138     __ b(RuntimeAddress(int_copy_entry));
2139 
2140   __ BIND(L_copy_longs);
2141 #ifdef ASSERT
2142     {
2143       BLOCK_COMMENT("assert long copy {");
2144       Label L;
2145       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2146       __ cmpw(r18_elsize, LogBytesPerLong);
2147       __ br(Assembler::EQ, L);
2148       __ stop("must be long copy, but elsize is wrong");
2149       __ bind(L);
2150       BLOCK_COMMENT("} assert long copy done");
2151     }
2152 #endif
2153     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2154     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2155     __ movw(count, scratch_length); // length
2156     __ b(RuntimeAddress(long_copy_entry));
2157 
2158     // ObjArrayKlass
2159   __ BIND(L_objArray);
2160     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2161 
2162     Label L_plain_copy, L_checkcast_copy;
2163     //  test array classes for subtyping
2164     __ load_klass(r18, dst);
2165     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2166     __ br(Assembler::NE, L_checkcast_copy);
2167 
2168     // Identically typed arrays can be copied without element-wise checks.
2169     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2170                            rscratch2, L_failed);
2171 
2172     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2173     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2174     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2175     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2176     __ movw(count, scratch_length); // length
2177   __ BIND(L_plain_copy);
2178     __ b(RuntimeAddress(oop_copy_entry));
2179 
2180   __ BIND(L_checkcast_copy);
2181     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2182     {
2183       // Before looking at dst.length, make sure dst is also an objArray.
2184       __ ldrw(rscratch1, Address(r18, lh_offset));
2185       __ movw(rscratch2, objArray_lh);
2186       __ eorw(rscratch1, rscratch1, rscratch2);
2187       __ cbnzw(rscratch1, L_failed);
2188 
2189       // It is safe to examine both src.length and dst.length.
2190       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2191                              r18, L_failed);
2192 
2193       __ load_klass(dst_klass, dst); // reload
2194 
2195       // Marshal the base address arguments now, freeing registers.
2196       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2197       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2198       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2199       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2200       __ movw(count, length);           // length (reloaded)
2201       Register sco_temp = c_rarg3;      // this register is free now
2202       assert_different_registers(from, to, count, sco_temp,
2203                                  dst_klass, scratch_src_klass);
2204       // assert_clean_int(count, sco_temp);
2205 
2206       // Generate the type check.
2207       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2208       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2209 
2210       // Smashes rscratch1, rscratch2
2211       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2212 
2213       // Fetch destination element klass from the ObjArrayKlass header.
2214       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2215       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2216       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2217 
2218       // the checkcast_copy loop needs two extra arguments:
2219       assert(c_rarg3 == sco_temp, "#3 already in place");
2220       // Set up arguments for checkcast_copy_entry.
2221       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2222       __ b(RuntimeAddress(checkcast_copy_entry));
2223     }
2224 
2225   __ BIND(L_failed);
2226     __ mov(r0, -1);
2227     __ leave();   // required for proper stackwalking of RuntimeStub frame
2228     __ ret(lr);
2229 
2230     return start;
2231   }
2232 
2233   //
2234   // Generate stub for array fill. If "aligned" is true, the
2235   // "to" address is assumed to be heapword aligned.
2236   //
2237   // Arguments for generated stub:
2238   //   to:    c_rarg0
2239   //   value: c_rarg1
2240   //   count: c_rarg2 treated as signed
2241   //
2242   address generate_fill(BasicType t, bool aligned, const char *name) {
2243     __ align(CodeEntryAlignment);
2244     StubCodeMark mark(this, "StubRoutines", name);
2245     address start = __ pc();
2246 
2247     BLOCK_COMMENT("Entry:");
2248 
2249     const Register to        = c_rarg0;  // source array address
2250     const Register value     = c_rarg1;  // value
2251     const Register count     = c_rarg2;  // elements count
2252 
2253     const Register bz_base = r10;        // base for block_zero routine
2254     const Register cnt_words = r11;      // temp register
2255 
2256     __ enter();
2257 
2258     Label L_fill_elements, L_exit1;
2259 
2260     int shift = -1;
2261     switch (t) {
2262       case T_BYTE:
2263         shift = 0;
2264         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2265         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2266         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2267         __ br(Assembler::LO, L_fill_elements);
2268         break;
2269       case T_SHORT:
2270         shift = 1;
2271         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2272         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2273         __ br(Assembler::LO, L_fill_elements);
2274         break;
2275       case T_INT:
2276         shift = 2;
2277         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2278         __ br(Assembler::LO, L_fill_elements);
2279         break;
2280       default: ShouldNotReachHere();
2281     }
2282 
2283     // Align source address at 8 bytes address boundary.
2284     Label L_skip_align1, L_skip_align2, L_skip_align4;
2285     if (!aligned) {
2286       switch (t) {
2287         case T_BYTE:
2288           // One byte misalignment happens only for byte arrays.
2289           __ tbz(to, 0, L_skip_align1);
2290           __ strb(value, Address(__ post(to, 1)));
2291           __ subw(count, count, 1);
2292           __ bind(L_skip_align1);
2293           // Fallthrough
2294         case T_SHORT:
2295           // Two bytes misalignment happens only for byte and short (char) arrays.
2296           __ tbz(to, 1, L_skip_align2);
2297           __ strh(value, Address(__ post(to, 2)));
2298           __ subw(count, count, 2 >> shift);
2299           __ bind(L_skip_align2);
2300           // Fallthrough
2301         case T_INT:
2302           // Align to 8 bytes, we know we are 4 byte aligned to start.
2303           __ tbz(to, 2, L_skip_align4);
2304           __ strw(value, Address(__ post(to, 4)));
2305           __ subw(count, count, 4 >> shift);
2306           __ bind(L_skip_align4);
2307           break;
2308         default: ShouldNotReachHere();
2309       }
2310     }
2311 
2312     //
2313     //  Fill large chunks
2314     //
2315     __ lsrw(cnt_words, count, 3 - shift); // number of words
2316     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2317     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2318     if (UseBlockZeroing) {
2319       Label non_block_zeroing, rest;
2320       // If the fill value is zero we can use the fast zero_words().
2321       __ cbnz(value, non_block_zeroing);
2322       __ mov(bz_base, to);
2323       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2324       __ zero_words(bz_base, cnt_words);
2325       __ b(rest);
2326       __ bind(non_block_zeroing);
2327       __ fill_words(to, cnt_words, value);
2328       __ bind(rest);
2329     } else {
2330       __ fill_words(to, cnt_words, value);
2331     }
2332 
2333     // Remaining count is less than 8 bytes. Fill it by a single store.
2334     // Note that the total length is no less than 8 bytes.
2335     if (t == T_BYTE || t == T_SHORT) {
2336       Label L_exit1;
2337       __ cbzw(count, L_exit1);
2338       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2339       __ str(value, Address(to, -8));    // overwrite some elements
2340       __ bind(L_exit1);
2341       __ leave();
2342       __ ret(lr);
2343     }
2344 
2345     // Handle copies less than 8 bytes.
2346     Label L_fill_2, L_fill_4, L_exit2;
2347     __ bind(L_fill_elements);
2348     switch (t) {
2349       case T_BYTE:
2350         __ tbz(count, 0, L_fill_2);
2351         __ strb(value, Address(__ post(to, 1)));
2352         __ bind(L_fill_2);
2353         __ tbz(count, 1, L_fill_4);
2354         __ strh(value, Address(__ post(to, 2)));
2355         __ bind(L_fill_4);
2356         __ tbz(count, 2, L_exit2);
2357         __ strw(value, Address(to));
2358         break;
2359       case T_SHORT:
2360         __ tbz(count, 0, L_fill_4);
2361         __ strh(value, Address(__ post(to, 2)));
2362         __ bind(L_fill_4);
2363         __ tbz(count, 1, L_exit2);
2364         __ strw(value, Address(to));
2365         break;
2366       case T_INT:
2367         __ cbzw(count, L_exit2);
2368         __ strw(value, Address(to));
2369         break;
2370       default: ShouldNotReachHere();
2371     }
2372     __ bind(L_exit2);
2373     __ leave();
2374     __ ret(lr);
2375     return start;
2376   }
2377 
2378   void generate_arraycopy_stubs() {
2379     address entry;
2380     address entry_jbyte_arraycopy;
2381     address entry_jshort_arraycopy;
2382     address entry_jint_arraycopy;
2383     address entry_oop_arraycopy;
2384     address entry_jlong_arraycopy;
2385     address entry_checkcast_arraycopy;
2386 
2387     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2388     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2389 
2390     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2391 
2392     //*** jbyte
2393     // Always need aligned and unaligned versions
2394     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2395                                                                                   "jbyte_disjoint_arraycopy");
2396     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2397                                                                                   &entry_jbyte_arraycopy,
2398                                                                                   "jbyte_arraycopy");
2399     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2400                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2401     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2402                                                                                   "arrayof_jbyte_arraycopy");
2403 
2404     //*** jshort
2405     // Always need aligned and unaligned versions
2406     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2407                                                                                     "jshort_disjoint_arraycopy");
2408     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2409                                                                                     &entry_jshort_arraycopy,
2410                                                                                     "jshort_arraycopy");
2411     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2412                                                                                     "arrayof_jshort_disjoint_arraycopy");
2413     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2414                                                                                     "arrayof_jshort_arraycopy");
2415 
2416     //*** jint
2417     // Aligned versions
2418     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2419                                                                                 "arrayof_jint_disjoint_arraycopy");
2420     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2421                                                                                 "arrayof_jint_arraycopy");
2422     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2423     // entry_jint_arraycopy always points to the unaligned version
2424     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2425                                                                                 "jint_disjoint_arraycopy");
2426     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2427                                                                                 &entry_jint_arraycopy,
2428                                                                                 "jint_arraycopy");
2429 
2430     //*** jlong
2431     // It is always aligned
2432     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2433                                                                                   "arrayof_jlong_disjoint_arraycopy");
2434     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2435                                                                                   "arrayof_jlong_arraycopy");
2436     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2437     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2438 
2439     //*** oops
2440     {
2441       // With compressed oops we need unaligned versions; notice that
2442       // we overwrite entry_oop_arraycopy.
2443       bool aligned = !UseCompressedOops;
2444 
2445       StubRoutines::_arrayof_oop_disjoint_arraycopy
2446         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2447                                      /*dest_uninitialized*/false);
2448       StubRoutines::_arrayof_oop_arraycopy
2449         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2450                                      /*dest_uninitialized*/false);
2451       // Aligned versions without pre-barriers
2452       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2453         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2454                                      /*dest_uninitialized*/true);
2455       StubRoutines::_arrayof_oop_arraycopy_uninit
2456         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2457                                      /*dest_uninitialized*/true);
2458     }
2459 
2460     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2461     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2462     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2463     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2464 
2465     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2466     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2467                                                                         /*dest_uninitialized*/true);
2468 
2469     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2470                                                               entry_jbyte_arraycopy,
2471                                                               entry_jshort_arraycopy,
2472                                                               entry_jint_arraycopy,
2473                                                               entry_jlong_arraycopy);
2474 
2475     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2476                                                                entry_jbyte_arraycopy,
2477                                                                entry_jshort_arraycopy,
2478                                                                entry_jint_arraycopy,
2479                                                                entry_oop_arraycopy,
2480                                                                entry_jlong_arraycopy,
2481                                                                entry_checkcast_arraycopy);
2482 
2483     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2484     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2485     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2486     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2487     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2488     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2489   }
2490 
2491   void generate_math_stubs() { Unimplemented(); }
2492 
2493   // Arguments:
2494   //
2495   // Inputs:
2496   //   c_rarg0   - source byte array address
2497   //   c_rarg1   - destination byte array address
2498   //   c_rarg2   - K (key) in little endian int array
2499   //
2500   address generate_aescrypt_encryptBlock() {
2501     __ align(CodeEntryAlignment);
2502     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2503 
2504     Label L_doLast;
2505 
2506     const Register from        = c_rarg0;  // source array address
2507     const Register to          = c_rarg1;  // destination array address
2508     const Register key         = c_rarg2;  // key array address
2509     const Register keylen      = rscratch1;
2510 
2511     address start = __ pc();
2512     __ enter();
2513 
2514     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2515 
2516     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2517 
2518     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2519     __ rev32(v1, __ T16B, v1);
2520     __ rev32(v2, __ T16B, v2);
2521     __ rev32(v3, __ T16B, v3);
2522     __ rev32(v4, __ T16B, v4);
2523     __ aese(v0, v1);
2524     __ aesmc(v0, v0);
2525     __ aese(v0, v2);
2526     __ aesmc(v0, v0);
2527     __ aese(v0, v3);
2528     __ aesmc(v0, v0);
2529     __ aese(v0, v4);
2530     __ aesmc(v0, v0);
2531 
2532     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2533     __ rev32(v1, __ T16B, v1);
2534     __ rev32(v2, __ T16B, v2);
2535     __ rev32(v3, __ T16B, v3);
2536     __ rev32(v4, __ T16B, v4);
2537     __ aese(v0, v1);
2538     __ aesmc(v0, v0);
2539     __ aese(v0, v2);
2540     __ aesmc(v0, v0);
2541     __ aese(v0, v3);
2542     __ aesmc(v0, v0);
2543     __ aese(v0, v4);
2544     __ aesmc(v0, v0);
2545 
2546     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2547     __ rev32(v1, __ T16B, v1);
2548     __ rev32(v2, __ T16B, v2);
2549 
2550     __ cmpw(keylen, 44);
2551     __ br(Assembler::EQ, L_doLast);
2552 
2553     __ aese(v0, v1);
2554     __ aesmc(v0, v0);
2555     __ aese(v0, v2);
2556     __ aesmc(v0, v0);
2557 
2558     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2559     __ rev32(v1, __ T16B, v1);
2560     __ rev32(v2, __ T16B, v2);
2561 
2562     __ cmpw(keylen, 52);
2563     __ br(Assembler::EQ, L_doLast);
2564 
2565     __ aese(v0, v1);
2566     __ aesmc(v0, v0);
2567     __ aese(v0, v2);
2568     __ aesmc(v0, v0);
2569 
2570     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2571     __ rev32(v1, __ T16B, v1);
2572     __ rev32(v2, __ T16B, v2);
2573 
2574     __ BIND(L_doLast);
2575 
2576     __ aese(v0, v1);
2577     __ aesmc(v0, v0);
2578     __ aese(v0, v2);
2579 
2580     __ ld1(v1, __ T16B, key);
2581     __ rev32(v1, __ T16B, v1);
2582     __ eor(v0, __ T16B, v0, v1);
2583 
2584     __ st1(v0, __ T16B, to);
2585 
2586     __ mov(r0, 0);
2587 
2588     __ leave();
2589     __ ret(lr);
2590 
2591     return start;
2592   }
2593 
2594   // Arguments:
2595   //
2596   // Inputs:
2597   //   c_rarg0   - source byte array address
2598   //   c_rarg1   - destination byte array address
2599   //   c_rarg2   - K (key) in little endian int array
2600   //
2601   address generate_aescrypt_decryptBlock() {
2602     assert(UseAES, "need AES instructions and misaligned SSE support");
2603     __ align(CodeEntryAlignment);
2604     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2605     Label L_doLast;
2606 
2607     const Register from        = c_rarg0;  // source array address
2608     const Register to          = c_rarg1;  // destination array address
2609     const Register key         = c_rarg2;  // key array address
2610     const Register keylen      = rscratch1;
2611 
2612     address start = __ pc();
2613     __ enter(); // required for proper stackwalking of RuntimeStub frame
2614 
2615     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2616 
2617     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2618 
2619     __ ld1(v5, __ T16B, __ post(key, 16));
2620     __ rev32(v5, __ T16B, v5);
2621 
2622     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2623     __ rev32(v1, __ T16B, v1);
2624     __ rev32(v2, __ T16B, v2);
2625     __ rev32(v3, __ T16B, v3);
2626     __ rev32(v4, __ T16B, v4);
2627     __ aesd(v0, v1);
2628     __ aesimc(v0, v0);
2629     __ aesd(v0, v2);
2630     __ aesimc(v0, v0);
2631     __ aesd(v0, v3);
2632     __ aesimc(v0, v0);
2633     __ aesd(v0, v4);
2634     __ aesimc(v0, v0);
2635 
2636     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2637     __ rev32(v1, __ T16B, v1);
2638     __ rev32(v2, __ T16B, v2);
2639     __ rev32(v3, __ T16B, v3);
2640     __ rev32(v4, __ T16B, v4);
2641     __ aesd(v0, v1);
2642     __ aesimc(v0, v0);
2643     __ aesd(v0, v2);
2644     __ aesimc(v0, v0);
2645     __ aesd(v0, v3);
2646     __ aesimc(v0, v0);
2647     __ aesd(v0, v4);
2648     __ aesimc(v0, v0);
2649 
2650     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2651     __ rev32(v1, __ T16B, v1);
2652     __ rev32(v2, __ T16B, v2);
2653 
2654     __ cmpw(keylen, 44);
2655     __ br(Assembler::EQ, L_doLast);
2656 
2657     __ aesd(v0, v1);
2658     __ aesimc(v0, v0);
2659     __ aesd(v0, v2);
2660     __ aesimc(v0, v0);
2661 
2662     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2663     __ rev32(v1, __ T16B, v1);
2664     __ rev32(v2, __ T16B, v2);
2665 
2666     __ cmpw(keylen, 52);
2667     __ br(Assembler::EQ, L_doLast);
2668 
2669     __ aesd(v0, v1);
2670     __ aesimc(v0, v0);
2671     __ aesd(v0, v2);
2672     __ aesimc(v0, v0);
2673 
2674     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2675     __ rev32(v1, __ T16B, v1);
2676     __ rev32(v2, __ T16B, v2);
2677 
2678     __ BIND(L_doLast);
2679 
2680     __ aesd(v0, v1);
2681     __ aesimc(v0, v0);
2682     __ aesd(v0, v2);
2683 
2684     __ eor(v0, __ T16B, v0, v5);
2685 
2686     __ st1(v0, __ T16B, to);
2687 
2688     __ mov(r0, 0);
2689 
2690     __ leave();
2691     __ ret(lr);
2692 
2693     return start;
2694   }
2695 
2696   // Arguments:
2697   //
2698   // Inputs:
2699   //   c_rarg0   - source byte array address
2700   //   c_rarg1   - destination byte array address
2701   //   c_rarg2   - K (key) in little endian int array
2702   //   c_rarg3   - r vector byte array address
2703   //   c_rarg4   - input length
2704   //
2705   // Output:
2706   //   x0        - input length
2707   //
2708   address generate_cipherBlockChaining_encryptAESCrypt() {
2709     assert(UseAES, "need AES instructions and misaligned SSE support");
2710     __ align(CodeEntryAlignment);
2711     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2712 
2713     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2714 
2715     const Register from        = c_rarg0;  // source array address
2716     const Register to          = c_rarg1;  // destination array address
2717     const Register key         = c_rarg2;  // key array address
2718     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2719                                            // and left with the results of the last encryption block
2720     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2721     const Register keylen      = rscratch1;
2722 
2723     address start = __ pc();
2724 
2725       __ enter();
2726 
2727       __ movw(rscratch2, len_reg);
2728 
2729       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2730 
2731       __ ld1(v0, __ T16B, rvec);
2732 
2733       __ cmpw(keylen, 52);
2734       __ br(Assembler::CC, L_loadkeys_44);
2735       __ br(Assembler::EQ, L_loadkeys_52);
2736 
2737       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2738       __ rev32(v17, __ T16B, v17);
2739       __ rev32(v18, __ T16B, v18);
2740     __ BIND(L_loadkeys_52);
2741       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2742       __ rev32(v19, __ T16B, v19);
2743       __ rev32(v20, __ T16B, v20);
2744     __ BIND(L_loadkeys_44);
2745       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2746       __ rev32(v21, __ T16B, v21);
2747       __ rev32(v22, __ T16B, v22);
2748       __ rev32(v23, __ T16B, v23);
2749       __ rev32(v24, __ T16B, v24);
2750       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2751       __ rev32(v25, __ T16B, v25);
2752       __ rev32(v26, __ T16B, v26);
2753       __ rev32(v27, __ T16B, v27);
2754       __ rev32(v28, __ T16B, v28);
2755       __ ld1(v29, v30, v31, __ T16B, key);
2756       __ rev32(v29, __ T16B, v29);
2757       __ rev32(v30, __ T16B, v30);
2758       __ rev32(v31, __ T16B, v31);
2759 
2760     __ BIND(L_aes_loop);
2761       __ ld1(v1, __ T16B, __ post(from, 16));
2762       __ eor(v0, __ T16B, v0, v1);
2763 
2764       __ br(Assembler::CC, L_rounds_44);
2765       __ br(Assembler::EQ, L_rounds_52);
2766 
2767       __ aese(v0, v17); __ aesmc(v0, v0);
2768       __ aese(v0, v18); __ aesmc(v0, v0);
2769     __ BIND(L_rounds_52);
2770       __ aese(v0, v19); __ aesmc(v0, v0);
2771       __ aese(v0, v20); __ aesmc(v0, v0);
2772     __ BIND(L_rounds_44);
2773       __ aese(v0, v21); __ aesmc(v0, v0);
2774       __ aese(v0, v22); __ aesmc(v0, v0);
2775       __ aese(v0, v23); __ aesmc(v0, v0);
2776       __ aese(v0, v24); __ aesmc(v0, v0);
2777       __ aese(v0, v25); __ aesmc(v0, v0);
2778       __ aese(v0, v26); __ aesmc(v0, v0);
2779       __ aese(v0, v27); __ aesmc(v0, v0);
2780       __ aese(v0, v28); __ aesmc(v0, v0);
2781       __ aese(v0, v29); __ aesmc(v0, v0);
2782       __ aese(v0, v30);
2783       __ eor(v0, __ T16B, v0, v31);
2784 
2785       __ st1(v0, __ T16B, __ post(to, 16));
2786 
2787       __ subw(len_reg, len_reg, 16);
2788       __ cbnzw(len_reg, L_aes_loop);
2789 
2790       __ st1(v0, __ T16B, rvec);
2791 
2792       __ mov(r0, rscratch2);
2793 
2794       __ leave();
2795       __ ret(lr);
2796 
2797       return start;
2798   }
2799 
2800   // Arguments:
2801   //
2802   // Inputs:
2803   //   c_rarg0   - source byte array address
2804   //   c_rarg1   - destination byte array address
2805   //   c_rarg2   - K (key) in little endian int array
2806   //   c_rarg3   - r vector byte array address
2807   //   c_rarg4   - input length
2808   //
2809   // Output:
2810   //   r0        - input length
2811   //
2812   address generate_cipherBlockChaining_decryptAESCrypt() {
2813     assert(UseAES, "need AES instructions and misaligned SSE support");
2814     __ align(CodeEntryAlignment);
2815     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2816 
2817     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2818 
2819     const Register from        = c_rarg0;  // source array address
2820     const Register to          = c_rarg1;  // destination array address
2821     const Register key         = c_rarg2;  // key array address
2822     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2823                                            // and left with the results of the last encryption block
2824     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2825     const Register keylen      = rscratch1;
2826 
2827     address start = __ pc();
2828 
2829       __ enter();
2830 
2831       __ movw(rscratch2, len_reg);
2832 
2833       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2834 
2835       __ ld1(v2, __ T16B, rvec);
2836 
2837       __ ld1(v31, __ T16B, __ post(key, 16));
2838       __ rev32(v31, __ T16B, v31);
2839 
2840       __ cmpw(keylen, 52);
2841       __ br(Assembler::CC, L_loadkeys_44);
2842       __ br(Assembler::EQ, L_loadkeys_52);
2843 
2844       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2845       __ rev32(v17, __ T16B, v17);
2846       __ rev32(v18, __ T16B, v18);
2847     __ BIND(L_loadkeys_52);
2848       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2849       __ rev32(v19, __ T16B, v19);
2850       __ rev32(v20, __ T16B, v20);
2851     __ BIND(L_loadkeys_44);
2852       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2853       __ rev32(v21, __ T16B, v21);
2854       __ rev32(v22, __ T16B, v22);
2855       __ rev32(v23, __ T16B, v23);
2856       __ rev32(v24, __ T16B, v24);
2857       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2858       __ rev32(v25, __ T16B, v25);
2859       __ rev32(v26, __ T16B, v26);
2860       __ rev32(v27, __ T16B, v27);
2861       __ rev32(v28, __ T16B, v28);
2862       __ ld1(v29, v30, __ T16B, key);
2863       __ rev32(v29, __ T16B, v29);
2864       __ rev32(v30, __ T16B, v30);
2865 
2866     __ BIND(L_aes_loop);
2867       __ ld1(v0, __ T16B, __ post(from, 16));
2868       __ orr(v1, __ T16B, v0, v0);
2869 
2870       __ br(Assembler::CC, L_rounds_44);
2871       __ br(Assembler::EQ, L_rounds_52);
2872 
2873       __ aesd(v0, v17); __ aesimc(v0, v0);
2874       __ aesd(v0, v18); __ aesimc(v0, v0);
2875     __ BIND(L_rounds_52);
2876       __ aesd(v0, v19); __ aesimc(v0, v0);
2877       __ aesd(v0, v20); __ aesimc(v0, v0);
2878     __ BIND(L_rounds_44);
2879       __ aesd(v0, v21); __ aesimc(v0, v0);
2880       __ aesd(v0, v22); __ aesimc(v0, v0);
2881       __ aesd(v0, v23); __ aesimc(v0, v0);
2882       __ aesd(v0, v24); __ aesimc(v0, v0);
2883       __ aesd(v0, v25); __ aesimc(v0, v0);
2884       __ aesd(v0, v26); __ aesimc(v0, v0);
2885       __ aesd(v0, v27); __ aesimc(v0, v0);
2886       __ aesd(v0, v28); __ aesimc(v0, v0);
2887       __ aesd(v0, v29); __ aesimc(v0, v0);
2888       __ aesd(v0, v30);
2889       __ eor(v0, __ T16B, v0, v31);
2890       __ eor(v0, __ T16B, v0, v2);
2891 
2892       __ st1(v0, __ T16B, __ post(to, 16));
2893       __ orr(v2, __ T16B, v1, v1);
2894 
2895       __ subw(len_reg, len_reg, 16);
2896       __ cbnzw(len_reg, L_aes_loop);
2897 
2898       __ st1(v2, __ T16B, rvec);
2899 
2900       __ mov(r0, rscratch2);
2901 
2902       __ leave();
2903       __ ret(lr);
2904 
2905     return start;
2906   }
2907 
2908   // Arguments:
2909   //
2910   // Inputs:
2911   //   c_rarg0   - byte[]  source+offset
2912   //   c_rarg1   - int[]   SHA.state
2913   //   c_rarg2   - int     offset
2914   //   c_rarg3   - int     limit
2915   //
2916   address generate_sha1_implCompress(bool multi_block, const char *name) {
2917     __ align(CodeEntryAlignment);
2918     StubCodeMark mark(this, "StubRoutines", name);
2919     address start = __ pc();
2920 
2921     Register buf   = c_rarg0;
2922     Register state = c_rarg1;
2923     Register ofs   = c_rarg2;
2924     Register limit = c_rarg3;
2925 
2926     Label keys;
2927     Label sha1_loop;
2928 
2929     // load the keys into v0..v3
2930     __ adr(rscratch1, keys);
2931     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2932     // load 5 words state into v6, v7
2933     __ ldrq(v6, Address(state, 0));
2934     __ ldrs(v7, Address(state, 16));
2935 
2936 
2937     __ BIND(sha1_loop);
2938     // load 64 bytes of data into v16..v19
2939     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2940     __ rev32(v16, __ T16B, v16);
2941     __ rev32(v17, __ T16B, v17);
2942     __ rev32(v18, __ T16B, v18);
2943     __ rev32(v19, __ T16B, v19);
2944 
2945     // do the sha1
2946     __ addv(v4, __ T4S, v16, v0);
2947     __ orr(v20, __ T16B, v6, v6);
2948 
2949     FloatRegister d0 = v16;
2950     FloatRegister d1 = v17;
2951     FloatRegister d2 = v18;
2952     FloatRegister d3 = v19;
2953 
2954     for (int round = 0; round < 20; round++) {
2955       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2956       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2957       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2958       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2959       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2960 
2961       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2962       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2963       __ sha1h(tmp2, __ T4S, v20);
2964       if (round < 5)
2965         __ sha1c(v20, __ T4S, tmp3, tmp4);
2966       else if (round < 10 || round >= 15)
2967         __ sha1p(v20, __ T4S, tmp3, tmp4);
2968       else
2969         __ sha1m(v20, __ T4S, tmp3, tmp4);
2970       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2971 
2972       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2973     }
2974 
2975     __ addv(v7, __ T2S, v7, v21);
2976     __ addv(v6, __ T4S, v6, v20);
2977 
2978     if (multi_block) {
2979       __ add(ofs, ofs, 64);
2980       __ cmp(ofs, limit);
2981       __ br(Assembler::LE, sha1_loop);
2982       __ mov(c_rarg0, ofs); // return ofs
2983     }
2984 
2985     __ strq(v6, Address(state, 0));
2986     __ strs(v7, Address(state, 16));
2987 
2988     __ ret(lr);
2989 
2990     __ bind(keys);
2991     __ emit_int32(0x5a827999);
2992     __ emit_int32(0x6ed9eba1);
2993     __ emit_int32(0x8f1bbcdc);
2994     __ emit_int32(0xca62c1d6);
2995 
2996     return start;
2997   }
2998 
2999 
3000   // Arguments:
3001   //
3002   // Inputs:
3003   //   c_rarg0   - byte[]  source+offset
3004   //   c_rarg1   - int[]   SHA.state
3005   //   c_rarg2   - int     offset
3006   //   c_rarg3   - int     limit
3007   //
3008   address generate_sha256_implCompress(bool multi_block, const char *name) {
3009     static const uint32_t round_consts[64] = {
3010       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3011       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3012       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3013       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3014       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3015       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3016       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3017       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3018       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3019       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3020       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3021       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3022       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3023       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3024       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3025       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3026     };
3027     __ align(CodeEntryAlignment);
3028     StubCodeMark mark(this, "StubRoutines", name);
3029     address start = __ pc();
3030 
3031     Register buf   = c_rarg0;
3032     Register state = c_rarg1;
3033     Register ofs   = c_rarg2;
3034     Register limit = c_rarg3;
3035 
3036     Label sha1_loop;
3037 
3038     __ stpd(v8, v9, __ pre(sp, -32));
3039     __ stpd(v10, v11, Address(sp, 16));
3040 
3041 // dga == v0
3042 // dgb == v1
3043 // dg0 == v2
3044 // dg1 == v3
3045 // dg2 == v4
3046 // t0 == v6
3047 // t1 == v7
3048 
3049     // load 16 keys to v16..v31
3050     __ lea(rscratch1, ExternalAddress((address)round_consts));
3051     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3052     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3053     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3054     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3055 
3056     // load 8 words (256 bits) state
3057     __ ldpq(v0, v1, state);
3058 
3059     __ BIND(sha1_loop);
3060     // load 64 bytes of data into v8..v11
3061     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3062     __ rev32(v8, __ T16B, v8);
3063     __ rev32(v9, __ T16B, v9);
3064     __ rev32(v10, __ T16B, v10);
3065     __ rev32(v11, __ T16B, v11);
3066 
3067     __ addv(v6, __ T4S, v8, v16);
3068     __ orr(v2, __ T16B, v0, v0);
3069     __ orr(v3, __ T16B, v1, v1);
3070 
3071     FloatRegister d0 = v8;
3072     FloatRegister d1 = v9;
3073     FloatRegister d2 = v10;
3074     FloatRegister d3 = v11;
3075 
3076 
3077     for (int round = 0; round < 16; round++) {
3078       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3079       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3080       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3081       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3082 
3083       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3084        __ orr(v4, __ T16B, v2, v2);
3085       if (round < 15)
3086         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3087       __ sha256h(v2, __ T4S, v3, tmp2);
3088       __ sha256h2(v3, __ T4S, v4, tmp2);
3089       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3090 
3091       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3092     }
3093 
3094     __ addv(v0, __ T4S, v0, v2);
3095     __ addv(v1, __ T4S, v1, v3);
3096 
3097     if (multi_block) {
3098       __ add(ofs, ofs, 64);
3099       __ cmp(ofs, limit);
3100       __ br(Assembler::LE, sha1_loop);
3101       __ mov(c_rarg0, ofs); // return ofs
3102     }
3103 
3104     __ ldpd(v10, v11, Address(sp, 16));
3105     __ ldpd(v8, v9, __ post(sp, 32));
3106 
3107     __ stpq(v0, v1, state);
3108 
3109     __ ret(lr);
3110 
3111     return start;
3112   }
3113 
3114 #ifndef BUILTIN_SIM
3115   // Safefetch stubs.
3116   void generate_safefetch(const char* name, int size, address* entry,
3117                           address* fault_pc, address* continuation_pc) {
3118     // safefetch signatures:
3119     //   int      SafeFetch32(int*      adr, int      errValue);
3120     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3121     //
3122     // arguments:
3123     //   c_rarg0 = adr
3124     //   c_rarg1 = errValue
3125     //
3126     // result:
3127     //   PPC_RET  = *adr or errValue
3128 
3129     StubCodeMark mark(this, "StubRoutines", name);
3130 
3131     // Entry point, pc or function descriptor.
3132     *entry = __ pc();
3133 
3134     // Load *adr into c_rarg1, may fault.
3135     *fault_pc = __ pc();
3136     switch (size) {
3137       case 4:
3138         // int32_t
3139         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3140         break;
3141       case 8:
3142         // int64_t
3143         __ ldr(c_rarg1, Address(c_rarg0, 0));
3144         break;
3145       default:
3146         ShouldNotReachHere();
3147     }
3148 
3149     // return errValue or *adr
3150     *continuation_pc = __ pc();
3151     __ mov(r0, c_rarg1);
3152     __ ret(lr);
3153   }
3154 #endif
3155 
3156   /**
3157    *  Arguments:
3158    *
3159    * Inputs:
3160    *   c_rarg0   - int crc
3161    *   c_rarg1   - byte* buf
3162    *   c_rarg2   - int length
3163    *
3164    * Ouput:
3165    *       rax   - int crc result
3166    */
3167   address generate_updateBytesCRC32() {
3168     assert(UseCRC32Intrinsics, "what are we doing here?");
3169 
3170     __ align(CodeEntryAlignment);
3171     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3172 
3173     address start = __ pc();
3174 
3175     const Register crc   = c_rarg0;  // crc
3176     const Register buf   = c_rarg1;  // source java byte array address
3177     const Register len   = c_rarg2;  // length
3178     const Register table0 = c_rarg3; // crc_table address
3179     const Register table1 = c_rarg4;
3180     const Register table2 = c_rarg5;
3181     const Register table3 = c_rarg6;
3182     const Register tmp3 = c_rarg7;
3183 
3184     BLOCK_COMMENT("Entry:");
3185     __ enter(); // required for proper stackwalking of RuntimeStub frame
3186 
3187     __ kernel_crc32(crc, buf, len,
3188               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3189 
3190     __ leave(); // required for proper stackwalking of RuntimeStub frame
3191     __ ret(lr);
3192 
3193     return start;
3194   }
3195 
3196   /**
3197    *  Arguments:
3198    *
3199    * Inputs:
3200    *   c_rarg0   - int crc
3201    *   c_rarg1   - byte* buf
3202    *   c_rarg2   - int length
3203    *   c_rarg3   - int* table
3204    *
3205    * Ouput:
3206    *       r0   - int crc result
3207    */
3208   address generate_updateBytesCRC32C() {
3209     assert(UseCRC32CIntrinsics, "what are we doing here?");
3210 
3211     __ align(CodeEntryAlignment);
3212     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3213 
3214     address start = __ pc();
3215 
3216     const Register crc   = c_rarg0;  // crc
3217     const Register buf   = c_rarg1;  // source java byte array address
3218     const Register len   = c_rarg2;  // length
3219     const Register table0 = c_rarg3; // crc_table address
3220     const Register table1 = c_rarg4;
3221     const Register table2 = c_rarg5;
3222     const Register table3 = c_rarg6;
3223     const Register tmp3 = c_rarg7;
3224 
3225     BLOCK_COMMENT("Entry:");
3226     __ enter(); // required for proper stackwalking of RuntimeStub frame
3227 
3228     __ kernel_crc32c(crc, buf, len,
3229               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3230 
3231     __ leave(); // required for proper stackwalking of RuntimeStub frame
3232     __ ret(lr);
3233 
3234     return start;
3235   }
3236 
3237   /***
3238    *  Arguments:
3239    *
3240    *  Inputs:
3241    *   c_rarg0   - int   adler
3242    *   c_rarg1   - byte* buff
3243    *   c_rarg2   - int   len
3244    *
3245    * Output:
3246    *   c_rarg0   - int adler result
3247    */
3248   address generate_updateBytesAdler32() {
3249     __ align(CodeEntryAlignment);
3250     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3251     address start = __ pc();
3252 
3253     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3254 
3255     // Aliases
3256     Register adler  = c_rarg0;
3257     Register s1     = c_rarg0;
3258     Register s2     = c_rarg3;
3259     Register buff   = c_rarg1;
3260     Register len    = c_rarg2;
3261     Register nmax  = r4;
3262     Register base  = r5;
3263     Register count = r6;
3264     Register temp0 = rscratch1;
3265     Register temp1 = rscratch2;
3266     FloatRegister vbytes = v0;
3267     FloatRegister vs1acc = v1;
3268     FloatRegister vs2acc = v2;
3269     FloatRegister vtable = v3;
3270 
3271     // Max number of bytes we can process before having to take the mod
3272     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3273     unsigned long BASE = 0xfff1;
3274     unsigned long NMAX = 0x15B0;
3275 
3276     __ mov(base, BASE);
3277     __ mov(nmax, NMAX);
3278 
3279     // Load accumulation coefficients for the upper 16 bits
3280     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3281     __ ld1(vtable, __ T16B, Address(temp0));
3282 
3283     // s1 is initialized to the lower 16 bits of adler
3284     // s2 is initialized to the upper 16 bits of adler
3285     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3286     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3287 
3288     // The pipelined loop needs at least 16 elements for 1 iteration
3289     // It does check this, but it is more effective to skip to the cleanup loop
3290     __ cmp(len, (u1)16);
3291     __ br(Assembler::HS, L_nmax);
3292     __ cbz(len, L_combine);
3293 
3294     __ bind(L_simple_by1_loop);
3295     __ ldrb(temp0, Address(__ post(buff, 1)));
3296     __ add(s1, s1, temp0);
3297     __ add(s2, s2, s1);
3298     __ subs(len, len, 1);
3299     __ br(Assembler::HI, L_simple_by1_loop);
3300 
3301     // s1 = s1 % BASE
3302     __ subs(temp0, s1, base);
3303     __ csel(s1, temp0, s1, Assembler::HS);
3304 
3305     // s2 = s2 % BASE
3306     __ lsr(temp0, s2, 16);
3307     __ lsl(temp1, temp0, 4);
3308     __ sub(temp1, temp1, temp0);
3309     __ add(s2, temp1, s2, ext::uxth);
3310 
3311     __ subs(temp0, s2, base);
3312     __ csel(s2, temp0, s2, Assembler::HS);
3313 
3314     __ b(L_combine);
3315 
3316     __ bind(L_nmax);
3317     __ subs(len, len, nmax);
3318     __ sub(count, nmax, 16);
3319     __ br(Assembler::LO, L_by16);
3320 
3321     __ bind(L_nmax_loop);
3322 
3323     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3324                                       vbytes, vs1acc, vs2acc, vtable);
3325 
3326     __ subs(count, count, 16);
3327     __ br(Assembler::HS, L_nmax_loop);
3328 
3329     // s1 = s1 % BASE
3330     __ lsr(temp0, s1, 16);
3331     __ lsl(temp1, temp0, 4);
3332     __ sub(temp1, temp1, temp0);
3333     __ add(temp1, temp1, s1, ext::uxth);
3334 
3335     __ lsr(temp0, temp1, 16);
3336     __ lsl(s1, temp0, 4);
3337     __ sub(s1, s1, temp0);
3338     __ add(s1, s1, temp1, ext:: uxth);
3339 
3340     __ subs(temp0, s1, base);
3341     __ csel(s1, temp0, s1, Assembler::HS);
3342 
3343     // s2 = s2 % BASE
3344     __ lsr(temp0, s2, 16);
3345     __ lsl(temp1, temp0, 4);
3346     __ sub(temp1, temp1, temp0);
3347     __ add(temp1, temp1, s2, ext::uxth);
3348 
3349     __ lsr(temp0, temp1, 16);
3350     __ lsl(s2, temp0, 4);
3351     __ sub(s2, s2, temp0);
3352     __ add(s2, s2, temp1, ext:: uxth);
3353 
3354     __ subs(temp0, s2, base);
3355     __ csel(s2, temp0, s2, Assembler::HS);
3356 
3357     __ subs(len, len, nmax);
3358     __ sub(count, nmax, 16);
3359     __ br(Assembler::HS, L_nmax_loop);
3360 
3361     __ bind(L_by16);
3362     __ adds(len, len, count);
3363     __ br(Assembler::LO, L_by1);
3364 
3365     __ bind(L_by16_loop);
3366 
3367     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3368                                       vbytes, vs1acc, vs2acc, vtable);
3369 
3370     __ subs(len, len, 16);
3371     __ br(Assembler::HS, L_by16_loop);
3372 
3373     __ bind(L_by1);
3374     __ adds(len, len, 15);
3375     __ br(Assembler::LO, L_do_mod);
3376 
3377     __ bind(L_by1_loop);
3378     __ ldrb(temp0, Address(__ post(buff, 1)));
3379     __ add(s1, temp0, s1);
3380     __ add(s2, s2, s1);
3381     __ subs(len, len, 1);
3382     __ br(Assembler::HS, L_by1_loop);
3383 
3384     __ bind(L_do_mod);
3385     // s1 = s1 % BASE
3386     __ lsr(temp0, s1, 16);
3387     __ lsl(temp1, temp0, 4);
3388     __ sub(temp1, temp1, temp0);
3389     __ add(temp1, temp1, s1, ext::uxth);
3390 
3391     __ lsr(temp0, temp1, 16);
3392     __ lsl(s1, temp0, 4);
3393     __ sub(s1, s1, temp0);
3394     __ add(s1, s1, temp1, ext:: uxth);
3395 
3396     __ subs(temp0, s1, base);
3397     __ csel(s1, temp0, s1, Assembler::HS);
3398 
3399     // s2 = s2 % BASE
3400     __ lsr(temp0, s2, 16);
3401     __ lsl(temp1, temp0, 4);
3402     __ sub(temp1, temp1, temp0);
3403     __ add(temp1, temp1, s2, ext::uxth);
3404 
3405     __ lsr(temp0, temp1, 16);
3406     __ lsl(s2, temp0, 4);
3407     __ sub(s2, s2, temp0);
3408     __ add(s2, s2, temp1, ext:: uxth);
3409 
3410     __ subs(temp0, s2, base);
3411     __ csel(s2, temp0, s2, Assembler::HS);
3412 
3413     // Combine lower bits and higher bits
3414     __ bind(L_combine);
3415     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3416 
3417     __ ret(lr);
3418 
3419     return start;
3420   }
3421 
3422   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3423           Register temp0, Register temp1, FloatRegister vbytes,
3424           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3425     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3426     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3427     // In non-vectorized code, we update s1 and s2 as:
3428     //   s1 <- s1 + b1
3429     //   s2 <- s2 + s1
3430     //   s1 <- s1 + b2
3431     //   s2 <- s2 + b1
3432     //   ...
3433     //   s1 <- s1 + b16
3434     //   s2 <- s2 + s1
3435     // Putting above assignments together, we have:
3436     //   s1_new = s1 + b1 + b2 + ... + b16
3437     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3438     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3439     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3440     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3441 
3442     // s2 = s2 + s1 * 16
3443     __ add(s2, s2, s1, Assembler::LSL, 4);
3444 
3445     // vs1acc = b1 + b2 + b3 + ... + b16
3446     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3447     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3448     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3449     __ uaddlv(vs1acc, __ T16B, vbytes);
3450     __ uaddlv(vs2acc, __ T8H, vs2acc);
3451 
3452     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3453     __ fmovd(temp0, vs1acc);
3454     __ fmovd(temp1, vs2acc);
3455     __ add(s1, s1, temp0);
3456     __ add(s2, s2, temp1);
3457   }
3458 
3459   /**
3460    *  Arguments:
3461    *
3462    *  Input:
3463    *    c_rarg0   - x address
3464    *    c_rarg1   - x length
3465    *    c_rarg2   - y address
3466    *    c_rarg3   - y lenth
3467    *    c_rarg4   - z address
3468    *    c_rarg5   - z length
3469    */
3470   address generate_multiplyToLen() {
3471     __ align(CodeEntryAlignment);
3472     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3473 
3474     address start = __ pc();
3475     const Register x     = r0;
3476     const Register xlen  = r1;
3477     const Register y     = r2;
3478     const Register ylen  = r3;
3479     const Register z     = r4;
3480     const Register zlen  = r5;
3481 
3482     const Register tmp1  = r10;
3483     const Register tmp2  = r11;
3484     const Register tmp3  = r12;
3485     const Register tmp4  = r13;
3486     const Register tmp5  = r14;
3487     const Register tmp6  = r15;
3488     const Register tmp7  = r16;
3489 
3490     BLOCK_COMMENT("Entry:");
3491     __ enter(); // required for proper stackwalking of RuntimeStub frame
3492     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3493     __ leave(); // required for proper stackwalking of RuntimeStub frame
3494     __ ret(lr);
3495 
3496     return start;
3497   }
3498 
3499   address generate_squareToLen() {
3500     // squareToLen algorithm for sizes 1..127 described in java code works
3501     // faster than multiply_to_len on some CPUs and slower on others, but
3502     // multiply_to_len shows a bit better overall results
3503     __ align(CodeEntryAlignment);
3504     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3505     address start = __ pc();
3506 
3507     const Register x     = r0;
3508     const Register xlen  = r1;
3509     const Register z     = r2;
3510     const Register zlen  = r3;
3511     const Register y     = r4; // == x
3512     const Register ylen  = r5; // == xlen
3513 
3514     const Register tmp1  = r10;
3515     const Register tmp2  = r11;
3516     const Register tmp3  = r12;
3517     const Register tmp4  = r13;
3518     const Register tmp5  = r14;
3519     const Register tmp6  = r15;
3520     const Register tmp7  = r16;
3521 
3522     RegSet spilled_regs = RegSet::of(y, ylen);
3523     BLOCK_COMMENT("Entry:");
3524     __ enter();
3525     __ push(spilled_regs, sp);
3526     __ mov(y, x);
3527     __ mov(ylen, xlen);
3528     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3529     __ pop(spilled_regs, sp);
3530     __ leave();
3531     __ ret(lr);
3532     return start;
3533   }
3534 
3535   address generate_mulAdd() {
3536     __ align(CodeEntryAlignment);
3537     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3538 
3539     address start = __ pc();
3540 
3541     const Register out     = r0;
3542     const Register in      = r1;
3543     const Register offset  = r2;
3544     const Register len     = r3;
3545     const Register k       = r4;
3546 
3547     BLOCK_COMMENT("Entry:");
3548     __ enter();
3549     __ mul_add(out, in, offset, len, k);
3550     __ leave();
3551     __ ret(lr);
3552 
3553     return start;
3554   }
3555 
3556   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3557                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3558                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3559     // Karatsuba multiplication performs a 128*128 -> 256-bit
3560     // multiplication in three 128-bit multiplications and a few
3561     // additions.
3562     //
3563     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3564     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3565     //
3566     // Inputs:
3567     //
3568     // A0 in a.d[0]     (subkey)
3569     // A1 in a.d[1]
3570     // (A1+A0) in a1_xor_a0.d[0]
3571     //
3572     // B0 in b.d[0]     (state)
3573     // B1 in b.d[1]
3574 
3575     __ ext(tmp1, __ T16B, b, b, 0x08);
3576     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3577     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3578     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3579     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3580 
3581     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3582     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3583     __ eor(tmp2, __ T16B, tmp2, tmp4);
3584     __ eor(tmp2, __ T16B, tmp2, tmp3);
3585 
3586     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3587     __ ins(result_hi, __ D, tmp2, 0, 1);
3588     __ ins(result_lo, __ D, tmp2, 1, 0);
3589   }
3590 
3591   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3592                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3593     const FloatRegister t0 = result;
3594 
3595     // The GCM field polynomial f is z^128 + p(z), where p =
3596     // z^7+z^2+z+1.
3597     //
3598     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3599     //
3600     // so, given that the product we're reducing is
3601     //    a == lo + hi * z^128
3602     // substituting,
3603     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3604     //
3605     // we reduce by multiplying hi by p(z) and subtracting the result
3606     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3607     // bits we can do this with two 64-bit multiplications, lo*p and
3608     // hi*p.
3609 
3610     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3611     __ ext(t1, __ T16B, t0, z, 8);
3612     __ eor(hi, __ T16B, hi, t1);
3613     __ ext(t1, __ T16B, z, t0, 8);
3614     __ eor(lo, __ T16B, lo, t1);
3615     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3616     __ eor(result, __ T16B, lo, t0);
3617   }
3618 
3619   address generate_has_negatives(address &has_negatives_long) {
3620     const u1 large_loop_size = 64;
3621     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3622     int dcache_line = VM_Version::dcache_line_size();
3623 
3624     Register ary1 = r1, len = r2, result = r0;
3625 
3626     __ align(CodeEntryAlignment);
3627 
3628     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3629 
3630     address entry = __ pc();
3631 
3632     __ enter();
3633 
3634   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3635         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3636 
3637   __ cmp(len, (u1)15);
3638   __ br(Assembler::GT, LEN_OVER_15);
3639   // The only case when execution falls into this code is when pointer is near
3640   // the end of memory page and we have to avoid reading next page
3641   __ add(ary1, ary1, len);
3642   __ subs(len, len, 8);
3643   __ br(Assembler::GT, LEN_OVER_8);
3644   __ ldr(rscratch2, Address(ary1, -8));
3645   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3646   __ lsrv(rscratch2, rscratch2, rscratch1);
3647   __ tst(rscratch2, UPPER_BIT_MASK);
3648   __ cset(result, Assembler::NE);
3649   __ leave();
3650   __ ret(lr);
3651   __ bind(LEN_OVER_8);
3652   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3653   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3654   __ tst(rscratch2, UPPER_BIT_MASK);
3655   __ br(Assembler::NE, RET_TRUE_NO_POP);
3656   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3657   __ lsrv(rscratch1, rscratch1, rscratch2);
3658   __ tst(rscratch1, UPPER_BIT_MASK);
3659   __ cset(result, Assembler::NE);
3660   __ leave();
3661   __ ret(lr);
3662 
3663   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3664   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3665 
3666   has_negatives_long = __ pc(); // 2nd entry point
3667 
3668   __ enter();
3669 
3670   __ bind(LEN_OVER_15);
3671     __ push(spilled_regs, sp);
3672     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3673     __ cbz(rscratch2, ALIGNED);
3674     __ ldp(tmp6, tmp1, Address(ary1));
3675     __ mov(tmp5, 16);
3676     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3677     __ add(ary1, ary1, rscratch1);
3678     __ sub(len, len, rscratch1);
3679     __ orr(tmp6, tmp6, tmp1);
3680     __ tst(tmp6, UPPER_BIT_MASK);
3681     __ br(Assembler::NE, RET_TRUE);
3682 
3683   __ bind(ALIGNED);
3684     __ cmp(len, large_loop_size);
3685     __ br(Assembler::LT, CHECK_16);
3686     // Perform 16-byte load as early return in pre-loop to handle situation
3687     // when initially aligned large array has negative values at starting bytes,
3688     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3689     // slower. Cases with negative bytes further ahead won't be affected that
3690     // much. In fact, it'll be faster due to early loads, less instructions and
3691     // less branches in LARGE_LOOP.
3692     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3693     __ sub(len, len, 16);
3694     __ orr(tmp6, tmp6, tmp1);
3695     __ tst(tmp6, UPPER_BIT_MASK);
3696     __ br(Assembler::NE, RET_TRUE);
3697     __ cmp(len, large_loop_size);
3698     __ br(Assembler::LT, CHECK_16);
3699 
3700     if (SoftwarePrefetchHintDistance >= 0
3701         && SoftwarePrefetchHintDistance >= dcache_line) {
3702       // initial prefetch
3703       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3704     }
3705   __ bind(LARGE_LOOP);
3706     if (SoftwarePrefetchHintDistance >= 0) {
3707       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3708     }
3709     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3710     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3711     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3712     // instructions per cycle and have less branches, but this approach disables
3713     // early return, thus, all 64 bytes are loaded and checked every time.
3714     __ ldp(tmp2, tmp3, Address(ary1));
3715     __ ldp(tmp4, tmp5, Address(ary1, 16));
3716     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3717     __ ldp(tmp6, tmp1, Address(ary1, 48));
3718     __ add(ary1, ary1, large_loop_size);
3719     __ sub(len, len, large_loop_size);
3720     __ orr(tmp2, tmp2, tmp3);
3721     __ orr(tmp4, tmp4, tmp5);
3722     __ orr(rscratch1, rscratch1, rscratch2);
3723     __ orr(tmp6, tmp6, tmp1);
3724     __ orr(tmp2, tmp2, tmp4);
3725     __ orr(rscratch1, rscratch1, tmp6);
3726     __ orr(tmp2, tmp2, rscratch1);
3727     __ tst(tmp2, UPPER_BIT_MASK);
3728     __ br(Assembler::NE, RET_TRUE);
3729     __ cmp(len, large_loop_size);
3730     __ br(Assembler::GE, LARGE_LOOP);
3731 
3732   __ bind(CHECK_16); // small 16-byte load pre-loop
3733     __ cmp(len, (u1)16);
3734     __ br(Assembler::LT, POST_LOOP16);
3735 
3736   __ bind(LOOP16); // small 16-byte load loop
3737     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3738     __ sub(len, len, 16);
3739     __ orr(tmp2, tmp2, tmp3);
3740     __ tst(tmp2, UPPER_BIT_MASK);
3741     __ br(Assembler::NE, RET_TRUE);
3742     __ cmp(len, (u1)16);
3743     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3744 
3745   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3746     __ cmp(len, (u1)8);
3747     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3748     __ ldr(tmp3, Address(__ post(ary1, 8)));
3749     __ sub(len, len, 8);
3750     __ tst(tmp3, UPPER_BIT_MASK);
3751     __ br(Assembler::NE, RET_TRUE);
3752 
3753   __ bind(POST_LOOP16_LOAD_TAIL);
3754     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3755     __ ldr(tmp1, Address(ary1));
3756     __ mov(tmp2, 64);
3757     __ sub(tmp4, tmp2, len, __ LSL, 3);
3758     __ lslv(tmp1, tmp1, tmp4);
3759     __ tst(tmp1, UPPER_BIT_MASK);
3760     __ br(Assembler::NE, RET_TRUE);
3761     // Fallthrough
3762 
3763   __ bind(RET_FALSE);
3764     __ pop(spilled_regs, sp);
3765     __ leave();
3766     __ mov(result, zr);
3767     __ ret(lr);
3768 
3769   __ bind(RET_TRUE);
3770     __ pop(spilled_regs, sp);
3771   __ bind(RET_TRUE_NO_POP);
3772     __ leave();
3773     __ mov(result, 1);
3774     __ ret(lr);
3775 
3776   __ bind(DONE);
3777     __ pop(spilled_regs, sp);
3778     __ leave();
3779     __ ret(lr);
3780     return entry;
3781   }
3782 
3783   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3784         bool usePrefetch, Label &NOT_EQUAL) {
3785     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3786         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3787         tmp7 = r12, tmp8 = r13;
3788     Label LOOP;
3789 
3790     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3791     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3792     __ bind(LOOP);
3793     if (usePrefetch) {
3794       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3795       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3796     }
3797     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3798     __ eor(tmp1, tmp1, tmp2);
3799     __ eor(tmp3, tmp3, tmp4);
3800     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3801     __ orr(tmp1, tmp1, tmp3);
3802     __ cbnz(tmp1, NOT_EQUAL);
3803     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3804     __ eor(tmp5, tmp5, tmp6);
3805     __ eor(tmp7, tmp7, tmp8);
3806     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3807     __ orr(tmp5, tmp5, tmp7);
3808     __ cbnz(tmp5, NOT_EQUAL);
3809     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3810     __ eor(tmp1, tmp1, tmp2);
3811     __ eor(tmp3, tmp3, tmp4);
3812     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3813     __ orr(tmp1, tmp1, tmp3);
3814     __ cbnz(tmp1, NOT_EQUAL);
3815     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3816     __ eor(tmp5, tmp5, tmp6);
3817     __ sub(cnt1, cnt1, 8 * wordSize);
3818     __ eor(tmp7, tmp7, tmp8);
3819     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3820     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3821     // cmp) because subs allows an unlimited range of immediate operand.
3822     __ subs(tmp6, cnt1, loopThreshold);
3823     __ orr(tmp5, tmp5, tmp7);
3824     __ cbnz(tmp5, NOT_EQUAL);
3825     __ br(__ GE, LOOP);
3826     // post-loop
3827     __ eor(tmp1, tmp1, tmp2);
3828     __ eor(tmp3, tmp3, tmp4);
3829     __ orr(tmp1, tmp1, tmp3);
3830     __ sub(cnt1, cnt1, 2 * wordSize);
3831     __ cbnz(tmp1, NOT_EQUAL);
3832   }
3833 
3834   void generate_large_array_equals_loop_simd(int loopThreshold,
3835         bool usePrefetch, Label &NOT_EQUAL) {
3836     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3837         tmp2 = rscratch2;
3838     Label LOOP;
3839 
3840     __ bind(LOOP);
3841     if (usePrefetch) {
3842       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3843       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3844     }
3845     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3846     __ sub(cnt1, cnt1, 8 * wordSize);
3847     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3848     __ subs(tmp1, cnt1, loopThreshold);
3849     __ eor(v0, __ T16B, v0, v4);
3850     __ eor(v1, __ T16B, v1, v5);
3851     __ eor(v2, __ T16B, v2, v6);
3852     __ eor(v3, __ T16B, v3, v7);
3853     __ orr(v0, __ T16B, v0, v1);
3854     __ orr(v1, __ T16B, v2, v3);
3855     __ orr(v0, __ T16B, v0, v1);
3856     __ umov(tmp1, v0, __ D, 0);
3857     __ umov(tmp2, v0, __ D, 1);
3858     __ orr(tmp1, tmp1, tmp2);
3859     __ cbnz(tmp1, NOT_EQUAL);
3860     __ br(__ GE, LOOP);
3861   }
3862 
3863   // a1 = r1 - array1 address
3864   // a2 = r2 - array2 address
3865   // result = r0 - return value. Already contains "false"
3866   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3867   // r3-r5 are reserved temporary registers
3868   address generate_large_array_equals() {
3869     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3870         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3871         tmp7 = r12, tmp8 = r13;
3872     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3873         SMALL_LOOP, POST_LOOP;
3874     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3875     // calculate if at least 32 prefetched bytes are used
3876     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3877     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3878     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3879     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3880         tmp5, tmp6, tmp7, tmp8);
3881 
3882     __ align(CodeEntryAlignment);
3883 
3884     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3885 
3886     address entry = __ pc();
3887     __ enter();
3888     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3889     // also advance pointers to use post-increment instead of pre-increment
3890     __ add(a1, a1, wordSize);
3891     __ add(a2, a2, wordSize);
3892     if (AvoidUnalignedAccesses) {
3893       // both implementations (SIMD/nonSIMD) are using relatively large load
3894       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3895       // on some CPUs in case of address is not at least 16-byte aligned.
3896       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3897       // load if needed at least for 1st address and make if 16-byte aligned.
3898       Label ALIGNED16;
3899       __ tbz(a1, 3, ALIGNED16);
3900       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3901       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3902       __ sub(cnt1, cnt1, wordSize);
3903       __ eor(tmp1, tmp1, tmp2);
3904       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3905       __ bind(ALIGNED16);
3906     }
3907     if (UseSIMDForArrayEquals) {
3908       if (SoftwarePrefetchHintDistance >= 0) {
3909         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3910         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3911         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3912             /* prfm = */ true, NOT_EQUAL);
3913         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3914         __ br(__ LT, TAIL);
3915       }
3916       __ bind(NO_PREFETCH_LARGE_LOOP);
3917       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3918           /* prfm = */ false, NOT_EQUAL);
3919     } else {
3920       __ push(spilled_regs, sp);
3921       if (SoftwarePrefetchHintDistance >= 0) {
3922         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3923         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3924         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3925             /* prfm = */ true, NOT_EQUAL);
3926         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3927         __ br(__ LT, TAIL);
3928       }
3929       __ bind(NO_PREFETCH_LARGE_LOOP);
3930       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3931           /* prfm = */ false, NOT_EQUAL);
3932     }
3933     __ bind(TAIL);
3934       __ cbz(cnt1, EQUAL);
3935       __ subs(cnt1, cnt1, wordSize);
3936       __ br(__ LE, POST_LOOP);
3937     __ bind(SMALL_LOOP);
3938       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3939       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3940       __ subs(cnt1, cnt1, wordSize);
3941       __ eor(tmp1, tmp1, tmp2);
3942       __ cbnz(tmp1, NOT_EQUAL);
3943       __ br(__ GT, SMALL_LOOP);
3944     __ bind(POST_LOOP);
3945       __ ldr(tmp1, Address(a1, cnt1));
3946       __ ldr(tmp2, Address(a2, cnt1));
3947       __ eor(tmp1, tmp1, tmp2);
3948       __ cbnz(tmp1, NOT_EQUAL);
3949     __ bind(EQUAL);
3950       __ mov(result, true);
3951     __ bind(NOT_EQUAL);
3952       if (!UseSIMDForArrayEquals) {
3953         __ pop(spilled_regs, sp);
3954       }
3955     __ bind(NOT_EQUAL_NO_POP);
3956     __ leave();
3957     __ ret(lr);
3958     return entry;
3959   }
3960 
3961   address generate_dsin_dcos(bool isCos) {
3962     __ align(CodeEntryAlignment);
3963     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3964     address start = __ pc();
3965     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3966         (address)StubRoutines::aarch64::_two_over_pi,
3967         (address)StubRoutines::aarch64::_pio2,
3968         (address)StubRoutines::aarch64::_dsin_coef,
3969         (address)StubRoutines::aarch64::_dcos_coef);
3970     return start;
3971   }
3972 
3973   address generate_dlog() {
3974     __ align(CodeEntryAlignment);
3975     StubCodeMark mark(this, "StubRoutines", "dlog");
3976     address entry = __ pc();
3977     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3978         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3979     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3980     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3981         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3982     return entry;
3983   }
3984 
3985   // code for comparing 16 bytes of strings with same encoding
3986   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3987     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3988     __ ldr(rscratch1, Address(__ post(str1, 8)));
3989     __ eor(rscratch2, tmp1, tmp2);
3990     __ ldr(cnt1, Address(__ post(str2, 8)));
3991     __ cbnz(rscratch2, DIFF1);
3992     __ ldr(tmp1, Address(__ post(str1, 8)));
3993     __ eor(rscratch2, rscratch1, cnt1);
3994     __ ldr(tmp2, Address(__ post(str2, 8)));
3995     __ cbnz(rscratch2, DIFF2);
3996   }
3997 
3998   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
3999   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4000       Label &DIFF2) {
4001     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4002     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4003 
4004     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4005     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4006     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4007     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4008 
4009     __ fmovd(tmpL, vtmp3);
4010     __ eor(rscratch2, tmp3, tmpL);
4011     __ cbnz(rscratch2, DIFF2);
4012 
4013     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4014     __ umov(tmpL, vtmp3, __ D, 1);
4015     __ eor(rscratch2, tmpU, tmpL);
4016     __ cbnz(rscratch2, DIFF1);
4017 
4018     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4019     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4020     __ fmovd(tmpL, vtmp);
4021     __ eor(rscratch2, tmp3, tmpL);
4022     __ cbnz(rscratch2, DIFF2);
4023 
4024     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4025     __ umov(tmpL, vtmp, __ D, 1);
4026     __ eor(rscratch2, tmpU, tmpL);
4027     __ cbnz(rscratch2, DIFF1);
4028   }
4029 
4030   // r0  = result
4031   // r1  = str1
4032   // r2  = cnt1
4033   // r3  = str2
4034   // r4  = cnt2
4035   // r10 = tmp1
4036   // r11 = tmp2
4037   address generate_compare_long_string_different_encoding(bool isLU) {
4038     __ align(CodeEntryAlignment);
4039     StubCodeMark mark(this, "StubRoutines", isLU
4040         ? "compare_long_string_different_encoding LU"
4041         : "compare_long_string_different_encoding UL");
4042     address entry = __ pc();
4043     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4044         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
4045         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4046     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4047         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4048     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4049     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4050 
4051     int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
4052 
4053     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4054     // cnt2 == amount of characters left to compare
4055     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4056     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4057     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4058     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4059     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4060     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4061     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4062     __ eor(rscratch2, tmp1, tmp2);
4063     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4064     __ mov(rscratch1, tmp2);
4065     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4066     Register strU = isLU ? str2 : str1,
4067              strL = isLU ? str1 : str2,
4068              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4069              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4070     __ push(spilled_regs, sp);
4071     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4072     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4073 
4074     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4075 
4076     if (SoftwarePrefetchHintDistance >= 0) {
4077       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4078       __ br(__ LT, SMALL_LOOP);
4079       __ bind(LARGE_LOOP_PREFETCH);
4080         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4081         __ mov(tmp4, 2);
4082         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4083         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4084           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4085           __ subs(tmp4, tmp4, 1);
4086           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4087           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4088           __ mov(tmp4, 2);
4089         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4090           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4091           __ subs(tmp4, tmp4, 1);
4092           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4093           __ sub(cnt2, cnt2, 64);
4094           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4095           __ br(__ GE, LARGE_LOOP_PREFETCH);
4096     }
4097     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4098     __ subs(cnt2, cnt2, 16);
4099     __ br(__ LT, TAIL);
4100     __ b(SMALL_LOOP_ENTER);
4101     __ bind(SMALL_LOOP); // smaller loop
4102       __ subs(cnt2, cnt2, 16);
4103     __ bind(SMALL_LOOP_ENTER);
4104       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4105       __ br(__ GE, SMALL_LOOP);
4106       __ cbz(cnt2, LOAD_LAST);
4107     __ bind(TAIL); // 1..15 characters left
4108       __ subs(zr, cnt2, -8);
4109       __ br(__ GT, TAIL_LOAD_16);
4110       __ ldrd(vtmp, Address(tmp2));
4111       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4112 
4113       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4114       __ fmovd(tmpL, vtmp3);
4115       __ eor(rscratch2, tmp3, tmpL);
4116       __ cbnz(rscratch2, DIFF2);
4117       __ umov(tmpL, vtmp3, __ D, 1);
4118       __ eor(rscratch2, tmpU, tmpL);
4119       __ cbnz(rscratch2, DIFF1);
4120       __ b(LOAD_LAST);
4121     __ bind(TAIL_LOAD_16);
4122       __ ldrq(vtmp, Address(tmp2));
4123       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4124       __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4125       __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4126       __ fmovd(tmpL, vtmp3);
4127       __ eor(rscratch2, tmp3, tmpL);
4128       __ cbnz(rscratch2, DIFF2);
4129 
4130       __ ldr(tmp3, Address(__ post(cnt1, 8)));
4131       __ umov(tmpL, vtmp3, __ D, 1);
4132       __ eor(rscratch2, tmpU, tmpL);
4133       __ cbnz(rscratch2, DIFF1);
4134 
4135       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4136       __ fmovd(tmpL, vtmp);
4137       __ eor(rscratch2, tmp3, tmpL);
4138       __ cbnz(rscratch2, DIFF2);
4139 
4140       __ umov(tmpL, vtmp, __ D, 1);
4141       __ eor(rscratch2, tmpU, tmpL);
4142       __ cbnz(rscratch2, DIFF1);
4143       __ b(LOAD_LAST);
4144     __ bind(DIFF2);
4145       __ mov(tmpU, tmp3);
4146     __ bind(DIFF1);
4147       __ pop(spilled_regs, sp);
4148       __ b(CALCULATE_DIFFERENCE);
4149     __ bind(LOAD_LAST);
4150       __ pop(spilled_regs, sp);
4151 
4152       __ ldrs(vtmp, Address(strL));
4153       __ ldr(tmpU, Address(strU));
4154       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4155       __ fmovd(tmpL, vtmp);
4156 
4157       __ eor(rscratch2, tmpU, tmpL);
4158       __ cbz(rscratch2, DONE);
4159 
4160     // Find the first different characters in the longwords and
4161     // compute their difference.
4162     __ bind(CALCULATE_DIFFERENCE);
4163       __ rev(rscratch2, rscratch2);
4164       __ clz(rscratch2, rscratch2);
4165       __ andr(rscratch2, rscratch2, -16);
4166       __ lsrv(tmp1, tmp1, rscratch2);
4167       __ uxthw(tmp1, tmp1);
4168       __ lsrv(rscratch1, rscratch1, rscratch2);
4169       __ uxthw(rscratch1, rscratch1);
4170       __ subw(result, tmp1, rscratch1);
4171     __ bind(DONE);
4172       __ ret(lr);
4173     return entry;
4174   }
4175 
4176   // r0  = result
4177   // r1  = str1
4178   // r2  = cnt1
4179   // r3  = str2
4180   // r4  = cnt2
4181   // r10 = tmp1
4182   // r11 = tmp2
4183   address generate_compare_long_string_same_encoding(bool isLL) {
4184     __ align(CodeEntryAlignment);
4185     StubCodeMark mark(this, "StubRoutines", isLL
4186         ? "compare_long_string_same_encoding LL"
4187         : "compare_long_string_same_encoding UU");
4188     address entry = __ pc();
4189     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4190         tmp1 = r10, tmp2 = r11;
4191     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4192         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4193         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4194     // exit from large loop when less than 64 bytes left to read or we're about
4195     // to prefetch memory behind array border
4196     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4197     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4198     // update cnt2 counter with already loaded 8 bytes
4199     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4200     // update pointers, because of previous read
4201     __ add(str1, str1, wordSize);
4202     __ add(str2, str2, wordSize);
4203     if (SoftwarePrefetchHintDistance >= 0) {
4204       __ bind(LARGE_LOOP_PREFETCH);
4205         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4206         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4207         compare_string_16_bytes_same(DIFF, DIFF2);
4208         compare_string_16_bytes_same(DIFF, DIFF2);
4209         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4210         compare_string_16_bytes_same(DIFF, DIFF2);
4211         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4212         compare_string_16_bytes_same(DIFF, DIFF2);
4213         __ br(__ GT, LARGE_LOOP_PREFETCH);
4214         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4215         // less than 16 bytes left?
4216         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4217         __ br(__ LT, TAIL);
4218     }
4219     __ bind(SMALL_LOOP);
4220       compare_string_16_bytes_same(DIFF, DIFF2);
4221       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4222       __ br(__ GE, SMALL_LOOP);
4223     __ bind(TAIL);
4224       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4225       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4226       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4227       __ br(__ LE, CHECK_LAST);
4228       __ eor(rscratch2, tmp1, tmp2);
4229       __ cbnz(rscratch2, DIFF);
4230       __ ldr(tmp1, Address(__ post(str1, 8)));
4231       __ ldr(tmp2, Address(__ post(str2, 8)));
4232       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4233     __ bind(CHECK_LAST);
4234       if (!isLL) {
4235         __ add(cnt2, cnt2, cnt2); // now in bytes
4236       }
4237       __ eor(rscratch2, tmp1, tmp2);
4238       __ cbnz(rscratch2, DIFF);
4239       __ ldr(rscratch1, Address(str1, cnt2));
4240       __ ldr(cnt1, Address(str2, cnt2));
4241       __ eor(rscratch2, rscratch1, cnt1);
4242       __ cbz(rscratch2, LENGTH_DIFF);
4243       // Find the first different characters in the longwords and
4244       // compute their difference.
4245     __ bind(DIFF2);
4246       __ rev(rscratch2, rscratch2);
4247       __ clz(rscratch2, rscratch2);
4248       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4249       __ lsrv(rscratch1, rscratch1, rscratch2);
4250       if (isLL) {
4251         __ lsrv(cnt1, cnt1, rscratch2);
4252         __ uxtbw(rscratch1, rscratch1);
4253         __ uxtbw(cnt1, cnt1);
4254       } else {
4255         __ lsrv(cnt1, cnt1, rscratch2);
4256         __ uxthw(rscratch1, rscratch1);
4257         __ uxthw(cnt1, cnt1);
4258       }
4259       __ subw(result, rscratch1, cnt1);
4260       __ b(LENGTH_DIFF);
4261     __ bind(DIFF);
4262       __ rev(rscratch2, rscratch2);
4263       __ clz(rscratch2, rscratch2);
4264       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4265       __ lsrv(tmp1, tmp1, rscratch2);
4266       if (isLL) {
4267         __ lsrv(tmp2, tmp2, rscratch2);
4268         __ uxtbw(tmp1, tmp1);
4269         __ uxtbw(tmp2, tmp2);
4270       } else {
4271         __ lsrv(tmp2, tmp2, rscratch2);
4272         __ uxthw(tmp1, tmp1);
4273         __ uxthw(tmp2, tmp2);
4274       }
4275       __ subw(result, tmp1, tmp2);
4276       __ b(LENGTH_DIFF);
4277     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4278       __ eor(rscratch2, tmp1, tmp2);
4279       __ cbnz(rscratch2, DIFF);
4280     __ bind(LENGTH_DIFF);
4281       __ ret(lr);
4282     return entry;
4283   }
4284 
4285   void generate_compare_long_strings() {
4286       StubRoutines::aarch64::_compare_long_string_LL
4287           = generate_compare_long_string_same_encoding(true);
4288       StubRoutines::aarch64::_compare_long_string_UU
4289           = generate_compare_long_string_same_encoding(false);
4290       StubRoutines::aarch64::_compare_long_string_LU
4291           = generate_compare_long_string_different_encoding(true);
4292       StubRoutines::aarch64::_compare_long_string_UL
4293           = generate_compare_long_string_different_encoding(false);
4294   }
4295 
4296   // R0 = result
4297   // R1 = str2
4298   // R2 = cnt1
4299   // R3 = str1
4300   // R4 = cnt2
4301   // This generic linear code use few additional ideas, which makes it faster:
4302   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4303   // in order to skip initial loading(help in systems with 1 ld pipeline)
4304   // 2) we can use "fast" algorithm of finding single character to search for
4305   // first symbol with less branches(1 branch per each loaded register instead
4306   // of branch for each symbol), so, this is where constants like
4307   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4308   // 3) after loading and analyzing 1st register of source string, it can be
4309   // used to search for every 1st character entry, saving few loads in
4310   // comparison with "simplier-but-slower" implementation
4311   // 4) in order to avoid lots of push/pop operations, code below is heavily
4312   // re-using/re-initializing/compressing register values, which makes code
4313   // larger and a bit less readable, however, most of extra operations are
4314   // issued during loads or branches, so, penalty is minimal
4315   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4316     const char* stubName = str1_isL
4317         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4318         : "indexof_linear_uu";
4319     __ align(CodeEntryAlignment);
4320     StubCodeMark mark(this, "StubRoutines", stubName);
4321     address entry = __ pc();
4322 
4323     int str1_chr_size = str1_isL ? 1 : 2;
4324     int str2_chr_size = str2_isL ? 1 : 2;
4325     int str1_chr_shift = str1_isL ? 0 : 1;
4326     int str2_chr_shift = str2_isL ? 0 : 1;
4327     bool isL = str1_isL && str2_isL;
4328    // parameters
4329     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4330     // temporary registers
4331     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4332     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4333     // redefinitions
4334     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4335 
4336     __ push(spilled_regs, sp);
4337     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4338         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4339         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4340         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4341         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4342         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4343     // Read whole register from str1. It is safe, because length >=8 here
4344     __ ldr(ch1, Address(str1));
4345     // Read whole register from str2. It is safe, because length >=8 here
4346     __ ldr(ch2, Address(str2));
4347     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4348     if (str1_isL != str2_isL) {
4349       __ eor(v0, __ T16B, v0, v0);
4350     }
4351     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4352     __ mul(first, first, tmp1);
4353     // check if we have less than 1 register to check
4354     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4355     if (str1_isL != str2_isL) {
4356       __ fmovd(v1, ch1);
4357     }
4358     __ br(__ LE, L_SMALL);
4359     __ eor(ch2, first, ch2);
4360     if (str1_isL != str2_isL) {
4361       __ zip1(v1, __ T16B, v1, v0);
4362     }
4363     __ sub(tmp2, ch2, tmp1);
4364     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4365     __ bics(tmp2, tmp2, ch2);
4366     if (str1_isL != str2_isL) {
4367       __ fmovd(ch1, v1);
4368     }
4369     __ br(__ NE, L_HAS_ZERO);
4370     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4371     __ add(result, result, wordSize/str2_chr_size);
4372     __ add(str2, str2, wordSize);
4373     __ br(__ LT, L_POST_LOOP);
4374     __ BIND(L_LOOP);
4375       __ ldr(ch2, Address(str2));
4376       __ eor(ch2, first, ch2);
4377       __ sub(tmp2, ch2, tmp1);
4378       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4379       __ bics(tmp2, tmp2, ch2);
4380       __ br(__ NE, L_HAS_ZERO);
4381     __ BIND(L_LOOP_PROCEED);
4382       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4383       __ add(str2, str2, wordSize);
4384       __ add(result, result, wordSize/str2_chr_size);
4385       __ br(__ GE, L_LOOP);
4386     __ BIND(L_POST_LOOP);
4387       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4388       __ br(__ LE, NOMATCH);
4389       __ ldr(ch2, Address(str2));
4390       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4391       __ eor(ch2, first, ch2);
4392       __ sub(tmp2, ch2, tmp1);
4393       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4394       __ mov(tmp4, -1); // all bits set
4395       __ b(L_SMALL_PROCEED);
4396     __ align(OptoLoopAlignment);
4397     __ BIND(L_SMALL);
4398       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4399       __ eor(ch2, first, ch2);
4400       if (str1_isL != str2_isL) {
4401         __ zip1(v1, __ T16B, v1, v0);
4402       }
4403       __ sub(tmp2, ch2, tmp1);
4404       __ mov(tmp4, -1); // all bits set
4405       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4406       if (str1_isL != str2_isL) {
4407         __ fmovd(ch1, v1); // move converted 4 symbols
4408       }
4409     __ BIND(L_SMALL_PROCEED);
4410       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4411       __ bic(tmp2, tmp2, ch2);
4412       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4413       __ rbit(tmp2, tmp2);
4414       __ br(__ EQ, NOMATCH);
4415     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4416       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4417       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4418       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4419       if (str2_isL) { // LL
4420         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4421         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4422         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4423         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4424         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4425       } else {
4426         __ mov(ch2, 0xE); // all bits in byte set except last one
4427         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4428         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4429         __ lslv(tmp2, tmp2, tmp4);
4430         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4431         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4432         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4433         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4434       }
4435       __ cmp(ch1, ch2);
4436       __ mov(tmp4, wordSize/str2_chr_size);
4437       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4438     __ BIND(L_SMALL_CMP_LOOP);
4439       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4440                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4441       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4442                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4443       __ add(tmp4, tmp4, 1);
4444       __ cmp(tmp4, cnt1);
4445       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4446       __ cmp(first, ch2);
4447       __ br(__ EQ, L_SMALL_CMP_LOOP);
4448     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4449       __ cbz(tmp2, NOMATCH); // no more matches. exit
4450       __ clz(tmp4, tmp2);
4451       __ add(result, result, 1); // advance index
4452       __ add(str2, str2, str2_chr_size); // advance pointer
4453       __ b(L_SMALL_HAS_ZERO_LOOP);
4454     __ align(OptoLoopAlignment);
4455     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4456       __ cmp(first, ch2);
4457       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4458       __ b(DONE);
4459     __ align(OptoLoopAlignment);
4460     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4461       if (str2_isL) { // LL
4462         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4463         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4464         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4465         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4466         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4467       } else {
4468         __ mov(ch2, 0xE); // all bits in byte set except last one
4469         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4470         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4471         __ lslv(tmp2, tmp2, tmp4);
4472         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4473         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4474         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4475         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4476       }
4477       __ cmp(ch1, ch2);
4478       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4479       __ b(DONE);
4480     __ align(OptoLoopAlignment);
4481     __ BIND(L_HAS_ZERO);
4482       __ rbit(tmp2, tmp2);
4483       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4484       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4485       // It's fine because both counters are 32bit and are not changed in this
4486       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4487       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4488       __ sub(result, result, 1);
4489     __ BIND(L_HAS_ZERO_LOOP);
4490       __ mov(cnt1, wordSize/str2_chr_size);
4491       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4492       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4493       if (str2_isL) {
4494         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4495         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4496         __ lslv(tmp2, tmp2, tmp4);
4497         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4498         __ add(tmp4, tmp4, 1);
4499         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4500         __ lsl(tmp2, tmp2, 1);
4501         __ mov(tmp4, wordSize/str2_chr_size);
4502       } else {
4503         __ mov(ch2, 0xE);
4504         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4505         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4506         __ lslv(tmp2, tmp2, tmp4);
4507         __ add(tmp4, tmp4, 1);
4508         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4509         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4510         __ lsl(tmp2, tmp2, 1);
4511         __ mov(tmp4, wordSize/str2_chr_size);
4512         __ sub(str2, str2, str2_chr_size);
4513       }
4514       __ cmp(ch1, ch2);
4515       __ mov(tmp4, wordSize/str2_chr_size);
4516       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4517     __ BIND(L_CMP_LOOP);
4518       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4519                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4520       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4521                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4522       __ add(tmp4, tmp4, 1);
4523       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4524       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4525       __ cmp(cnt1, ch2);
4526       __ br(__ EQ, L_CMP_LOOP);
4527     __ BIND(L_CMP_LOOP_NOMATCH);
4528       // here we're not matched
4529       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4530       __ clz(tmp4, tmp2);
4531       __ add(str2, str2, str2_chr_size); // advance pointer
4532       __ b(L_HAS_ZERO_LOOP);
4533     __ align(OptoLoopAlignment);
4534     __ BIND(L_CMP_LOOP_LAST_CMP);
4535       __ cmp(cnt1, ch2);
4536       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4537       __ b(DONE);
4538     __ align(OptoLoopAlignment);
4539     __ BIND(L_CMP_LOOP_LAST_CMP2);
4540       if (str2_isL) {
4541         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4542         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4543         __ lslv(tmp2, tmp2, tmp4);
4544         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4545         __ add(tmp4, tmp4, 1);
4546         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4547         __ lsl(tmp2, tmp2, 1);
4548       } else {
4549         __ mov(ch2, 0xE);
4550         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4551         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4552         __ lslv(tmp2, tmp2, tmp4);
4553         __ add(tmp4, tmp4, 1);
4554         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4555         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4556         __ lsl(tmp2, tmp2, 1);
4557         __ sub(str2, str2, str2_chr_size);
4558       }
4559       __ cmp(ch1, ch2);
4560       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4561       __ b(DONE);
4562     __ align(OptoLoopAlignment);
4563     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4564       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4565       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4566       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4567       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4568       // result by analyzed characters value, so, we can just reset lower bits
4569       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4570       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4571       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4572       // index of last analyzed substring inside current octet. So, str2 in at
4573       // respective start address. We need to advance it to next octet
4574       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4575       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4576       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4577       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4578       __ movw(cnt2, cnt2);
4579       __ b(L_LOOP_PROCEED);
4580     __ align(OptoLoopAlignment);
4581     __ BIND(NOMATCH);
4582       __ mov(result, -1);
4583     __ BIND(DONE);
4584       __ pop(spilled_regs, sp);
4585       __ ret(lr);
4586     return entry;
4587   }
4588 
4589   void generate_string_indexof_stubs() {
4590     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4591     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4592     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4593   }
4594 
4595   void inflate_and_store_2_fp_registers(bool generatePrfm,
4596       FloatRegister src1, FloatRegister src2) {
4597     Register dst = r1;
4598     __ zip1(v1, __ T16B, src1, v0);
4599     __ zip2(v2, __ T16B, src1, v0);
4600     if (generatePrfm) {
4601       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4602     }
4603     __ zip1(v3, __ T16B, src2, v0);
4604     __ zip2(v4, __ T16B, src2, v0);
4605     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4606   }
4607 
4608   // R0 = src
4609   // R1 = dst
4610   // R2 = len
4611   // R3 = len >> 3
4612   // V0 = 0
4613   // v1 = loaded 8 bytes
4614   address generate_large_byte_array_inflate() {
4615     __ align(CodeEntryAlignment);
4616     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4617     address entry = __ pc();
4618     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4619     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4620     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4621 
4622     // do one more 8-byte read to have address 16-byte aligned in most cases
4623     // also use single store instruction
4624     __ ldrd(v2, __ post(src, 8));
4625     __ sub(octetCounter, octetCounter, 2);
4626     __ zip1(v1, __ T16B, v1, v0);
4627     __ zip1(v2, __ T16B, v2, v0);
4628     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4629     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4630     __ subs(rscratch1, octetCounter, large_loop_threshold);
4631     __ br(__ LE, LOOP_START);
4632     __ b(LOOP_PRFM_START);
4633     __ bind(LOOP_PRFM);
4634       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4635     __ bind(LOOP_PRFM_START);
4636       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4637       __ sub(octetCounter, octetCounter, 8);
4638       __ subs(rscratch1, octetCounter, large_loop_threshold);
4639       inflate_and_store_2_fp_registers(true, v3, v4);
4640       inflate_and_store_2_fp_registers(true, v5, v6);
4641       __ br(__ GT, LOOP_PRFM);
4642       __ cmp(octetCounter, (u1)8);
4643       __ br(__ LT, DONE);
4644     __ bind(LOOP);
4645       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4646       __ bind(LOOP_START);
4647       __ sub(octetCounter, octetCounter, 8);
4648       __ cmp(octetCounter, (u1)8);
4649       inflate_and_store_2_fp_registers(false, v3, v4);
4650       inflate_and_store_2_fp_registers(false, v5, v6);
4651       __ br(__ GE, LOOP);
4652     __ bind(DONE);
4653       __ ret(lr);
4654     return entry;
4655   }
4656 
4657   /**
4658    *  Arguments:
4659    *
4660    *  Input:
4661    *  c_rarg0   - current state address
4662    *  c_rarg1   - H key address
4663    *  c_rarg2   - data address
4664    *  c_rarg3   - number of blocks
4665    *
4666    *  Output:
4667    *  Updated state at c_rarg0
4668    */
4669   address generate_ghash_processBlocks() {
4670     // Bafflingly, GCM uses little-endian for the byte order, but
4671     // big-endian for the bit order.  For example, the polynomial 1 is
4672     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4673     //
4674     // So, we must either reverse the bytes in each word and do
4675     // everything big-endian or reverse the bits in each byte and do
4676     // it little-endian.  On AArch64 it's more idiomatic to reverse
4677     // the bits in each byte (we have an instruction, RBIT, to do
4678     // that) and keep the data in little-endian bit order throught the
4679     // calculation, bit-reversing the inputs and outputs.
4680 
4681     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4682     __ align(wordSize * 2);
4683     address p = __ pc();
4684     __ emit_int64(0x87);  // The low-order bits of the field
4685                           // polynomial (i.e. p = z^7+z^2+z+1)
4686                           // repeated in the low and high parts of a
4687                           // 128-bit vector
4688     __ emit_int64(0x87);
4689 
4690     __ align(CodeEntryAlignment);
4691     address start = __ pc();
4692 
4693     Register state   = c_rarg0;
4694     Register subkeyH = c_rarg1;
4695     Register data    = c_rarg2;
4696     Register blocks  = c_rarg3;
4697 
4698     FloatRegister vzr = v30;
4699     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4700 
4701     __ ldrq(v0, Address(state));
4702     __ ldrq(v1, Address(subkeyH));
4703 
4704     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4705     __ rbit(v0, __ T16B, v0);
4706     __ rev64(v1, __ T16B, v1);
4707     __ rbit(v1, __ T16B, v1);
4708 
4709     __ ldrq(v26, p);
4710 
4711     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4712     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4713 
4714     {
4715       Label L_ghash_loop;
4716       __ bind(L_ghash_loop);
4717 
4718       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4719                                                  // reversing each byte
4720       __ rbit(v2, __ T16B, v2);
4721       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4722 
4723       // Multiply state in v2 by subkey in v1
4724       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4725                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4726                      /*temps*/v6, v20, v18, v21);
4727       // Reduce v7:v5 by the field polynomial
4728       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4729 
4730       __ sub(blocks, blocks, 1);
4731       __ cbnz(blocks, L_ghash_loop);
4732     }
4733 
4734     // The bit-reversed result is at this point in v0
4735     __ rev64(v1, __ T16B, v0);
4736     __ rbit(v1, __ T16B, v1);
4737 
4738     __ st1(v1, __ T16B, state);
4739     __ ret(lr);
4740 
4741     return start;
4742   }
4743 
4744   // Continuation point for throwing of implicit exceptions that are
4745   // not handled in the current activation. Fabricates an exception
4746   // oop and initiates normal exception dispatching in this
4747   // frame. Since we need to preserve callee-saved values (currently
4748   // only for C2, but done for C1 as well) we need a callee-saved oop
4749   // map and therefore have to make these stubs into RuntimeStubs
4750   // rather than BufferBlobs.  If the compiler needs all registers to
4751   // be preserved between the fault point and the exception handler
4752   // then it must assume responsibility for that in
4753   // AbstractCompiler::continuation_for_implicit_null_exception or
4754   // continuation_for_implicit_division_by_zero_exception. All other
4755   // implicit exceptions (e.g., NullPointerException or
4756   // AbstractMethodError on entry) are either at call sites or
4757   // otherwise assume that stack unwinding will be initiated, so
4758   // caller saved registers were assumed volatile in the compiler.
4759 
4760 #undef __
4761 #define __ masm->
4762 
4763   address generate_throw_exception(const char* name,
4764                                    address runtime_entry,
4765                                    Register arg1 = noreg,
4766                                    Register arg2 = noreg) {
4767     // Information about frame layout at time of blocking runtime call.
4768     // Note that we only have to preserve callee-saved registers since
4769     // the compilers are responsible for supplying a continuation point
4770     // if they expect all registers to be preserved.
4771     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4772     enum layout {
4773       rfp_off = 0,
4774       rfp_off2,
4775       return_off,
4776       return_off2,
4777       framesize // inclusive of return address
4778     };
4779 
4780     int insts_size = 512;
4781     int locs_size  = 64;
4782 
4783     CodeBuffer code(name, insts_size, locs_size);
4784     OopMapSet* oop_maps  = new OopMapSet();
4785     MacroAssembler* masm = new MacroAssembler(&code);
4786 
4787     address start = __ pc();
4788 
4789     // This is an inlined and slightly modified version of call_VM
4790     // which has the ability to fetch the return PC out of
4791     // thread-local storage and also sets up last_Java_sp slightly
4792     // differently than the real call_VM
4793 
4794     __ enter(); // Save FP and LR before call
4795 
4796     assert(is_even(framesize/2), "sp not 16-byte aligned");
4797 
4798     // lr and fp are already in place
4799     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4800 
4801     int frame_complete = __ pc() - start;
4802 
4803     // Set up last_Java_sp and last_Java_fp
4804     address the_pc = __ pc();
4805     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4806 
4807     // Call runtime
4808     if (arg1 != noreg) {
4809       assert(arg2 != c_rarg1, "clobbered");
4810       __ mov(c_rarg1, arg1);
4811     }
4812     if (arg2 != noreg) {
4813       __ mov(c_rarg2, arg2);
4814     }
4815     __ mov(c_rarg0, rthread);
4816     BLOCK_COMMENT("call runtime_entry");
4817     __ mov(rscratch1, runtime_entry);
4818     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4819 
4820     // Generate oop map
4821     OopMap* map = new OopMap(framesize, 0);
4822 
4823     oop_maps->add_gc_map(the_pc - start, map);
4824 
4825     __ reset_last_Java_frame(true);
4826     __ maybe_isb();
4827 
4828     __ leave();
4829 
4830     // check for pending exceptions
4831 #ifdef ASSERT
4832     Label L;
4833     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4834     __ cbnz(rscratch1, L);
4835     __ should_not_reach_here();
4836     __ bind(L);
4837 #endif // ASSERT
4838     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4839 
4840 
4841     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4842     RuntimeStub* stub =
4843       RuntimeStub::new_runtime_stub(name,
4844                                     &code,
4845                                     frame_complete,
4846                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4847                                     oop_maps, false);
4848     return stub->entry_point();
4849   }
4850 
4851   class MontgomeryMultiplyGenerator : public MacroAssembler {
4852 
4853     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4854       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4855 
4856     RegSet _toSave;
4857     bool _squaring;
4858 
4859   public:
4860     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4861       : MacroAssembler(as->code()), _squaring(squaring) {
4862 
4863       // Register allocation
4864 
4865       Register reg = c_rarg0;
4866       Pa_base = reg;       // Argument registers
4867       if (squaring)
4868         Pb_base = Pa_base;
4869       else
4870         Pb_base = ++reg;
4871       Pn_base = ++reg;
4872       Rlen= ++reg;
4873       inv = ++reg;
4874       Pm_base = ++reg;
4875 
4876                           // Working registers:
4877       Ra =  ++reg;        // The current digit of a, b, n, and m.
4878       Rb =  ++reg;
4879       Rm =  ++reg;
4880       Rn =  ++reg;
4881 
4882       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4883       Pb =  ++reg;
4884       Pm =  ++reg;
4885       Pn =  ++reg;
4886 
4887       t0 =  ++reg;        // Three registers which form a
4888       t1 =  ++reg;        // triple-precision accumuator.
4889       t2 =  ++reg;
4890 
4891       Ri =  ++reg;        // Inner and outer loop indexes.
4892       Rj =  ++reg;
4893 
4894       Rhi_ab = ++reg;     // Product registers: low and high parts
4895       Rlo_ab = ++reg;     // of a*b and m*n.
4896       Rhi_mn = ++reg;
4897       Rlo_mn = ++reg;
4898 
4899       // r19 and up are callee-saved.
4900       _toSave = RegSet::range(r19, reg) + Pm_base;
4901     }
4902 
4903   private:
4904     void save_regs() {
4905       push(_toSave, sp);
4906     }
4907 
4908     void restore_regs() {
4909       pop(_toSave, sp);
4910     }
4911 
4912     template <typename T>
4913     void unroll_2(Register count, T block) {
4914       Label loop, end, odd;
4915       tbnz(count, 0, odd);
4916       cbz(count, end);
4917       align(16);
4918       bind(loop);
4919       (this->*block)();
4920       bind(odd);
4921       (this->*block)();
4922       subs(count, count, 2);
4923       br(Assembler::GT, loop);
4924       bind(end);
4925     }
4926 
4927     template <typename T>
4928     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4929       Label loop, end, odd;
4930       tbnz(count, 0, odd);
4931       cbz(count, end);
4932       align(16);
4933       bind(loop);
4934       (this->*block)(d, s, tmp);
4935       bind(odd);
4936       (this->*block)(d, s, tmp);
4937       subs(count, count, 2);
4938       br(Assembler::GT, loop);
4939       bind(end);
4940     }
4941 
4942     void pre1(RegisterOrConstant i) {
4943       block_comment("pre1");
4944       // Pa = Pa_base;
4945       // Pb = Pb_base + i;
4946       // Pm = Pm_base;
4947       // Pn = Pn_base + i;
4948       // Ra = *Pa;
4949       // Rb = *Pb;
4950       // Rm = *Pm;
4951       // Rn = *Pn;
4952       ldr(Ra, Address(Pa_base));
4953       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4954       ldr(Rm, Address(Pm_base));
4955       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4956       lea(Pa, Address(Pa_base));
4957       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4958       lea(Pm, Address(Pm_base));
4959       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4960 
4961       // Zero the m*n result.
4962       mov(Rhi_mn, zr);
4963       mov(Rlo_mn, zr);
4964     }
4965 
4966     // The core multiply-accumulate step of a Montgomery
4967     // multiplication.  The idea is to schedule operations as a
4968     // pipeline so that instructions with long latencies (loads and
4969     // multiplies) have time to complete before their results are
4970     // used.  This most benefits in-order implementations of the
4971     // architecture but out-of-order ones also benefit.
4972     void step() {
4973       block_comment("step");
4974       // MACC(Ra, Rb, t0, t1, t2);
4975       // Ra = *++Pa;
4976       // Rb = *--Pb;
4977       umulh(Rhi_ab, Ra, Rb);
4978       mul(Rlo_ab, Ra, Rb);
4979       ldr(Ra, pre(Pa, wordSize));
4980       ldr(Rb, pre(Pb, -wordSize));
4981       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4982                                        // previous iteration.
4983       // MACC(Rm, Rn, t0, t1, t2);
4984       // Rm = *++Pm;
4985       // Rn = *--Pn;
4986       umulh(Rhi_mn, Rm, Rn);
4987       mul(Rlo_mn, Rm, Rn);
4988       ldr(Rm, pre(Pm, wordSize));
4989       ldr(Rn, pre(Pn, -wordSize));
4990       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4991     }
4992 
4993     void post1() {
4994       block_comment("post1");
4995 
4996       // MACC(Ra, Rb, t0, t1, t2);
4997       // Ra = *++Pa;
4998       // Rb = *--Pb;
4999       umulh(Rhi_ab, Ra, Rb);
5000       mul(Rlo_ab, Ra, Rb);
5001       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5002       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5003 
5004       // *Pm = Rm = t0 * inv;
5005       mul(Rm, t0, inv);
5006       str(Rm, Address(Pm));
5007 
5008       // MACC(Rm, Rn, t0, t1, t2);
5009       // t0 = t1; t1 = t2; t2 = 0;
5010       umulh(Rhi_mn, Rm, Rn);
5011 
5012 #ifndef PRODUCT
5013       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5014       {
5015         mul(Rlo_mn, Rm, Rn);
5016         add(Rlo_mn, t0, Rlo_mn);
5017         Label ok;
5018         cbz(Rlo_mn, ok); {
5019           stop("broken Montgomery multiply");
5020         } bind(ok);
5021       }
5022 #endif
5023       // We have very carefully set things up so that
5024       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5025       // the lower half of Rm * Rn because we know the result already:
5026       // it must be -t0.  t0 + (-t0) must generate a carry iff
5027       // t0 != 0.  So, rather than do a mul and an adds we just set
5028       // the carry flag iff t0 is nonzero.
5029       //
5030       // mul(Rlo_mn, Rm, Rn);
5031       // adds(zr, t0, Rlo_mn);
5032       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5033       adcs(t0, t1, Rhi_mn);
5034       adc(t1, t2, zr);
5035       mov(t2, zr);
5036     }
5037 
5038     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5039       block_comment("pre2");
5040       // Pa = Pa_base + i-len;
5041       // Pb = Pb_base + len;
5042       // Pm = Pm_base + i-len;
5043       // Pn = Pn_base + len;
5044 
5045       if (i.is_register()) {
5046         sub(Rj, i.as_register(), len);
5047       } else {
5048         mov(Rj, i.as_constant());
5049         sub(Rj, Rj, len);
5050       }
5051       // Rj == i-len
5052 
5053       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5054       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5055       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5056       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5057 
5058       // Ra = *++Pa;
5059       // Rb = *--Pb;
5060       // Rm = *++Pm;
5061       // Rn = *--Pn;
5062       ldr(Ra, pre(Pa, wordSize));
5063       ldr(Rb, pre(Pb, -wordSize));
5064       ldr(Rm, pre(Pm, wordSize));
5065       ldr(Rn, pre(Pn, -wordSize));
5066 
5067       mov(Rhi_mn, zr);
5068       mov(Rlo_mn, zr);
5069     }
5070 
5071     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5072       block_comment("post2");
5073       if (i.is_constant()) {
5074         mov(Rj, i.as_constant()-len.as_constant());
5075       } else {
5076         sub(Rj, i.as_register(), len);
5077       }
5078 
5079       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5080 
5081       // As soon as we know the least significant digit of our result,
5082       // store it.
5083       // Pm_base[i-len] = t0;
5084       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5085 
5086       // t0 = t1; t1 = t2; t2 = 0;
5087       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5088       adc(t1, t2, zr);
5089       mov(t2, zr);
5090     }
5091 
5092     // A carry in t0 after Montgomery multiplication means that we
5093     // should subtract multiples of n from our result in m.  We'll
5094     // keep doing that until there is no carry.
5095     void normalize(RegisterOrConstant len) {
5096       block_comment("normalize");
5097       // while (t0)
5098       //   t0 = sub(Pm_base, Pn_base, t0, len);
5099       Label loop, post, again;
5100       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5101       cbz(t0, post); {
5102         bind(again); {
5103           mov(i, zr);
5104           mov(cnt, len);
5105           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5106           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5107           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5108           align(16);
5109           bind(loop); {
5110             sbcs(Rm, Rm, Rn);
5111             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5112             add(i, i, 1);
5113             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5114             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5115             sub(cnt, cnt, 1);
5116           } cbnz(cnt, loop);
5117           sbc(t0, t0, zr);
5118         } cbnz(t0, again);
5119       } bind(post);
5120     }
5121 
5122     // Move memory at s to d, reversing words.
5123     //    Increments d to end of copied memory
5124     //    Destroys tmp1, tmp2
5125     //    Preserves len
5126     //    Leaves s pointing to the address which was in d at start
5127     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5128       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5129 
5130       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5131       mov(tmp1, len);
5132       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5133       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5134     }
5135     // where
5136     void reverse1(Register d, Register s, Register tmp) {
5137       ldr(tmp, pre(s, -wordSize));
5138       ror(tmp, tmp, 32);
5139       str(tmp, post(d, wordSize));
5140     }
5141 
5142     void step_squaring() {
5143       // An extra ACC
5144       step();
5145       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5146     }
5147 
5148     void last_squaring(RegisterOrConstant i) {
5149       Label dont;
5150       // if ((i & 1) == 0) {
5151       tbnz(i.as_register(), 0, dont); {
5152         // MACC(Ra, Rb, t0, t1, t2);
5153         // Ra = *++Pa;
5154         // Rb = *--Pb;
5155         umulh(Rhi_ab, Ra, Rb);
5156         mul(Rlo_ab, Ra, Rb);
5157         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5158       } bind(dont);
5159     }
5160 
5161     void extra_step_squaring() {
5162       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5163 
5164       // MACC(Rm, Rn, t0, t1, t2);
5165       // Rm = *++Pm;
5166       // Rn = *--Pn;
5167       umulh(Rhi_mn, Rm, Rn);
5168       mul(Rlo_mn, Rm, Rn);
5169       ldr(Rm, pre(Pm, wordSize));
5170       ldr(Rn, pre(Pn, -wordSize));
5171     }
5172 
5173     void post1_squaring() {
5174       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5175 
5176       // *Pm = Rm = t0 * inv;
5177       mul(Rm, t0, inv);
5178       str(Rm, Address(Pm));
5179 
5180       // MACC(Rm, Rn, t0, t1, t2);
5181       // t0 = t1; t1 = t2; t2 = 0;
5182       umulh(Rhi_mn, Rm, Rn);
5183 
5184 #ifndef PRODUCT
5185       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5186       {
5187         mul(Rlo_mn, Rm, Rn);
5188         add(Rlo_mn, t0, Rlo_mn);
5189         Label ok;
5190         cbz(Rlo_mn, ok); {
5191           stop("broken Montgomery multiply");
5192         } bind(ok);
5193       }
5194 #endif
5195       // We have very carefully set things up so that
5196       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5197       // the lower half of Rm * Rn because we know the result already:
5198       // it must be -t0.  t0 + (-t0) must generate a carry iff
5199       // t0 != 0.  So, rather than do a mul and an adds we just set
5200       // the carry flag iff t0 is nonzero.
5201       //
5202       // mul(Rlo_mn, Rm, Rn);
5203       // adds(zr, t0, Rlo_mn);
5204       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5205       adcs(t0, t1, Rhi_mn);
5206       adc(t1, t2, zr);
5207       mov(t2, zr);
5208     }
5209 
5210     void acc(Register Rhi, Register Rlo,
5211              Register t0, Register t1, Register t2) {
5212       adds(t0, t0, Rlo);
5213       adcs(t1, t1, Rhi);
5214       adc(t2, t2, zr);
5215     }
5216 
5217   public:
5218     /**
5219      * Fast Montgomery multiplication.  The derivation of the
5220      * algorithm is in A Cryptographic Library for the Motorola
5221      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5222      *
5223      * Arguments:
5224      *
5225      * Inputs for multiplication:
5226      *   c_rarg0   - int array elements a
5227      *   c_rarg1   - int array elements b
5228      *   c_rarg2   - int array elements n (the modulus)
5229      *   c_rarg3   - int length
5230      *   c_rarg4   - int inv
5231      *   c_rarg5   - int array elements m (the result)
5232      *
5233      * Inputs for squaring:
5234      *   c_rarg0   - int array elements a
5235      *   c_rarg1   - int array elements n (the modulus)
5236      *   c_rarg2   - int length
5237      *   c_rarg3   - int inv
5238      *   c_rarg4   - int array elements m (the result)
5239      *
5240      */
5241     address generate_multiply() {
5242       Label argh, nothing;
5243       bind(argh);
5244       stop("MontgomeryMultiply total_allocation must be <= 8192");
5245 
5246       align(CodeEntryAlignment);
5247       address entry = pc();
5248 
5249       cbzw(Rlen, nothing);
5250 
5251       enter();
5252 
5253       // Make room.
5254       cmpw(Rlen, 512);
5255       br(Assembler::HI, argh);
5256       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5257       andr(sp, Ra, -2 * wordSize);
5258 
5259       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5260 
5261       {
5262         // Copy input args, reversing as we go.  We use Ra as a
5263         // temporary variable.
5264         reverse(Ra, Pa_base, Rlen, t0, t1);
5265         if (!_squaring)
5266           reverse(Ra, Pb_base, Rlen, t0, t1);
5267         reverse(Ra, Pn_base, Rlen, t0, t1);
5268       }
5269 
5270       // Push all call-saved registers and also Pm_base which we'll need
5271       // at the end.
5272       save_regs();
5273 
5274 #ifndef PRODUCT
5275       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5276       {
5277         ldr(Rn, Address(Pn_base, 0));
5278         mul(Rlo_mn, Rn, inv);
5279         subs(zr, Rlo_mn, -1);
5280         Label ok;
5281         br(EQ, ok); {
5282           stop("broken inverse in Montgomery multiply");
5283         } bind(ok);
5284       }
5285 #endif
5286 
5287       mov(Pm_base, Ra);
5288 
5289       mov(t0, zr);
5290       mov(t1, zr);
5291       mov(t2, zr);
5292 
5293       block_comment("for (int i = 0; i < len; i++) {");
5294       mov(Ri, zr); {
5295         Label loop, end;
5296         cmpw(Ri, Rlen);
5297         br(Assembler::GE, end);
5298 
5299         bind(loop);
5300         pre1(Ri);
5301 
5302         block_comment("  for (j = i; j; j--) {"); {
5303           movw(Rj, Ri);
5304           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5305         } block_comment("  } // j");
5306 
5307         post1();
5308         addw(Ri, Ri, 1);
5309         cmpw(Ri, Rlen);
5310         br(Assembler::LT, loop);
5311         bind(end);
5312         block_comment("} // i");
5313       }
5314 
5315       block_comment("for (int i = len; i < 2*len; i++) {");
5316       mov(Ri, Rlen); {
5317         Label loop, end;
5318         cmpw(Ri, Rlen, Assembler::LSL, 1);
5319         br(Assembler::GE, end);
5320 
5321         bind(loop);
5322         pre2(Ri, Rlen);
5323 
5324         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5325           lslw(Rj, Rlen, 1);
5326           subw(Rj, Rj, Ri);
5327           subw(Rj, Rj, 1);
5328           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5329         } block_comment("  } // j");
5330 
5331         post2(Ri, Rlen);
5332         addw(Ri, Ri, 1);
5333         cmpw(Ri, Rlen, Assembler::LSL, 1);
5334         br(Assembler::LT, loop);
5335         bind(end);
5336       }
5337       block_comment("} // i");
5338 
5339       normalize(Rlen);
5340 
5341       mov(Ra, Pm_base);  // Save Pm_base in Ra
5342       restore_regs();  // Restore caller's Pm_base
5343 
5344       // Copy our result into caller's Pm_base
5345       reverse(Pm_base, Ra, Rlen, t0, t1);
5346 
5347       leave();
5348       bind(nothing);
5349       ret(lr);
5350 
5351       return entry;
5352     }
5353     // In C, approximately:
5354 
5355     // void
5356     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5357     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5358     //                     unsigned long inv, int len) {
5359     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5360     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5361     //   unsigned long Ra, Rb, Rn, Rm;
5362 
5363     //   int i;
5364 
5365     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5366 
5367     //   for (i = 0; i < len; i++) {
5368     //     int j;
5369 
5370     //     Pa = Pa_base;
5371     //     Pb = Pb_base + i;
5372     //     Pm = Pm_base;
5373     //     Pn = Pn_base + i;
5374 
5375     //     Ra = *Pa;
5376     //     Rb = *Pb;
5377     //     Rm = *Pm;
5378     //     Rn = *Pn;
5379 
5380     //     int iters = i;
5381     //     for (j = 0; iters--; j++) {
5382     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5383     //       MACC(Ra, Rb, t0, t1, t2);
5384     //       Ra = *++Pa;
5385     //       Rb = *--Pb;
5386     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5387     //       MACC(Rm, Rn, t0, t1, t2);
5388     //       Rm = *++Pm;
5389     //       Rn = *--Pn;
5390     //     }
5391 
5392     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5393     //     MACC(Ra, Rb, t0, t1, t2);
5394     //     *Pm = Rm = t0 * inv;
5395     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5396     //     MACC(Rm, Rn, t0, t1, t2);
5397 
5398     //     assert(t0 == 0, "broken Montgomery multiply");
5399 
5400     //     t0 = t1; t1 = t2; t2 = 0;
5401     //   }
5402 
5403     //   for (i = len; i < 2*len; i++) {
5404     //     int j;
5405 
5406     //     Pa = Pa_base + i-len;
5407     //     Pb = Pb_base + len;
5408     //     Pm = Pm_base + i-len;
5409     //     Pn = Pn_base + len;
5410 
5411     //     Ra = *++Pa;
5412     //     Rb = *--Pb;
5413     //     Rm = *++Pm;
5414     //     Rn = *--Pn;
5415 
5416     //     int iters = len*2-i-1;
5417     //     for (j = i-len+1; iters--; j++) {
5418     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5419     //       MACC(Ra, Rb, t0, t1, t2);
5420     //       Ra = *++Pa;
5421     //       Rb = *--Pb;
5422     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5423     //       MACC(Rm, Rn, t0, t1, t2);
5424     //       Rm = *++Pm;
5425     //       Rn = *--Pn;
5426     //     }
5427 
5428     //     Pm_base[i-len] = t0;
5429     //     t0 = t1; t1 = t2; t2 = 0;
5430     //   }
5431 
5432     //   while (t0)
5433     //     t0 = sub(Pm_base, Pn_base, t0, len);
5434     // }
5435 
5436     /**
5437      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5438      * multiplies than Montgomery multiplication so it should be up to
5439      * 25% faster.  However, its loop control is more complex and it
5440      * may actually run slower on some machines.
5441      *
5442      * Arguments:
5443      *
5444      * Inputs:
5445      *   c_rarg0   - int array elements a
5446      *   c_rarg1   - int array elements n (the modulus)
5447      *   c_rarg2   - int length
5448      *   c_rarg3   - int inv
5449      *   c_rarg4   - int array elements m (the result)
5450      *
5451      */
5452     address generate_square() {
5453       Label argh;
5454       bind(argh);
5455       stop("MontgomeryMultiply total_allocation must be <= 8192");
5456 
5457       align(CodeEntryAlignment);
5458       address entry = pc();
5459 
5460       enter();
5461 
5462       // Make room.
5463       cmpw(Rlen, 512);
5464       br(Assembler::HI, argh);
5465       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5466       andr(sp, Ra, -2 * wordSize);
5467 
5468       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5469 
5470       {
5471         // Copy input args, reversing as we go.  We use Ra as a
5472         // temporary variable.
5473         reverse(Ra, Pa_base, Rlen, t0, t1);
5474         reverse(Ra, Pn_base, Rlen, t0, t1);
5475       }
5476 
5477       // Push all call-saved registers and also Pm_base which we'll need
5478       // at the end.
5479       save_regs();
5480 
5481       mov(Pm_base, Ra);
5482 
5483       mov(t0, zr);
5484       mov(t1, zr);
5485       mov(t2, zr);
5486 
5487       block_comment("for (int i = 0; i < len; i++) {");
5488       mov(Ri, zr); {
5489         Label loop, end;
5490         bind(loop);
5491         cmp(Ri, Rlen);
5492         br(Assembler::GE, end);
5493 
5494         pre1(Ri);
5495 
5496         block_comment("for (j = (i+1)/2; j; j--) {"); {
5497           add(Rj, Ri, 1);
5498           lsr(Rj, Rj, 1);
5499           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5500         } block_comment("  } // j");
5501 
5502         last_squaring(Ri);
5503 
5504         block_comment("  for (j = i/2; j; j--) {"); {
5505           lsr(Rj, Ri, 1);
5506           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5507         } block_comment("  } // j");
5508 
5509         post1_squaring();
5510         add(Ri, Ri, 1);
5511         cmp(Ri, Rlen);
5512         br(Assembler::LT, loop);
5513 
5514         bind(end);
5515         block_comment("} // i");
5516       }
5517 
5518       block_comment("for (int i = len; i < 2*len; i++) {");
5519       mov(Ri, Rlen); {
5520         Label loop, end;
5521         bind(loop);
5522         cmp(Ri, Rlen, Assembler::LSL, 1);
5523         br(Assembler::GE, end);
5524 
5525         pre2(Ri, Rlen);
5526 
5527         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5528           lsl(Rj, Rlen, 1);
5529           sub(Rj, Rj, Ri);
5530           sub(Rj, Rj, 1);
5531           lsr(Rj, Rj, 1);
5532           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5533         } block_comment("  } // j");
5534 
5535         last_squaring(Ri);
5536 
5537         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5538           lsl(Rj, Rlen, 1);
5539           sub(Rj, Rj, Ri);
5540           lsr(Rj, Rj, 1);
5541           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5542         } block_comment("  } // j");
5543 
5544         post2(Ri, Rlen);
5545         add(Ri, Ri, 1);
5546         cmp(Ri, Rlen, Assembler::LSL, 1);
5547 
5548         br(Assembler::LT, loop);
5549         bind(end);
5550         block_comment("} // i");
5551       }
5552 
5553       normalize(Rlen);
5554 
5555       mov(Ra, Pm_base);  // Save Pm_base in Ra
5556       restore_regs();  // Restore caller's Pm_base
5557 
5558       // Copy our result into caller's Pm_base
5559       reverse(Pm_base, Ra, Rlen, t0, t1);
5560 
5561       leave();
5562       ret(lr);
5563 
5564       return entry;
5565     }
5566     // In C, approximately:
5567 
5568     // void
5569     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5570     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5571     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5572     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5573     //   unsigned long Ra, Rb, Rn, Rm;
5574 
5575     //   int i;
5576 
5577     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5578 
5579     //   for (i = 0; i < len; i++) {
5580     //     int j;
5581 
5582     //     Pa = Pa_base;
5583     //     Pb = Pa_base + i;
5584     //     Pm = Pm_base;
5585     //     Pn = Pn_base + i;
5586 
5587     //     Ra = *Pa;
5588     //     Rb = *Pb;
5589     //     Rm = *Pm;
5590     //     Rn = *Pn;
5591 
5592     //     int iters = (i+1)/2;
5593     //     for (j = 0; iters--; j++) {
5594     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5595     //       MACC2(Ra, Rb, t0, t1, t2);
5596     //       Ra = *++Pa;
5597     //       Rb = *--Pb;
5598     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5599     //       MACC(Rm, Rn, t0, t1, t2);
5600     //       Rm = *++Pm;
5601     //       Rn = *--Pn;
5602     //     }
5603     //     if ((i & 1) == 0) {
5604     //       assert(Ra == Pa_base[j], "must be");
5605     //       MACC(Ra, Ra, t0, t1, t2);
5606     //     }
5607     //     iters = i/2;
5608     //     assert(iters == i-j, "must be");
5609     //     for (; iters--; j++) {
5610     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5611     //       MACC(Rm, Rn, t0, t1, t2);
5612     //       Rm = *++Pm;
5613     //       Rn = *--Pn;
5614     //     }
5615 
5616     //     *Pm = Rm = t0 * inv;
5617     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5618     //     MACC(Rm, Rn, t0, t1, t2);
5619 
5620     //     assert(t0 == 0, "broken Montgomery multiply");
5621 
5622     //     t0 = t1; t1 = t2; t2 = 0;
5623     //   }
5624 
5625     //   for (i = len; i < 2*len; i++) {
5626     //     int start = i-len+1;
5627     //     int end = start + (len - start)/2;
5628     //     int j;
5629 
5630     //     Pa = Pa_base + i-len;
5631     //     Pb = Pa_base + len;
5632     //     Pm = Pm_base + i-len;
5633     //     Pn = Pn_base + len;
5634 
5635     //     Ra = *++Pa;
5636     //     Rb = *--Pb;
5637     //     Rm = *++Pm;
5638     //     Rn = *--Pn;
5639 
5640     //     int iters = (2*len-i-1)/2;
5641     //     assert(iters == end-start, "must be");
5642     //     for (j = start; iters--; j++) {
5643     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5644     //       MACC2(Ra, Rb, t0, t1, t2);
5645     //       Ra = *++Pa;
5646     //       Rb = *--Pb;
5647     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5648     //       MACC(Rm, Rn, t0, t1, t2);
5649     //       Rm = *++Pm;
5650     //       Rn = *--Pn;
5651     //     }
5652     //     if ((i & 1) == 0) {
5653     //       assert(Ra == Pa_base[j], "must be");
5654     //       MACC(Ra, Ra, t0, t1, t2);
5655     //     }
5656     //     iters =  (2*len-i)/2;
5657     //     assert(iters == len-j, "must be");
5658     //     for (; iters--; j++) {
5659     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5660     //       MACC(Rm, Rn, t0, t1, t2);
5661     //       Rm = *++Pm;
5662     //       Rn = *--Pn;
5663     //     }
5664     //     Pm_base[i-len] = t0;
5665     //     t0 = t1; t1 = t2; t2 = 0;
5666     //   }
5667 
5668     //   while (t0)
5669     //     t0 = sub(Pm_base, Pn_base, t0, len);
5670     // }
5671   };
5672 
5673 
5674   // Call here from the interpreter or compiled code to either load
5675   // multiple returned values from the value type instance being
5676   // returned to registers or to store returned values to a newly
5677   // allocated value type instance.
5678   address generate_return_value_stub(address destination, const char* name, bool has_res) {
5679 
5680     // Information about frame layout at time of blocking runtime call.
5681     // Note that we only have to preserve callee-saved registers since
5682     // the compilers are responsible for supplying a continuation point
5683     // if they expect all registers to be preserved.
5684     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
5685     enum layout {
5686       rfp_off = 0, rfp_off2,
5687 
5688       j_rarg7_off, j_rarg7_2,
5689       j_rarg6_off, j_rarg6_2,
5690       j_rarg5_off, j_rarg5_2,
5691       j_rarg4_off, j_rarg4_2,
5692       j_rarg3_off, j_rarg3_2,
5693       j_rarg2_off, j_rarg2_2,
5694       j_rarg1_off, j_rarg1_2,
5695       j_rarg0_off, j_rarg0_2,
5696 
5697       j_farg0_off, j_farg0_2,
5698       j_farg1_off, j_farg1_2,
5699       j_farg2_off, j_farg2_2,
5700       j_farg3_off, j_farg3_2,
5701       j_farg4_off, j_farg4_2,
5702       j_farg5_off, j_farg5_2,
5703       j_farg6_off, j_farg6_2,
5704       j_farg7_off, j_farg7_2,
5705  
5706       return_off, return_off2,
5707       framesize // inclusive of return address
5708     };
5709 
5710     int insts_size = 512;
5711     int locs_size  = 64;
5712 
5713     CodeBuffer code(name, insts_size, locs_size);
5714     OopMapSet* oop_maps  = new OopMapSet();
5715     MacroAssembler* masm = new MacroAssembler(&code);
5716 
5717     address start = __ pc();
5718 
5719     const Address f7_save       (rfp, j_farg7_off * wordSize);
5720     const Address f6_save       (rfp, j_farg6_off * wordSize);
5721     const Address f5_save       (rfp, j_farg5_off * wordSize);
5722     const Address f4_save       (rfp, j_farg4_off * wordSize);
5723     const Address f3_save       (rfp, j_farg3_off * wordSize);
5724     const Address f2_save       (rfp, j_farg2_off * wordSize);
5725     const Address f1_save       (rfp, j_farg1_off * wordSize);
5726     const Address f0_save       (rfp, j_farg0_off * wordSize);
5727 
5728     const Address r0_save      (rfp, j_rarg0_off * wordSize);
5729     const Address r1_save      (rfp, j_rarg1_off * wordSize);
5730     const Address r2_save      (rfp, j_rarg2_off * wordSize);
5731     const Address r3_save      (rfp, j_rarg3_off * wordSize);
5732     const Address r4_save      (rfp, j_rarg4_off * wordSize);
5733     const Address r5_save      (rfp, j_rarg5_off * wordSize);
5734     const Address r6_save      (rfp, j_rarg6_off * wordSize);
5735     const Address r7_save      (rfp, j_rarg7_off * wordSize);
5736 
5737     // Generate oop map
5738     OopMap* map = new OopMap(framesize, 0);
5739 
5740     map->set_callee_saved(VMRegImpl::stack2reg(rfp_off), rfp->as_VMReg());
5741     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
5742     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
5743     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
5744     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
5745     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
5746     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
5747     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
5748     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
5749 
5750     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
5751     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
5752     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
5753     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
5754     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
5755     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
5756     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
5757     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
5758 
5759     // This is an inlined and slightly modified version of call_VM
5760     // which has the ability to fetch the return PC out of
5761     // thread-local storage and also sets up last_Java_sp slightly
5762     // differently than the real call_VM
5763 
5764     __ enter(); // Save FP and LR before call
5765 
5766     assert(is_even(framesize/2), "sp not 16-byte aligned");
5767 
5768     // lr and fp are already in place
5769     __ sub(sp, rfp, ((unsigned)framesize - 4) << LogBytesPerInt); // prolog
5770 
5771     __ strd(j_farg7, f7_save); 
5772     __ strd(j_farg6, f6_save); 
5773     __ strd(j_farg5, f5_save); 
5774     __ strd(j_farg4, f4_save); 
5775     __ strd(j_farg3, f3_save); 
5776     __ strd(j_farg2, f2_save); 
5777     __ strd(j_farg1, f1_save); 
5778     __ strd(j_farg0, f0_save); 
5779 
5780     __ str(j_rarg0, r0_save); 
5781     __ str(j_rarg1, r1_save); 
5782     __ str(j_rarg2, r2_save); 
5783     __ str(j_rarg3, r3_save); 
5784     __ str(j_rarg4, r4_save); 
5785     __ str(j_rarg5, r5_save); 
5786     __ str(j_rarg6, r6_save); 
5787     __ str(j_rarg7, r7_save); 
5788 
5789     int frame_complete = __ pc() - start;
5790 
5791     // Set up last_Java_sp and last_Java_fp
5792     address the_pc = __ pc();
5793     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
5794 
5795     // Call runtime
5796     __ mov(c_rarg0, rthread);
5797     __ mov(c_rarg1, r0);
5798 
5799     BLOCK_COMMENT("call runtime_entry");
5800     __ mov(rscratch1, destination);
5801     __ blrt(rscratch1, 2 /* number_of_arguments */, 0, 1);
5802 
5803     oop_maps->add_gc_map(the_pc - start, map);
5804 
5805     __ reset_last_Java_frame(false); 
5806     __ maybe_isb(); 
5807 
5808     __ ldrd(j_farg7, f7_save); 
5809     __ ldrd(j_farg6, f6_save); 
5810     __ ldrd(j_farg5, f5_save); 
5811     __ ldrd(j_farg4, f4_save); 
5812     __ ldrd(j_farg3, f3_save); 
5813     __ ldrd(j_farg3, f2_save); 
5814     __ ldrd(j_farg1, f1_save); 
5815     __ ldrd(j_farg0, f0_save); 
5816 
5817     __ ldr(j_rarg0, r0_save); 
5818     __ ldr(j_rarg1, r1_save); 
5819     __ ldr(j_rarg2, r2_save); 
5820     __ ldr(j_rarg3, r3_save); 
5821     __ ldr(j_rarg4, r4_save); 
5822     __ ldr(j_rarg5, r5_save); 
5823     __ ldr(j_rarg6, r6_save); 
5824     __ ldr(j_rarg7, r7_save); 
5825 
5826     __ leave();
5827 
5828     // check for pending exceptions
5829     Label pending;
5830     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
5831     __ cmp(rscratch1, (u1)NULL_WORD);
5832     __ br(Assembler::NE, pending);
5833 
5834     if (has_res) {
5835       __ get_vm_result(r0, rthread);
5836     }
5837     __ ret(lr);
5838 
5839     __ bind(pending);
5840     __ ldr(r0, Address(rthread, in_bytes(Thread::pending_exception_offset())));
5841     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5842 
5843 
5844     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5845     int frame_size_in_words = (framesize >> (LogBytesPerWord - LogBytesPerInt));
5846     RuntimeStub* stub =
5847       RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
5848 
5849     return stub->entry_point();
5850   }
5851 
5852   // Initialization
5853   void generate_initial() {
5854     // Generate initial stubs and initializes the entry points
5855 
5856     // entry points that exist in all platforms Note: This is code
5857     // that could be shared among different platforms - however the
5858     // benefit seems to be smaller than the disadvantage of having a
5859     // much more complicated generator structure. See also comment in
5860     // stubRoutines.hpp.
5861 
5862     StubRoutines::_forward_exception_entry = generate_forward_exception();
5863 
5864     StubRoutines::_call_stub_entry =
5865       generate_call_stub(StubRoutines::_call_stub_return_address);
5866 
5867     // is referenced by megamorphic call
5868     StubRoutines::_catch_exception_entry = generate_catch_exception();
5869 
5870     // Build this early so it's available for the interpreter.
5871     StubRoutines::_throw_StackOverflowError_entry =
5872       generate_throw_exception("StackOverflowError throw_exception",
5873                                CAST_FROM_FN_PTR(address,
5874                                                 SharedRuntime::throw_StackOverflowError));
5875     StubRoutines::_throw_delayed_StackOverflowError_entry =
5876       generate_throw_exception("delayed StackOverflowError throw_exception",
5877                                CAST_FROM_FN_PTR(address,
5878                                                 SharedRuntime::throw_delayed_StackOverflowError));
5879     if (UseCRC32Intrinsics) {
5880       // set table address before stub generation which use it
5881       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5882       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5883     }
5884 
5885     if (UseCRC32CIntrinsics) {
5886       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5887     }
5888 
5889     // Disabled until JDK-8210858 is fixed
5890     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5891     //   StubRoutines::_dlog = generate_dlog();
5892     // }
5893 
5894     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5895       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5896     }
5897 
5898     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5899       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5900     }
5901 
5902 
5903     StubRoutines::_load_value_type_fields_in_regs = 
5904          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_value_type_fields_in_regs), "load_value_type_fields_in_regs", false);
5905     StubRoutines::_store_value_type_fields_to_buf = 
5906          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_value_type_fields_to_buf), "store_value_type_fields_to_buf", true);
5907   }
5908 
5909   void generate_all() {
5910     // support for verify_oop (must happen after universe_init)
5911     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5912     StubRoutines::_throw_AbstractMethodError_entry =
5913       generate_throw_exception("AbstractMethodError throw_exception",
5914                                CAST_FROM_FN_PTR(address,
5915                                                 SharedRuntime::
5916                                                 throw_AbstractMethodError));
5917 
5918     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5919       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5920                                CAST_FROM_FN_PTR(address,
5921                                                 SharedRuntime::
5922                                                 throw_IncompatibleClassChangeError));
5923 
5924     StubRoutines::_throw_NullPointerException_at_call_entry =
5925       generate_throw_exception("NullPointerException at call throw_exception",
5926                                CAST_FROM_FN_PTR(address,
5927                                                 SharedRuntime::
5928                                                 throw_NullPointerException_at_call));
5929 
5930     // arraycopy stubs used by compilers
5931     generate_arraycopy_stubs();
5932 
5933     // has negatives stub for large arrays.
5934     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5935 
5936     // array equals stub for large arrays.
5937     if (!UseSimpleArrayEquals) {
5938       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5939     }
5940 
5941     generate_compare_long_strings();
5942 
5943     generate_string_indexof_stubs();
5944 
5945     // byte_array_inflate stub for large arrays.
5946     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5947 
5948 #ifdef COMPILER2
5949     if (UseMultiplyToLenIntrinsic) {
5950       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5951     }
5952 
5953     if (UseSquareToLenIntrinsic) {
5954       StubRoutines::_squareToLen = generate_squareToLen();
5955     }
5956 
5957     if (UseMulAddIntrinsic) {
5958       StubRoutines::_mulAdd = generate_mulAdd();
5959     }
5960 
5961     if (UseMontgomeryMultiplyIntrinsic) {
5962       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5963       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5964       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5965     }
5966 
5967     if (UseMontgomerySquareIntrinsic) {
5968       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5969       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5970       // We use generate_multiply() rather than generate_square()
5971       // because it's faster for the sizes of modulus we care about.
5972       StubRoutines::_montgomerySquare = g.generate_multiply();
5973     }
5974 #endif // COMPILER2
5975 
5976 #ifndef BUILTIN_SIM
5977     // generate GHASH intrinsics code
5978     if (UseGHASHIntrinsics) {
5979       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5980     }
5981 
5982     if (UseAESIntrinsics) {
5983       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5984       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5985       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5986       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5987     }
5988 
5989     if (UseSHA1Intrinsics) {
5990       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5991       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5992     }
5993     if (UseSHA256Intrinsics) {
5994       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5995       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5996     }
5997 
5998     // generate Adler32 intrinsics code
5999     if (UseAdler32Intrinsics) {
6000       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6001     }
6002 
6003     // Safefetch stubs.
6004     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
6005                                                        &StubRoutines::_safefetch32_fault_pc,
6006                                                        &StubRoutines::_safefetch32_continuation_pc);
6007     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6008                                                        &StubRoutines::_safefetchN_fault_pc,
6009                                                        &StubRoutines::_safefetchN_continuation_pc);
6010 #endif
6011     StubRoutines::aarch64::set_completed();
6012   }
6013 
6014  public:
6015   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6016     if (all) {
6017       generate_all();
6018     } else {
6019       generate_initial();
6020     }
6021   }
6022 }; // end class declaration
6023 
6024 void StubGenerator_generate(CodeBuffer* code, bool all) {
6025   StubGenerator g(code, all);
6026 }