1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (u1)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, (u1)T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, (u1)T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, (u1)T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(j_rarg1, (u1)T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(j_rarg2));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376 #ifndef PRODUCT
 377     // tell the simulator we are about to end Java execution
 378     if (NotifySimulator) {
 379       __ notify(Assembler::method_exit);
 380     }
 381 #endif
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387 
 388     __ BIND(is_long);
 389     __ str(r0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_float);
 393     __ strs(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_double);
 397     __ strd(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     return start;
 401   }
 402 
 403   // Return point for a Java call if there's an exception thrown in
 404   // Java code.  The exception is caught and transformed into a
 405   // pending exception stored in JavaThread that can be tested from
 406   // within the VM.
 407   //
 408   // Note: Usually the parameters are removed by the callee. In case
 409   // of an exception crossing an activation frame boundary, that is
 410   // not the case if the callee is compiled code => need to setup the
 411   // rsp.
 412   //
 413   // r0: exception oop
 414 
 415   // NOTE: this is used as a target from the signal handler so it
 416   // needs an x86 prolog which returns into the current simulator
 417   // executing the generated catch_exception code. so the prolog
 418   // needs to install rax in a sim register and adjust the sim's
 419   // restart pc to enter the generated code at the start position
 420   // then return from native to simulated execution.
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != NULL,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code wiht no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // we should not really care that lr is no longer the callee
 517     // address. we saved the value the handler needs in r19 so we can
 518     // just copy it to r3. however, the C2 handler will push its own
 519     // frame and then calls into the VM and the VM code asserts that
 520     // the PC for the frame above the handler belongs to a compiled
 521     // Java method. So, we restore lr here to satisfy that assert.
 522     __ mov(lr, r19);
 523     // setup r0 & r3 & clear pending exception
 524     __ mov(r3, r19);
 525     __ mov(r19, r0);
 526     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 527     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ cbnz(r0, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler
 540     // r0: exception
 541     // r3: throwing pc
 542     // r19: exception handler
 543     __ verify_oop(r0);
 544     __ br(r19);
 545 
 546     return start;
 547   }
 548 
 549   // Non-destructive plausibility checks for oops
 550   //
 551   // Arguments:
 552   //    r0: oop to verify
 553   //    rscratch1: error message
 554   //
 555   // Stack after saving c_rarg3:
 556   //    [tos + 0]: saved c_rarg3
 557   //    [tos + 1]: saved c_rarg2
 558   //    [tos + 2]: saved lr
 559   //    [tos + 3]: saved rscratch2
 560   //    [tos + 4]: saved r0
 561   //    [tos + 5]: saved rscratch1
 562   address generate_verify_oop() {
 563 
 564     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 565     address start = __ pc();
 566 
 567     Label exit, error;
 568 
 569     // save c_rarg2 and c_rarg3
 570     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 571 
 572     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 573     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ ldr(c_rarg3, Address(c_rarg2));
 575     __ add(c_rarg3, c_rarg3, 1);
 576     __ str(c_rarg3, Address(c_rarg2));
 577 
 578     // object is in r0
 579     // make sure object is 'reasonable'
 580     __ cbz(r0, exit); // if obj is NULL it is OK
 581 
 582     // Check if the oop is in the right area of memory
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 584     __ andr(c_rarg2, r0, c_rarg3);
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 586 
 587     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 588     // instruction here because the flags register is live.
 589     __ eor(c_rarg2, c_rarg2, c_rarg3);
 590     __ cbnz(c_rarg2, error);
 591 
 592     // make sure klass is 'reasonable', which is not zero.
 593     __ load_klass(r0, r0);  // get klass
 594     __ cbz(r0, error);      // if klass is NULL it is broken
 595 
 596     // return if everything seems ok
 597     __ bind(exit);
 598 
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600     __ ret(lr);
 601 
 602     // handle errors
 603     __ bind(error);
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605 
 606     __ push(RegSet::range(r0, r29), sp);
 607     // debug(char* msg, int64_t pc, int64_t regs[])
 608     __ mov(c_rarg0, rscratch1);      // pass address of error message
 609     __ mov(c_rarg1, lr);             // pass return address
 610     __ mov(c_rarg2, sp);             // pass address of regs on stack
 611 #ifndef PRODUCT
 612     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 613 #endif
 614     BLOCK_COMMENT("call MacroAssembler::debug");
 615     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 616     __ blrt(rscratch1, 3, 0, 1);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657 
 658       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 659       // alignment.
 660       Label small;
 661       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 662       __ subs(rscratch1, cnt, low_limit >> 3);
 663       __ br(Assembler::LT, small);
 664       __ zero_dcache_blocks(base, cnt);
 665       __ bind(small);
 666     }
 667 
 668     {
 669       // Number of stp instructions we'll unroll
 670       const int unroll =
 671         MacroAssembler::zero_words_block_size / 2;
 672       // Clear the remaining blocks.
 673       Label loop;
 674       __ subs(cnt, cnt, unroll * 2);
 675       __ br(Assembler::LT, done);
 676       __ bind(loop);
 677       for (int i = 0; i < unroll; i++)
 678         __ stp(zr, zr, __ post(base, 16));
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::GE, loop);
 681       __ bind(done);
 682       __ add(cnt, cnt, unroll * 2);
 683     }
 684 
 685     __ ret(lr);
 686 
 687     return start;
 688   }
 689 
 690 
 691   typedef enum {
 692     copy_forwards = 1,
 693     copy_backwards = -1
 694   } copy_direction;
 695 
 696   // Bulk copy of blocks of 8 words.
 697   //
 698   // count is a count of words.
 699   //
 700   // Precondition: count >= 8
 701   //
 702   // Postconditions:
 703   //
 704   // The least significant bit of count contains the remaining count
 705   // of words to copy.  The rest of count is trash.
 706   //
 707   // s and d are adjusted to point to the remaining words to copy
 708   //
 709   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 710                            copy_direction direction) {
 711     int unit = wordSize * direction;
 712     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 713 
 714     int offset;
 715     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 716       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 717     const Register stride = r13;
 718 
 719     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 720     assert_different_registers(s, d, count, rscratch1);
 721 
 722     Label again, drain;
 723     const char *stub_name;
 724     if (direction == copy_forwards)
 725       stub_name = "forward_copy_longs";
 726     else
 727       stub_name = "backward_copy_longs";
 728 
 729     __ align(CodeEntryAlignment);
 730 
 731     StubCodeMark mark(this, "StubRoutines", stub_name);
 732 
 733     __ bind(start);
 734 
 735     Label unaligned_copy_long;
 736     if (AvoidUnalignedAccesses) {
 737       __ tbnz(d, 3, unaligned_copy_long);
 738     }
 739 
 740     if (direction == copy_forwards) {
 741       __ sub(s, s, bias);
 742       __ sub(d, d, bias);
 743     }
 744 
 745 #ifdef ASSERT
 746     // Make sure we are never given < 8 words
 747     {
 748       Label L;
 749       __ cmp(count, (u1)8);
 750       __ br(Assembler::GE, L);
 751       __ stop("genrate_copy_longs called with < 8 words");
 752       __ bind(L);
 753     }
 754 #endif
 755 
 756     // Fill 8 registers
 757     if (UseSIMDForMemoryOps) {
 758       __ ldpq(v0, v1, Address(s, 4 * unit));
 759       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 760     } else {
 761       __ ldp(t0, t1, Address(s, 2 * unit));
 762       __ ldp(t2, t3, Address(s, 4 * unit));
 763       __ ldp(t4, t5, Address(s, 6 * unit));
 764       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 765     }
 766 
 767     __ subs(count, count, 16);
 768     __ br(Assembler::LO, drain);
 769 
 770     int prefetch = PrefetchCopyIntervalInBytes;
 771     bool use_stride = false;
 772     if (direction == copy_backwards) {
 773        use_stride = prefetch > 256;
 774        prefetch = -prefetch;
 775        if (use_stride) __ mov(stride, prefetch);
 776     }
 777 
 778     __ bind(again);
 779 
 780     if (PrefetchCopyIntervalInBytes > 0)
 781       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 782 
 783     if (UseSIMDForMemoryOps) {
 784       __ stpq(v0, v1, Address(d, 4 * unit));
 785       __ ldpq(v0, v1, Address(s, 4 * unit));
 786       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 787       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 788     } else {
 789       __ stp(t0, t1, Address(d, 2 * unit));
 790       __ ldp(t0, t1, Address(s, 2 * unit));
 791       __ stp(t2, t3, Address(d, 4 * unit));
 792       __ ldp(t2, t3, Address(s, 4 * unit));
 793       __ stp(t4, t5, Address(d, 6 * unit));
 794       __ ldp(t4, t5, Address(s, 6 * unit));
 795       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 796       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 797     }
 798 
 799     __ subs(count, count, 8);
 800     __ br(Assembler::HS, again);
 801 
 802     // Drain
 803     __ bind(drain);
 804     if (UseSIMDForMemoryOps) {
 805       __ stpq(v0, v1, Address(d, 4 * unit));
 806       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 807     } else {
 808       __ stp(t0, t1, Address(d, 2 * unit));
 809       __ stp(t2, t3, Address(d, 4 * unit));
 810       __ stp(t4, t5, Address(d, 6 * unit));
 811       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 812     }
 813 
 814     {
 815       Label L1, L2;
 816       __ tbz(count, exact_log2(4), L1);
 817       if (UseSIMDForMemoryOps) {
 818         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 819         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 820       } else {
 821         __ ldp(t0, t1, Address(s, 2 * unit));
 822         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 823         __ stp(t0, t1, Address(d, 2 * unit));
 824         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 825       }
 826       __ bind(L1);
 827 
 828       if (direction == copy_forwards) {
 829         __ add(s, s, bias);
 830         __ add(d, d, bias);
 831       }
 832 
 833       __ tbz(count, 1, L2);
 834       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 835       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 836       __ bind(L2);
 837     }
 838 
 839     __ ret(lr);
 840 
 841     if (AvoidUnalignedAccesses) {
 842       Label drain, again;
 843       // Register order for storing. Order is different for backward copy.
 844 
 845       __ bind(unaligned_copy_long);
 846 
 847       // source address is even aligned, target odd aligned
 848       //
 849       // when forward copying word pairs we read long pairs at offsets
 850       // {0, 2, 4, 6} (in long words). when backwards copying we read
 851       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 852       // address by -2 in the forwards case so we can compute the
 853       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 854       // or -1.
 855       //
 856       // when forward copying we need to store 1 word, 3 pairs and
 857       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 858       // zero offset We adjust the destination by -1 which means we
 859       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 860       //
 861       // When backwards copyng we need to store 1 word, 3 pairs and
 862       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 863       // offsets {1, 3, 5, 7, 8} * unit.
 864 
 865       if (direction == copy_forwards) {
 866         __ sub(s, s, 16);
 867         __ sub(d, d, 8);
 868       }
 869 
 870       // Fill 8 registers
 871       //
 872       // for forwards copy s was offset by -16 from the original input
 873       // value of s so the register contents are at these offsets
 874       // relative to the 64 bit block addressed by that original input
 875       // and so on for each successive 64 byte block when s is updated
 876       //
 877       // t0 at offset 0,  t1 at offset 8
 878       // t2 at offset 16, t3 at offset 24
 879       // t4 at offset 32, t5 at offset 40
 880       // t6 at offset 48, t7 at offset 56
 881 
 882       // for backwards copy s was not offset so the register contents
 883       // are at these offsets into the preceding 64 byte block
 884       // relative to that original input and so on for each successive
 885       // preceding 64 byte block when s is updated. this explains the
 886       // slightly counter-intuitive looking pattern of register usage
 887       // in the stp instructions for backwards copy.
 888       //
 889       // t0 at offset -16, t1 at offset -8
 890       // t2 at offset -32, t3 at offset -24
 891       // t4 at offset -48, t5 at offset -40
 892       // t6 at offset -64, t7 at offset -56
 893 
 894       __ ldp(t0, t1, Address(s, 2 * unit));
 895       __ ldp(t2, t3, Address(s, 4 * unit));
 896       __ ldp(t4, t5, Address(s, 6 * unit));
 897       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 898 
 899       __ subs(count, count, 16);
 900       __ br(Assembler::LO, drain);
 901 
 902       int prefetch = PrefetchCopyIntervalInBytes;
 903       bool use_stride = false;
 904       if (direction == copy_backwards) {
 905          use_stride = prefetch > 256;
 906          prefetch = -prefetch;
 907          if (use_stride) __ mov(stride, prefetch);
 908       }
 909 
 910       __ bind(again);
 911 
 912       if (PrefetchCopyIntervalInBytes > 0)
 913         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 914 
 915       if (direction == copy_forwards) {
 916        // allowing for the offset of -8 the store instructions place
 917        // registers into the target 64 bit block at the following
 918        // offsets
 919        //
 920        // t0 at offset 0
 921        // t1 at offset 8,  t2 at offset 16
 922        // t3 at offset 24, t4 at offset 32
 923        // t5 at offset 40, t6 at offset 48
 924        // t7 at offset 56
 925 
 926         __ str(t0, Address(d, 1 * unit));
 927         __ stp(t1, t2, Address(d, 2 * unit));
 928         __ ldp(t0, t1, Address(s, 2 * unit));
 929         __ stp(t3, t4, Address(d, 4 * unit));
 930         __ ldp(t2, t3, Address(s, 4 * unit));
 931         __ stp(t5, t6, Address(d, 6 * unit));
 932         __ ldp(t4, t5, Address(s, 6 * unit));
 933         __ str(t7, Address(__ pre(d, 8 * unit)));
 934         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 935       } else {
 936        // d was not offset when we started so the registers are
 937        // written into the 64 bit block preceding d with the following
 938        // offsets
 939        //
 940        // t1 at offset -8
 941        // t3 at offset -24, t0 at offset -16
 942        // t5 at offset -48, t2 at offset -32
 943        // t7 at offset -56, t4 at offset -48
 944        //                   t6 at offset -64
 945        //
 946        // note that this matches the offsets previously noted for the
 947        // loads
 948 
 949         __ str(t1, Address(d, 1 * unit));
 950         __ stp(t3, t0, Address(d, 3 * unit));
 951         __ ldp(t0, t1, Address(s, 2 * unit));
 952         __ stp(t5, t2, Address(d, 5 * unit));
 953         __ ldp(t2, t3, Address(s, 4 * unit));
 954         __ stp(t7, t4, Address(d, 7 * unit));
 955         __ ldp(t4, t5, Address(s, 6 * unit));
 956         __ str(t6, Address(__ pre(d, 8 * unit)));
 957         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 958       }
 959 
 960       __ subs(count, count, 8);
 961       __ br(Assembler::HS, again);
 962 
 963       // Drain
 964       //
 965       // this uses the same pattern of offsets and register arguments
 966       // as above
 967       __ bind(drain);
 968       if (direction == copy_forwards) {
 969         __ str(t0, Address(d, 1 * unit));
 970         __ stp(t1, t2, Address(d, 2 * unit));
 971         __ stp(t3, t4, Address(d, 4 * unit));
 972         __ stp(t5, t6, Address(d, 6 * unit));
 973         __ str(t7, Address(__ pre(d, 8 * unit)));
 974       } else {
 975         __ str(t1, Address(d, 1 * unit));
 976         __ stp(t3, t0, Address(d, 3 * unit));
 977         __ stp(t5, t2, Address(d, 5 * unit));
 978         __ stp(t7, t4, Address(d, 7 * unit));
 979         __ str(t6, Address(__ pre(d, 8 * unit)));
 980       }
 981       // now we need to copy any remaining part block which may
 982       // include a 4 word block subblock and/or a 2 word subblock.
 983       // bits 2 and 1 in the count are the tell-tale for whetehr we
 984       // have each such subblock
 985       {
 986         Label L1, L2;
 987         __ tbz(count, exact_log2(4), L1);
 988        // this is the same as above but copying only 4 longs hence
 989        // with ony one intervening stp between the str instructions
 990        // but note that the offsets and registers still follow the
 991        // same pattern
 992         __ ldp(t0, t1, Address(s, 2 * unit));
 993         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 994         if (direction == copy_forwards) {
 995           __ str(t0, Address(d, 1 * unit));
 996           __ stp(t1, t2, Address(d, 2 * unit));
 997           __ str(t3, Address(__ pre(d, 4 * unit)));
 998         } else {
 999           __ str(t1, Address(d, 1 * unit));
1000           __ stp(t3, t0, Address(d, 3 * unit));
1001           __ str(t2, Address(__ pre(d, 4 * unit)));
1002         }
1003         __ bind(L1);
1004 
1005         __ tbz(count, 1, L2);
1006        // this is the same as above but copying only 2 longs hence
1007        // there is no intervening stp between the str instructions
1008        // but note that the offset and register patterns are still
1009        // the same
1010         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1011         if (direction == copy_forwards) {
1012           __ str(t0, Address(d, 1 * unit));
1013           __ str(t1, Address(__ pre(d, 2 * unit)));
1014         } else {
1015           __ str(t1, Address(d, 1 * unit));
1016           __ str(t0, Address(__ pre(d, 2 * unit)));
1017         }
1018         __ bind(L2);
1019 
1020        // for forwards copy we need to re-adjust the offsets we
1021        // applied so that s and d are follow the last words written
1022 
1023        if (direction == copy_forwards) {
1024          __ add(s, s, 16);
1025          __ add(d, d, 8);
1026        }
1027 
1028       }
1029 
1030       __ ret(lr);
1031       }
1032   }
1033 
1034   // Small copy: less than 16 bytes.
1035   //
1036   // NB: Ignores all of the bits of count which represent more than 15
1037   // bytes, so a caller doesn't have to mask them.
1038 
1039   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1040     bool is_backwards = step < 0;
1041     size_t granularity = uabs(step);
1042     int direction = is_backwards ? -1 : 1;
1043     int unit = wordSize * direction;
1044 
1045     Label Lword, Lint, Lshort, Lbyte;
1046 
1047     assert(granularity
1048            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1049 
1050     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1051 
1052     // ??? I don't know if this bit-test-and-branch is the right thing
1053     // to do.  It does a lot of jumping, resulting in several
1054     // mispredicted branches.  It might make more sense to do this
1055     // with something like Duff's device with a single computed branch.
1056 
1057     __ tbz(count, 3 - exact_log2(granularity), Lword);
1058     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1059     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1060     __ bind(Lword);
1061 
1062     if (granularity <= sizeof (jint)) {
1063       __ tbz(count, 2 - exact_log2(granularity), Lint);
1064       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1065       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1066       __ bind(Lint);
1067     }
1068 
1069     if (granularity <= sizeof (jshort)) {
1070       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1071       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1072       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1073       __ bind(Lshort);
1074     }
1075 
1076     if (granularity <= sizeof (jbyte)) {
1077       __ tbz(count, 0, Lbyte);
1078       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1079       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1080       __ bind(Lbyte);
1081     }
1082   }
1083 
1084   Label copy_f, copy_b;
1085 
1086   // All-singing all-dancing memory copy.
1087   //
1088   // Copy count units of memory from s to d.  The size of a unit is
1089   // step, which can be positive or negative depending on the direction
1090   // of copy.  If is_aligned is false, we align the source address.
1091   //
1092 
1093   void copy_memory(bool is_aligned, Register s, Register d,
1094                    Register count, Register tmp, int step) {
1095     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1096     bool is_backwards = step < 0;
1097     int granularity = uabs(step);
1098     const Register t0 = r3, t1 = r4;
1099 
1100     // <= 96 bytes do inline. Direction doesn't matter because we always
1101     // load all the data before writing anything
1102     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1103     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1104     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1105     const Register send = r17, dend = r18;
1106 
1107     if (PrefetchCopyIntervalInBytes > 0)
1108       __ prfm(Address(s, 0), PLDL1KEEP);
1109     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1110     __ br(Assembler::HI, copy_big);
1111 
1112     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1113     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1114 
1115     __ cmp(count, u1(16/granularity));
1116     __ br(Assembler::LS, copy16);
1117 
1118     __ cmp(count, u1(64/granularity));
1119     __ br(Assembler::HI, copy80);
1120 
1121     __ cmp(count, u1(32/granularity));
1122     __ br(Assembler::LS, copy32);
1123 
1124     // 33..64 bytes
1125     if (UseSIMDForMemoryOps) {
1126       __ ldpq(v0, v1, Address(s, 0));
1127       __ ldpq(v2, v3, Address(send, -32));
1128       __ stpq(v0, v1, Address(d, 0));
1129       __ stpq(v2, v3, Address(dend, -32));
1130     } else {
1131       __ ldp(t0, t1, Address(s, 0));
1132       __ ldp(t2, t3, Address(s, 16));
1133       __ ldp(t4, t5, Address(send, -32));
1134       __ ldp(t6, t7, Address(send, -16));
1135 
1136       __ stp(t0, t1, Address(d, 0));
1137       __ stp(t2, t3, Address(d, 16));
1138       __ stp(t4, t5, Address(dend, -32));
1139       __ stp(t6, t7, Address(dend, -16));
1140     }
1141     __ b(finish);
1142 
1143     // 17..32 bytes
1144     __ bind(copy32);
1145     __ ldp(t0, t1, Address(s, 0));
1146     __ ldp(t2, t3, Address(send, -16));
1147     __ stp(t0, t1, Address(d, 0));
1148     __ stp(t2, t3, Address(dend, -16));
1149     __ b(finish);
1150 
1151     // 65..80/96 bytes
1152     // (96 bytes if SIMD because we do 32 byes per instruction)
1153     __ bind(copy80);
1154     if (UseSIMDForMemoryOps) {
1155       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1156       __ ldpq(v4, v5, Address(send, -32));
1157       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1158       __ stpq(v4, v5, Address(dend, -32));
1159     } else {
1160       __ ldp(t0, t1, Address(s, 0));
1161       __ ldp(t2, t3, Address(s, 16));
1162       __ ldp(t4, t5, Address(s, 32));
1163       __ ldp(t6, t7, Address(s, 48));
1164       __ ldp(t8, t9, Address(send, -16));
1165 
1166       __ stp(t0, t1, Address(d, 0));
1167       __ stp(t2, t3, Address(d, 16));
1168       __ stp(t4, t5, Address(d, 32));
1169       __ stp(t6, t7, Address(d, 48));
1170       __ stp(t8, t9, Address(dend, -16));
1171     }
1172     __ b(finish);
1173 
1174     // 0..16 bytes
1175     __ bind(copy16);
1176     __ cmp(count, u1(8/granularity));
1177     __ br(Assembler::LO, copy8);
1178 
1179     // 8..16 bytes
1180     __ ldr(t0, Address(s, 0));
1181     __ ldr(t1, Address(send, -8));
1182     __ str(t0, Address(d, 0));
1183     __ str(t1, Address(dend, -8));
1184     __ b(finish);
1185 
1186     if (granularity < 8) {
1187       // 4..7 bytes
1188       __ bind(copy8);
1189       __ tbz(count, 2 - exact_log2(granularity), copy4);
1190       __ ldrw(t0, Address(s, 0));
1191       __ ldrw(t1, Address(send, -4));
1192       __ strw(t0, Address(d, 0));
1193       __ strw(t1, Address(dend, -4));
1194       __ b(finish);
1195       if (granularity < 4) {
1196         // 0..3 bytes
1197         __ bind(copy4);
1198         __ cbz(count, finish); // get rid of 0 case
1199         if (granularity == 2) {
1200           __ ldrh(t0, Address(s, 0));
1201           __ strh(t0, Address(d, 0));
1202         } else { // granularity == 1
1203           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1204           // the first and last byte.
1205           // Handle the 3 byte case by loading and storing base + count/2
1206           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1207           // This does means in the 1 byte case we load/store the same
1208           // byte 3 times.
1209           __ lsr(count, count, 1);
1210           __ ldrb(t0, Address(s, 0));
1211           __ ldrb(t1, Address(send, -1));
1212           __ ldrb(t2, Address(s, count));
1213           __ strb(t0, Address(d, 0));
1214           __ strb(t1, Address(dend, -1));
1215           __ strb(t2, Address(d, count));
1216         }
1217         __ b(finish);
1218       }
1219     }
1220 
1221     __ bind(copy_big);
1222     if (is_backwards) {
1223       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1224       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1225     }
1226 
1227     // Now we've got the small case out of the way we can align the
1228     // source address on a 2-word boundary.
1229 
1230     Label aligned;
1231 
1232     if (is_aligned) {
1233       // We may have to adjust by 1 word to get s 2-word-aligned.
1234       __ tbz(s, exact_log2(wordSize), aligned);
1235       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1236       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1237       __ sub(count, count, wordSize/granularity);
1238     } else {
1239       if (is_backwards) {
1240         __ andr(rscratch2, s, 2 * wordSize - 1);
1241       } else {
1242         __ neg(rscratch2, s);
1243         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1244       }
1245       // rscratch2 is the byte adjustment needed to align s.
1246       __ cbz(rscratch2, aligned);
1247       int shift = exact_log2(granularity);
1248       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1249       __ sub(count, count, rscratch2);
1250 
1251 #if 0
1252       // ?? This code is only correct for a disjoint copy.  It may or
1253       // may not make sense to use it in that case.
1254 
1255       // Copy the first pair; s and d may not be aligned.
1256       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1257       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1258 
1259       // Align s and d, adjust count
1260       if (is_backwards) {
1261         __ sub(s, s, rscratch2);
1262         __ sub(d, d, rscratch2);
1263       } else {
1264         __ add(s, s, rscratch2);
1265         __ add(d, d, rscratch2);
1266       }
1267 #else
1268       copy_memory_small(s, d, rscratch2, rscratch1, step);
1269 #endif
1270     }
1271 
1272     __ bind(aligned);
1273 
1274     // s is now 2-word-aligned.
1275 
1276     // We have a count of units and some trailing bytes.  Adjust the
1277     // count and do a bulk copy of words.
1278     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1279     if (direction == copy_forwards)
1280       __ bl(copy_f);
1281     else
1282       __ bl(copy_b);
1283 
1284     // And the tail.
1285     copy_memory_small(s, d, count, tmp, step);
1286 
1287     if (granularity >= 8) __ bind(copy8);
1288     if (granularity >= 4) __ bind(copy4);
1289     __ bind(finish);
1290   }
1291 
1292 
1293   void clobber_registers() {
1294 #ifdef ASSERT
1295     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1296     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1297     for (Register r = r3; r <= r18; r++)
1298       if (r != rscratch1) __ mov(r, rscratch1);
1299 #endif
1300   }
1301 
1302   // Scan over array at a for count oops, verifying each one.
1303   // Preserves a and count, clobbers rscratch1 and rscratch2.
1304   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1305     Label loop, end;
1306     __ mov(rscratch1, a);
1307     __ mov(rscratch2, zr);
1308     __ bind(loop);
1309     __ cmp(rscratch2, count);
1310     __ br(Assembler::HS, end);
1311     if (size == (size_t)wordSize) {
1312       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1313       __ verify_oop(temp);
1314     } else {
1315       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1316       __ decode_heap_oop(temp); // calls verify_oop
1317     }
1318     __ add(rscratch2, rscratch2, size);
1319     __ b(loop);
1320     __ bind(end);
1321   }
1322 
1323   // Arguments:
1324   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1325   //             ignored
1326   //   is_oop  - true => oop array, so generate store check code
1327   //   name    - stub name string
1328   //
1329   // Inputs:
1330   //   c_rarg0   - source array address
1331   //   c_rarg1   - destination array address
1332   //   c_rarg2   - element count, treated as ssize_t, can be zero
1333   //
1334   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1335   // the hardware handle it.  The two dwords within qwords that span
1336   // cache line boundaries will still be loaded and stored atomicly.
1337   //
1338   // Side Effects:
1339   //   disjoint_int_copy_entry is set to the no-overlap entry point
1340   //   used by generate_conjoint_int_oop_copy().
1341   //
1342   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1343                                   const char *name, bool dest_uninitialized = false) {
1344     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1345     RegSet saved_reg = RegSet::of(s, d, count);
1346     __ align(CodeEntryAlignment);
1347     StubCodeMark mark(this, "StubRoutines", name);
1348     address start = __ pc();
1349     __ enter();
1350 
1351     if (entry != NULL) {
1352       *entry = __ pc();
1353       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1354       BLOCK_COMMENT("Entry:");
1355     }
1356 
1357     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1358     if (dest_uninitialized) {
1359       decorators |= IS_DEST_UNINITIALIZED;
1360     }
1361     if (aligned) {
1362       decorators |= ARRAYCOPY_ALIGNED;
1363     }
1364 
1365     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1366     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1367 
1368     if (is_oop) {
1369       // save regs before copy_memory
1370       __ push(RegSet::of(d, count), sp);
1371     }
1372     copy_memory(aligned, s, d, count, rscratch1, size);
1373 
1374     if (is_oop) {
1375       __ pop(RegSet::of(d, count), sp);
1376       if (VerifyOops)
1377         verify_oop_array(size, d, count, r16);
1378     }
1379 
1380     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1381 
1382     __ leave();
1383     __ mov(r0, zr); // return 0
1384     __ ret(lr);
1385 #ifdef BUILTIN_SIM
1386     {
1387       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1388       sim->notifyCompile(const_cast<char*>(name), start);
1389     }
1390 #endif
1391     return start;
1392   }
1393 
1394   // Arguments:
1395   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1396   //             ignored
1397   //   is_oop  - true => oop array, so generate store check code
1398   //   name    - stub name string
1399   //
1400   // Inputs:
1401   //   c_rarg0   - source array address
1402   //   c_rarg1   - destination array address
1403   //   c_rarg2   - element count, treated as ssize_t, can be zero
1404   //
1405   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1406   // the hardware handle it.  The two dwords within qwords that span
1407   // cache line boundaries will still be loaded and stored atomicly.
1408   //
1409   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1410                                  address *entry, const char *name,
1411                                  bool dest_uninitialized = false) {
1412     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1413     RegSet saved_regs = RegSet::of(s, d, count);
1414     StubCodeMark mark(this, "StubRoutines", name);
1415     address start = __ pc();
1416     __ enter();
1417 
1418     if (entry != NULL) {
1419       *entry = __ pc();
1420       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1421       BLOCK_COMMENT("Entry:");
1422     }
1423 
1424     // use fwd copy when (d-s) above_equal (count*size)
1425     __ sub(rscratch1, d, s);
1426     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1427     __ br(Assembler::HS, nooverlap_target);
1428 
1429     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1430     if (dest_uninitialized) {
1431       decorators |= IS_DEST_UNINITIALIZED;
1432     }
1433     if (aligned) {
1434       decorators |= ARRAYCOPY_ALIGNED;
1435     }
1436 
1437     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1438     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1439 
1440     if (is_oop) {
1441       // save regs before copy_memory
1442       __ push(RegSet::of(d, count), sp);
1443     }
1444     copy_memory(aligned, s, d, count, rscratch1, -size);
1445     if (is_oop) {
1446       __ pop(RegSet::of(d, count), sp);
1447       if (VerifyOops)
1448         verify_oop_array(size, d, count, r16);
1449     }
1450     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1451     __ leave();
1452     __ mov(r0, zr); // return 0
1453     __ ret(lr);
1454 #ifdef BUILTIN_SIM
1455     {
1456       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1457       sim->notifyCompile(const_cast<char*>(name), start);
1458     }
1459 #endif
1460     return start;
1461 }
1462 
1463   // Arguments:
1464   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1465   //             ignored
1466   //   name    - stub name string
1467   //
1468   // Inputs:
1469   //   c_rarg0   - source array address
1470   //   c_rarg1   - destination array address
1471   //   c_rarg2   - element count, treated as ssize_t, can be zero
1472   //
1473   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1474   // we let the hardware handle it.  The one to eight bytes within words,
1475   // dwords or qwords that span cache line boundaries will still be loaded
1476   // and stored atomically.
1477   //
1478   // Side Effects:
1479   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1480   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1481   // we let the hardware handle it.  The one to eight bytes within words,
1482   // dwords or qwords that span cache line boundaries will still be loaded
1483   // and stored atomically.
1484   //
1485   // Side Effects:
1486   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1487   //   used by generate_conjoint_byte_copy().
1488   //
1489   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1490     const bool not_oop = false;
1491     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1492   }
1493 
1494   // Arguments:
1495   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1496   //             ignored
1497   //   name    - stub name string
1498   //
1499   // Inputs:
1500   //   c_rarg0   - source array address
1501   //   c_rarg1   - destination array address
1502   //   c_rarg2   - element count, treated as ssize_t, can be zero
1503   //
1504   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1505   // we let the hardware handle it.  The one to eight bytes within words,
1506   // dwords or qwords that span cache line boundaries will still be loaded
1507   // and stored atomically.
1508   //
1509   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1510                                       address* entry, const char *name) {
1511     const bool not_oop = false;
1512     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1513   }
1514 
1515   // Arguments:
1516   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1517   //             ignored
1518   //   name    - stub name string
1519   //
1520   // Inputs:
1521   //   c_rarg0   - source array address
1522   //   c_rarg1   - destination array address
1523   //   c_rarg2   - element count, treated as ssize_t, can be zero
1524   //
1525   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1526   // let the hardware handle it.  The two or four words within dwords
1527   // or qwords that span cache line boundaries will still be loaded
1528   // and stored atomically.
1529   //
1530   // Side Effects:
1531   //   disjoint_short_copy_entry is set to the no-overlap entry point
1532   //   used by generate_conjoint_short_copy().
1533   //
1534   address generate_disjoint_short_copy(bool aligned,
1535                                        address* entry, const char *name) {
1536     const bool not_oop = false;
1537     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1538   }
1539 
1540   // Arguments:
1541   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1542   //             ignored
1543   //   name    - stub name string
1544   //
1545   // Inputs:
1546   //   c_rarg0   - source array address
1547   //   c_rarg1   - destination array address
1548   //   c_rarg2   - element count, treated as ssize_t, can be zero
1549   //
1550   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1551   // let the hardware handle it.  The two or four words within dwords
1552   // or qwords that span cache line boundaries will still be loaded
1553   // and stored atomically.
1554   //
1555   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1556                                        address *entry, const char *name) {
1557     const bool not_oop = false;
1558     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1559 
1560   }
1561   // Arguments:
1562   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1563   //             ignored
1564   //   name    - stub name string
1565   //
1566   // Inputs:
1567   //   c_rarg0   - source array address
1568   //   c_rarg1   - destination array address
1569   //   c_rarg2   - element count, treated as ssize_t, can be zero
1570   //
1571   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1572   // the hardware handle it.  The two dwords within qwords that span
1573   // cache line boundaries will still be loaded and stored atomicly.
1574   //
1575   // Side Effects:
1576   //   disjoint_int_copy_entry is set to the no-overlap entry point
1577   //   used by generate_conjoint_int_oop_copy().
1578   //
1579   address generate_disjoint_int_copy(bool aligned, address *entry,
1580                                          const char *name, bool dest_uninitialized = false) {
1581     const bool not_oop = false;
1582     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1583   }
1584 
1585   // Arguments:
1586   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1587   //             ignored
1588   //   name    - stub name string
1589   //
1590   // Inputs:
1591   //   c_rarg0   - source array address
1592   //   c_rarg1   - destination array address
1593   //   c_rarg2   - element count, treated as ssize_t, can be zero
1594   //
1595   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1596   // the hardware handle it.  The two dwords within qwords that span
1597   // cache line boundaries will still be loaded and stored atomicly.
1598   //
1599   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1600                                      address *entry, const char *name,
1601                                      bool dest_uninitialized = false) {
1602     const bool not_oop = false;
1603     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1604   }
1605 
1606 
1607   // Arguments:
1608   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1609   //             ignored
1610   //   name    - stub name string
1611   //
1612   // Inputs:
1613   //   c_rarg0   - source array address
1614   //   c_rarg1   - destination array address
1615   //   c_rarg2   - element count, treated as size_t, can be zero
1616   //
1617   // Side Effects:
1618   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1619   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1620   //
1621   address generate_disjoint_long_copy(bool aligned, address *entry,
1622                                           const char *name, bool dest_uninitialized = false) {
1623     const bool not_oop = false;
1624     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1625   }
1626 
1627   // Arguments:
1628   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1629   //             ignored
1630   //   name    - stub name string
1631   //
1632   // Inputs:
1633   //   c_rarg0   - source array address
1634   //   c_rarg1   - destination array address
1635   //   c_rarg2   - element count, treated as size_t, can be zero
1636   //
1637   address generate_conjoint_long_copy(bool aligned,
1638                                       address nooverlap_target, address *entry,
1639                                       const char *name, bool dest_uninitialized = false) {
1640     const bool not_oop = false;
1641     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1642   }
1643 
1644   // Arguments:
1645   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1646   //             ignored
1647   //   name    - stub name string
1648   //
1649   // Inputs:
1650   //   c_rarg0   - source array address
1651   //   c_rarg1   - destination array address
1652   //   c_rarg2   - element count, treated as size_t, can be zero
1653   //
1654   // Side Effects:
1655   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1656   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1657   //
1658   address generate_disjoint_oop_copy(bool aligned, address *entry,
1659                                      const char *name, bool dest_uninitialized) {
1660     const bool is_oop = true;
1661     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1662     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1663   }
1664 
1665   // Arguments:
1666   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1667   //             ignored
1668   //   name    - stub name string
1669   //
1670   // Inputs:
1671   //   c_rarg0   - source array address
1672   //   c_rarg1   - destination array address
1673   //   c_rarg2   - element count, treated as size_t, can be zero
1674   //
1675   address generate_conjoint_oop_copy(bool aligned,
1676                                      address nooverlap_target, address *entry,
1677                                      const char *name, bool dest_uninitialized) {
1678     const bool is_oop = true;
1679     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1680     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1681                                   name, dest_uninitialized);
1682   }
1683 
1684 
1685   // Helper for generating a dynamic type check.
1686   // Smashes rscratch1, rscratch2.
1687   void generate_type_check(Register sub_klass,
1688                            Register super_check_offset,
1689                            Register super_klass,
1690                            Label& L_success) {
1691     assert_different_registers(sub_klass, super_check_offset, super_klass);
1692 
1693     BLOCK_COMMENT("type_check:");
1694 
1695     Label L_miss;
1696 
1697     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1698                                      super_check_offset);
1699     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1700 
1701     // Fall through on failure!
1702     __ BIND(L_miss);
1703   }
1704 
1705   //
1706   //  Generate checkcasting array copy stub
1707   //
1708   //  Input:
1709   //    c_rarg0   - source array address
1710   //    c_rarg1   - destination array address
1711   //    c_rarg2   - element count, treated as ssize_t, can be zero
1712   //    c_rarg3   - size_t ckoff (super_check_offset)
1713   //    c_rarg4   - oop ckval (super_klass)
1714   //
1715   //  Output:
1716   //    r0 ==  0  -  success
1717   //    r0 == -1^K - failure, where K is partial transfer count
1718   //
1719   address generate_checkcast_copy(const char *name, address *entry,
1720                                   bool dest_uninitialized = false) {
1721 
1722     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1723 
1724     // Input registers (after setup_arg_regs)
1725     const Register from        = c_rarg0;   // source array address
1726     const Register to          = c_rarg1;   // destination array address
1727     const Register count       = c_rarg2;   // elementscount
1728     const Register ckoff       = c_rarg3;   // super_check_offset
1729     const Register ckval       = c_rarg4;   // super_klass
1730 
1731     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1732     RegSet wb_post_saved_regs = RegSet::of(count);
1733 
1734     // Registers used as temps (r18, r19, r20 are save-on-entry)
1735     const Register count_save  = r21;       // orig elementscount
1736     const Register start_to    = r20;       // destination array start address
1737     const Register copied_oop  = r18;       // actual oop copied
1738     const Register r19_klass   = r19;       // oop._klass
1739 
1740     //---------------------------------------------------------------
1741     // Assembler stub will be used for this call to arraycopy
1742     // if the two arrays are subtypes of Object[] but the
1743     // destination array type is not equal to or a supertype
1744     // of the source type.  Each element must be separately
1745     // checked.
1746 
1747     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1748                                copied_oop, r19_klass, count_save);
1749 
1750     __ align(CodeEntryAlignment);
1751     StubCodeMark mark(this, "StubRoutines", name);
1752     address start = __ pc();
1753 
1754     __ enter(); // required for proper stackwalking of RuntimeStub frame
1755 
1756 #ifdef ASSERT
1757     // caller guarantees that the arrays really are different
1758     // otherwise, we would have to make conjoint checks
1759     { Label L;
1760       array_overlap_test(L, TIMES_OOP);
1761       __ stop("checkcast_copy within a single array");
1762       __ bind(L);
1763     }
1764 #endif //ASSERT
1765 
1766     // Caller of this entry point must set up the argument registers.
1767     if (entry != NULL) {
1768       *entry = __ pc();
1769       BLOCK_COMMENT("Entry:");
1770     }
1771 
1772      // Empty array:  Nothing to do.
1773     __ cbz(count, L_done);
1774 
1775     __ push(RegSet::of(r18, r19, r20, r21), sp);
1776 
1777 #ifdef ASSERT
1778     BLOCK_COMMENT("assert consistent ckoff/ckval");
1779     // The ckoff and ckval must be mutually consistent,
1780     // even though caller generates both.
1781     { Label L;
1782       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1783       __ ldrw(start_to, Address(ckval, sco_offset));
1784       __ cmpw(ckoff, start_to);
1785       __ br(Assembler::EQ, L);
1786       __ stop("super_check_offset inconsistent");
1787       __ bind(L);
1788     }
1789 #endif //ASSERT
1790 
1791     // Note: checkcast arraycopy is always disjoint. If it were not, then we wouldn't
1792     // need to checkcast.
1793     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1794     bool is_oop = true;
1795     if (dest_uninitialized) {
1796       decorators |= IS_DEST_UNINITIALIZED;
1797     }
1798 
1799     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1800     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1801 
1802     // save the original count
1803     __ mov(count_save, count);
1804 
1805     // Copy from low to high addresses
1806     __ mov(start_to, to);              // Save destination array start address
1807     __ b(L_load_element);
1808 
1809     // ======== begin loop ========
1810     // (Loop is rotated; its entry is L_load_element.)
1811     // Loop control:
1812     //   for (; count != 0; count--) {
1813     //     copied_oop = load_heap_oop(from++);
1814     //     ... generate_type_check ...;
1815     //     store_heap_oop(to++, copied_oop);
1816     //   }
1817     __ align(OptoLoopAlignment);
1818 
1819     __ BIND(L_store_element);
1820     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1821     __ sub(count, count, 1);
1822     __ cbz(count, L_do_card_marks);
1823 
1824     // ======== loop entry is here ========
1825     __ BIND(L_load_element);
1826     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1827     __ cbz(copied_oop, L_store_element);
1828 
1829     __ load_klass(r19_klass, copied_oop);// query the object klass
1830     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1831     // ======== end loop ========
1832 
1833     // It was a real error; we must depend on the caller to finish the job.
1834     // Register count = remaining oops, count_orig = total oops.
1835     // Emit GC store barriers for the oops we have copied and report
1836     // their number to the caller.
1837 
1838     __ subs(count, count_save, count);     // K = partially copied oop count
1839     __ eon(count, count, zr);                   // report (-1^K) to caller
1840     __ br(Assembler::EQ, L_done_pop);
1841 
1842     __ BIND(L_do_card_marks);
1843     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1844 
1845     __ bind(L_done_pop);
1846     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1847     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1848 
1849     __ bind(L_done);
1850     __ mov(r0, count);
1851     __ leave();
1852     __ ret(lr);
1853 
1854     return start;
1855   }
1856 
1857   // Perform range checks on the proposed arraycopy.
1858   // Kills temp, but nothing else.
1859   // Also, clean the sign bits of src_pos and dst_pos.
1860   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1861                               Register src_pos, // source position (c_rarg1)
1862                               Register dst,     // destination array oo (c_rarg2)
1863                               Register dst_pos, // destination position (c_rarg3)
1864                               Register length,
1865                               Register temp,
1866                               Label& L_failed) {
1867     BLOCK_COMMENT("arraycopy_range_checks:");
1868 
1869     assert_different_registers(rscratch1, temp);
1870 
1871     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1872     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1873     __ addw(temp, length, src_pos);
1874     __ cmpw(temp, rscratch1);
1875     __ br(Assembler::HI, L_failed);
1876 
1877     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1878     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1879     __ addw(temp, length, dst_pos);
1880     __ cmpw(temp, rscratch1);
1881     __ br(Assembler::HI, L_failed);
1882 
1883     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1884     __ movw(src_pos, src_pos);
1885     __ movw(dst_pos, dst_pos);
1886 
1887     BLOCK_COMMENT("arraycopy_range_checks done");
1888   }
1889 
1890   // These stubs get called from some dumb test routine.
1891   // I'll write them properly when they're called from
1892   // something that's actually doing something.
1893   static void fake_arraycopy_stub(address src, address dst, int count) {
1894     assert(count == 0, "huh?");
1895   }
1896 
1897 
1898   //
1899   //  Generate 'unsafe' array copy stub
1900   //  Though just as safe as the other stubs, it takes an unscaled
1901   //  size_t argument instead of an element count.
1902   //
1903   //  Input:
1904   //    c_rarg0   - source array address
1905   //    c_rarg1   - destination array address
1906   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1907   //
1908   // Examines the alignment of the operands and dispatches
1909   // to a long, int, short, or byte copy loop.
1910   //
1911   address generate_unsafe_copy(const char *name,
1912                                address byte_copy_entry,
1913                                address short_copy_entry,
1914                                address int_copy_entry,
1915                                address long_copy_entry) {
1916     Label L_long_aligned, L_int_aligned, L_short_aligned;
1917     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1918 
1919     __ align(CodeEntryAlignment);
1920     StubCodeMark mark(this, "StubRoutines", name);
1921     address start = __ pc();
1922     __ enter(); // required for proper stackwalking of RuntimeStub frame
1923 
1924     // bump this on entry, not on exit:
1925     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1926 
1927     __ orr(rscratch1, s, d);
1928     __ orr(rscratch1, rscratch1, count);
1929 
1930     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1931     __ cbz(rscratch1, L_long_aligned);
1932     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1933     __ cbz(rscratch1, L_int_aligned);
1934     __ tbz(rscratch1, 0, L_short_aligned);
1935     __ b(RuntimeAddress(byte_copy_entry));
1936 
1937     __ BIND(L_short_aligned);
1938     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1939     __ b(RuntimeAddress(short_copy_entry));
1940     __ BIND(L_int_aligned);
1941     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1942     __ b(RuntimeAddress(int_copy_entry));
1943     __ BIND(L_long_aligned);
1944     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1945     __ b(RuntimeAddress(long_copy_entry));
1946 
1947     return start;
1948   }
1949 
1950   //
1951   //  Generate generic array copy stubs
1952   //
1953   //  Input:
1954   //    c_rarg0    -  src oop
1955   //    c_rarg1    -  src_pos (32-bits)
1956   //    c_rarg2    -  dst oop
1957   //    c_rarg3    -  dst_pos (32-bits)
1958   //    c_rarg4    -  element count (32-bits)
1959   //
1960   //  Output:
1961   //    r0 ==  0  -  success
1962   //    r0 == -1^K - failure, where K is partial transfer count
1963   //
1964   address generate_generic_copy(const char *name,
1965                                 address byte_copy_entry, address short_copy_entry,
1966                                 address int_copy_entry, address oop_copy_entry,
1967                                 address long_copy_entry, address checkcast_copy_entry) {
1968 
1969     Label L_failed, L_objArray;
1970     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1971 
1972     // Input registers
1973     const Register src        = c_rarg0;  // source array oop
1974     const Register src_pos    = c_rarg1;  // source position
1975     const Register dst        = c_rarg2;  // destination array oop
1976     const Register dst_pos    = c_rarg3;  // destination position
1977     const Register length     = c_rarg4;
1978 
1979 
1980     // Registers used as temps
1981     const Register dst_klass  = c_rarg5;
1982 
1983     __ align(CodeEntryAlignment);
1984 
1985     StubCodeMark mark(this, "StubRoutines", name);
1986 
1987     address start = __ pc();
1988 
1989     __ enter(); // required for proper stackwalking of RuntimeStub frame
1990 
1991     // bump this on entry, not on exit:
1992     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1993 
1994     //-----------------------------------------------------------------------
1995     // Assembler stub will be used for this call to arraycopy
1996     // if the following conditions are met:
1997     //
1998     // (1) src and dst must not be null.
1999     // (2) src_pos must not be negative.
2000     // (3) dst_pos must not be negative.
2001     // (4) length  must not be negative.
2002     // (5) src klass and dst klass should be the same and not NULL.
2003     // (6) src and dst should be arrays.
2004     // (7) src_pos + length must not exceed length of src.
2005     // (8) dst_pos + length must not exceed length of dst.
2006     //
2007 
2008     //  if (src == NULL) return -1;
2009     __ cbz(src, L_failed);
2010 
2011     //  if (src_pos < 0) return -1;
2012     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2013 
2014     //  if (dst == NULL) return -1;
2015     __ cbz(dst, L_failed);
2016 
2017     //  if (dst_pos < 0) return -1;
2018     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2019 
2020     // registers used as temp
2021     const Register scratch_length    = r16; // elements count to copy
2022     const Register scratch_src_klass = r17; // array klass
2023     const Register lh                = r18; // layout helper
2024 
2025     //  if (length < 0) return -1;
2026     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2027     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2028 
2029     __ load_klass(scratch_src_klass, src);
2030 #ifdef ASSERT
2031     //  assert(src->klass() != NULL);
2032     {
2033       BLOCK_COMMENT("assert klasses not null {");
2034       Label L1, L2;
2035       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2036       __ bind(L1);
2037       __ stop("broken null klass");
2038       __ bind(L2);
2039       __ load_klass(rscratch1, dst);
2040       __ cbz(rscratch1, L1);     // this would be broken also
2041       BLOCK_COMMENT("} assert klasses not null done");
2042     }
2043 #endif
2044 
2045     // Load layout helper (32-bits)
2046     //
2047     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2048     // 32        30    24            16              8     2                 0
2049     //
2050     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2051     //
2052 
2053     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2054 
2055     // Handle objArrays completely differently...
2056     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2057     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2058     __ movw(rscratch1, objArray_lh);
2059     __ eorw(rscratch2, lh, rscratch1);
2060     __ cbzw(rscratch2, L_objArray);
2061 
2062     //  if (src->klass() != dst->klass()) return -1;
2063     __ load_klass(rscratch2, dst);
2064     __ eor(rscratch2, rscratch2, scratch_src_klass);
2065     __ cbnz(rscratch2, L_failed);
2066 
2067     //  if (!src->is_Array()) return -1;
2068     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2069 
2070     // At this point, it is known to be a typeArray (array_tag 0x3).
2071 #ifdef ASSERT
2072     {
2073       BLOCK_COMMENT("assert primitive array {");
2074       Label L;
2075       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2076       __ cmpw(lh, rscratch2);
2077       __ br(Assembler::GE, L);
2078       __ stop("must be a primitive array");
2079       __ bind(L);
2080       BLOCK_COMMENT("} assert primitive array done");
2081     }
2082 #endif
2083 
2084     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2085                            rscratch2, L_failed);
2086 
2087     // TypeArrayKlass
2088     //
2089     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2090     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2091     //
2092 
2093     const Register rscratch1_offset = rscratch1;    // array offset
2094     const Register r18_elsize = lh; // element size
2095 
2096     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2097            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2098     __ add(src, src, rscratch1_offset);           // src array offset
2099     __ add(dst, dst, rscratch1_offset);           // dst array offset
2100     BLOCK_COMMENT("choose copy loop based on element size");
2101 
2102     // next registers should be set before the jump to corresponding stub
2103     const Register from     = c_rarg0;  // source array address
2104     const Register to       = c_rarg1;  // destination array address
2105     const Register count    = c_rarg2;  // elements count
2106 
2107     // 'from', 'to', 'count' registers should be set in such order
2108     // since they are the same as 'src', 'src_pos', 'dst'.
2109 
2110     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2111 
2112     // The possible values of elsize are 0-3, i.e. exact_log2(element
2113     // size in bytes).  We do a simple bitwise binary search.
2114   __ BIND(L_copy_bytes);
2115     __ tbnz(r18_elsize, 1, L_copy_ints);
2116     __ tbnz(r18_elsize, 0, L_copy_shorts);
2117     __ lea(from, Address(src, src_pos));// src_addr
2118     __ lea(to,   Address(dst, dst_pos));// dst_addr
2119     __ movw(count, scratch_length); // length
2120     __ b(RuntimeAddress(byte_copy_entry));
2121 
2122   __ BIND(L_copy_shorts);
2123     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2124     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2125     __ movw(count, scratch_length); // length
2126     __ b(RuntimeAddress(short_copy_entry));
2127 
2128   __ BIND(L_copy_ints);
2129     __ tbnz(r18_elsize, 0, L_copy_longs);
2130     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2131     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2132     __ movw(count, scratch_length); // length
2133     __ b(RuntimeAddress(int_copy_entry));
2134 
2135   __ BIND(L_copy_longs);
2136 #ifdef ASSERT
2137     {
2138       BLOCK_COMMENT("assert long copy {");
2139       Label L;
2140       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2141       __ cmpw(r18_elsize, LogBytesPerLong);
2142       __ br(Assembler::EQ, L);
2143       __ stop("must be long copy, but elsize is wrong");
2144       __ bind(L);
2145       BLOCK_COMMENT("} assert long copy done");
2146     }
2147 #endif
2148     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2149     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2150     __ movw(count, scratch_length); // length
2151     __ b(RuntimeAddress(long_copy_entry));
2152 
2153     // ObjArrayKlass
2154   __ BIND(L_objArray);
2155     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2156 
2157     Label L_plain_copy, L_checkcast_copy;
2158     //  test array classes for subtyping
2159     __ load_klass(r18, dst);
2160     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2161     __ br(Assembler::NE, L_checkcast_copy);
2162 
2163     // Identically typed arrays can be copied without element-wise checks.
2164     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2165                            rscratch2, L_failed);
2166 
2167     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2168     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2169     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2170     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2171     __ movw(count, scratch_length); // length
2172   __ BIND(L_plain_copy);
2173     __ b(RuntimeAddress(oop_copy_entry));
2174 
2175   __ BIND(L_checkcast_copy);
2176     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2177     {
2178       // Before looking at dst.length, make sure dst is also an objArray.
2179       __ ldrw(rscratch1, Address(r18, lh_offset));
2180       __ movw(rscratch2, objArray_lh);
2181       __ eorw(rscratch1, rscratch1, rscratch2);
2182       __ cbnzw(rscratch1, L_failed);
2183 
2184       // It is safe to examine both src.length and dst.length.
2185       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2186                              r18, L_failed);
2187 
2188       __ load_klass(dst_klass, dst); // reload
2189 
2190       // Marshal the base address arguments now, freeing registers.
2191       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2192       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2193       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2194       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2195       __ movw(count, length);           // length (reloaded)
2196       Register sco_temp = c_rarg3;      // this register is free now
2197       assert_different_registers(from, to, count, sco_temp,
2198                                  dst_klass, scratch_src_klass);
2199       // assert_clean_int(count, sco_temp);
2200 
2201       // Generate the type check.
2202       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2203       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2204 
2205       // Smashes rscratch1, rscratch2
2206       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2207 
2208       // Fetch destination element klass from the ObjArrayKlass header.
2209       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2210       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2211       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2212 
2213       // the checkcast_copy loop needs two extra arguments:
2214       assert(c_rarg3 == sco_temp, "#3 already in place");
2215       // Set up arguments for checkcast_copy_entry.
2216       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2217       __ b(RuntimeAddress(checkcast_copy_entry));
2218     }
2219 
2220   __ BIND(L_failed);
2221     __ mov(r0, -1);
2222     __ leave();   // required for proper stackwalking of RuntimeStub frame
2223     __ ret(lr);
2224 
2225     return start;
2226   }
2227 
2228   //
2229   // Generate stub for array fill. If "aligned" is true, the
2230   // "to" address is assumed to be heapword aligned.
2231   //
2232   // Arguments for generated stub:
2233   //   to:    c_rarg0
2234   //   value: c_rarg1
2235   //   count: c_rarg2 treated as signed
2236   //
2237   address generate_fill(BasicType t, bool aligned, const char *name) {
2238     __ align(CodeEntryAlignment);
2239     StubCodeMark mark(this, "StubRoutines", name);
2240     address start = __ pc();
2241 
2242     BLOCK_COMMENT("Entry:");
2243 
2244     const Register to        = c_rarg0;  // source array address
2245     const Register value     = c_rarg1;  // value
2246     const Register count     = c_rarg2;  // elements count
2247 
2248     const Register bz_base = r10;        // base for block_zero routine
2249     const Register cnt_words = r11;      // temp register
2250 
2251     __ enter();
2252 
2253     Label L_fill_elements, L_exit1;
2254 
2255     int shift = -1;
2256     switch (t) {
2257       case T_BYTE:
2258         shift = 0;
2259         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2260         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2261         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2262         __ br(Assembler::LO, L_fill_elements);
2263         break;
2264       case T_SHORT:
2265         shift = 1;
2266         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2267         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2268         __ br(Assembler::LO, L_fill_elements);
2269         break;
2270       case T_INT:
2271         shift = 2;
2272         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2273         __ br(Assembler::LO, L_fill_elements);
2274         break;
2275       default: ShouldNotReachHere();
2276     }
2277 
2278     // Align source address at 8 bytes address boundary.
2279     Label L_skip_align1, L_skip_align2, L_skip_align4;
2280     if (!aligned) {
2281       switch (t) {
2282         case T_BYTE:
2283           // One byte misalignment happens only for byte arrays.
2284           __ tbz(to, 0, L_skip_align1);
2285           __ strb(value, Address(__ post(to, 1)));
2286           __ subw(count, count, 1);
2287           __ bind(L_skip_align1);
2288           // Fallthrough
2289         case T_SHORT:
2290           // Two bytes misalignment happens only for byte and short (char) arrays.
2291           __ tbz(to, 1, L_skip_align2);
2292           __ strh(value, Address(__ post(to, 2)));
2293           __ subw(count, count, 2 >> shift);
2294           __ bind(L_skip_align2);
2295           // Fallthrough
2296         case T_INT:
2297           // Align to 8 bytes, we know we are 4 byte aligned to start.
2298           __ tbz(to, 2, L_skip_align4);
2299           __ strw(value, Address(__ post(to, 4)));
2300           __ subw(count, count, 4 >> shift);
2301           __ bind(L_skip_align4);
2302           break;
2303         default: ShouldNotReachHere();
2304       }
2305     }
2306 
2307     //
2308     //  Fill large chunks
2309     //
2310     __ lsrw(cnt_words, count, 3 - shift); // number of words
2311     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2312     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2313     if (UseBlockZeroing) {
2314       Label non_block_zeroing, rest;
2315       // If the fill value is zero we can use the fast zero_words().
2316       __ cbnz(value, non_block_zeroing);
2317       __ mov(bz_base, to);
2318       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2319       __ zero_words(bz_base, cnt_words);
2320       __ b(rest);
2321       __ bind(non_block_zeroing);
2322       __ fill_words(to, cnt_words, value);
2323       __ bind(rest);
2324     } else {
2325       __ fill_words(to, cnt_words, value);
2326     }
2327 
2328     // Remaining count is less than 8 bytes. Fill it by a single store.
2329     // Note that the total length is no less than 8 bytes.
2330     if (t == T_BYTE || t == T_SHORT) {
2331       Label L_exit1;
2332       __ cbzw(count, L_exit1);
2333       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2334       __ str(value, Address(to, -8));    // overwrite some elements
2335       __ bind(L_exit1);
2336       __ leave();
2337       __ ret(lr);
2338     }
2339 
2340     // Handle copies less than 8 bytes.
2341     Label L_fill_2, L_fill_4, L_exit2;
2342     __ bind(L_fill_elements);
2343     switch (t) {
2344       case T_BYTE:
2345         __ tbz(count, 0, L_fill_2);
2346         __ strb(value, Address(__ post(to, 1)));
2347         __ bind(L_fill_2);
2348         __ tbz(count, 1, L_fill_4);
2349         __ strh(value, Address(__ post(to, 2)));
2350         __ bind(L_fill_4);
2351         __ tbz(count, 2, L_exit2);
2352         __ strw(value, Address(to));
2353         break;
2354       case T_SHORT:
2355         __ tbz(count, 0, L_fill_4);
2356         __ strh(value, Address(__ post(to, 2)));
2357         __ bind(L_fill_4);
2358         __ tbz(count, 1, L_exit2);
2359         __ strw(value, Address(to));
2360         break;
2361       case T_INT:
2362         __ cbzw(count, L_exit2);
2363         __ strw(value, Address(to));
2364         break;
2365       default: ShouldNotReachHere();
2366     }
2367     __ bind(L_exit2);
2368     __ leave();
2369     __ ret(lr);
2370     return start;
2371   }
2372 
2373   void generate_arraycopy_stubs() {
2374     address entry;
2375     address entry_jbyte_arraycopy;
2376     address entry_jshort_arraycopy;
2377     address entry_jint_arraycopy;
2378     address entry_oop_arraycopy;
2379     address entry_jlong_arraycopy;
2380     address entry_checkcast_arraycopy;
2381 
2382     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2383     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2384 
2385     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2386 
2387     //*** jbyte
2388     // Always need aligned and unaligned versions
2389     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2390                                                                                   "jbyte_disjoint_arraycopy");
2391     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2392                                                                                   &entry_jbyte_arraycopy,
2393                                                                                   "jbyte_arraycopy");
2394     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2395                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2396     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2397                                                                                   "arrayof_jbyte_arraycopy");
2398 
2399     //*** jshort
2400     // Always need aligned and unaligned versions
2401     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2402                                                                                     "jshort_disjoint_arraycopy");
2403     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2404                                                                                     &entry_jshort_arraycopy,
2405                                                                                     "jshort_arraycopy");
2406     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2407                                                                                     "arrayof_jshort_disjoint_arraycopy");
2408     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2409                                                                                     "arrayof_jshort_arraycopy");
2410 
2411     //*** jint
2412     // Aligned versions
2413     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2414                                                                                 "arrayof_jint_disjoint_arraycopy");
2415     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2416                                                                                 "arrayof_jint_arraycopy");
2417     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2418     // entry_jint_arraycopy always points to the unaligned version
2419     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2420                                                                                 "jint_disjoint_arraycopy");
2421     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2422                                                                                 &entry_jint_arraycopy,
2423                                                                                 "jint_arraycopy");
2424 
2425     //*** jlong
2426     // It is always aligned
2427     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2428                                                                                   "arrayof_jlong_disjoint_arraycopy");
2429     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2430                                                                                   "arrayof_jlong_arraycopy");
2431     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2432     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2433 
2434     //*** oops
2435     {
2436       // With compressed oops we need unaligned versions; notice that
2437       // we overwrite entry_oop_arraycopy.
2438       bool aligned = !UseCompressedOops;
2439 
2440       StubRoutines::_arrayof_oop_disjoint_arraycopy
2441         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2442                                      /*dest_uninitialized*/false);
2443       StubRoutines::_arrayof_oop_arraycopy
2444         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2445                                      /*dest_uninitialized*/false);
2446       // Aligned versions without pre-barriers
2447       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2448         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2449                                      /*dest_uninitialized*/true);
2450       StubRoutines::_arrayof_oop_arraycopy_uninit
2451         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2452                                      /*dest_uninitialized*/true);
2453     }
2454 
2455     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2456     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2457     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2458     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2459 
2460     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2461     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2462                                                                         /*dest_uninitialized*/true);
2463 
2464     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2465                                                               entry_jbyte_arraycopy,
2466                                                               entry_jshort_arraycopy,
2467                                                               entry_jint_arraycopy,
2468                                                               entry_jlong_arraycopy);
2469 
2470     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2471                                                                entry_jbyte_arraycopy,
2472                                                                entry_jshort_arraycopy,
2473                                                                entry_jint_arraycopy,
2474                                                                entry_oop_arraycopy,
2475                                                                entry_jlong_arraycopy,
2476                                                                entry_checkcast_arraycopy);
2477 
2478     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2479     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2480     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2481     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2482     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2483     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2484   }
2485 
2486   void generate_math_stubs() { Unimplemented(); }
2487 
2488   // Arguments:
2489   //
2490   // Inputs:
2491   //   c_rarg0   - source byte array address
2492   //   c_rarg1   - destination byte array address
2493   //   c_rarg2   - K (key) in little endian int array
2494   //
2495   address generate_aescrypt_encryptBlock() {
2496     __ align(CodeEntryAlignment);
2497     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2498 
2499     Label L_doLast;
2500 
2501     const Register from        = c_rarg0;  // source array address
2502     const Register to          = c_rarg1;  // destination array address
2503     const Register key         = c_rarg2;  // key array address
2504     const Register keylen      = rscratch1;
2505 
2506     address start = __ pc();
2507     __ enter();
2508 
2509     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2510 
2511     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2512 
2513     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2514     __ rev32(v1, __ T16B, v1);
2515     __ rev32(v2, __ T16B, v2);
2516     __ rev32(v3, __ T16B, v3);
2517     __ rev32(v4, __ T16B, v4);
2518     __ aese(v0, v1);
2519     __ aesmc(v0, v0);
2520     __ aese(v0, v2);
2521     __ aesmc(v0, v0);
2522     __ aese(v0, v3);
2523     __ aesmc(v0, v0);
2524     __ aese(v0, v4);
2525     __ aesmc(v0, v0);
2526 
2527     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2528     __ rev32(v1, __ T16B, v1);
2529     __ rev32(v2, __ T16B, v2);
2530     __ rev32(v3, __ T16B, v3);
2531     __ rev32(v4, __ T16B, v4);
2532     __ aese(v0, v1);
2533     __ aesmc(v0, v0);
2534     __ aese(v0, v2);
2535     __ aesmc(v0, v0);
2536     __ aese(v0, v3);
2537     __ aesmc(v0, v0);
2538     __ aese(v0, v4);
2539     __ aesmc(v0, v0);
2540 
2541     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2542     __ rev32(v1, __ T16B, v1);
2543     __ rev32(v2, __ T16B, v2);
2544 
2545     __ cmpw(keylen, 44);
2546     __ br(Assembler::EQ, L_doLast);
2547 
2548     __ aese(v0, v1);
2549     __ aesmc(v0, v0);
2550     __ aese(v0, v2);
2551     __ aesmc(v0, v0);
2552 
2553     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2554     __ rev32(v1, __ T16B, v1);
2555     __ rev32(v2, __ T16B, v2);
2556 
2557     __ cmpw(keylen, 52);
2558     __ br(Assembler::EQ, L_doLast);
2559 
2560     __ aese(v0, v1);
2561     __ aesmc(v0, v0);
2562     __ aese(v0, v2);
2563     __ aesmc(v0, v0);
2564 
2565     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2566     __ rev32(v1, __ T16B, v1);
2567     __ rev32(v2, __ T16B, v2);
2568 
2569     __ BIND(L_doLast);
2570 
2571     __ aese(v0, v1);
2572     __ aesmc(v0, v0);
2573     __ aese(v0, v2);
2574 
2575     __ ld1(v1, __ T16B, key);
2576     __ rev32(v1, __ T16B, v1);
2577     __ eor(v0, __ T16B, v0, v1);
2578 
2579     __ st1(v0, __ T16B, to);
2580 
2581     __ mov(r0, 0);
2582 
2583     __ leave();
2584     __ ret(lr);
2585 
2586     return start;
2587   }
2588 
2589   // Arguments:
2590   //
2591   // Inputs:
2592   //   c_rarg0   - source byte array address
2593   //   c_rarg1   - destination byte array address
2594   //   c_rarg2   - K (key) in little endian int array
2595   //
2596   address generate_aescrypt_decryptBlock() {
2597     assert(UseAES, "need AES instructions and misaligned SSE support");
2598     __ align(CodeEntryAlignment);
2599     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2600     Label L_doLast;
2601 
2602     const Register from        = c_rarg0;  // source array address
2603     const Register to          = c_rarg1;  // destination array address
2604     const Register key         = c_rarg2;  // key array address
2605     const Register keylen      = rscratch1;
2606 
2607     address start = __ pc();
2608     __ enter(); // required for proper stackwalking of RuntimeStub frame
2609 
2610     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2611 
2612     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2613 
2614     __ ld1(v5, __ T16B, __ post(key, 16));
2615     __ rev32(v5, __ T16B, v5);
2616 
2617     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2618     __ rev32(v1, __ T16B, v1);
2619     __ rev32(v2, __ T16B, v2);
2620     __ rev32(v3, __ T16B, v3);
2621     __ rev32(v4, __ T16B, v4);
2622     __ aesd(v0, v1);
2623     __ aesimc(v0, v0);
2624     __ aesd(v0, v2);
2625     __ aesimc(v0, v0);
2626     __ aesd(v0, v3);
2627     __ aesimc(v0, v0);
2628     __ aesd(v0, v4);
2629     __ aesimc(v0, v0);
2630 
2631     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2632     __ rev32(v1, __ T16B, v1);
2633     __ rev32(v2, __ T16B, v2);
2634     __ rev32(v3, __ T16B, v3);
2635     __ rev32(v4, __ T16B, v4);
2636     __ aesd(v0, v1);
2637     __ aesimc(v0, v0);
2638     __ aesd(v0, v2);
2639     __ aesimc(v0, v0);
2640     __ aesd(v0, v3);
2641     __ aesimc(v0, v0);
2642     __ aesd(v0, v4);
2643     __ aesimc(v0, v0);
2644 
2645     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2646     __ rev32(v1, __ T16B, v1);
2647     __ rev32(v2, __ T16B, v2);
2648 
2649     __ cmpw(keylen, 44);
2650     __ br(Assembler::EQ, L_doLast);
2651 
2652     __ aesd(v0, v1);
2653     __ aesimc(v0, v0);
2654     __ aesd(v0, v2);
2655     __ aesimc(v0, v0);
2656 
2657     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2658     __ rev32(v1, __ T16B, v1);
2659     __ rev32(v2, __ T16B, v2);
2660 
2661     __ cmpw(keylen, 52);
2662     __ br(Assembler::EQ, L_doLast);
2663 
2664     __ aesd(v0, v1);
2665     __ aesimc(v0, v0);
2666     __ aesd(v0, v2);
2667     __ aesimc(v0, v0);
2668 
2669     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2670     __ rev32(v1, __ T16B, v1);
2671     __ rev32(v2, __ T16B, v2);
2672 
2673     __ BIND(L_doLast);
2674 
2675     __ aesd(v0, v1);
2676     __ aesimc(v0, v0);
2677     __ aesd(v0, v2);
2678 
2679     __ eor(v0, __ T16B, v0, v5);
2680 
2681     __ st1(v0, __ T16B, to);
2682 
2683     __ mov(r0, 0);
2684 
2685     __ leave();
2686     __ ret(lr);
2687 
2688     return start;
2689   }
2690 
2691   // Arguments:
2692   //
2693   // Inputs:
2694   //   c_rarg0   - source byte array address
2695   //   c_rarg1   - destination byte array address
2696   //   c_rarg2   - K (key) in little endian int array
2697   //   c_rarg3   - r vector byte array address
2698   //   c_rarg4   - input length
2699   //
2700   // Output:
2701   //   x0        - input length
2702   //
2703   address generate_cipherBlockChaining_encryptAESCrypt() {
2704     assert(UseAES, "need AES instructions and misaligned SSE support");
2705     __ align(CodeEntryAlignment);
2706     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2707 
2708     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2709 
2710     const Register from        = c_rarg0;  // source array address
2711     const Register to          = c_rarg1;  // destination array address
2712     const Register key         = c_rarg2;  // key array address
2713     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2714                                            // and left with the results of the last encryption block
2715     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2716     const Register keylen      = rscratch1;
2717 
2718     address start = __ pc();
2719 
2720       __ enter();
2721 
2722       __ movw(rscratch2, len_reg);
2723 
2724       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2725 
2726       __ ld1(v0, __ T16B, rvec);
2727 
2728       __ cmpw(keylen, 52);
2729       __ br(Assembler::CC, L_loadkeys_44);
2730       __ br(Assembler::EQ, L_loadkeys_52);
2731 
2732       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2733       __ rev32(v17, __ T16B, v17);
2734       __ rev32(v18, __ T16B, v18);
2735     __ BIND(L_loadkeys_52);
2736       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2737       __ rev32(v19, __ T16B, v19);
2738       __ rev32(v20, __ T16B, v20);
2739     __ BIND(L_loadkeys_44);
2740       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2741       __ rev32(v21, __ T16B, v21);
2742       __ rev32(v22, __ T16B, v22);
2743       __ rev32(v23, __ T16B, v23);
2744       __ rev32(v24, __ T16B, v24);
2745       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2746       __ rev32(v25, __ T16B, v25);
2747       __ rev32(v26, __ T16B, v26);
2748       __ rev32(v27, __ T16B, v27);
2749       __ rev32(v28, __ T16B, v28);
2750       __ ld1(v29, v30, v31, __ T16B, key);
2751       __ rev32(v29, __ T16B, v29);
2752       __ rev32(v30, __ T16B, v30);
2753       __ rev32(v31, __ T16B, v31);
2754 
2755     __ BIND(L_aes_loop);
2756       __ ld1(v1, __ T16B, __ post(from, 16));
2757       __ eor(v0, __ T16B, v0, v1);
2758 
2759       __ br(Assembler::CC, L_rounds_44);
2760       __ br(Assembler::EQ, L_rounds_52);
2761 
2762       __ aese(v0, v17); __ aesmc(v0, v0);
2763       __ aese(v0, v18); __ aesmc(v0, v0);
2764     __ BIND(L_rounds_52);
2765       __ aese(v0, v19); __ aesmc(v0, v0);
2766       __ aese(v0, v20); __ aesmc(v0, v0);
2767     __ BIND(L_rounds_44);
2768       __ aese(v0, v21); __ aesmc(v0, v0);
2769       __ aese(v0, v22); __ aesmc(v0, v0);
2770       __ aese(v0, v23); __ aesmc(v0, v0);
2771       __ aese(v0, v24); __ aesmc(v0, v0);
2772       __ aese(v0, v25); __ aesmc(v0, v0);
2773       __ aese(v0, v26); __ aesmc(v0, v0);
2774       __ aese(v0, v27); __ aesmc(v0, v0);
2775       __ aese(v0, v28); __ aesmc(v0, v0);
2776       __ aese(v0, v29); __ aesmc(v0, v0);
2777       __ aese(v0, v30);
2778       __ eor(v0, __ T16B, v0, v31);
2779 
2780       __ st1(v0, __ T16B, __ post(to, 16));
2781 
2782       __ subw(len_reg, len_reg, 16);
2783       __ cbnzw(len_reg, L_aes_loop);
2784 
2785       __ st1(v0, __ T16B, rvec);
2786 
2787       __ mov(r0, rscratch2);
2788 
2789       __ leave();
2790       __ ret(lr);
2791 
2792       return start;
2793   }
2794 
2795   // Arguments:
2796   //
2797   // Inputs:
2798   //   c_rarg0   - source byte array address
2799   //   c_rarg1   - destination byte array address
2800   //   c_rarg2   - K (key) in little endian int array
2801   //   c_rarg3   - r vector byte array address
2802   //   c_rarg4   - input length
2803   //
2804   // Output:
2805   //   r0        - input length
2806   //
2807   address generate_cipherBlockChaining_decryptAESCrypt() {
2808     assert(UseAES, "need AES instructions and misaligned SSE support");
2809     __ align(CodeEntryAlignment);
2810     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2811 
2812     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2813 
2814     const Register from        = c_rarg0;  // source array address
2815     const Register to          = c_rarg1;  // destination array address
2816     const Register key         = c_rarg2;  // key array address
2817     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2818                                            // and left with the results of the last encryption block
2819     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2820     const Register keylen      = rscratch1;
2821 
2822     address start = __ pc();
2823 
2824       __ enter();
2825 
2826       __ movw(rscratch2, len_reg);
2827 
2828       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2829 
2830       __ ld1(v2, __ T16B, rvec);
2831 
2832       __ ld1(v31, __ T16B, __ post(key, 16));
2833       __ rev32(v31, __ T16B, v31);
2834 
2835       __ cmpw(keylen, 52);
2836       __ br(Assembler::CC, L_loadkeys_44);
2837       __ br(Assembler::EQ, L_loadkeys_52);
2838 
2839       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2840       __ rev32(v17, __ T16B, v17);
2841       __ rev32(v18, __ T16B, v18);
2842     __ BIND(L_loadkeys_52);
2843       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2844       __ rev32(v19, __ T16B, v19);
2845       __ rev32(v20, __ T16B, v20);
2846     __ BIND(L_loadkeys_44);
2847       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2848       __ rev32(v21, __ T16B, v21);
2849       __ rev32(v22, __ T16B, v22);
2850       __ rev32(v23, __ T16B, v23);
2851       __ rev32(v24, __ T16B, v24);
2852       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2853       __ rev32(v25, __ T16B, v25);
2854       __ rev32(v26, __ T16B, v26);
2855       __ rev32(v27, __ T16B, v27);
2856       __ rev32(v28, __ T16B, v28);
2857       __ ld1(v29, v30, __ T16B, key);
2858       __ rev32(v29, __ T16B, v29);
2859       __ rev32(v30, __ T16B, v30);
2860 
2861     __ BIND(L_aes_loop);
2862       __ ld1(v0, __ T16B, __ post(from, 16));
2863       __ orr(v1, __ T16B, v0, v0);
2864 
2865       __ br(Assembler::CC, L_rounds_44);
2866       __ br(Assembler::EQ, L_rounds_52);
2867 
2868       __ aesd(v0, v17); __ aesimc(v0, v0);
2869       __ aesd(v0, v18); __ aesimc(v0, v0);
2870     __ BIND(L_rounds_52);
2871       __ aesd(v0, v19); __ aesimc(v0, v0);
2872       __ aesd(v0, v20); __ aesimc(v0, v0);
2873     __ BIND(L_rounds_44);
2874       __ aesd(v0, v21); __ aesimc(v0, v0);
2875       __ aesd(v0, v22); __ aesimc(v0, v0);
2876       __ aesd(v0, v23); __ aesimc(v0, v0);
2877       __ aesd(v0, v24); __ aesimc(v0, v0);
2878       __ aesd(v0, v25); __ aesimc(v0, v0);
2879       __ aesd(v0, v26); __ aesimc(v0, v0);
2880       __ aesd(v0, v27); __ aesimc(v0, v0);
2881       __ aesd(v0, v28); __ aesimc(v0, v0);
2882       __ aesd(v0, v29); __ aesimc(v0, v0);
2883       __ aesd(v0, v30);
2884       __ eor(v0, __ T16B, v0, v31);
2885       __ eor(v0, __ T16B, v0, v2);
2886 
2887       __ st1(v0, __ T16B, __ post(to, 16));
2888       __ orr(v2, __ T16B, v1, v1);
2889 
2890       __ subw(len_reg, len_reg, 16);
2891       __ cbnzw(len_reg, L_aes_loop);
2892 
2893       __ st1(v2, __ T16B, rvec);
2894 
2895       __ mov(r0, rscratch2);
2896 
2897       __ leave();
2898       __ ret(lr);
2899 
2900     return start;
2901   }
2902 
2903   // Arguments:
2904   //
2905   // Inputs:
2906   //   c_rarg0   - byte[]  source+offset
2907   //   c_rarg1   - int[]   SHA.state
2908   //   c_rarg2   - int     offset
2909   //   c_rarg3   - int     limit
2910   //
2911   address generate_sha1_implCompress(bool multi_block, const char *name) {
2912     __ align(CodeEntryAlignment);
2913     StubCodeMark mark(this, "StubRoutines", name);
2914     address start = __ pc();
2915 
2916     Register buf   = c_rarg0;
2917     Register state = c_rarg1;
2918     Register ofs   = c_rarg2;
2919     Register limit = c_rarg3;
2920 
2921     Label keys;
2922     Label sha1_loop;
2923 
2924     // load the keys into v0..v3
2925     __ adr(rscratch1, keys);
2926     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2927     // load 5 words state into v6, v7
2928     __ ldrq(v6, Address(state, 0));
2929     __ ldrs(v7, Address(state, 16));
2930 
2931 
2932     __ BIND(sha1_loop);
2933     // load 64 bytes of data into v16..v19
2934     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2935     __ rev32(v16, __ T16B, v16);
2936     __ rev32(v17, __ T16B, v17);
2937     __ rev32(v18, __ T16B, v18);
2938     __ rev32(v19, __ T16B, v19);
2939 
2940     // do the sha1
2941     __ addv(v4, __ T4S, v16, v0);
2942     __ orr(v20, __ T16B, v6, v6);
2943 
2944     FloatRegister d0 = v16;
2945     FloatRegister d1 = v17;
2946     FloatRegister d2 = v18;
2947     FloatRegister d3 = v19;
2948 
2949     for (int round = 0; round < 20; round++) {
2950       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2951       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2952       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2953       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2954       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2955 
2956       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2957       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2958       __ sha1h(tmp2, __ T4S, v20);
2959       if (round < 5)
2960         __ sha1c(v20, __ T4S, tmp3, tmp4);
2961       else if (round < 10 || round >= 15)
2962         __ sha1p(v20, __ T4S, tmp3, tmp4);
2963       else
2964         __ sha1m(v20, __ T4S, tmp3, tmp4);
2965       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2966 
2967       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2968     }
2969 
2970     __ addv(v7, __ T2S, v7, v21);
2971     __ addv(v6, __ T4S, v6, v20);
2972 
2973     if (multi_block) {
2974       __ add(ofs, ofs, 64);
2975       __ cmp(ofs, limit);
2976       __ br(Assembler::LE, sha1_loop);
2977       __ mov(c_rarg0, ofs); // return ofs
2978     }
2979 
2980     __ strq(v6, Address(state, 0));
2981     __ strs(v7, Address(state, 16));
2982 
2983     __ ret(lr);
2984 
2985     __ bind(keys);
2986     __ emit_int32(0x5a827999);
2987     __ emit_int32(0x6ed9eba1);
2988     __ emit_int32(0x8f1bbcdc);
2989     __ emit_int32(0xca62c1d6);
2990 
2991     return start;
2992   }
2993 
2994 
2995   // Arguments:
2996   //
2997   // Inputs:
2998   //   c_rarg0   - byte[]  source+offset
2999   //   c_rarg1   - int[]   SHA.state
3000   //   c_rarg2   - int     offset
3001   //   c_rarg3   - int     limit
3002   //
3003   address generate_sha256_implCompress(bool multi_block, const char *name) {
3004     static const uint32_t round_consts[64] = {
3005       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3006       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3007       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3008       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3009       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3010       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3011       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3012       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3013       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3014       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3015       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3016       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3017       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3018       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3019       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3020       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3021     };
3022     __ align(CodeEntryAlignment);
3023     StubCodeMark mark(this, "StubRoutines", name);
3024     address start = __ pc();
3025 
3026     Register buf   = c_rarg0;
3027     Register state = c_rarg1;
3028     Register ofs   = c_rarg2;
3029     Register limit = c_rarg3;
3030 
3031     Label sha1_loop;
3032 
3033     __ stpd(v8, v9, __ pre(sp, -32));
3034     __ stpd(v10, v11, Address(sp, 16));
3035 
3036 // dga == v0
3037 // dgb == v1
3038 // dg0 == v2
3039 // dg1 == v3
3040 // dg2 == v4
3041 // t0 == v6
3042 // t1 == v7
3043 
3044     // load 16 keys to v16..v31
3045     __ lea(rscratch1, ExternalAddress((address)round_consts));
3046     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3047     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3048     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3049     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3050 
3051     // load 8 words (256 bits) state
3052     __ ldpq(v0, v1, state);
3053 
3054     __ BIND(sha1_loop);
3055     // load 64 bytes of data into v8..v11
3056     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3057     __ rev32(v8, __ T16B, v8);
3058     __ rev32(v9, __ T16B, v9);
3059     __ rev32(v10, __ T16B, v10);
3060     __ rev32(v11, __ T16B, v11);
3061 
3062     __ addv(v6, __ T4S, v8, v16);
3063     __ orr(v2, __ T16B, v0, v0);
3064     __ orr(v3, __ T16B, v1, v1);
3065 
3066     FloatRegister d0 = v8;
3067     FloatRegister d1 = v9;
3068     FloatRegister d2 = v10;
3069     FloatRegister d3 = v11;
3070 
3071 
3072     for (int round = 0; round < 16; round++) {
3073       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3074       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3075       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3076       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3077 
3078       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3079        __ orr(v4, __ T16B, v2, v2);
3080       if (round < 15)
3081         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3082       __ sha256h(v2, __ T4S, v3, tmp2);
3083       __ sha256h2(v3, __ T4S, v4, tmp2);
3084       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3085 
3086       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3087     }
3088 
3089     __ addv(v0, __ T4S, v0, v2);
3090     __ addv(v1, __ T4S, v1, v3);
3091 
3092     if (multi_block) {
3093       __ add(ofs, ofs, 64);
3094       __ cmp(ofs, limit);
3095       __ br(Assembler::LE, sha1_loop);
3096       __ mov(c_rarg0, ofs); // return ofs
3097     }
3098 
3099     __ ldpd(v10, v11, Address(sp, 16));
3100     __ ldpd(v8, v9, __ post(sp, 32));
3101 
3102     __ stpq(v0, v1, state);
3103 
3104     __ ret(lr);
3105 
3106     return start;
3107   }
3108 
3109 #ifndef BUILTIN_SIM
3110   // Safefetch stubs.
3111   void generate_safefetch(const char* name, int size, address* entry,
3112                           address* fault_pc, address* continuation_pc) {
3113     // safefetch signatures:
3114     //   int      SafeFetch32(int*      adr, int      errValue);
3115     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3116     //
3117     // arguments:
3118     //   c_rarg0 = adr
3119     //   c_rarg1 = errValue
3120     //
3121     // result:
3122     //   PPC_RET  = *adr or errValue
3123 
3124     StubCodeMark mark(this, "StubRoutines", name);
3125 
3126     // Entry point, pc or function descriptor.
3127     *entry = __ pc();
3128 
3129     // Load *adr into c_rarg1, may fault.
3130     *fault_pc = __ pc();
3131     switch (size) {
3132       case 4:
3133         // int32_t
3134         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3135         break;
3136       case 8:
3137         // int64_t
3138         __ ldr(c_rarg1, Address(c_rarg0, 0));
3139         break;
3140       default:
3141         ShouldNotReachHere();
3142     }
3143 
3144     // return errValue or *adr
3145     *continuation_pc = __ pc();
3146     __ mov(r0, c_rarg1);
3147     __ ret(lr);
3148   }
3149 #endif
3150 
3151   /**
3152    *  Arguments:
3153    *
3154    * Inputs:
3155    *   c_rarg0   - int crc
3156    *   c_rarg1   - byte* buf
3157    *   c_rarg2   - int length
3158    *
3159    * Ouput:
3160    *       rax   - int crc result
3161    */
3162   address generate_updateBytesCRC32() {
3163     assert(UseCRC32Intrinsics, "what are we doing here?");
3164 
3165     __ align(CodeEntryAlignment);
3166     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3167 
3168     address start = __ pc();
3169 
3170     const Register crc   = c_rarg0;  // crc
3171     const Register buf   = c_rarg1;  // source java byte array address
3172     const Register len   = c_rarg2;  // length
3173     const Register table0 = c_rarg3; // crc_table address
3174     const Register table1 = c_rarg4;
3175     const Register table2 = c_rarg5;
3176     const Register table3 = c_rarg6;
3177     const Register tmp3 = c_rarg7;
3178 
3179     BLOCK_COMMENT("Entry:");
3180     __ enter(); // required for proper stackwalking of RuntimeStub frame
3181 
3182     __ kernel_crc32(crc, buf, len,
3183               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3184 
3185     __ leave(); // required for proper stackwalking of RuntimeStub frame
3186     __ ret(lr);
3187 
3188     return start;
3189   }
3190 
3191   /**
3192    *  Arguments:
3193    *
3194    * Inputs:
3195    *   c_rarg0   - int crc
3196    *   c_rarg1   - byte* buf
3197    *   c_rarg2   - int length
3198    *   c_rarg3   - int* table
3199    *
3200    * Ouput:
3201    *       r0   - int crc result
3202    */
3203   address generate_updateBytesCRC32C() {
3204     assert(UseCRC32CIntrinsics, "what are we doing here?");
3205 
3206     __ align(CodeEntryAlignment);
3207     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3208 
3209     address start = __ pc();
3210 
3211     const Register crc   = c_rarg0;  // crc
3212     const Register buf   = c_rarg1;  // source java byte array address
3213     const Register len   = c_rarg2;  // length
3214     const Register table0 = c_rarg3; // crc_table address
3215     const Register table1 = c_rarg4;
3216     const Register table2 = c_rarg5;
3217     const Register table3 = c_rarg6;
3218     const Register tmp3 = c_rarg7;
3219 
3220     BLOCK_COMMENT("Entry:");
3221     __ enter(); // required for proper stackwalking of RuntimeStub frame
3222 
3223     __ kernel_crc32c(crc, buf, len,
3224               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3225 
3226     __ leave(); // required for proper stackwalking of RuntimeStub frame
3227     __ ret(lr);
3228 
3229     return start;
3230   }
3231 
3232   /***
3233    *  Arguments:
3234    *
3235    *  Inputs:
3236    *   c_rarg0   - int   adler
3237    *   c_rarg1   - byte* buff
3238    *   c_rarg2   - int   len
3239    *
3240    * Output:
3241    *   c_rarg0   - int adler result
3242    */
3243   address generate_updateBytesAdler32() {
3244     __ align(CodeEntryAlignment);
3245     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3246     address start = __ pc();
3247 
3248     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3249 
3250     // Aliases
3251     Register adler  = c_rarg0;
3252     Register s1     = c_rarg0;
3253     Register s2     = c_rarg3;
3254     Register buff   = c_rarg1;
3255     Register len    = c_rarg2;
3256     Register nmax  = r4;
3257     Register base  = r5;
3258     Register count = r6;
3259     Register temp0 = rscratch1;
3260     Register temp1 = rscratch2;
3261     FloatRegister vbytes = v0;
3262     FloatRegister vs1acc = v1;
3263     FloatRegister vs2acc = v2;
3264     FloatRegister vtable = v3;
3265 
3266     // Max number of bytes we can process before having to take the mod
3267     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3268     unsigned long BASE = 0xfff1;
3269     unsigned long NMAX = 0x15B0;
3270 
3271     __ mov(base, BASE);
3272     __ mov(nmax, NMAX);
3273 
3274     // Load accumulation coefficients for the upper 16 bits
3275     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3276     __ ld1(vtable, __ T16B, Address(temp0));
3277 
3278     // s1 is initialized to the lower 16 bits of adler
3279     // s2 is initialized to the upper 16 bits of adler
3280     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3281     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3282 
3283     // The pipelined loop needs at least 16 elements for 1 iteration
3284     // It does check this, but it is more effective to skip to the cleanup loop
3285     __ cmp(len, (u1)16);
3286     __ br(Assembler::HS, L_nmax);
3287     __ cbz(len, L_combine);
3288 
3289     __ bind(L_simple_by1_loop);
3290     __ ldrb(temp0, Address(__ post(buff, 1)));
3291     __ add(s1, s1, temp0);
3292     __ add(s2, s2, s1);
3293     __ subs(len, len, 1);
3294     __ br(Assembler::HI, L_simple_by1_loop);
3295 
3296     // s1 = s1 % BASE
3297     __ subs(temp0, s1, base);
3298     __ csel(s1, temp0, s1, Assembler::HS);
3299 
3300     // s2 = s2 % BASE
3301     __ lsr(temp0, s2, 16);
3302     __ lsl(temp1, temp0, 4);
3303     __ sub(temp1, temp1, temp0);
3304     __ add(s2, temp1, s2, ext::uxth);
3305 
3306     __ subs(temp0, s2, base);
3307     __ csel(s2, temp0, s2, Assembler::HS);
3308 
3309     __ b(L_combine);
3310 
3311     __ bind(L_nmax);
3312     __ subs(len, len, nmax);
3313     __ sub(count, nmax, 16);
3314     __ br(Assembler::LO, L_by16);
3315 
3316     __ bind(L_nmax_loop);
3317 
3318     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3319                                       vbytes, vs1acc, vs2acc, vtable);
3320 
3321     __ subs(count, count, 16);
3322     __ br(Assembler::HS, L_nmax_loop);
3323 
3324     // s1 = s1 % BASE
3325     __ lsr(temp0, s1, 16);
3326     __ lsl(temp1, temp0, 4);
3327     __ sub(temp1, temp1, temp0);
3328     __ add(temp1, temp1, s1, ext::uxth);
3329 
3330     __ lsr(temp0, temp1, 16);
3331     __ lsl(s1, temp0, 4);
3332     __ sub(s1, s1, temp0);
3333     __ add(s1, s1, temp1, ext:: uxth);
3334 
3335     __ subs(temp0, s1, base);
3336     __ csel(s1, temp0, s1, Assembler::HS);
3337 
3338     // s2 = s2 % BASE
3339     __ lsr(temp0, s2, 16);
3340     __ lsl(temp1, temp0, 4);
3341     __ sub(temp1, temp1, temp0);
3342     __ add(temp1, temp1, s2, ext::uxth);
3343 
3344     __ lsr(temp0, temp1, 16);
3345     __ lsl(s2, temp0, 4);
3346     __ sub(s2, s2, temp0);
3347     __ add(s2, s2, temp1, ext:: uxth);
3348 
3349     __ subs(temp0, s2, base);
3350     __ csel(s2, temp0, s2, Assembler::HS);
3351 
3352     __ subs(len, len, nmax);
3353     __ sub(count, nmax, 16);
3354     __ br(Assembler::HS, L_nmax_loop);
3355 
3356     __ bind(L_by16);
3357     __ adds(len, len, count);
3358     __ br(Assembler::LO, L_by1);
3359 
3360     __ bind(L_by16_loop);
3361 
3362     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3363                                       vbytes, vs1acc, vs2acc, vtable);
3364 
3365     __ subs(len, len, 16);
3366     __ br(Assembler::HS, L_by16_loop);
3367 
3368     __ bind(L_by1);
3369     __ adds(len, len, 15);
3370     __ br(Assembler::LO, L_do_mod);
3371 
3372     __ bind(L_by1_loop);
3373     __ ldrb(temp0, Address(__ post(buff, 1)));
3374     __ add(s1, temp0, s1);
3375     __ add(s2, s2, s1);
3376     __ subs(len, len, 1);
3377     __ br(Assembler::HS, L_by1_loop);
3378 
3379     __ bind(L_do_mod);
3380     // s1 = s1 % BASE
3381     __ lsr(temp0, s1, 16);
3382     __ lsl(temp1, temp0, 4);
3383     __ sub(temp1, temp1, temp0);
3384     __ add(temp1, temp1, s1, ext::uxth);
3385 
3386     __ lsr(temp0, temp1, 16);
3387     __ lsl(s1, temp0, 4);
3388     __ sub(s1, s1, temp0);
3389     __ add(s1, s1, temp1, ext:: uxth);
3390 
3391     __ subs(temp0, s1, base);
3392     __ csel(s1, temp0, s1, Assembler::HS);
3393 
3394     // s2 = s2 % BASE
3395     __ lsr(temp0, s2, 16);
3396     __ lsl(temp1, temp0, 4);
3397     __ sub(temp1, temp1, temp0);
3398     __ add(temp1, temp1, s2, ext::uxth);
3399 
3400     __ lsr(temp0, temp1, 16);
3401     __ lsl(s2, temp0, 4);
3402     __ sub(s2, s2, temp0);
3403     __ add(s2, s2, temp1, ext:: uxth);
3404 
3405     __ subs(temp0, s2, base);
3406     __ csel(s2, temp0, s2, Assembler::HS);
3407 
3408     // Combine lower bits and higher bits
3409     __ bind(L_combine);
3410     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3411 
3412     __ ret(lr);
3413 
3414     return start;
3415   }
3416 
3417   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3418           Register temp0, Register temp1, FloatRegister vbytes,
3419           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3420     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3421     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3422     // In non-vectorized code, we update s1 and s2 as:
3423     //   s1 <- s1 + b1
3424     //   s2 <- s2 + s1
3425     //   s1 <- s1 + b2
3426     //   s2 <- s2 + b1
3427     //   ...
3428     //   s1 <- s1 + b16
3429     //   s2 <- s2 + s1
3430     // Putting above assignments together, we have:
3431     //   s1_new = s1 + b1 + b2 + ... + b16
3432     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3433     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3434     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3435     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3436 
3437     // s2 = s2 + s1 * 16
3438     __ add(s2, s2, s1, Assembler::LSL, 4);
3439 
3440     // vs1acc = b1 + b2 + b3 + ... + b16
3441     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3442     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3443     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3444     __ uaddlv(vs1acc, __ T16B, vbytes);
3445     __ uaddlv(vs2acc, __ T8H, vs2acc);
3446 
3447     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3448     __ fmovd(temp0, vs1acc);
3449     __ fmovd(temp1, vs2acc);
3450     __ add(s1, s1, temp0);
3451     __ add(s2, s2, temp1);
3452   }
3453 
3454   /**
3455    *  Arguments:
3456    *
3457    *  Input:
3458    *    c_rarg0   - x address
3459    *    c_rarg1   - x length
3460    *    c_rarg2   - y address
3461    *    c_rarg3   - y lenth
3462    *    c_rarg4   - z address
3463    *    c_rarg5   - z length
3464    */
3465   address generate_multiplyToLen() {
3466     __ align(CodeEntryAlignment);
3467     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3468 
3469     address start = __ pc();
3470     const Register x     = r0;
3471     const Register xlen  = r1;
3472     const Register y     = r2;
3473     const Register ylen  = r3;
3474     const Register z     = r4;
3475     const Register zlen  = r5;
3476 
3477     const Register tmp1  = r10;
3478     const Register tmp2  = r11;
3479     const Register tmp3  = r12;
3480     const Register tmp4  = r13;
3481     const Register tmp5  = r14;
3482     const Register tmp6  = r15;
3483     const Register tmp7  = r16;
3484 
3485     BLOCK_COMMENT("Entry:");
3486     __ enter(); // required for proper stackwalking of RuntimeStub frame
3487     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3488     __ leave(); // required for proper stackwalking of RuntimeStub frame
3489     __ ret(lr);
3490 
3491     return start;
3492   }
3493 
3494   address generate_squareToLen() {
3495     // squareToLen algorithm for sizes 1..127 described in java code works
3496     // faster than multiply_to_len on some CPUs and slower on others, but
3497     // multiply_to_len shows a bit better overall results
3498     __ align(CodeEntryAlignment);
3499     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3500     address start = __ pc();
3501 
3502     const Register x     = r0;
3503     const Register xlen  = r1;
3504     const Register z     = r2;
3505     const Register zlen  = r3;
3506     const Register y     = r4; // == x
3507     const Register ylen  = r5; // == xlen
3508 
3509     const Register tmp1  = r10;
3510     const Register tmp2  = r11;
3511     const Register tmp3  = r12;
3512     const Register tmp4  = r13;
3513     const Register tmp5  = r14;
3514     const Register tmp6  = r15;
3515     const Register tmp7  = r16;
3516 
3517     RegSet spilled_regs = RegSet::of(y, ylen);
3518     BLOCK_COMMENT("Entry:");
3519     __ enter();
3520     __ push(spilled_regs, sp);
3521     __ mov(y, x);
3522     __ mov(ylen, xlen);
3523     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3524     __ pop(spilled_regs, sp);
3525     __ leave();
3526     __ ret(lr);
3527     return start;
3528   }
3529 
3530   address generate_mulAdd() {
3531     __ align(CodeEntryAlignment);
3532     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3533 
3534     address start = __ pc();
3535 
3536     const Register out     = r0;
3537     const Register in      = r1;
3538     const Register offset  = r2;
3539     const Register len     = r3;
3540     const Register k       = r4;
3541 
3542     BLOCK_COMMENT("Entry:");
3543     __ enter();
3544     __ mul_add(out, in, offset, len, k);
3545     __ leave();
3546     __ ret(lr);
3547 
3548     return start;
3549   }
3550 
3551   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3552                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3553                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3554     // Karatsuba multiplication performs a 128*128 -> 256-bit
3555     // multiplication in three 128-bit multiplications and a few
3556     // additions.
3557     //
3558     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3559     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3560     //
3561     // Inputs:
3562     //
3563     // A0 in a.d[0]     (subkey)
3564     // A1 in a.d[1]
3565     // (A1+A0) in a1_xor_a0.d[0]
3566     //
3567     // B0 in b.d[0]     (state)
3568     // B1 in b.d[1]
3569 
3570     __ ext(tmp1, __ T16B, b, b, 0x08);
3571     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3572     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3573     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3574     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3575 
3576     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3577     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3578     __ eor(tmp2, __ T16B, tmp2, tmp4);
3579     __ eor(tmp2, __ T16B, tmp2, tmp3);
3580 
3581     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3582     __ ins(result_hi, __ D, tmp2, 0, 1);
3583     __ ins(result_lo, __ D, tmp2, 1, 0);
3584   }
3585 
3586   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3587                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3588     const FloatRegister t0 = result;
3589 
3590     // The GCM field polynomial f is z^128 + p(z), where p =
3591     // z^7+z^2+z+1.
3592     //
3593     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3594     //
3595     // so, given that the product we're reducing is
3596     //    a == lo + hi * z^128
3597     // substituting,
3598     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3599     //
3600     // we reduce by multiplying hi by p(z) and subtracting the result
3601     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3602     // bits we can do this with two 64-bit multiplications, lo*p and
3603     // hi*p.
3604 
3605     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3606     __ ext(t1, __ T16B, t0, z, 8);
3607     __ eor(hi, __ T16B, hi, t1);
3608     __ ext(t1, __ T16B, z, t0, 8);
3609     __ eor(lo, __ T16B, lo, t1);
3610     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3611     __ eor(result, __ T16B, lo, t0);
3612   }
3613 
3614   address generate_has_negatives(address &has_negatives_long) {
3615     const u1 large_loop_size = 64;
3616     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3617     int dcache_line = VM_Version::dcache_line_size();
3618 
3619     Register ary1 = r1, len = r2, result = r0;
3620 
3621     __ align(CodeEntryAlignment);
3622 
3623     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3624 
3625     address entry = __ pc();
3626 
3627     __ enter();
3628 
3629   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3630         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3631 
3632   __ cmp(len, (u1)15);
3633   __ br(Assembler::GT, LEN_OVER_15);
3634   // The only case when execution falls into this code is when pointer is near
3635   // the end of memory page and we have to avoid reading next page
3636   __ add(ary1, ary1, len);
3637   __ subs(len, len, 8);
3638   __ br(Assembler::GT, LEN_OVER_8);
3639   __ ldr(rscratch2, Address(ary1, -8));
3640   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3641   __ lsrv(rscratch2, rscratch2, rscratch1);
3642   __ tst(rscratch2, UPPER_BIT_MASK);
3643   __ cset(result, Assembler::NE);
3644   __ leave();
3645   __ ret(lr);
3646   __ bind(LEN_OVER_8);
3647   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3648   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3649   __ tst(rscratch2, UPPER_BIT_MASK);
3650   __ br(Assembler::NE, RET_TRUE_NO_POP);
3651   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3652   __ lsrv(rscratch1, rscratch1, rscratch2);
3653   __ tst(rscratch1, UPPER_BIT_MASK);
3654   __ cset(result, Assembler::NE);
3655   __ leave();
3656   __ ret(lr);
3657 
3658   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3659   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3660 
3661   has_negatives_long = __ pc(); // 2nd entry point
3662 
3663   __ enter();
3664 
3665   __ bind(LEN_OVER_15);
3666     __ push(spilled_regs, sp);
3667     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3668     __ cbz(rscratch2, ALIGNED);
3669     __ ldp(tmp6, tmp1, Address(ary1));
3670     __ mov(tmp5, 16);
3671     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3672     __ add(ary1, ary1, rscratch1);
3673     __ sub(len, len, rscratch1);
3674     __ orr(tmp6, tmp6, tmp1);
3675     __ tst(tmp6, UPPER_BIT_MASK);
3676     __ br(Assembler::NE, RET_TRUE);
3677 
3678   __ bind(ALIGNED);
3679     __ cmp(len, large_loop_size);
3680     __ br(Assembler::LT, CHECK_16);
3681     // Perform 16-byte load as early return in pre-loop to handle situation
3682     // when initially aligned large array has negative values at starting bytes,
3683     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3684     // slower. Cases with negative bytes further ahead won't be affected that
3685     // much. In fact, it'll be faster due to early loads, less instructions and
3686     // less branches in LARGE_LOOP.
3687     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3688     __ sub(len, len, 16);
3689     __ orr(tmp6, tmp6, tmp1);
3690     __ tst(tmp6, UPPER_BIT_MASK);
3691     __ br(Assembler::NE, RET_TRUE);
3692     __ cmp(len, large_loop_size);
3693     __ br(Assembler::LT, CHECK_16);
3694 
3695     if (SoftwarePrefetchHintDistance >= 0
3696         && SoftwarePrefetchHintDistance >= dcache_line) {
3697       // initial prefetch
3698       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3699     }
3700   __ bind(LARGE_LOOP);
3701     if (SoftwarePrefetchHintDistance >= 0) {
3702       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3703     }
3704     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3705     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3706     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3707     // instructions per cycle and have less branches, but this approach disables
3708     // early return, thus, all 64 bytes are loaded and checked every time.
3709     __ ldp(tmp2, tmp3, Address(ary1));
3710     __ ldp(tmp4, tmp5, Address(ary1, 16));
3711     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3712     __ ldp(tmp6, tmp1, Address(ary1, 48));
3713     __ add(ary1, ary1, large_loop_size);
3714     __ sub(len, len, large_loop_size);
3715     __ orr(tmp2, tmp2, tmp3);
3716     __ orr(tmp4, tmp4, tmp5);
3717     __ orr(rscratch1, rscratch1, rscratch2);
3718     __ orr(tmp6, tmp6, tmp1);
3719     __ orr(tmp2, tmp2, tmp4);
3720     __ orr(rscratch1, rscratch1, tmp6);
3721     __ orr(tmp2, tmp2, rscratch1);
3722     __ tst(tmp2, UPPER_BIT_MASK);
3723     __ br(Assembler::NE, RET_TRUE);
3724     __ cmp(len, large_loop_size);
3725     __ br(Assembler::GE, LARGE_LOOP);
3726 
3727   __ bind(CHECK_16); // small 16-byte load pre-loop
3728     __ cmp(len, (u1)16);
3729     __ br(Assembler::LT, POST_LOOP16);
3730 
3731   __ bind(LOOP16); // small 16-byte load loop
3732     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3733     __ sub(len, len, 16);
3734     __ orr(tmp2, tmp2, tmp3);
3735     __ tst(tmp2, UPPER_BIT_MASK);
3736     __ br(Assembler::NE, RET_TRUE);
3737     __ cmp(len, (u1)16);
3738     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3739 
3740   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3741     __ cmp(len, (u1)8);
3742     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3743     __ ldr(tmp3, Address(__ post(ary1, 8)));
3744     __ sub(len, len, 8);
3745     __ tst(tmp3, UPPER_BIT_MASK);
3746     __ br(Assembler::NE, RET_TRUE);
3747 
3748   __ bind(POST_LOOP16_LOAD_TAIL);
3749     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3750     __ ldr(tmp1, Address(ary1));
3751     __ mov(tmp2, 64);
3752     __ sub(tmp4, tmp2, len, __ LSL, 3);
3753     __ lslv(tmp1, tmp1, tmp4);
3754     __ tst(tmp1, UPPER_BIT_MASK);
3755     __ br(Assembler::NE, RET_TRUE);
3756     // Fallthrough
3757 
3758   __ bind(RET_FALSE);
3759     __ pop(spilled_regs, sp);
3760     __ leave();
3761     __ mov(result, zr);
3762     __ ret(lr);
3763 
3764   __ bind(RET_TRUE);
3765     __ pop(spilled_regs, sp);
3766   __ bind(RET_TRUE_NO_POP);
3767     __ leave();
3768     __ mov(result, 1);
3769     __ ret(lr);
3770 
3771   __ bind(DONE);
3772     __ pop(spilled_regs, sp);
3773     __ leave();
3774     __ ret(lr);
3775     return entry;
3776   }
3777 
3778   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3779         bool usePrefetch, Label &NOT_EQUAL) {
3780     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3781         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3782         tmp7 = r12, tmp8 = r13;
3783     Label LOOP;
3784 
3785     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3786     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3787     __ bind(LOOP);
3788     if (usePrefetch) {
3789       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3790       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3791     }
3792     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3793     __ eor(tmp1, tmp1, tmp2);
3794     __ eor(tmp3, tmp3, tmp4);
3795     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3796     __ orr(tmp1, tmp1, tmp3);
3797     __ cbnz(tmp1, NOT_EQUAL);
3798     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3799     __ eor(tmp5, tmp5, tmp6);
3800     __ eor(tmp7, tmp7, tmp8);
3801     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3802     __ orr(tmp5, tmp5, tmp7);
3803     __ cbnz(tmp5, NOT_EQUAL);
3804     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3805     __ eor(tmp1, tmp1, tmp2);
3806     __ eor(tmp3, tmp3, tmp4);
3807     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3808     __ orr(tmp1, tmp1, tmp3);
3809     __ cbnz(tmp1, NOT_EQUAL);
3810     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3811     __ eor(tmp5, tmp5, tmp6);
3812     __ sub(cnt1, cnt1, 8 * wordSize);
3813     __ eor(tmp7, tmp7, tmp8);
3814     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3815     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3816     // cmp) because subs allows an unlimited range of immediate operand.
3817     __ subs(tmp6, cnt1, loopThreshold);
3818     __ orr(tmp5, tmp5, tmp7);
3819     __ cbnz(tmp5, NOT_EQUAL);
3820     __ br(__ GE, LOOP);
3821     // post-loop
3822     __ eor(tmp1, tmp1, tmp2);
3823     __ eor(tmp3, tmp3, tmp4);
3824     __ orr(tmp1, tmp1, tmp3);
3825     __ sub(cnt1, cnt1, 2 * wordSize);
3826     __ cbnz(tmp1, NOT_EQUAL);
3827   }
3828 
3829   void generate_large_array_equals_loop_simd(int loopThreshold,
3830         bool usePrefetch, Label &NOT_EQUAL) {
3831     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3832         tmp2 = rscratch2;
3833     Label LOOP;
3834 
3835     __ bind(LOOP);
3836     if (usePrefetch) {
3837       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3838       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3839     }
3840     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3841     __ sub(cnt1, cnt1, 8 * wordSize);
3842     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3843     __ subs(tmp1, cnt1, loopThreshold);
3844     __ eor(v0, __ T16B, v0, v4);
3845     __ eor(v1, __ T16B, v1, v5);
3846     __ eor(v2, __ T16B, v2, v6);
3847     __ eor(v3, __ T16B, v3, v7);
3848     __ orr(v0, __ T16B, v0, v1);
3849     __ orr(v1, __ T16B, v2, v3);
3850     __ orr(v0, __ T16B, v0, v1);
3851     __ umov(tmp1, v0, __ D, 0);
3852     __ umov(tmp2, v0, __ D, 1);
3853     __ orr(tmp1, tmp1, tmp2);
3854     __ cbnz(tmp1, NOT_EQUAL);
3855     __ br(__ GE, LOOP);
3856   }
3857 
3858   // a1 = r1 - array1 address
3859   // a2 = r2 - array2 address
3860   // result = r0 - return value. Already contains "false"
3861   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3862   // r3-r5 are reserved temporary registers
3863   address generate_large_array_equals() {
3864     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3865         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3866         tmp7 = r12, tmp8 = r13;
3867     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3868         SMALL_LOOP, POST_LOOP;
3869     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3870     // calculate if at least 32 prefetched bytes are used
3871     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3872     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3873     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3874     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3875         tmp5, tmp6, tmp7, tmp8);
3876 
3877     __ align(CodeEntryAlignment);
3878 
3879     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3880 
3881     address entry = __ pc();
3882     __ enter();
3883     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3884     // also advance pointers to use post-increment instead of pre-increment
3885     __ add(a1, a1, wordSize);
3886     __ add(a2, a2, wordSize);
3887     if (AvoidUnalignedAccesses) {
3888       // both implementations (SIMD/nonSIMD) are using relatively large load
3889       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3890       // on some CPUs in case of address is not at least 16-byte aligned.
3891       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3892       // load if needed at least for 1st address and make if 16-byte aligned.
3893       Label ALIGNED16;
3894       __ tbz(a1, 3, ALIGNED16);
3895       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3896       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3897       __ sub(cnt1, cnt1, wordSize);
3898       __ eor(tmp1, tmp1, tmp2);
3899       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3900       __ bind(ALIGNED16);
3901     }
3902     if (UseSIMDForArrayEquals) {
3903       if (SoftwarePrefetchHintDistance >= 0) {
3904         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3905         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3906         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3907             /* prfm = */ true, NOT_EQUAL);
3908         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3909         __ br(__ LT, TAIL);
3910       }
3911       __ bind(NO_PREFETCH_LARGE_LOOP);
3912       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3913           /* prfm = */ false, NOT_EQUAL);
3914     } else {
3915       __ push(spilled_regs, sp);
3916       if (SoftwarePrefetchHintDistance >= 0) {
3917         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3918         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3919         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3920             /* prfm = */ true, NOT_EQUAL);
3921         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3922         __ br(__ LT, TAIL);
3923       }
3924       __ bind(NO_PREFETCH_LARGE_LOOP);
3925       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3926           /* prfm = */ false, NOT_EQUAL);
3927     }
3928     __ bind(TAIL);
3929       __ cbz(cnt1, EQUAL);
3930       __ subs(cnt1, cnt1, wordSize);
3931       __ br(__ LE, POST_LOOP);
3932     __ bind(SMALL_LOOP);
3933       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3934       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3935       __ subs(cnt1, cnt1, wordSize);
3936       __ eor(tmp1, tmp1, tmp2);
3937       __ cbnz(tmp1, NOT_EQUAL);
3938       __ br(__ GT, SMALL_LOOP);
3939     __ bind(POST_LOOP);
3940       __ ldr(tmp1, Address(a1, cnt1));
3941       __ ldr(tmp2, Address(a2, cnt1));
3942       __ eor(tmp1, tmp1, tmp2);
3943       __ cbnz(tmp1, NOT_EQUAL);
3944     __ bind(EQUAL);
3945       __ mov(result, true);
3946     __ bind(NOT_EQUAL);
3947       if (!UseSIMDForArrayEquals) {
3948         __ pop(spilled_regs, sp);
3949       }
3950     __ bind(NOT_EQUAL_NO_POP);
3951     __ leave();
3952     __ ret(lr);
3953     return entry;
3954   }
3955 
3956   address generate_dsin_dcos(bool isCos) {
3957     __ align(CodeEntryAlignment);
3958     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3959     address start = __ pc();
3960     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3961         (address)StubRoutines::aarch64::_two_over_pi,
3962         (address)StubRoutines::aarch64::_pio2,
3963         (address)StubRoutines::aarch64::_dsin_coef,
3964         (address)StubRoutines::aarch64::_dcos_coef);
3965     return start;
3966   }
3967 
3968   address generate_dlog() {
3969     __ align(CodeEntryAlignment);
3970     StubCodeMark mark(this, "StubRoutines", "dlog");
3971     address entry = __ pc();
3972     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3973         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3974     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3975     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3976         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3977     return entry;
3978   }
3979 
3980   // code for comparing 16 bytes of strings with same encoding
3981   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3982     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3983     __ ldr(rscratch1, Address(__ post(str1, 8)));
3984     __ eor(rscratch2, tmp1, tmp2);
3985     __ ldr(cnt1, Address(__ post(str2, 8)));
3986     __ cbnz(rscratch2, DIFF1);
3987     __ ldr(tmp1, Address(__ post(str1, 8)));
3988     __ eor(rscratch2, rscratch1, cnt1);
3989     __ ldr(tmp2, Address(__ post(str2, 8)));
3990     __ cbnz(rscratch2, DIFF2);
3991   }
3992 
3993   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
3994   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
3995       Label &DIFF2) {
3996     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
3997     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
3998 
3999     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4000     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4001     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4002     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4003 
4004     __ fmovd(tmpL, vtmp3);
4005     __ eor(rscratch2, tmp3, tmpL);
4006     __ cbnz(rscratch2, DIFF2);
4007 
4008     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4009     __ umov(tmpL, vtmp3, __ D, 1);
4010     __ eor(rscratch2, tmpU, tmpL);
4011     __ cbnz(rscratch2, DIFF1);
4012 
4013     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4014     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4015     __ fmovd(tmpL, vtmp);
4016     __ eor(rscratch2, tmp3, tmpL);
4017     __ cbnz(rscratch2, DIFF2);
4018 
4019     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4020     __ umov(tmpL, vtmp, __ D, 1);
4021     __ eor(rscratch2, tmpU, tmpL);
4022     __ cbnz(rscratch2, DIFF1);
4023   }
4024 
4025   // r0  = result
4026   // r1  = str1
4027   // r2  = cnt1
4028   // r3  = str2
4029   // r4  = cnt2
4030   // r10 = tmp1
4031   // r11 = tmp2
4032   address generate_compare_long_string_different_encoding(bool isLU) {
4033     __ align(CodeEntryAlignment);
4034     StubCodeMark mark(this, "StubRoutines", isLU
4035         ? "compare_long_string_different_encoding LU"
4036         : "compare_long_string_different_encoding UL");
4037     address entry = __ pc();
4038     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4039         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
4040         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4041     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4042         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4043     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4044     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4045 
4046     int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
4047 
4048     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4049     // cnt2 == amount of characters left to compare
4050     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4051     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4052     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4053     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4054     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4055     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4056     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4057     __ eor(rscratch2, tmp1, tmp2);
4058     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4059     __ mov(rscratch1, tmp2);
4060     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4061     Register strU = isLU ? str2 : str1,
4062              strL = isLU ? str1 : str2,
4063              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4064              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4065     __ push(spilled_regs, sp);
4066     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4067     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4068 
4069     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4070 
4071     if (SoftwarePrefetchHintDistance >= 0) {
4072       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4073       __ br(__ LT, SMALL_LOOP);
4074       __ bind(LARGE_LOOP_PREFETCH);
4075         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4076         __ mov(tmp4, 2);
4077         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4078         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4079           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4080           __ subs(tmp4, tmp4, 1);
4081           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4082           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4083           __ mov(tmp4, 2);
4084         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4085           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4086           __ subs(tmp4, tmp4, 1);
4087           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4088           __ sub(cnt2, cnt2, 64);
4089           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4090           __ br(__ GE, LARGE_LOOP_PREFETCH);
4091     }
4092     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4093     __ subs(cnt2, cnt2, 16);
4094     __ br(__ LT, TAIL);
4095     __ b(SMALL_LOOP_ENTER);
4096     __ bind(SMALL_LOOP); // smaller loop
4097       __ subs(cnt2, cnt2, 16);
4098     __ bind(SMALL_LOOP_ENTER);
4099       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4100       __ br(__ GE, SMALL_LOOP);
4101       __ cbz(cnt2, LOAD_LAST);
4102     __ bind(TAIL); // 1..15 characters left
4103       __ subs(zr, cnt2, -8);
4104       __ br(__ GT, TAIL_LOAD_16);
4105       __ ldrd(vtmp, Address(tmp2));
4106       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4107 
4108       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4109       __ fmovd(tmpL, vtmp3);
4110       __ eor(rscratch2, tmp3, tmpL);
4111       __ cbnz(rscratch2, DIFF2);
4112       __ umov(tmpL, vtmp3, __ D, 1);
4113       __ eor(rscratch2, tmpU, tmpL);
4114       __ cbnz(rscratch2, DIFF1);
4115       __ b(LOAD_LAST);
4116     __ bind(TAIL_LOAD_16);
4117       __ ldrq(vtmp, Address(tmp2));
4118       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4119       __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4120       __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4121       __ fmovd(tmpL, vtmp3);
4122       __ eor(rscratch2, tmp3, tmpL);
4123       __ cbnz(rscratch2, DIFF2);
4124 
4125       __ ldr(tmp3, Address(__ post(cnt1, 8)));
4126       __ umov(tmpL, vtmp3, __ D, 1);
4127       __ eor(rscratch2, tmpU, tmpL);
4128       __ cbnz(rscratch2, DIFF1);
4129 
4130       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4131       __ fmovd(tmpL, vtmp);
4132       __ eor(rscratch2, tmp3, tmpL);
4133       __ cbnz(rscratch2, DIFF2);
4134 
4135       __ umov(tmpL, vtmp, __ D, 1);
4136       __ eor(rscratch2, tmpU, tmpL);
4137       __ cbnz(rscratch2, DIFF1);
4138       __ b(LOAD_LAST);
4139     __ bind(DIFF2);
4140       __ mov(tmpU, tmp3);
4141     __ bind(DIFF1);
4142       __ pop(spilled_regs, sp);
4143       __ b(CALCULATE_DIFFERENCE);
4144     __ bind(LOAD_LAST);
4145       __ pop(spilled_regs, sp);
4146 
4147       __ ldrs(vtmp, Address(strL));
4148       __ ldr(tmpU, Address(strU));
4149       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4150       __ fmovd(tmpL, vtmp);
4151 
4152       __ eor(rscratch2, tmpU, tmpL);
4153       __ cbz(rscratch2, DONE);
4154 
4155     // Find the first different characters in the longwords and
4156     // compute their difference.
4157     __ bind(CALCULATE_DIFFERENCE);
4158       __ rev(rscratch2, rscratch2);
4159       __ clz(rscratch2, rscratch2);
4160       __ andr(rscratch2, rscratch2, -16);
4161       __ lsrv(tmp1, tmp1, rscratch2);
4162       __ uxthw(tmp1, tmp1);
4163       __ lsrv(rscratch1, rscratch1, rscratch2);
4164       __ uxthw(rscratch1, rscratch1);
4165       __ subw(result, tmp1, rscratch1);
4166     __ bind(DONE);
4167       __ ret(lr);
4168     return entry;
4169   }
4170 
4171   // r0  = result
4172   // r1  = str1
4173   // r2  = cnt1
4174   // r3  = str2
4175   // r4  = cnt2
4176   // r10 = tmp1
4177   // r11 = tmp2
4178   address generate_compare_long_string_same_encoding(bool isLL) {
4179     __ align(CodeEntryAlignment);
4180     StubCodeMark mark(this, "StubRoutines", isLL
4181         ? "compare_long_string_same_encoding LL"
4182         : "compare_long_string_same_encoding UU");
4183     address entry = __ pc();
4184     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4185         tmp1 = r10, tmp2 = r11;
4186     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4187         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4188         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4189     // exit from large loop when less than 64 bytes left to read or we're about
4190     // to prefetch memory behind array border
4191     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4192     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4193     // update cnt2 counter with already loaded 8 bytes
4194     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4195     // update pointers, because of previous read
4196     __ add(str1, str1, wordSize);
4197     __ add(str2, str2, wordSize);
4198     if (SoftwarePrefetchHintDistance >= 0) {
4199       __ bind(LARGE_LOOP_PREFETCH);
4200         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4201         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4202         compare_string_16_bytes_same(DIFF, DIFF2);
4203         compare_string_16_bytes_same(DIFF, DIFF2);
4204         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4205         compare_string_16_bytes_same(DIFF, DIFF2);
4206         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4207         compare_string_16_bytes_same(DIFF, DIFF2);
4208         __ br(__ GT, LARGE_LOOP_PREFETCH);
4209         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4210         // less than 16 bytes left?
4211         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4212         __ br(__ LT, TAIL);
4213     }
4214     __ bind(SMALL_LOOP);
4215       compare_string_16_bytes_same(DIFF, DIFF2);
4216       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4217       __ br(__ GE, SMALL_LOOP);
4218     __ bind(TAIL);
4219       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4220       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4221       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4222       __ br(__ LE, CHECK_LAST);
4223       __ eor(rscratch2, tmp1, tmp2);
4224       __ cbnz(rscratch2, DIFF);
4225       __ ldr(tmp1, Address(__ post(str1, 8)));
4226       __ ldr(tmp2, Address(__ post(str2, 8)));
4227       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4228     __ bind(CHECK_LAST);
4229       if (!isLL) {
4230         __ add(cnt2, cnt2, cnt2); // now in bytes
4231       }
4232       __ eor(rscratch2, tmp1, tmp2);
4233       __ cbnz(rscratch2, DIFF);
4234       __ ldr(rscratch1, Address(str1, cnt2));
4235       __ ldr(cnt1, Address(str2, cnt2));
4236       __ eor(rscratch2, rscratch1, cnt1);
4237       __ cbz(rscratch2, LENGTH_DIFF);
4238       // Find the first different characters in the longwords and
4239       // compute their difference.
4240     __ bind(DIFF2);
4241       __ rev(rscratch2, rscratch2);
4242       __ clz(rscratch2, rscratch2);
4243       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4244       __ lsrv(rscratch1, rscratch1, rscratch2);
4245       if (isLL) {
4246         __ lsrv(cnt1, cnt1, rscratch2);
4247         __ uxtbw(rscratch1, rscratch1);
4248         __ uxtbw(cnt1, cnt1);
4249       } else {
4250         __ lsrv(cnt1, cnt1, rscratch2);
4251         __ uxthw(rscratch1, rscratch1);
4252         __ uxthw(cnt1, cnt1);
4253       }
4254       __ subw(result, rscratch1, cnt1);
4255       __ b(LENGTH_DIFF);
4256     __ bind(DIFF);
4257       __ rev(rscratch2, rscratch2);
4258       __ clz(rscratch2, rscratch2);
4259       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4260       __ lsrv(tmp1, tmp1, rscratch2);
4261       if (isLL) {
4262         __ lsrv(tmp2, tmp2, rscratch2);
4263         __ uxtbw(tmp1, tmp1);
4264         __ uxtbw(tmp2, tmp2);
4265       } else {
4266         __ lsrv(tmp2, tmp2, rscratch2);
4267         __ uxthw(tmp1, tmp1);
4268         __ uxthw(tmp2, tmp2);
4269       }
4270       __ subw(result, tmp1, tmp2);
4271       __ b(LENGTH_DIFF);
4272     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4273       __ eor(rscratch2, tmp1, tmp2);
4274       __ cbnz(rscratch2, DIFF);
4275     __ bind(LENGTH_DIFF);
4276       __ ret(lr);
4277     return entry;
4278   }
4279 
4280   void generate_compare_long_strings() {
4281       StubRoutines::aarch64::_compare_long_string_LL
4282           = generate_compare_long_string_same_encoding(true);
4283       StubRoutines::aarch64::_compare_long_string_UU
4284           = generate_compare_long_string_same_encoding(false);
4285       StubRoutines::aarch64::_compare_long_string_LU
4286           = generate_compare_long_string_different_encoding(true);
4287       StubRoutines::aarch64::_compare_long_string_UL
4288           = generate_compare_long_string_different_encoding(false);
4289   }
4290 
4291   // R0 = result
4292   // R1 = str2
4293   // R2 = cnt1
4294   // R3 = str1
4295   // R4 = cnt2
4296   // This generic linear code use few additional ideas, which makes it faster:
4297   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4298   // in order to skip initial loading(help in systems with 1 ld pipeline)
4299   // 2) we can use "fast" algorithm of finding single character to search for
4300   // first symbol with less branches(1 branch per each loaded register instead
4301   // of branch for each symbol), so, this is where constants like
4302   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4303   // 3) after loading and analyzing 1st register of source string, it can be
4304   // used to search for every 1st character entry, saving few loads in
4305   // comparison with "simplier-but-slower" implementation
4306   // 4) in order to avoid lots of push/pop operations, code below is heavily
4307   // re-using/re-initializing/compressing register values, which makes code
4308   // larger and a bit less readable, however, most of extra operations are
4309   // issued during loads or branches, so, penalty is minimal
4310   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4311     const char* stubName = str1_isL
4312         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4313         : "indexof_linear_uu";
4314     __ align(CodeEntryAlignment);
4315     StubCodeMark mark(this, "StubRoutines", stubName);
4316     address entry = __ pc();
4317 
4318     int str1_chr_size = str1_isL ? 1 : 2;
4319     int str2_chr_size = str2_isL ? 1 : 2;
4320     int str1_chr_shift = str1_isL ? 0 : 1;
4321     int str2_chr_shift = str2_isL ? 0 : 1;
4322     bool isL = str1_isL && str2_isL;
4323    // parameters
4324     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4325     // temporary registers
4326     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4327     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4328     // redefinitions
4329     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4330 
4331     __ push(spilled_regs, sp);
4332     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4333         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4334         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4335         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4336         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4337         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4338     // Read whole register from str1. It is safe, because length >=8 here
4339     __ ldr(ch1, Address(str1));
4340     // Read whole register from str2. It is safe, because length >=8 here
4341     __ ldr(ch2, Address(str2));
4342     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4343     if (str1_isL != str2_isL) {
4344       __ eor(v0, __ T16B, v0, v0);
4345     }
4346     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4347     __ mul(first, first, tmp1);
4348     // check if we have less than 1 register to check
4349     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4350     if (str1_isL != str2_isL) {
4351       __ fmovd(v1, ch1);
4352     }
4353     __ br(__ LE, L_SMALL);
4354     __ eor(ch2, first, ch2);
4355     if (str1_isL != str2_isL) {
4356       __ zip1(v1, __ T16B, v1, v0);
4357     }
4358     __ sub(tmp2, ch2, tmp1);
4359     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4360     __ bics(tmp2, tmp2, ch2);
4361     if (str1_isL != str2_isL) {
4362       __ fmovd(ch1, v1);
4363     }
4364     __ br(__ NE, L_HAS_ZERO);
4365     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4366     __ add(result, result, wordSize/str2_chr_size);
4367     __ add(str2, str2, wordSize);
4368     __ br(__ LT, L_POST_LOOP);
4369     __ BIND(L_LOOP);
4370       __ ldr(ch2, Address(str2));
4371       __ eor(ch2, first, ch2);
4372       __ sub(tmp2, ch2, tmp1);
4373       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4374       __ bics(tmp2, tmp2, ch2);
4375       __ br(__ NE, L_HAS_ZERO);
4376     __ BIND(L_LOOP_PROCEED);
4377       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4378       __ add(str2, str2, wordSize);
4379       __ add(result, result, wordSize/str2_chr_size);
4380       __ br(__ GE, L_LOOP);
4381     __ BIND(L_POST_LOOP);
4382       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4383       __ br(__ LE, NOMATCH);
4384       __ ldr(ch2, Address(str2));
4385       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4386       __ eor(ch2, first, ch2);
4387       __ sub(tmp2, ch2, tmp1);
4388       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4389       __ mov(tmp4, -1); // all bits set
4390       __ b(L_SMALL_PROCEED);
4391     __ align(OptoLoopAlignment);
4392     __ BIND(L_SMALL);
4393       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4394       __ eor(ch2, first, ch2);
4395       if (str1_isL != str2_isL) {
4396         __ zip1(v1, __ T16B, v1, v0);
4397       }
4398       __ sub(tmp2, ch2, tmp1);
4399       __ mov(tmp4, -1); // all bits set
4400       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4401       if (str1_isL != str2_isL) {
4402         __ fmovd(ch1, v1); // move converted 4 symbols
4403       }
4404     __ BIND(L_SMALL_PROCEED);
4405       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4406       __ bic(tmp2, tmp2, ch2);
4407       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4408       __ rbit(tmp2, tmp2);
4409       __ br(__ EQ, NOMATCH);
4410     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4411       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4412       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4413       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4414       if (str2_isL) { // LL
4415         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4416         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4417         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4418         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4419         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4420       } else {
4421         __ mov(ch2, 0xE); // all bits in byte set except last one
4422         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4423         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4424         __ lslv(tmp2, tmp2, tmp4);
4425         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4426         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4427         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4428         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4429       }
4430       __ cmp(ch1, ch2);
4431       __ mov(tmp4, wordSize/str2_chr_size);
4432       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4433     __ BIND(L_SMALL_CMP_LOOP);
4434       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4435                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4436       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4437                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4438       __ add(tmp4, tmp4, 1);
4439       __ cmp(tmp4, cnt1);
4440       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4441       __ cmp(first, ch2);
4442       __ br(__ EQ, L_SMALL_CMP_LOOP);
4443     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4444       __ cbz(tmp2, NOMATCH); // no more matches. exit
4445       __ clz(tmp4, tmp2);
4446       __ add(result, result, 1); // advance index
4447       __ add(str2, str2, str2_chr_size); // advance pointer
4448       __ b(L_SMALL_HAS_ZERO_LOOP);
4449     __ align(OptoLoopAlignment);
4450     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4451       __ cmp(first, ch2);
4452       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4453       __ b(DONE);
4454     __ align(OptoLoopAlignment);
4455     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4456       if (str2_isL) { // LL
4457         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4458         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4459         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4460         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4461         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4462       } else {
4463         __ mov(ch2, 0xE); // all bits in byte set except last one
4464         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4465         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4466         __ lslv(tmp2, tmp2, tmp4);
4467         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4468         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4469         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4470         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4471       }
4472       __ cmp(ch1, ch2);
4473       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4474       __ b(DONE);
4475     __ align(OptoLoopAlignment);
4476     __ BIND(L_HAS_ZERO);
4477       __ rbit(tmp2, tmp2);
4478       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4479       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4480       // It's fine because both counters are 32bit and are not changed in this
4481       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4482       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4483       __ sub(result, result, 1);
4484     __ BIND(L_HAS_ZERO_LOOP);
4485       __ mov(cnt1, wordSize/str2_chr_size);
4486       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4487       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4488       if (str2_isL) {
4489         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4490         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4491         __ lslv(tmp2, tmp2, tmp4);
4492         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4493         __ add(tmp4, tmp4, 1);
4494         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4495         __ lsl(tmp2, tmp2, 1);
4496         __ mov(tmp4, wordSize/str2_chr_size);
4497       } else {
4498         __ mov(ch2, 0xE);
4499         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4500         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4501         __ lslv(tmp2, tmp2, tmp4);
4502         __ add(tmp4, tmp4, 1);
4503         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4504         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4505         __ lsl(tmp2, tmp2, 1);
4506         __ mov(tmp4, wordSize/str2_chr_size);
4507         __ sub(str2, str2, str2_chr_size);
4508       }
4509       __ cmp(ch1, ch2);
4510       __ mov(tmp4, wordSize/str2_chr_size);
4511       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4512     __ BIND(L_CMP_LOOP);
4513       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4514                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4515       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4516                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4517       __ add(tmp4, tmp4, 1);
4518       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4519       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4520       __ cmp(cnt1, ch2);
4521       __ br(__ EQ, L_CMP_LOOP);
4522     __ BIND(L_CMP_LOOP_NOMATCH);
4523       // here we're not matched
4524       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4525       __ clz(tmp4, tmp2);
4526       __ add(str2, str2, str2_chr_size); // advance pointer
4527       __ b(L_HAS_ZERO_LOOP);
4528     __ align(OptoLoopAlignment);
4529     __ BIND(L_CMP_LOOP_LAST_CMP);
4530       __ cmp(cnt1, ch2);
4531       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4532       __ b(DONE);
4533     __ align(OptoLoopAlignment);
4534     __ BIND(L_CMP_LOOP_LAST_CMP2);
4535       if (str2_isL) {
4536         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4537         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4538         __ lslv(tmp2, tmp2, tmp4);
4539         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4540         __ add(tmp4, tmp4, 1);
4541         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4542         __ lsl(tmp2, tmp2, 1);
4543       } else {
4544         __ mov(ch2, 0xE);
4545         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4546         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4547         __ lslv(tmp2, tmp2, tmp4);
4548         __ add(tmp4, tmp4, 1);
4549         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4550         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4551         __ lsl(tmp2, tmp2, 1);
4552         __ sub(str2, str2, str2_chr_size);
4553       }
4554       __ cmp(ch1, ch2);
4555       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4556       __ b(DONE);
4557     __ align(OptoLoopAlignment);
4558     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4559       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4560       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4561       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4562       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4563       // result by analyzed characters value, so, we can just reset lower bits
4564       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4565       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4566       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4567       // index of last analyzed substring inside current octet. So, str2 in at
4568       // respective start address. We need to advance it to next octet
4569       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4570       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4571       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4572       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4573       __ movw(cnt2, cnt2);
4574       __ b(L_LOOP_PROCEED);
4575     __ align(OptoLoopAlignment);
4576     __ BIND(NOMATCH);
4577       __ mov(result, -1);
4578     __ BIND(DONE);
4579       __ pop(spilled_regs, sp);
4580       __ ret(lr);
4581     return entry;
4582   }
4583 
4584   void generate_string_indexof_stubs() {
4585     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4586     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4587     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4588   }
4589 
4590   void inflate_and_store_2_fp_registers(bool generatePrfm,
4591       FloatRegister src1, FloatRegister src2) {
4592     Register dst = r1;
4593     __ zip1(v1, __ T16B, src1, v0);
4594     __ zip2(v2, __ T16B, src1, v0);
4595     if (generatePrfm) {
4596       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4597     }
4598     __ zip1(v3, __ T16B, src2, v0);
4599     __ zip2(v4, __ T16B, src2, v0);
4600     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4601   }
4602 
4603   // R0 = src
4604   // R1 = dst
4605   // R2 = len
4606   // R3 = len >> 3
4607   // V0 = 0
4608   // v1 = loaded 8 bytes
4609   address generate_large_byte_array_inflate() {
4610     __ align(CodeEntryAlignment);
4611     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4612     address entry = __ pc();
4613     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4614     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4615     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4616 
4617     // do one more 8-byte read to have address 16-byte aligned in most cases
4618     // also use single store instruction
4619     __ ldrd(v2, __ post(src, 8));
4620     __ sub(octetCounter, octetCounter, 2);
4621     __ zip1(v1, __ T16B, v1, v0);
4622     __ zip1(v2, __ T16B, v2, v0);
4623     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4624     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4625     __ subs(rscratch1, octetCounter, large_loop_threshold);
4626     __ br(__ LE, LOOP_START);
4627     __ b(LOOP_PRFM_START);
4628     __ bind(LOOP_PRFM);
4629       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4630     __ bind(LOOP_PRFM_START);
4631       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4632       __ sub(octetCounter, octetCounter, 8);
4633       __ subs(rscratch1, octetCounter, large_loop_threshold);
4634       inflate_and_store_2_fp_registers(true, v3, v4);
4635       inflate_and_store_2_fp_registers(true, v5, v6);
4636       __ br(__ GT, LOOP_PRFM);
4637       __ cmp(octetCounter, (u1)8);
4638       __ br(__ LT, DONE);
4639     __ bind(LOOP);
4640       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4641       __ bind(LOOP_START);
4642       __ sub(octetCounter, octetCounter, 8);
4643       __ cmp(octetCounter, (u1)8);
4644       inflate_and_store_2_fp_registers(false, v3, v4);
4645       inflate_and_store_2_fp_registers(false, v5, v6);
4646       __ br(__ GE, LOOP);
4647     __ bind(DONE);
4648       __ ret(lr);
4649     return entry;
4650   }
4651 
4652   /**
4653    *  Arguments:
4654    *
4655    *  Input:
4656    *  c_rarg0   - current state address
4657    *  c_rarg1   - H key address
4658    *  c_rarg2   - data address
4659    *  c_rarg3   - number of blocks
4660    *
4661    *  Output:
4662    *  Updated state at c_rarg0
4663    */
4664   address generate_ghash_processBlocks() {
4665     // Bafflingly, GCM uses little-endian for the byte order, but
4666     // big-endian for the bit order.  For example, the polynomial 1 is
4667     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4668     //
4669     // So, we must either reverse the bytes in each word and do
4670     // everything big-endian or reverse the bits in each byte and do
4671     // it little-endian.  On AArch64 it's more idiomatic to reverse
4672     // the bits in each byte (we have an instruction, RBIT, to do
4673     // that) and keep the data in little-endian bit order throught the
4674     // calculation, bit-reversing the inputs and outputs.
4675 
4676     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4677     __ align(wordSize * 2);
4678     address p = __ pc();
4679     __ emit_int64(0x87);  // The low-order bits of the field
4680                           // polynomial (i.e. p = z^7+z^2+z+1)
4681                           // repeated in the low and high parts of a
4682                           // 128-bit vector
4683     __ emit_int64(0x87);
4684 
4685     __ align(CodeEntryAlignment);
4686     address start = __ pc();
4687 
4688     Register state   = c_rarg0;
4689     Register subkeyH = c_rarg1;
4690     Register data    = c_rarg2;
4691     Register blocks  = c_rarg3;
4692 
4693     FloatRegister vzr = v30;
4694     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4695 
4696     __ ldrq(v0, Address(state));
4697     __ ldrq(v1, Address(subkeyH));
4698 
4699     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4700     __ rbit(v0, __ T16B, v0);
4701     __ rev64(v1, __ T16B, v1);
4702     __ rbit(v1, __ T16B, v1);
4703 
4704     __ ldrq(v26, p);
4705 
4706     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4707     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4708 
4709     {
4710       Label L_ghash_loop;
4711       __ bind(L_ghash_loop);
4712 
4713       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4714                                                  // reversing each byte
4715       __ rbit(v2, __ T16B, v2);
4716       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4717 
4718       // Multiply state in v2 by subkey in v1
4719       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4720                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4721                      /*temps*/v6, v20, v18, v21);
4722       // Reduce v7:v5 by the field polynomial
4723       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4724 
4725       __ sub(blocks, blocks, 1);
4726       __ cbnz(blocks, L_ghash_loop);
4727     }
4728 
4729     // The bit-reversed result is at this point in v0
4730     __ rev64(v1, __ T16B, v0);
4731     __ rbit(v1, __ T16B, v1);
4732 
4733     __ st1(v1, __ T16B, state);
4734     __ ret(lr);
4735 
4736     return start;
4737   }
4738 
4739   // Continuation point for throwing of implicit exceptions that are
4740   // not handled in the current activation. Fabricates an exception
4741   // oop and initiates normal exception dispatching in this
4742   // frame. Since we need to preserve callee-saved values (currently
4743   // only for C2, but done for C1 as well) we need a callee-saved oop
4744   // map and therefore have to make these stubs into RuntimeStubs
4745   // rather than BufferBlobs.  If the compiler needs all registers to
4746   // be preserved between the fault point and the exception handler
4747   // then it must assume responsibility for that in
4748   // AbstractCompiler::continuation_for_implicit_null_exception or
4749   // continuation_for_implicit_division_by_zero_exception. All other
4750   // implicit exceptions (e.g., NullPointerException or
4751   // AbstractMethodError on entry) are either at call sites or
4752   // otherwise assume that stack unwinding will be initiated, so
4753   // caller saved registers were assumed volatile in the compiler.
4754 
4755 #undef __
4756 #define __ masm->
4757 
4758   address generate_throw_exception(const char* name,
4759                                    address runtime_entry,
4760                                    Register arg1 = noreg,
4761                                    Register arg2 = noreg) {
4762     // Information about frame layout at time of blocking runtime call.
4763     // Note that we only have to preserve callee-saved registers since
4764     // the compilers are responsible for supplying a continuation point
4765     // if they expect all registers to be preserved.
4766     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4767     enum layout {
4768       rfp_off = 0,
4769       rfp_off2,
4770       return_off,
4771       return_off2,
4772       framesize // inclusive of return address
4773     };
4774 
4775     int insts_size = 512;
4776     int locs_size  = 64;
4777 
4778     CodeBuffer code(name, insts_size, locs_size);
4779     OopMapSet* oop_maps  = new OopMapSet();
4780     MacroAssembler* masm = new MacroAssembler(&code);
4781 
4782     address start = __ pc();
4783 
4784     // This is an inlined and slightly modified version of call_VM
4785     // which has the ability to fetch the return PC out of
4786     // thread-local storage and also sets up last_Java_sp slightly
4787     // differently than the real call_VM
4788 
4789     __ enter(); // Save FP and LR before call
4790 
4791     assert(is_even(framesize/2), "sp not 16-byte aligned");
4792 
4793     // lr and fp are already in place
4794     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4795 
4796     int frame_complete = __ pc() - start;
4797 
4798     // Set up last_Java_sp and last_Java_fp
4799     address the_pc = __ pc();
4800     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4801 
4802     // Call runtime
4803     if (arg1 != noreg) {
4804       assert(arg2 != c_rarg1, "clobbered");
4805       __ mov(c_rarg1, arg1);
4806     }
4807     if (arg2 != noreg) {
4808       __ mov(c_rarg2, arg2);
4809     }
4810     __ mov(c_rarg0, rthread);
4811     BLOCK_COMMENT("call runtime_entry");
4812     __ mov(rscratch1, runtime_entry);
4813     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4814 
4815     // Generate oop map
4816     OopMap* map = new OopMap(framesize, 0);
4817 
4818     oop_maps->add_gc_map(the_pc - start, map);
4819 
4820     __ reset_last_Java_frame(true);
4821     __ maybe_isb();
4822 
4823     __ leave();
4824 
4825     // check for pending exceptions
4826 #ifdef ASSERT
4827     Label L;
4828     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4829     __ cbnz(rscratch1, L);
4830     __ should_not_reach_here();
4831     __ bind(L);
4832 #endif // ASSERT
4833     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4834 
4835 
4836     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4837     RuntimeStub* stub =
4838       RuntimeStub::new_runtime_stub(name,
4839                                     &code,
4840                                     frame_complete,
4841                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4842                                     oop_maps, false);
4843     return stub->entry_point();
4844   }
4845 
4846   class MontgomeryMultiplyGenerator : public MacroAssembler {
4847 
4848     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4849       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4850 
4851     RegSet _toSave;
4852     bool _squaring;
4853 
4854   public:
4855     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4856       : MacroAssembler(as->code()), _squaring(squaring) {
4857 
4858       // Register allocation
4859 
4860       Register reg = c_rarg0;
4861       Pa_base = reg;       // Argument registers
4862       if (squaring)
4863         Pb_base = Pa_base;
4864       else
4865         Pb_base = ++reg;
4866       Pn_base = ++reg;
4867       Rlen= ++reg;
4868       inv = ++reg;
4869       Pm_base = ++reg;
4870 
4871                           // Working registers:
4872       Ra =  ++reg;        // The current digit of a, b, n, and m.
4873       Rb =  ++reg;
4874       Rm =  ++reg;
4875       Rn =  ++reg;
4876 
4877       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4878       Pb =  ++reg;
4879       Pm =  ++reg;
4880       Pn =  ++reg;
4881 
4882       t0 =  ++reg;        // Three registers which form a
4883       t1 =  ++reg;        // triple-precision accumuator.
4884       t2 =  ++reg;
4885 
4886       Ri =  ++reg;        // Inner and outer loop indexes.
4887       Rj =  ++reg;
4888 
4889       Rhi_ab = ++reg;     // Product registers: low and high parts
4890       Rlo_ab = ++reg;     // of a*b and m*n.
4891       Rhi_mn = ++reg;
4892       Rlo_mn = ++reg;
4893 
4894       // r19 and up are callee-saved.
4895       _toSave = RegSet::range(r19, reg) + Pm_base;
4896     }
4897 
4898   private:
4899     void save_regs() {
4900       push(_toSave, sp);
4901     }
4902 
4903     void restore_regs() {
4904       pop(_toSave, sp);
4905     }
4906 
4907     template <typename T>
4908     void unroll_2(Register count, T block) {
4909       Label loop, end, odd;
4910       tbnz(count, 0, odd);
4911       cbz(count, end);
4912       align(16);
4913       bind(loop);
4914       (this->*block)();
4915       bind(odd);
4916       (this->*block)();
4917       subs(count, count, 2);
4918       br(Assembler::GT, loop);
4919       bind(end);
4920     }
4921 
4922     template <typename T>
4923     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4924       Label loop, end, odd;
4925       tbnz(count, 0, odd);
4926       cbz(count, end);
4927       align(16);
4928       bind(loop);
4929       (this->*block)(d, s, tmp);
4930       bind(odd);
4931       (this->*block)(d, s, tmp);
4932       subs(count, count, 2);
4933       br(Assembler::GT, loop);
4934       bind(end);
4935     }
4936 
4937     void pre1(RegisterOrConstant i) {
4938       block_comment("pre1");
4939       // Pa = Pa_base;
4940       // Pb = Pb_base + i;
4941       // Pm = Pm_base;
4942       // Pn = Pn_base + i;
4943       // Ra = *Pa;
4944       // Rb = *Pb;
4945       // Rm = *Pm;
4946       // Rn = *Pn;
4947       ldr(Ra, Address(Pa_base));
4948       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4949       ldr(Rm, Address(Pm_base));
4950       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4951       lea(Pa, Address(Pa_base));
4952       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4953       lea(Pm, Address(Pm_base));
4954       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4955 
4956       // Zero the m*n result.
4957       mov(Rhi_mn, zr);
4958       mov(Rlo_mn, zr);
4959     }
4960 
4961     // The core multiply-accumulate step of a Montgomery
4962     // multiplication.  The idea is to schedule operations as a
4963     // pipeline so that instructions with long latencies (loads and
4964     // multiplies) have time to complete before their results are
4965     // used.  This most benefits in-order implementations of the
4966     // architecture but out-of-order ones also benefit.
4967     void step() {
4968       block_comment("step");
4969       // MACC(Ra, Rb, t0, t1, t2);
4970       // Ra = *++Pa;
4971       // Rb = *--Pb;
4972       umulh(Rhi_ab, Ra, Rb);
4973       mul(Rlo_ab, Ra, Rb);
4974       ldr(Ra, pre(Pa, wordSize));
4975       ldr(Rb, pre(Pb, -wordSize));
4976       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4977                                        // previous iteration.
4978       // MACC(Rm, Rn, t0, t1, t2);
4979       // Rm = *++Pm;
4980       // Rn = *--Pn;
4981       umulh(Rhi_mn, Rm, Rn);
4982       mul(Rlo_mn, Rm, Rn);
4983       ldr(Rm, pre(Pm, wordSize));
4984       ldr(Rn, pre(Pn, -wordSize));
4985       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4986     }
4987 
4988     void post1() {
4989       block_comment("post1");
4990 
4991       // MACC(Ra, Rb, t0, t1, t2);
4992       // Ra = *++Pa;
4993       // Rb = *--Pb;
4994       umulh(Rhi_ab, Ra, Rb);
4995       mul(Rlo_ab, Ra, Rb);
4996       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4997       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4998 
4999       // *Pm = Rm = t0 * inv;
5000       mul(Rm, t0, inv);
5001       str(Rm, Address(Pm));
5002 
5003       // MACC(Rm, Rn, t0, t1, t2);
5004       // t0 = t1; t1 = t2; t2 = 0;
5005       umulh(Rhi_mn, Rm, Rn);
5006 
5007 #ifndef PRODUCT
5008       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5009       {
5010         mul(Rlo_mn, Rm, Rn);
5011         add(Rlo_mn, t0, Rlo_mn);
5012         Label ok;
5013         cbz(Rlo_mn, ok); {
5014           stop("broken Montgomery multiply");
5015         } bind(ok);
5016       }
5017 #endif
5018       // We have very carefully set things up so that
5019       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5020       // the lower half of Rm * Rn because we know the result already:
5021       // it must be -t0.  t0 + (-t0) must generate a carry iff
5022       // t0 != 0.  So, rather than do a mul and an adds we just set
5023       // the carry flag iff t0 is nonzero.
5024       //
5025       // mul(Rlo_mn, Rm, Rn);
5026       // adds(zr, t0, Rlo_mn);
5027       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5028       adcs(t0, t1, Rhi_mn);
5029       adc(t1, t2, zr);
5030       mov(t2, zr);
5031     }
5032 
5033     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5034       block_comment("pre2");
5035       // Pa = Pa_base + i-len;
5036       // Pb = Pb_base + len;
5037       // Pm = Pm_base + i-len;
5038       // Pn = Pn_base + len;
5039 
5040       if (i.is_register()) {
5041         sub(Rj, i.as_register(), len);
5042       } else {
5043         mov(Rj, i.as_constant());
5044         sub(Rj, Rj, len);
5045       }
5046       // Rj == i-len
5047 
5048       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5049       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5050       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5051       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5052 
5053       // Ra = *++Pa;
5054       // Rb = *--Pb;
5055       // Rm = *++Pm;
5056       // Rn = *--Pn;
5057       ldr(Ra, pre(Pa, wordSize));
5058       ldr(Rb, pre(Pb, -wordSize));
5059       ldr(Rm, pre(Pm, wordSize));
5060       ldr(Rn, pre(Pn, -wordSize));
5061 
5062       mov(Rhi_mn, zr);
5063       mov(Rlo_mn, zr);
5064     }
5065 
5066     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5067       block_comment("post2");
5068       if (i.is_constant()) {
5069         mov(Rj, i.as_constant()-len.as_constant());
5070       } else {
5071         sub(Rj, i.as_register(), len);
5072       }
5073 
5074       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5075 
5076       // As soon as we know the least significant digit of our result,
5077       // store it.
5078       // Pm_base[i-len] = t0;
5079       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5080 
5081       // t0 = t1; t1 = t2; t2 = 0;
5082       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5083       adc(t1, t2, zr);
5084       mov(t2, zr);
5085     }
5086 
5087     // A carry in t0 after Montgomery multiplication means that we
5088     // should subtract multiples of n from our result in m.  We'll
5089     // keep doing that until there is no carry.
5090     void normalize(RegisterOrConstant len) {
5091       block_comment("normalize");
5092       // while (t0)
5093       //   t0 = sub(Pm_base, Pn_base, t0, len);
5094       Label loop, post, again;
5095       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5096       cbz(t0, post); {
5097         bind(again); {
5098           mov(i, zr);
5099           mov(cnt, len);
5100           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5101           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5102           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5103           align(16);
5104           bind(loop); {
5105             sbcs(Rm, Rm, Rn);
5106             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5107             add(i, i, 1);
5108             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5109             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5110             sub(cnt, cnt, 1);
5111           } cbnz(cnt, loop);
5112           sbc(t0, t0, zr);
5113         } cbnz(t0, again);
5114       } bind(post);
5115     }
5116 
5117     // Move memory at s to d, reversing words.
5118     //    Increments d to end of copied memory
5119     //    Destroys tmp1, tmp2
5120     //    Preserves len
5121     //    Leaves s pointing to the address which was in d at start
5122     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5123       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5124 
5125       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5126       mov(tmp1, len);
5127       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5128       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5129     }
5130     // where
5131     void reverse1(Register d, Register s, Register tmp) {
5132       ldr(tmp, pre(s, -wordSize));
5133       ror(tmp, tmp, 32);
5134       str(tmp, post(d, wordSize));
5135     }
5136 
5137     void step_squaring() {
5138       // An extra ACC
5139       step();
5140       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5141     }
5142 
5143     void last_squaring(RegisterOrConstant i) {
5144       Label dont;
5145       // if ((i & 1) == 0) {
5146       tbnz(i.as_register(), 0, dont); {
5147         // MACC(Ra, Rb, t0, t1, t2);
5148         // Ra = *++Pa;
5149         // Rb = *--Pb;
5150         umulh(Rhi_ab, Ra, Rb);
5151         mul(Rlo_ab, Ra, Rb);
5152         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5153       } bind(dont);
5154     }
5155 
5156     void extra_step_squaring() {
5157       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5158 
5159       // MACC(Rm, Rn, t0, t1, t2);
5160       // Rm = *++Pm;
5161       // Rn = *--Pn;
5162       umulh(Rhi_mn, Rm, Rn);
5163       mul(Rlo_mn, Rm, Rn);
5164       ldr(Rm, pre(Pm, wordSize));
5165       ldr(Rn, pre(Pn, -wordSize));
5166     }
5167 
5168     void post1_squaring() {
5169       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5170 
5171       // *Pm = Rm = t0 * inv;
5172       mul(Rm, t0, inv);
5173       str(Rm, Address(Pm));
5174 
5175       // MACC(Rm, Rn, t0, t1, t2);
5176       // t0 = t1; t1 = t2; t2 = 0;
5177       umulh(Rhi_mn, Rm, Rn);
5178 
5179 #ifndef PRODUCT
5180       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5181       {
5182         mul(Rlo_mn, Rm, Rn);
5183         add(Rlo_mn, t0, Rlo_mn);
5184         Label ok;
5185         cbz(Rlo_mn, ok); {
5186           stop("broken Montgomery multiply");
5187         } bind(ok);
5188       }
5189 #endif
5190       // We have very carefully set things up so that
5191       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5192       // the lower half of Rm * Rn because we know the result already:
5193       // it must be -t0.  t0 + (-t0) must generate a carry iff
5194       // t0 != 0.  So, rather than do a mul and an adds we just set
5195       // the carry flag iff t0 is nonzero.
5196       //
5197       // mul(Rlo_mn, Rm, Rn);
5198       // adds(zr, t0, Rlo_mn);
5199       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5200       adcs(t0, t1, Rhi_mn);
5201       adc(t1, t2, zr);
5202       mov(t2, zr);
5203     }
5204 
5205     void acc(Register Rhi, Register Rlo,
5206              Register t0, Register t1, Register t2) {
5207       adds(t0, t0, Rlo);
5208       adcs(t1, t1, Rhi);
5209       adc(t2, t2, zr);
5210     }
5211 
5212   public:
5213     /**
5214      * Fast Montgomery multiplication.  The derivation of the
5215      * algorithm is in A Cryptographic Library for the Motorola
5216      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5217      *
5218      * Arguments:
5219      *
5220      * Inputs for multiplication:
5221      *   c_rarg0   - int array elements a
5222      *   c_rarg1   - int array elements b
5223      *   c_rarg2   - int array elements n (the modulus)
5224      *   c_rarg3   - int length
5225      *   c_rarg4   - int inv
5226      *   c_rarg5   - int array elements m (the result)
5227      *
5228      * Inputs for squaring:
5229      *   c_rarg0   - int array elements a
5230      *   c_rarg1   - int array elements n (the modulus)
5231      *   c_rarg2   - int length
5232      *   c_rarg3   - int inv
5233      *   c_rarg4   - int array elements m (the result)
5234      *
5235      */
5236     address generate_multiply() {
5237       Label argh, nothing;
5238       bind(argh);
5239       stop("MontgomeryMultiply total_allocation must be <= 8192");
5240 
5241       align(CodeEntryAlignment);
5242       address entry = pc();
5243 
5244       cbzw(Rlen, nothing);
5245 
5246       enter();
5247 
5248       // Make room.
5249       cmpw(Rlen, 512);
5250       br(Assembler::HI, argh);
5251       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5252       andr(sp, Ra, -2 * wordSize);
5253 
5254       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5255 
5256       {
5257         // Copy input args, reversing as we go.  We use Ra as a
5258         // temporary variable.
5259         reverse(Ra, Pa_base, Rlen, t0, t1);
5260         if (!_squaring)
5261           reverse(Ra, Pb_base, Rlen, t0, t1);
5262         reverse(Ra, Pn_base, Rlen, t0, t1);
5263       }
5264 
5265       // Push all call-saved registers and also Pm_base which we'll need
5266       // at the end.
5267       save_regs();
5268 
5269 #ifndef PRODUCT
5270       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5271       {
5272         ldr(Rn, Address(Pn_base, 0));
5273         mul(Rlo_mn, Rn, inv);
5274         subs(zr, Rlo_mn, -1);
5275         Label ok;
5276         br(EQ, ok); {
5277           stop("broken inverse in Montgomery multiply");
5278         } bind(ok);
5279       }
5280 #endif
5281 
5282       mov(Pm_base, Ra);
5283 
5284       mov(t0, zr);
5285       mov(t1, zr);
5286       mov(t2, zr);
5287 
5288       block_comment("for (int i = 0; i < len; i++) {");
5289       mov(Ri, zr); {
5290         Label loop, end;
5291         cmpw(Ri, Rlen);
5292         br(Assembler::GE, end);
5293 
5294         bind(loop);
5295         pre1(Ri);
5296 
5297         block_comment("  for (j = i; j; j--) {"); {
5298           movw(Rj, Ri);
5299           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5300         } block_comment("  } // j");
5301 
5302         post1();
5303         addw(Ri, Ri, 1);
5304         cmpw(Ri, Rlen);
5305         br(Assembler::LT, loop);
5306         bind(end);
5307         block_comment("} // i");
5308       }
5309 
5310       block_comment("for (int i = len; i < 2*len; i++) {");
5311       mov(Ri, Rlen); {
5312         Label loop, end;
5313         cmpw(Ri, Rlen, Assembler::LSL, 1);
5314         br(Assembler::GE, end);
5315 
5316         bind(loop);
5317         pre2(Ri, Rlen);
5318 
5319         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5320           lslw(Rj, Rlen, 1);
5321           subw(Rj, Rj, Ri);
5322           subw(Rj, Rj, 1);
5323           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5324         } block_comment("  } // j");
5325 
5326         post2(Ri, Rlen);
5327         addw(Ri, Ri, 1);
5328         cmpw(Ri, Rlen, Assembler::LSL, 1);
5329         br(Assembler::LT, loop);
5330         bind(end);
5331       }
5332       block_comment("} // i");
5333 
5334       normalize(Rlen);
5335 
5336       mov(Ra, Pm_base);  // Save Pm_base in Ra
5337       restore_regs();  // Restore caller's Pm_base
5338 
5339       // Copy our result into caller's Pm_base
5340       reverse(Pm_base, Ra, Rlen, t0, t1);
5341 
5342       leave();
5343       bind(nothing);
5344       ret(lr);
5345 
5346       return entry;
5347     }
5348     // In C, approximately:
5349 
5350     // void
5351     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5352     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5353     //                     unsigned long inv, int len) {
5354     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5355     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5356     //   unsigned long Ra, Rb, Rn, Rm;
5357 
5358     //   int i;
5359 
5360     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5361 
5362     //   for (i = 0; i < len; i++) {
5363     //     int j;
5364 
5365     //     Pa = Pa_base;
5366     //     Pb = Pb_base + i;
5367     //     Pm = Pm_base;
5368     //     Pn = Pn_base + i;
5369 
5370     //     Ra = *Pa;
5371     //     Rb = *Pb;
5372     //     Rm = *Pm;
5373     //     Rn = *Pn;
5374 
5375     //     int iters = i;
5376     //     for (j = 0; iters--; j++) {
5377     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5378     //       MACC(Ra, Rb, t0, t1, t2);
5379     //       Ra = *++Pa;
5380     //       Rb = *--Pb;
5381     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5382     //       MACC(Rm, Rn, t0, t1, t2);
5383     //       Rm = *++Pm;
5384     //       Rn = *--Pn;
5385     //     }
5386 
5387     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5388     //     MACC(Ra, Rb, t0, t1, t2);
5389     //     *Pm = Rm = t0 * inv;
5390     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5391     //     MACC(Rm, Rn, t0, t1, t2);
5392 
5393     //     assert(t0 == 0, "broken Montgomery multiply");
5394 
5395     //     t0 = t1; t1 = t2; t2 = 0;
5396     //   }
5397 
5398     //   for (i = len; i < 2*len; i++) {
5399     //     int j;
5400 
5401     //     Pa = Pa_base + i-len;
5402     //     Pb = Pb_base + len;
5403     //     Pm = Pm_base + i-len;
5404     //     Pn = Pn_base + len;
5405 
5406     //     Ra = *++Pa;
5407     //     Rb = *--Pb;
5408     //     Rm = *++Pm;
5409     //     Rn = *--Pn;
5410 
5411     //     int iters = len*2-i-1;
5412     //     for (j = i-len+1; iters--; j++) {
5413     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5414     //       MACC(Ra, Rb, t0, t1, t2);
5415     //       Ra = *++Pa;
5416     //       Rb = *--Pb;
5417     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5418     //       MACC(Rm, Rn, t0, t1, t2);
5419     //       Rm = *++Pm;
5420     //       Rn = *--Pn;
5421     //     }
5422 
5423     //     Pm_base[i-len] = t0;
5424     //     t0 = t1; t1 = t2; t2 = 0;
5425     //   }
5426 
5427     //   while (t0)
5428     //     t0 = sub(Pm_base, Pn_base, t0, len);
5429     // }
5430 
5431     /**
5432      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5433      * multiplies than Montgomery multiplication so it should be up to
5434      * 25% faster.  However, its loop control is more complex and it
5435      * may actually run slower on some machines.
5436      *
5437      * Arguments:
5438      *
5439      * Inputs:
5440      *   c_rarg0   - int array elements a
5441      *   c_rarg1   - int array elements n (the modulus)
5442      *   c_rarg2   - int length
5443      *   c_rarg3   - int inv
5444      *   c_rarg4   - int array elements m (the result)
5445      *
5446      */
5447     address generate_square() {
5448       Label argh;
5449       bind(argh);
5450       stop("MontgomeryMultiply total_allocation must be <= 8192");
5451 
5452       align(CodeEntryAlignment);
5453       address entry = pc();
5454 
5455       enter();
5456 
5457       // Make room.
5458       cmpw(Rlen, 512);
5459       br(Assembler::HI, argh);
5460       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5461       andr(sp, Ra, -2 * wordSize);
5462 
5463       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5464 
5465       {
5466         // Copy input args, reversing as we go.  We use Ra as a
5467         // temporary variable.
5468         reverse(Ra, Pa_base, Rlen, t0, t1);
5469         reverse(Ra, Pn_base, Rlen, t0, t1);
5470       }
5471 
5472       // Push all call-saved registers and also Pm_base which we'll need
5473       // at the end.
5474       save_regs();
5475 
5476       mov(Pm_base, Ra);
5477 
5478       mov(t0, zr);
5479       mov(t1, zr);
5480       mov(t2, zr);
5481 
5482       block_comment("for (int i = 0; i < len; i++) {");
5483       mov(Ri, zr); {
5484         Label loop, end;
5485         bind(loop);
5486         cmp(Ri, Rlen);
5487         br(Assembler::GE, end);
5488 
5489         pre1(Ri);
5490 
5491         block_comment("for (j = (i+1)/2; j; j--) {"); {
5492           add(Rj, Ri, 1);
5493           lsr(Rj, Rj, 1);
5494           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5495         } block_comment("  } // j");
5496 
5497         last_squaring(Ri);
5498 
5499         block_comment("  for (j = i/2; j; j--) {"); {
5500           lsr(Rj, Ri, 1);
5501           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5502         } block_comment("  } // j");
5503 
5504         post1_squaring();
5505         add(Ri, Ri, 1);
5506         cmp(Ri, Rlen);
5507         br(Assembler::LT, loop);
5508 
5509         bind(end);
5510         block_comment("} // i");
5511       }
5512 
5513       block_comment("for (int i = len; i < 2*len; i++) {");
5514       mov(Ri, Rlen); {
5515         Label loop, end;
5516         bind(loop);
5517         cmp(Ri, Rlen, Assembler::LSL, 1);
5518         br(Assembler::GE, end);
5519 
5520         pre2(Ri, Rlen);
5521 
5522         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5523           lsl(Rj, Rlen, 1);
5524           sub(Rj, Rj, Ri);
5525           sub(Rj, Rj, 1);
5526           lsr(Rj, Rj, 1);
5527           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5528         } block_comment("  } // j");
5529 
5530         last_squaring(Ri);
5531 
5532         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5533           lsl(Rj, Rlen, 1);
5534           sub(Rj, Rj, Ri);
5535           lsr(Rj, Rj, 1);
5536           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5537         } block_comment("  } // j");
5538 
5539         post2(Ri, Rlen);
5540         add(Ri, Ri, 1);
5541         cmp(Ri, Rlen, Assembler::LSL, 1);
5542 
5543         br(Assembler::LT, loop);
5544         bind(end);
5545         block_comment("} // i");
5546       }
5547 
5548       normalize(Rlen);
5549 
5550       mov(Ra, Pm_base);  // Save Pm_base in Ra
5551       restore_regs();  // Restore caller's Pm_base
5552 
5553       // Copy our result into caller's Pm_base
5554       reverse(Pm_base, Ra, Rlen, t0, t1);
5555 
5556       leave();
5557       ret(lr);
5558 
5559       return entry;
5560     }
5561     // In C, approximately:
5562 
5563     // void
5564     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5565     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5566     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5567     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5568     //   unsigned long Ra, Rb, Rn, Rm;
5569 
5570     //   int i;
5571 
5572     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5573 
5574     //   for (i = 0; i < len; i++) {
5575     //     int j;
5576 
5577     //     Pa = Pa_base;
5578     //     Pb = Pa_base + i;
5579     //     Pm = Pm_base;
5580     //     Pn = Pn_base + i;
5581 
5582     //     Ra = *Pa;
5583     //     Rb = *Pb;
5584     //     Rm = *Pm;
5585     //     Rn = *Pn;
5586 
5587     //     int iters = (i+1)/2;
5588     //     for (j = 0; iters--; j++) {
5589     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5590     //       MACC2(Ra, Rb, t0, t1, t2);
5591     //       Ra = *++Pa;
5592     //       Rb = *--Pb;
5593     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5594     //       MACC(Rm, Rn, t0, t1, t2);
5595     //       Rm = *++Pm;
5596     //       Rn = *--Pn;
5597     //     }
5598     //     if ((i & 1) == 0) {
5599     //       assert(Ra == Pa_base[j], "must be");
5600     //       MACC(Ra, Ra, t0, t1, t2);
5601     //     }
5602     //     iters = i/2;
5603     //     assert(iters == i-j, "must be");
5604     //     for (; iters--; j++) {
5605     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5606     //       MACC(Rm, Rn, t0, t1, t2);
5607     //       Rm = *++Pm;
5608     //       Rn = *--Pn;
5609     //     }
5610 
5611     //     *Pm = Rm = t0 * inv;
5612     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5613     //     MACC(Rm, Rn, t0, t1, t2);
5614 
5615     //     assert(t0 == 0, "broken Montgomery multiply");
5616 
5617     //     t0 = t1; t1 = t2; t2 = 0;
5618     //   }
5619 
5620     //   for (i = len; i < 2*len; i++) {
5621     //     int start = i-len+1;
5622     //     int end = start + (len - start)/2;
5623     //     int j;
5624 
5625     //     Pa = Pa_base + i-len;
5626     //     Pb = Pa_base + len;
5627     //     Pm = Pm_base + i-len;
5628     //     Pn = Pn_base + len;
5629 
5630     //     Ra = *++Pa;
5631     //     Rb = *--Pb;
5632     //     Rm = *++Pm;
5633     //     Rn = *--Pn;
5634 
5635     //     int iters = (2*len-i-1)/2;
5636     //     assert(iters == end-start, "must be");
5637     //     for (j = start; iters--; j++) {
5638     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5639     //       MACC2(Ra, Rb, t0, t1, t2);
5640     //       Ra = *++Pa;
5641     //       Rb = *--Pb;
5642     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5643     //       MACC(Rm, Rn, t0, t1, t2);
5644     //       Rm = *++Pm;
5645     //       Rn = *--Pn;
5646     //     }
5647     //     if ((i & 1) == 0) {
5648     //       assert(Ra == Pa_base[j], "must be");
5649     //       MACC(Ra, Ra, t0, t1, t2);
5650     //     }
5651     //     iters =  (2*len-i)/2;
5652     //     assert(iters == len-j, "must be");
5653     //     for (; iters--; j++) {
5654     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5655     //       MACC(Rm, Rn, t0, t1, t2);
5656     //       Rm = *++Pm;
5657     //       Rn = *--Pn;
5658     //     }
5659     //     Pm_base[i-len] = t0;
5660     //     t0 = t1; t1 = t2; t2 = 0;
5661     //   }
5662 
5663     //   while (t0)
5664     //     t0 = sub(Pm_base, Pn_base, t0, len);
5665     // }
5666   };
5667 
5668 
5669   // Initialization
5670   void generate_initial() {
5671     // Generate initial stubs and initializes the entry points
5672 
5673     // entry points that exist in all platforms Note: This is code
5674     // that could be shared among different platforms - however the
5675     // benefit seems to be smaller than the disadvantage of having a
5676     // much more complicated generator structure. See also comment in
5677     // stubRoutines.hpp.
5678 
5679     StubRoutines::_forward_exception_entry = generate_forward_exception();
5680 
5681     StubRoutines::_call_stub_entry =
5682       generate_call_stub(StubRoutines::_call_stub_return_address);
5683 
5684     // is referenced by megamorphic call
5685     StubRoutines::_catch_exception_entry = generate_catch_exception();
5686 
5687     // Build this early so it's available for the interpreter.
5688     StubRoutines::_throw_StackOverflowError_entry =
5689       generate_throw_exception("StackOverflowError throw_exception",
5690                                CAST_FROM_FN_PTR(address,
5691                                                 SharedRuntime::throw_StackOverflowError));
5692     StubRoutines::_throw_delayed_StackOverflowError_entry =
5693       generate_throw_exception("delayed StackOverflowError throw_exception",
5694                                CAST_FROM_FN_PTR(address,
5695                                                 SharedRuntime::throw_delayed_StackOverflowError));
5696     if (UseCRC32Intrinsics) {
5697       // set table address before stub generation which use it
5698       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5699       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5700     }
5701 
5702     if (UseCRC32CIntrinsics) {
5703       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5704     }
5705 
5706     // Disabled until JDK-8210858 is fixed
5707     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5708     //   StubRoutines::_dlog = generate_dlog();
5709     // }
5710 
5711     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5712       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5713     }
5714 
5715     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5716       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5717     }
5718   }
5719 
5720   void generate_all() {
5721     // support for verify_oop (must happen after universe_init)
5722     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5723     StubRoutines::_throw_AbstractMethodError_entry =
5724       generate_throw_exception("AbstractMethodError throw_exception",
5725                                CAST_FROM_FN_PTR(address,
5726                                                 SharedRuntime::
5727                                                 throw_AbstractMethodError));
5728 
5729     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5730       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5731                                CAST_FROM_FN_PTR(address,
5732                                                 SharedRuntime::
5733                                                 throw_IncompatibleClassChangeError));
5734 
5735     StubRoutines::_throw_NullPointerException_at_call_entry =
5736       generate_throw_exception("NullPointerException at call throw_exception",
5737                                CAST_FROM_FN_PTR(address,
5738                                                 SharedRuntime::
5739                                                 throw_NullPointerException_at_call));
5740 
5741     // arraycopy stubs used by compilers
5742     generate_arraycopy_stubs();
5743 
5744     // has negatives stub for large arrays.
5745     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5746 
5747     // array equals stub for large arrays.
5748     if (!UseSimpleArrayEquals) {
5749       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5750     }
5751 
5752     generate_compare_long_strings();
5753 
5754     generate_string_indexof_stubs();
5755 
5756     // byte_array_inflate stub for large arrays.
5757     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5758 
5759 #ifdef COMPILER2
5760     if (UseMultiplyToLenIntrinsic) {
5761       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5762     }
5763 
5764     if (UseSquareToLenIntrinsic) {
5765       StubRoutines::_squareToLen = generate_squareToLen();
5766     }
5767 
5768     if (UseMulAddIntrinsic) {
5769       StubRoutines::_mulAdd = generate_mulAdd();
5770     }
5771 
5772     if (UseMontgomeryMultiplyIntrinsic) {
5773       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5774       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5775       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5776     }
5777 
5778     if (UseMontgomerySquareIntrinsic) {
5779       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5780       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5781       // We use generate_multiply() rather than generate_square()
5782       // because it's faster for the sizes of modulus we care about.
5783       StubRoutines::_montgomerySquare = g.generate_multiply();
5784     }
5785 #endif // COMPILER2
5786 
5787 #ifndef BUILTIN_SIM
5788     // generate GHASH intrinsics code
5789     if (UseGHASHIntrinsics) {
5790       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5791     }
5792 
5793     if (UseAESIntrinsics) {
5794       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5795       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5796       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5797       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5798     }
5799 
5800     if (UseSHA1Intrinsics) {
5801       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5802       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5803     }
5804     if (UseSHA256Intrinsics) {
5805       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5806       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5807     }
5808 
5809     // generate Adler32 intrinsics code
5810     if (UseAdler32Intrinsics) {
5811       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5812     }
5813 
5814     // Safefetch stubs.
5815     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5816                                                        &StubRoutines::_safefetch32_fault_pc,
5817                                                        &StubRoutines::_safefetch32_continuation_pc);
5818     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5819                                                        &StubRoutines::_safefetchN_fault_pc,
5820                                                        &StubRoutines::_safefetchN_continuation_pc);
5821 #endif
5822     StubRoutines::aarch64::set_completed();
5823   }
5824 
5825  public:
5826   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5827     if (all) {
5828       generate_all();
5829     } else {
5830       generate_initial();
5831     }
5832   }
5833 }; // end class declaration
5834 
5835 void StubGenerator_generate(CodeBuffer* code, bool all) {
5836   StubGenerator g(code, all);
5837 }