1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (u1)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, (u1)T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, (u1)T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, (u1)T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(j_rarg1, (u1)T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(j_rarg2));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376 #ifndef PRODUCT
 377     // tell the simulator we are about to end Java execution
 378     if (NotifySimulator) {
 379       __ notify(Assembler::method_exit);
 380     }
 381 #endif
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387 
 388     __ BIND(is_long);
 389     __ str(r0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_float);
 393     __ strs(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_double);
 397     __ strd(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     return start;
 401   }
 402 
 403   // Return point for a Java call if there's an exception thrown in
 404   // Java code.  The exception is caught and transformed into a
 405   // pending exception stored in JavaThread that can be tested from
 406   // within the VM.
 407   //
 408   // Note: Usually the parameters are removed by the callee. In case
 409   // of an exception crossing an activation frame boundary, that is
 410   // not the case if the callee is compiled code => need to setup the
 411   // rsp.
 412   //
 413   // r0: exception oop
 414 
 415   // NOTE: this is used as a target from the signal handler so it
 416   // needs an x86 prolog which returns into the current simulator
 417   // executing the generated catch_exception code. so the prolog
 418   // needs to install rax in a sim register and adjust the sim's
 419   // restart pc to enter the generated code at the start position
 420   // then return from native to simulated execution.
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != NULL,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code wiht no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // we should not really care that lr is no longer the callee
 517     // address. we saved the value the handler needs in r19 so we can
 518     // just copy it to r3. however, the C2 handler will push its own
 519     // frame and then calls into the VM and the VM code asserts that
 520     // the PC for the frame above the handler belongs to a compiled
 521     // Java method. So, we restore lr here to satisfy that assert.
 522     __ mov(lr, r19);
 523     // setup r0 & r3 & clear pending exception
 524     __ mov(r3, r19);
 525     __ mov(r19, r0);
 526     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 527     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ cbnz(r0, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler
 540     // r0: exception
 541     // r3: throwing pc
 542     // r19: exception handler
 543     __ verify_oop(r0);
 544     __ br(r19);
 545 
 546     return start;
 547   }
 548 
 549   // Non-destructive plausibility checks for oops
 550   //
 551   // Arguments:
 552   //    r0: oop to verify
 553   //    rscratch1: error message
 554   //
 555   // Stack after saving c_rarg3:
 556   //    [tos + 0]: saved c_rarg3
 557   //    [tos + 1]: saved c_rarg2
 558   //    [tos + 2]: saved lr
 559   //    [tos + 3]: saved rscratch2
 560   //    [tos + 4]: saved r0
 561   //    [tos + 5]: saved rscratch1
 562   address generate_verify_oop() {
 563 
 564     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 565     address start = __ pc();
 566 
 567     Label exit, error;
 568 
 569     // save c_rarg2 and c_rarg3
 570     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 571 
 572     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 573     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ ldr(c_rarg3, Address(c_rarg2));
 575     __ add(c_rarg3, c_rarg3, 1);
 576     __ str(c_rarg3, Address(c_rarg2));
 577 
 578     // object is in r0
 579     // make sure object is 'reasonable'
 580     __ cbz(r0, exit); // if obj is NULL it is OK
 581 
 582     // Check if the oop is in the right area of memory
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 584     __ andr(c_rarg2, r0, c_rarg3);
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 586 
 587     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 588     // instruction here because the flags register is live.
 589     __ eor(c_rarg2, c_rarg2, c_rarg3);
 590     __ cbnz(c_rarg2, error);
 591 
 592     // make sure klass is 'reasonable', which is not zero.
 593     __ load_klass(r0, r0);  // get klass
 594     __ cbz(r0, error);      // if klass is NULL it is broken
 595 
 596     // return if everything seems ok
 597     __ bind(exit);
 598 
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600     __ ret(lr);
 601 
 602     // handle errors
 603     __ bind(error);
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605 
 606     __ push(RegSet::range(r0, r29), sp);
 607     // debug(char* msg, int64_t pc, int64_t regs[])
 608     __ mov(c_rarg0, rscratch1);      // pass address of error message
 609     __ mov(c_rarg1, lr);             // pass return address
 610     __ mov(c_rarg2, sp);             // pass address of regs on stack
 611 #ifndef PRODUCT
 612     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 613 #endif
 614     BLOCK_COMMENT("call MacroAssembler::debug");
 615     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 616     __ blrt(rscratch1, 3, 0, 1);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657 
 658       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 659       // alignment.
 660       Label small;
 661       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 662       __ subs(rscratch1, cnt, low_limit >> 3);
 663       __ br(Assembler::LT, small);
 664       __ zero_dcache_blocks(base, cnt);
 665       __ bind(small);
 666     }
 667 
 668     {
 669       // Number of stp instructions we'll unroll
 670       const int unroll =
 671         MacroAssembler::zero_words_block_size / 2;
 672       // Clear the remaining blocks.
 673       Label loop;
 674       __ subs(cnt, cnt, unroll * 2);
 675       __ br(Assembler::LT, done);
 676       __ bind(loop);
 677       for (int i = 0; i < unroll; i++)
 678         __ stp(zr, zr, __ post(base, 16));
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::GE, loop);
 681       __ bind(done);
 682       __ add(cnt, cnt, unroll * 2);
 683     }
 684 
 685     __ ret(lr);
 686 
 687     return start;
 688   }
 689 
 690 
 691   typedef enum {
 692     copy_forwards = 1,
 693     copy_backwards = -1
 694   } copy_direction;
 695 
 696   // Bulk copy of blocks of 8 words.
 697   //
 698   // count is a count of words.
 699   //
 700   // Precondition: count >= 8
 701   //
 702   // Postconditions:
 703   //
 704   // The least significant bit of count contains the remaining count
 705   // of words to copy.  The rest of count is trash.
 706   //
 707   // s and d are adjusted to point to the remaining words to copy
 708   //
 709   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 710                            copy_direction direction) {
 711     int unit = wordSize * direction;
 712     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 713 
 714     int offset;
 715     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 716       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 717     const Register stride = r13;
 718 
 719     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 720     assert_different_registers(s, d, count, rscratch1);
 721 
 722     Label again, drain;
 723     const char *stub_name;
 724     if (direction == copy_forwards)
 725       stub_name = "forward_copy_longs";
 726     else
 727       stub_name = "backward_copy_longs";
 728 
 729     __ align(CodeEntryAlignment);
 730 
 731     StubCodeMark mark(this, "StubRoutines", stub_name);
 732 
 733     __ bind(start);
 734 
 735     Label unaligned_copy_long;
 736     if (AvoidUnalignedAccesses) {
 737       __ tbnz(d, 3, unaligned_copy_long);
 738     }
 739 
 740     if (direction == copy_forwards) {
 741       __ sub(s, s, bias);
 742       __ sub(d, d, bias);
 743     }
 744 
 745 #ifdef ASSERT
 746     // Make sure we are never given < 8 words
 747     {
 748       Label L;
 749       __ cmp(count, (u1)8);
 750       __ br(Assembler::GE, L);
 751       __ stop("genrate_copy_longs called with < 8 words");
 752       __ bind(L);
 753     }
 754 #endif
 755 
 756     // Fill 8 registers
 757     if (UseSIMDForMemoryOps) {
 758       __ ldpq(v0, v1, Address(s, 4 * unit));
 759       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 760     } else {
 761       __ ldp(t0, t1, Address(s, 2 * unit));
 762       __ ldp(t2, t3, Address(s, 4 * unit));
 763       __ ldp(t4, t5, Address(s, 6 * unit));
 764       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 765     }
 766 
 767     __ subs(count, count, 16);
 768     __ br(Assembler::LO, drain);
 769 
 770     int prefetch = PrefetchCopyIntervalInBytes;
 771     bool use_stride = false;
 772     if (direction == copy_backwards) {
 773        use_stride = prefetch > 256;
 774        prefetch = -prefetch;
 775        if (use_stride) __ mov(stride, prefetch);
 776     }
 777 
 778     __ bind(again);
 779 
 780     if (PrefetchCopyIntervalInBytes > 0)
 781       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 782 
 783     if (UseSIMDForMemoryOps) {
 784       __ stpq(v0, v1, Address(d, 4 * unit));
 785       __ ldpq(v0, v1, Address(s, 4 * unit));
 786       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 787       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 788     } else {
 789       __ stp(t0, t1, Address(d, 2 * unit));
 790       __ ldp(t0, t1, Address(s, 2 * unit));
 791       __ stp(t2, t3, Address(d, 4 * unit));
 792       __ ldp(t2, t3, Address(s, 4 * unit));
 793       __ stp(t4, t5, Address(d, 6 * unit));
 794       __ ldp(t4, t5, Address(s, 6 * unit));
 795       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 796       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 797     }
 798 
 799     __ subs(count, count, 8);
 800     __ br(Assembler::HS, again);
 801 
 802     // Drain
 803     __ bind(drain);
 804     if (UseSIMDForMemoryOps) {
 805       __ stpq(v0, v1, Address(d, 4 * unit));
 806       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 807     } else {
 808       __ stp(t0, t1, Address(d, 2 * unit));
 809       __ stp(t2, t3, Address(d, 4 * unit));
 810       __ stp(t4, t5, Address(d, 6 * unit));
 811       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 812     }
 813 
 814     {
 815       Label L1, L2;
 816       __ tbz(count, exact_log2(4), L1);
 817       if (UseSIMDForMemoryOps) {
 818         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 819         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 820       } else {
 821         __ ldp(t0, t1, Address(s, 2 * unit));
 822         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 823         __ stp(t0, t1, Address(d, 2 * unit));
 824         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 825       }
 826       __ bind(L1);
 827 
 828       if (direction == copy_forwards) {
 829         __ add(s, s, bias);
 830         __ add(d, d, bias);
 831       }
 832 
 833       __ tbz(count, 1, L2);
 834       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 835       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 836       __ bind(L2);
 837     }
 838 
 839     __ ret(lr);
 840 
 841     if (AvoidUnalignedAccesses) {
 842       Label drain, again;
 843       // Register order for storing. Order is different for backward copy.
 844 
 845       __ bind(unaligned_copy_long);
 846 
 847       // source address is even aligned, target odd aligned
 848       //
 849       // when forward copying word pairs we read long pairs at offsets
 850       // {0, 2, 4, 6} (in long words). when backwards copying we read
 851       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 852       // address by -2 in the forwards case so we can compute the
 853       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 854       // or -1.
 855       //
 856       // when forward copying we need to store 1 word, 3 pairs and
 857       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 858       // zero offset We adjust the destination by -1 which means we
 859       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 860       //
 861       // When backwards copyng we need to store 1 word, 3 pairs and
 862       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 863       // offsets {1, 3, 5, 7, 8} * unit.
 864 
 865       if (direction == copy_forwards) {
 866         __ sub(s, s, 16);
 867         __ sub(d, d, 8);
 868       }
 869 
 870       // Fill 8 registers
 871       //
 872       // for forwards copy s was offset by -16 from the original input
 873       // value of s so the register contents are at these offsets
 874       // relative to the 64 bit block addressed by that original input
 875       // and so on for each successive 64 byte block when s is updated
 876       //
 877       // t0 at offset 0,  t1 at offset 8
 878       // t2 at offset 16, t3 at offset 24
 879       // t4 at offset 32, t5 at offset 40
 880       // t6 at offset 48, t7 at offset 56
 881 
 882       // for backwards copy s was not offset so the register contents
 883       // are at these offsets into the preceding 64 byte block
 884       // relative to that original input and so on for each successive
 885       // preceding 64 byte block when s is updated. this explains the
 886       // slightly counter-intuitive looking pattern of register usage
 887       // in the stp instructions for backwards copy.
 888       //
 889       // t0 at offset -16, t1 at offset -8
 890       // t2 at offset -32, t3 at offset -24
 891       // t4 at offset -48, t5 at offset -40
 892       // t6 at offset -64, t7 at offset -56
 893 
 894       __ ldp(t0, t1, Address(s, 2 * unit));
 895       __ ldp(t2, t3, Address(s, 4 * unit));
 896       __ ldp(t4, t5, Address(s, 6 * unit));
 897       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 898 
 899       __ subs(count, count, 16);
 900       __ br(Assembler::LO, drain);
 901 
 902       int prefetch = PrefetchCopyIntervalInBytes;
 903       bool use_stride = false;
 904       if (direction == copy_backwards) {
 905          use_stride = prefetch > 256;
 906          prefetch = -prefetch;
 907          if (use_stride) __ mov(stride, prefetch);
 908       }
 909 
 910       __ bind(again);
 911 
 912       if (PrefetchCopyIntervalInBytes > 0)
 913         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 914 
 915       if (direction == copy_forwards) {
 916        // allowing for the offset of -8 the store instructions place
 917        // registers into the target 64 bit block at the following
 918        // offsets
 919        //
 920        // t0 at offset 0
 921        // t1 at offset 8,  t2 at offset 16
 922        // t3 at offset 24, t4 at offset 32
 923        // t5 at offset 40, t6 at offset 48
 924        // t7 at offset 56
 925 
 926         __ str(t0, Address(d, 1 * unit));
 927         __ stp(t1, t2, Address(d, 2 * unit));
 928         __ ldp(t0, t1, Address(s, 2 * unit));
 929         __ stp(t3, t4, Address(d, 4 * unit));
 930         __ ldp(t2, t3, Address(s, 4 * unit));
 931         __ stp(t5, t6, Address(d, 6 * unit));
 932         __ ldp(t4, t5, Address(s, 6 * unit));
 933         __ str(t7, Address(__ pre(d, 8 * unit)));
 934         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 935       } else {
 936        // d was not offset when we started so the registers are
 937        // written into the 64 bit block preceding d with the following
 938        // offsets
 939        //
 940        // t1 at offset -8
 941        // t3 at offset -24, t0 at offset -16
 942        // t5 at offset -48, t2 at offset -32
 943        // t7 at offset -56, t4 at offset -48
 944        //                   t6 at offset -64
 945        //
 946        // note that this matches the offsets previously noted for the
 947        // loads
 948 
 949         __ str(t1, Address(d, 1 * unit));
 950         __ stp(t3, t0, Address(d, 3 * unit));
 951         __ ldp(t0, t1, Address(s, 2 * unit));
 952         __ stp(t5, t2, Address(d, 5 * unit));
 953         __ ldp(t2, t3, Address(s, 4 * unit));
 954         __ stp(t7, t4, Address(d, 7 * unit));
 955         __ ldp(t4, t5, Address(s, 6 * unit));
 956         __ str(t6, Address(__ pre(d, 8 * unit)));
 957         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 958       }
 959 
 960       __ subs(count, count, 8);
 961       __ br(Assembler::HS, again);
 962 
 963       // Drain
 964       //
 965       // this uses the same pattern of offsets and register arguments
 966       // as above
 967       __ bind(drain);
 968       if (direction == copy_forwards) {
 969         __ str(t0, Address(d, 1 * unit));
 970         __ stp(t1, t2, Address(d, 2 * unit));
 971         __ stp(t3, t4, Address(d, 4 * unit));
 972         __ stp(t5, t6, Address(d, 6 * unit));
 973         __ str(t7, Address(__ pre(d, 8 * unit)));
 974       } else {
 975         __ str(t1, Address(d, 1 * unit));
 976         __ stp(t3, t0, Address(d, 3 * unit));
 977         __ stp(t5, t2, Address(d, 5 * unit));
 978         __ stp(t7, t4, Address(d, 7 * unit));
 979         __ str(t6, Address(__ pre(d, 8 * unit)));
 980       }
 981       // now we need to copy any remaining part block which may
 982       // include a 4 word block subblock and/or a 2 word subblock.
 983       // bits 2 and 1 in the count are the tell-tale for whetehr we
 984       // have each such subblock
 985       {
 986         Label L1, L2;
 987         __ tbz(count, exact_log2(4), L1);
 988        // this is the same as above but copying only 4 longs hence
 989        // with ony one intervening stp between the str instructions
 990        // but note that the offsets and registers still follow the
 991        // same pattern
 992         __ ldp(t0, t1, Address(s, 2 * unit));
 993         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 994         if (direction == copy_forwards) {
 995           __ str(t0, Address(d, 1 * unit));
 996           __ stp(t1, t2, Address(d, 2 * unit));
 997           __ str(t3, Address(__ pre(d, 4 * unit)));
 998         } else {
 999           __ str(t1, Address(d, 1 * unit));
1000           __ stp(t3, t0, Address(d, 3 * unit));
1001           __ str(t2, Address(__ pre(d, 4 * unit)));
1002         }
1003         __ bind(L1);
1004 
1005         __ tbz(count, 1, L2);
1006        // this is the same as above but copying only 2 longs hence
1007        // there is no intervening stp between the str instructions
1008        // but note that the offset and register patterns are still
1009        // the same
1010         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1011         if (direction == copy_forwards) {
1012           __ str(t0, Address(d, 1 * unit));
1013           __ str(t1, Address(__ pre(d, 2 * unit)));
1014         } else {
1015           __ str(t1, Address(d, 1 * unit));
1016           __ str(t0, Address(__ pre(d, 2 * unit)));
1017         }
1018         __ bind(L2);
1019 
1020        // for forwards copy we need to re-adjust the offsets we
1021        // applied so that s and d are follow the last words written
1022 
1023        if (direction == copy_forwards) {
1024          __ add(s, s, 16);
1025          __ add(d, d, 8);
1026        }
1027 
1028       }
1029 
1030       __ ret(lr);
1031       }
1032   }
1033 
1034   // Small copy: less than 16 bytes.
1035   //
1036   // NB: Ignores all of the bits of count which represent more than 15
1037   // bytes, so a caller doesn't have to mask them.
1038 
1039   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1040     bool is_backwards = step < 0;
1041     size_t granularity = uabs(step);
1042     int direction = is_backwards ? -1 : 1;
1043     int unit = wordSize * direction;
1044 
1045     Label Lword, Lint, Lshort, Lbyte;
1046 
1047     assert(granularity
1048            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1049 
1050     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1051 
1052     // ??? I don't know if this bit-test-and-branch is the right thing
1053     // to do.  It does a lot of jumping, resulting in several
1054     // mispredicted branches.  It might make more sense to do this
1055     // with something like Duff's device with a single computed branch.
1056 
1057     __ tbz(count, 3 - exact_log2(granularity), Lword);
1058     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1059     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1060     __ bind(Lword);
1061 
1062     if (granularity <= sizeof (jint)) {
1063       __ tbz(count, 2 - exact_log2(granularity), Lint);
1064       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1065       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1066       __ bind(Lint);
1067     }
1068 
1069     if (granularity <= sizeof (jshort)) {
1070       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1071       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1072       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1073       __ bind(Lshort);
1074     }
1075 
1076     if (granularity <= sizeof (jbyte)) {
1077       __ tbz(count, 0, Lbyte);
1078       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1079       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1080       __ bind(Lbyte);
1081     }
1082   }
1083 
1084   Label copy_f, copy_b;
1085 
1086   // All-singing all-dancing memory copy.
1087   //
1088   // Copy count units of memory from s to d.  The size of a unit is
1089   // step, which can be positive or negative depending on the direction
1090   // of copy.  If is_aligned is false, we align the source address.
1091   //
1092 
1093   void copy_memory(bool is_aligned, Register s, Register d,
1094                    Register count, Register tmp, int step) {
1095     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1096     bool is_backwards = step < 0;
1097     int granularity = uabs(step);
1098     const Register t0 = r3, t1 = r4;
1099 
1100     // <= 96 bytes do inline. Direction doesn't matter because we always
1101     // load all the data before writing anything
1102     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1103     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1104     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1105     const Register send = r17, dend = r18;
1106 
1107     if (PrefetchCopyIntervalInBytes > 0)
1108       __ prfm(Address(s, 0), PLDL1KEEP);
1109     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1110     __ br(Assembler::HI, copy_big);
1111 
1112     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1113     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1114 
1115     __ cmp(count, u1(16/granularity));
1116     __ br(Assembler::LS, copy16);
1117 
1118     __ cmp(count, u1(64/granularity));
1119     __ br(Assembler::HI, copy80);
1120 
1121     __ cmp(count, u1(32/granularity));
1122     __ br(Assembler::LS, copy32);
1123 
1124     // 33..64 bytes
1125     if (UseSIMDForMemoryOps) {
1126       __ ldpq(v0, v1, Address(s, 0));
1127       __ ldpq(v2, v3, Address(send, -32));
1128       __ stpq(v0, v1, Address(d, 0));
1129       __ stpq(v2, v3, Address(dend, -32));
1130     } else {
1131       __ ldp(t0, t1, Address(s, 0));
1132       __ ldp(t2, t3, Address(s, 16));
1133       __ ldp(t4, t5, Address(send, -32));
1134       __ ldp(t6, t7, Address(send, -16));
1135 
1136       __ stp(t0, t1, Address(d, 0));
1137       __ stp(t2, t3, Address(d, 16));
1138       __ stp(t4, t5, Address(dend, -32));
1139       __ stp(t6, t7, Address(dend, -16));
1140     }
1141     __ b(finish);
1142 
1143     // 17..32 bytes
1144     __ bind(copy32);
1145     __ ldp(t0, t1, Address(s, 0));
1146     __ ldp(t2, t3, Address(send, -16));
1147     __ stp(t0, t1, Address(d, 0));
1148     __ stp(t2, t3, Address(dend, -16));
1149     __ b(finish);
1150 
1151     // 65..80/96 bytes
1152     // (96 bytes if SIMD because we do 32 byes per instruction)
1153     __ bind(copy80);
1154     if (UseSIMDForMemoryOps) {
1155       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1156       __ ldpq(v4, v5, Address(send, -32));
1157       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1158       __ stpq(v4, v5, Address(dend, -32));
1159     } else {
1160       __ ldp(t0, t1, Address(s, 0));
1161       __ ldp(t2, t3, Address(s, 16));
1162       __ ldp(t4, t5, Address(s, 32));
1163       __ ldp(t6, t7, Address(s, 48));
1164       __ ldp(t8, t9, Address(send, -16));
1165 
1166       __ stp(t0, t1, Address(d, 0));
1167       __ stp(t2, t3, Address(d, 16));
1168       __ stp(t4, t5, Address(d, 32));
1169       __ stp(t6, t7, Address(d, 48));
1170       __ stp(t8, t9, Address(dend, -16));
1171     }
1172     __ b(finish);
1173 
1174     // 0..16 bytes
1175     __ bind(copy16);
1176     __ cmp(count, u1(8/granularity));
1177     __ br(Assembler::LO, copy8);
1178 
1179     // 8..16 bytes
1180     __ ldr(t0, Address(s, 0));
1181     __ ldr(t1, Address(send, -8));
1182     __ str(t0, Address(d, 0));
1183     __ str(t1, Address(dend, -8));
1184     __ b(finish);
1185 
1186     if (granularity < 8) {
1187       // 4..7 bytes
1188       __ bind(copy8);
1189       __ tbz(count, 2 - exact_log2(granularity), copy4);
1190       __ ldrw(t0, Address(s, 0));
1191       __ ldrw(t1, Address(send, -4));
1192       __ strw(t0, Address(d, 0));
1193       __ strw(t1, Address(dend, -4));
1194       __ b(finish);
1195       if (granularity < 4) {
1196         // 0..3 bytes
1197         __ bind(copy4);
1198         __ cbz(count, finish); // get rid of 0 case
1199         if (granularity == 2) {
1200           __ ldrh(t0, Address(s, 0));
1201           __ strh(t0, Address(d, 0));
1202         } else { // granularity == 1
1203           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1204           // the first and last byte.
1205           // Handle the 3 byte case by loading and storing base + count/2
1206           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1207           // This does means in the 1 byte case we load/store the same
1208           // byte 3 times.
1209           __ lsr(count, count, 1);
1210           __ ldrb(t0, Address(s, 0));
1211           __ ldrb(t1, Address(send, -1));
1212           __ ldrb(t2, Address(s, count));
1213           __ strb(t0, Address(d, 0));
1214           __ strb(t1, Address(dend, -1));
1215           __ strb(t2, Address(d, count));
1216         }
1217         __ b(finish);
1218       }
1219     }
1220 
1221     __ bind(copy_big);
1222     if (is_backwards) {
1223       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1224       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1225     }
1226 
1227     // Now we've got the small case out of the way we can align the
1228     // source address on a 2-word boundary.
1229 
1230     Label aligned;
1231 
1232     if (is_aligned) {
1233       // We may have to adjust by 1 word to get s 2-word-aligned.
1234       __ tbz(s, exact_log2(wordSize), aligned);
1235       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1236       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1237       __ sub(count, count, wordSize/granularity);
1238     } else {
1239       if (is_backwards) {
1240         __ andr(rscratch2, s, 2 * wordSize - 1);
1241       } else {
1242         __ neg(rscratch2, s);
1243         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1244       }
1245       // rscratch2 is the byte adjustment needed to align s.
1246       __ cbz(rscratch2, aligned);
1247       int shift = exact_log2(granularity);
1248       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1249       __ sub(count, count, rscratch2);
1250 
1251 #if 0
1252       // ?? This code is only correct for a disjoint copy.  It may or
1253       // may not make sense to use it in that case.
1254 
1255       // Copy the first pair; s and d may not be aligned.
1256       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1257       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1258 
1259       // Align s and d, adjust count
1260       if (is_backwards) {
1261         __ sub(s, s, rscratch2);
1262         __ sub(d, d, rscratch2);
1263       } else {
1264         __ add(s, s, rscratch2);
1265         __ add(d, d, rscratch2);
1266       }
1267 #else
1268       copy_memory_small(s, d, rscratch2, rscratch1, step);
1269 #endif
1270     }
1271 
1272     __ bind(aligned);
1273 
1274     // s is now 2-word-aligned.
1275 
1276     // We have a count of units and some trailing bytes.  Adjust the
1277     // count and do a bulk copy of words.
1278     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1279     if (direction == copy_forwards)
1280       __ bl(copy_f);
1281     else
1282       __ bl(copy_b);
1283 
1284     // And the tail.
1285     copy_memory_small(s, d, count, tmp, step);
1286 
1287     if (granularity >= 8) __ bind(copy8);
1288     if (granularity >= 4) __ bind(copy4);
1289     __ bind(finish);
1290   }
1291 
1292 
1293   void clobber_registers() {
1294 #ifdef ASSERT
1295     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1296     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1297     for (Register r = r3; r <= r18; r++)
1298       if (r != rscratch1) __ mov(r, rscratch1);
1299 #endif
1300   }
1301 
1302   // Scan over array at a for count oops, verifying each one.
1303   // Preserves a and count, clobbers rscratch1 and rscratch2.
1304   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1305     Label loop, end;
1306     __ mov(rscratch1, a);
1307     __ mov(rscratch2, zr);
1308     __ bind(loop);
1309     __ cmp(rscratch2, count);
1310     __ br(Assembler::HS, end);
1311     if (size == (size_t)wordSize) {
1312       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1313       __ verify_oop(temp);
1314     } else {
1315       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1316       __ decode_heap_oop(temp); // calls verify_oop
1317     }
1318     __ add(rscratch2, rscratch2, size);
1319     __ b(loop);
1320     __ bind(end);
1321   }
1322 
1323   // Arguments:
1324   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1325   //             ignored
1326   //   is_oop  - true => oop array, so generate store check code
1327   //   name    - stub name string
1328   //
1329   // Inputs:
1330   //   c_rarg0   - source array address
1331   //   c_rarg1   - destination array address
1332   //   c_rarg2   - element count, treated as ssize_t, can be zero
1333   //
1334   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1335   // the hardware handle it.  The two dwords within qwords that span
1336   // cache line boundaries will still be loaded and stored atomicly.
1337   //
1338   // Side Effects:
1339   //   disjoint_int_copy_entry is set to the no-overlap entry point
1340   //   used by generate_conjoint_int_oop_copy().
1341   //
1342   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1343                                   const char *name, bool dest_uninitialized = false) {
1344     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1345     RegSet saved_reg = RegSet::of(s, d, count);
1346     __ align(CodeEntryAlignment);
1347     StubCodeMark mark(this, "StubRoutines", name);
1348     address start = __ pc();
1349     __ enter();
1350 
1351     if (entry != NULL) {
1352       *entry = __ pc();
1353       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1354       BLOCK_COMMENT("Entry:");
1355     }
1356 
1357     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1358     if (dest_uninitialized) {
1359       decorators |= IS_DEST_UNINITIALIZED;
1360     }
1361     if (aligned) {
1362       decorators |= ARRAYCOPY_ALIGNED;
1363     }
1364 
1365     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1366     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1367 
1368     if (is_oop) {
1369       // save regs before copy_memory
1370       __ push(RegSet::of(d, count), sp);
1371     }
1372     copy_memory(aligned, s, d, count, rscratch1, size);
1373 
1374     if (is_oop) {
1375       __ pop(RegSet::of(d, count), sp);
1376       if (VerifyOops)
1377         verify_oop_array(size, d, count, r16);
1378     }
1379 
1380     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1381 
1382     __ leave();
1383     __ mov(r0, zr); // return 0
1384     __ ret(lr);
1385 #ifdef BUILTIN_SIM
1386     {
1387       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1388       sim->notifyCompile(const_cast<char*>(name), start);
1389     }
1390 #endif
1391     return start;
1392   }
1393 
1394   // Arguments:
1395   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1396   //             ignored
1397   //   is_oop  - true => oop array, so generate store check code
1398   //   name    - stub name string
1399   //
1400   // Inputs:
1401   //   c_rarg0   - source array address
1402   //   c_rarg1   - destination array address
1403   //   c_rarg2   - element count, treated as ssize_t, can be zero
1404   //
1405   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1406   // the hardware handle it.  The two dwords within qwords that span
1407   // cache line boundaries will still be loaded and stored atomicly.
1408   //
1409   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1410                                  address *entry, const char *name,
1411                                  bool dest_uninitialized = false) {
1412     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1413     RegSet saved_regs = RegSet::of(s, d, count);
1414     StubCodeMark mark(this, "StubRoutines", name);
1415     address start = __ pc();
1416     __ enter();
1417 
1418     if (entry != NULL) {
1419       *entry = __ pc();
1420       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1421       BLOCK_COMMENT("Entry:");
1422     }
1423 
1424     // use fwd copy when (d-s) above_equal (count*size)
1425     __ sub(rscratch1, d, s);
1426     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1427     __ br(Assembler::HS, nooverlap_target);
1428 
1429     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1430     if (dest_uninitialized) {
1431       decorators |= IS_DEST_UNINITIALIZED;
1432     }
1433     if (aligned) {
1434       decorators |= ARRAYCOPY_ALIGNED;
1435     }
1436 
1437     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1438     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1439 
1440     if (is_oop) {
1441       // save regs before copy_memory
1442       __ push(RegSet::of(d, count), sp);
1443     }
1444     copy_memory(aligned, s, d, count, rscratch1, -size);
1445     if (is_oop) {
1446       __ pop(RegSet::of(d, count), sp);
1447       if (VerifyOops)
1448         verify_oop_array(size, d, count, r16);
1449     }
1450     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1451     __ leave();
1452     __ mov(r0, zr); // return 0
1453     __ ret(lr);
1454 #ifdef BUILTIN_SIM
1455     {
1456       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1457       sim->notifyCompile(const_cast<char*>(name), start);
1458     }
1459 #endif
1460     return start;
1461 }
1462 
1463   // Arguments:
1464   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1465   //             ignored
1466   //   name    - stub name string
1467   //
1468   // Inputs:
1469   //   c_rarg0   - source array address
1470   //   c_rarg1   - destination array address
1471   //   c_rarg2   - element count, treated as ssize_t, can be zero
1472   //
1473   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1474   // we let the hardware handle it.  The one to eight bytes within words,
1475   // dwords or qwords that span cache line boundaries will still be loaded
1476   // and stored atomically.
1477   //
1478   // Side Effects:
1479   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1480   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1481   // we let the hardware handle it.  The one to eight bytes within words,
1482   // dwords or qwords that span cache line boundaries will still be loaded
1483   // and stored atomically.
1484   //
1485   // Side Effects:
1486   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1487   //   used by generate_conjoint_byte_copy().
1488   //
1489   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1490     const bool not_oop = false;
1491     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1492   }
1493 
1494   // Arguments:
1495   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1496   //             ignored
1497   //   name    - stub name string
1498   //
1499   // Inputs:
1500   //   c_rarg0   - source array address
1501   //   c_rarg1   - destination array address
1502   //   c_rarg2   - element count, treated as ssize_t, can be zero
1503   //
1504   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1505   // we let the hardware handle it.  The one to eight bytes within words,
1506   // dwords or qwords that span cache line boundaries will still be loaded
1507   // and stored atomically.
1508   //
1509   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1510                                       address* entry, const char *name) {
1511     const bool not_oop = false;
1512     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1513   }
1514 
1515   // Arguments:
1516   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1517   //             ignored
1518   //   name    - stub name string
1519   //
1520   // Inputs:
1521   //   c_rarg0   - source array address
1522   //   c_rarg1   - destination array address
1523   //   c_rarg2   - element count, treated as ssize_t, can be zero
1524   //
1525   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1526   // let the hardware handle it.  The two or four words within dwords
1527   // or qwords that span cache line boundaries will still be loaded
1528   // and stored atomically.
1529   //
1530   // Side Effects:
1531   //   disjoint_short_copy_entry is set to the no-overlap entry point
1532   //   used by generate_conjoint_short_copy().
1533   //
1534   address generate_disjoint_short_copy(bool aligned,
1535                                        address* entry, const char *name) {
1536     const bool not_oop = false;
1537     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1538   }
1539 
1540   // Arguments:
1541   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1542   //             ignored
1543   //   name    - stub name string
1544   //
1545   // Inputs:
1546   //   c_rarg0   - source array address
1547   //   c_rarg1   - destination array address
1548   //   c_rarg2   - element count, treated as ssize_t, can be zero
1549   //
1550   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1551   // let the hardware handle it.  The two or four words within dwords
1552   // or qwords that span cache line boundaries will still be loaded
1553   // and stored atomically.
1554   //
1555   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1556                                        address *entry, const char *name) {
1557     const bool not_oop = false;
1558     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1559 
1560   }
1561   // Arguments:
1562   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1563   //             ignored
1564   //   name    - stub name string
1565   //
1566   // Inputs:
1567   //   c_rarg0   - source array address
1568   //   c_rarg1   - destination array address
1569   //   c_rarg2   - element count, treated as ssize_t, can be zero
1570   //
1571   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1572   // the hardware handle it.  The two dwords within qwords that span
1573   // cache line boundaries will still be loaded and stored atomicly.
1574   //
1575   // Side Effects:
1576   //   disjoint_int_copy_entry is set to the no-overlap entry point
1577   //   used by generate_conjoint_int_oop_copy().
1578   //
1579   address generate_disjoint_int_copy(bool aligned, address *entry,
1580                                          const char *name, bool dest_uninitialized = false) {
1581     const bool not_oop = false;
1582     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1583   }
1584 
1585   // Arguments:
1586   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1587   //             ignored
1588   //   name    - stub name string
1589   //
1590   // Inputs:
1591   //   c_rarg0   - source array address
1592   //   c_rarg1   - destination array address
1593   //   c_rarg2   - element count, treated as ssize_t, can be zero
1594   //
1595   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1596   // the hardware handle it.  The two dwords within qwords that span
1597   // cache line boundaries will still be loaded and stored atomicly.
1598   //
1599   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1600                                      address *entry, const char *name,
1601                                      bool dest_uninitialized = false) {
1602     const bool not_oop = false;
1603     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1604   }
1605 
1606 
1607   // Arguments:
1608   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1609   //             ignored
1610   //   name    - stub name string
1611   //
1612   // Inputs:
1613   //   c_rarg0   - source array address
1614   //   c_rarg1   - destination array address
1615   //   c_rarg2   - element count, treated as size_t, can be zero
1616   //
1617   // Side Effects:
1618   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1619   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1620   //
1621   address generate_disjoint_long_copy(bool aligned, address *entry,
1622                                           const char *name, bool dest_uninitialized = false) {
1623     const bool not_oop = false;
1624     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1625   }
1626 
1627   // Arguments:
1628   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1629   //             ignored
1630   //   name    - stub name string
1631   //
1632   // Inputs:
1633   //   c_rarg0   - source array address
1634   //   c_rarg1   - destination array address
1635   //   c_rarg2   - element count, treated as size_t, can be zero
1636   //
1637   address generate_conjoint_long_copy(bool aligned,
1638                                       address nooverlap_target, address *entry,
1639                                       const char *name, bool dest_uninitialized = false) {
1640     const bool not_oop = false;
1641     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1642   }
1643 
1644   // Arguments:
1645   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1646   //             ignored
1647   //   name    - stub name string
1648   //
1649   // Inputs:
1650   //   c_rarg0   - source array address
1651   //   c_rarg1   - destination array address
1652   //   c_rarg2   - element count, treated as size_t, can be zero
1653   //
1654   // Side Effects:
1655   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1656   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1657   //
1658   address generate_disjoint_oop_copy(bool aligned, address *entry,
1659                                      const char *name, bool dest_uninitialized) {
1660     const bool is_oop = true;
1661     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1662     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1663   }
1664 
1665   // Arguments:
1666   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1667   //             ignored
1668   //   name    - stub name string
1669   //
1670   // Inputs:
1671   //   c_rarg0   - source array address
1672   //   c_rarg1   - destination array address
1673   //   c_rarg2   - element count, treated as size_t, can be zero
1674   //
1675   address generate_conjoint_oop_copy(bool aligned,
1676                                      address nooverlap_target, address *entry,
1677                                      const char *name, bool dest_uninitialized) {
1678     const bool is_oop = true;
1679     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1680     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1681                                   name, dest_uninitialized);
1682   }
1683 
1684 
1685   // Helper for generating a dynamic type check.
1686   // Smashes rscratch1, rscratch2.
1687   void generate_type_check(Register sub_klass,
1688                            Register super_check_offset,
1689                            Register super_klass,
1690                            Label& L_success) {
1691     assert_different_registers(sub_klass, super_check_offset, super_klass);
1692 
1693     BLOCK_COMMENT("type_check:");
1694 
1695     Label L_miss;
1696 
1697     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1698                                      super_check_offset);
1699     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1700 
1701     // Fall through on failure!
1702     __ BIND(L_miss);
1703   }
1704 
1705   //
1706   //  Generate checkcasting array copy stub
1707   //
1708   //  Input:
1709   //    c_rarg0   - source array address
1710   //    c_rarg1   - destination array address
1711   //    c_rarg2   - element count, treated as ssize_t, can be zero
1712   //    c_rarg3   - size_t ckoff (super_check_offset)
1713   //    c_rarg4   - oop ckval (super_klass)
1714   //
1715   //  Output:
1716   //    r0 ==  0  -  success
1717   //    r0 == -1^K - failure, where K is partial transfer count
1718   //
1719   address generate_checkcast_copy(const char *name, address *entry,
1720                                   bool dest_uninitialized = false) {
1721 
1722     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1723 
1724     // Input registers (after setup_arg_regs)
1725     const Register from        = c_rarg0;   // source array address
1726     const Register to          = c_rarg1;   // destination array address
1727     const Register count       = c_rarg2;   // elementscount
1728     const Register ckoff       = c_rarg3;   // super_check_offset
1729     const Register ckval       = c_rarg4;   // super_klass
1730 
1731     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1732     RegSet wb_post_saved_regs = RegSet::of(count);
1733 
1734     // Registers used as temps (r18, r19, r20 are save-on-entry)
1735     const Register count_save  = r21;       // orig elementscount
1736     const Register start_to    = r20;       // destination array start address
1737     const Register copied_oop  = r18;       // actual oop copied
1738     const Register r19_klass   = r19;       // oop._klass
1739 
1740     //---------------------------------------------------------------
1741     // Assembler stub will be used for this call to arraycopy
1742     // if the two arrays are subtypes of Object[] but the
1743     // destination array type is not equal to or a supertype
1744     // of the source type.  Each element must be separately
1745     // checked.
1746 
1747     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1748                                copied_oop, r19_klass, count_save);
1749 
1750     __ align(CodeEntryAlignment);
1751     StubCodeMark mark(this, "StubRoutines", name);
1752     address start = __ pc();
1753 
1754     __ enter(); // required for proper stackwalking of RuntimeStub frame
1755 
1756 #ifdef ASSERT
1757     // caller guarantees that the arrays really are different
1758     // otherwise, we would have to make conjoint checks
1759     { Label L;
1760       array_overlap_test(L, TIMES_OOP);
1761       __ stop("checkcast_copy within a single array");
1762       __ bind(L);
1763     }
1764 #endif //ASSERT
1765 
1766     // Caller of this entry point must set up the argument registers.
1767     if (entry != NULL) {
1768       *entry = __ pc();
1769       BLOCK_COMMENT("Entry:");
1770     }
1771 
1772      // Empty array:  Nothing to do.
1773     __ cbz(count, L_done);
1774 
1775     __ push(RegSet::of(r18, r19, r20, r21), sp);
1776 
1777 #ifdef ASSERT
1778     BLOCK_COMMENT("assert consistent ckoff/ckval");
1779     // The ckoff and ckval must be mutually consistent,
1780     // even though caller generates both.
1781     { Label L;
1782       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1783       __ ldrw(start_to, Address(ckval, sco_offset));
1784       __ cmpw(ckoff, start_to);
1785       __ br(Assembler::EQ, L);
1786       __ stop("super_check_offset inconsistent");
1787       __ bind(L);
1788     }
1789 #endif //ASSERT
1790 
1791     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1792     bool is_oop = true;
1793     if (dest_uninitialized) {
1794       decorators |= IS_DEST_UNINITIALIZED;
1795     }
1796 
1797     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1798     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1799 
1800     // save the original count
1801     __ mov(count_save, count);
1802 
1803     // Copy from low to high addresses
1804     __ mov(start_to, to);              // Save destination array start address
1805     __ b(L_load_element);
1806 
1807     // ======== begin loop ========
1808     // (Loop is rotated; its entry is L_load_element.)
1809     // Loop control:
1810     //   for (; count != 0; count--) {
1811     //     copied_oop = load_heap_oop(from++);
1812     //     ... generate_type_check ...;
1813     //     store_heap_oop(to++, copied_oop);
1814     //   }
1815     __ align(OptoLoopAlignment);
1816 
1817     __ BIND(L_store_element);
1818     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1819     __ sub(count, count, 1);
1820     __ cbz(count, L_do_card_marks);
1821 
1822     // ======== loop entry is here ========
1823     __ BIND(L_load_element);
1824     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1825     __ cbz(copied_oop, L_store_element);
1826 
1827     __ load_klass(r19_klass, copied_oop);// query the object klass
1828     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1829     // ======== end loop ========
1830 
1831     // It was a real error; we must depend on the caller to finish the job.
1832     // Register count = remaining oops, count_orig = total oops.
1833     // Emit GC store barriers for the oops we have copied and report
1834     // their number to the caller.
1835 
1836     __ subs(count, count_save, count);     // K = partially copied oop count
1837     __ eon(count, count, zr);                   // report (-1^K) to caller
1838     __ br(Assembler::EQ, L_done_pop);
1839 
1840     __ BIND(L_do_card_marks);
1841     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1842 
1843     __ bind(L_done_pop);
1844     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1845     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1846 
1847     __ bind(L_done);
1848     __ mov(r0, count);
1849     __ leave();
1850     __ ret(lr);
1851 
1852     return start;
1853   }
1854 
1855   // Perform range checks on the proposed arraycopy.
1856   // Kills temp, but nothing else.
1857   // Also, clean the sign bits of src_pos and dst_pos.
1858   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1859                               Register src_pos, // source position (c_rarg1)
1860                               Register dst,     // destination array oo (c_rarg2)
1861                               Register dst_pos, // destination position (c_rarg3)
1862                               Register length,
1863                               Register temp,
1864                               Label& L_failed) {
1865     BLOCK_COMMENT("arraycopy_range_checks:");
1866 
1867     assert_different_registers(rscratch1, temp);
1868 
1869     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1870     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1871     __ addw(temp, length, src_pos);
1872     __ cmpw(temp, rscratch1);
1873     __ br(Assembler::HI, L_failed);
1874 
1875     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1876     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1877     __ addw(temp, length, dst_pos);
1878     __ cmpw(temp, rscratch1);
1879     __ br(Assembler::HI, L_failed);
1880 
1881     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1882     __ movw(src_pos, src_pos);
1883     __ movw(dst_pos, dst_pos);
1884 
1885     BLOCK_COMMENT("arraycopy_range_checks done");
1886   }
1887 
1888   // These stubs get called from some dumb test routine.
1889   // I'll write them properly when they're called from
1890   // something that's actually doing something.
1891   static void fake_arraycopy_stub(address src, address dst, int count) {
1892     assert(count == 0, "huh?");
1893   }
1894 
1895 
1896   //
1897   //  Generate 'unsafe' array copy stub
1898   //  Though just as safe as the other stubs, it takes an unscaled
1899   //  size_t argument instead of an element count.
1900   //
1901   //  Input:
1902   //    c_rarg0   - source array address
1903   //    c_rarg1   - destination array address
1904   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1905   //
1906   // Examines the alignment of the operands and dispatches
1907   // to a long, int, short, or byte copy loop.
1908   //
1909   address generate_unsafe_copy(const char *name,
1910                                address byte_copy_entry,
1911                                address short_copy_entry,
1912                                address int_copy_entry,
1913                                address long_copy_entry) {
1914     Label L_long_aligned, L_int_aligned, L_short_aligned;
1915     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1916 
1917     __ align(CodeEntryAlignment);
1918     StubCodeMark mark(this, "StubRoutines", name);
1919     address start = __ pc();
1920     __ enter(); // required for proper stackwalking of RuntimeStub frame
1921 
1922     // bump this on entry, not on exit:
1923     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1924 
1925     __ orr(rscratch1, s, d);
1926     __ orr(rscratch1, rscratch1, count);
1927 
1928     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1929     __ cbz(rscratch1, L_long_aligned);
1930     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1931     __ cbz(rscratch1, L_int_aligned);
1932     __ tbz(rscratch1, 0, L_short_aligned);
1933     __ b(RuntimeAddress(byte_copy_entry));
1934 
1935     __ BIND(L_short_aligned);
1936     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1937     __ b(RuntimeAddress(short_copy_entry));
1938     __ BIND(L_int_aligned);
1939     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1940     __ b(RuntimeAddress(int_copy_entry));
1941     __ BIND(L_long_aligned);
1942     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1943     __ b(RuntimeAddress(long_copy_entry));
1944 
1945     return start;
1946   }
1947 
1948   //
1949   //  Generate generic array copy stubs
1950   //
1951   //  Input:
1952   //    c_rarg0    -  src oop
1953   //    c_rarg1    -  src_pos (32-bits)
1954   //    c_rarg2    -  dst oop
1955   //    c_rarg3    -  dst_pos (32-bits)
1956   //    c_rarg4    -  element count (32-bits)
1957   //
1958   //  Output:
1959   //    r0 ==  0  -  success
1960   //    r0 == -1^K - failure, where K is partial transfer count
1961   //
1962   address generate_generic_copy(const char *name,
1963                                 address byte_copy_entry, address short_copy_entry,
1964                                 address int_copy_entry, address oop_copy_entry,
1965                                 address long_copy_entry, address checkcast_copy_entry) {
1966 
1967     Label L_failed, L_objArray;
1968     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1969 
1970     // Input registers
1971     const Register src        = c_rarg0;  // source array oop
1972     const Register src_pos    = c_rarg1;  // source position
1973     const Register dst        = c_rarg2;  // destination array oop
1974     const Register dst_pos    = c_rarg3;  // destination position
1975     const Register length     = c_rarg4;
1976 
1977 
1978     // Registers used as temps
1979     const Register dst_klass  = c_rarg5;
1980 
1981     __ align(CodeEntryAlignment);
1982 
1983     StubCodeMark mark(this, "StubRoutines", name);
1984 
1985     address start = __ pc();
1986 
1987     __ enter(); // required for proper stackwalking of RuntimeStub frame
1988 
1989     // bump this on entry, not on exit:
1990     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1991 
1992     //-----------------------------------------------------------------------
1993     // Assembler stub will be used for this call to arraycopy
1994     // if the following conditions are met:
1995     //
1996     // (1) src and dst must not be null.
1997     // (2) src_pos must not be negative.
1998     // (3) dst_pos must not be negative.
1999     // (4) length  must not be negative.
2000     // (5) src klass and dst klass should be the same and not NULL.
2001     // (6) src and dst should be arrays.
2002     // (7) src_pos + length must not exceed length of src.
2003     // (8) dst_pos + length must not exceed length of dst.
2004     //
2005 
2006     //  if (src == NULL) return -1;
2007     __ cbz(src, L_failed);
2008 
2009     //  if (src_pos < 0) return -1;
2010     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2011 
2012     //  if (dst == NULL) return -1;
2013     __ cbz(dst, L_failed);
2014 
2015     //  if (dst_pos < 0) return -1;
2016     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2017 
2018     // registers used as temp
2019     const Register scratch_length    = r16; // elements count to copy
2020     const Register scratch_src_klass = r17; // array klass
2021     const Register lh                = r18; // layout helper
2022 
2023     //  if (length < 0) return -1;
2024     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2025     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2026 
2027     __ load_klass(scratch_src_klass, src);
2028 #ifdef ASSERT
2029     //  assert(src->klass() != NULL);
2030     {
2031       BLOCK_COMMENT("assert klasses not null {");
2032       Label L1, L2;
2033       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2034       __ bind(L1);
2035       __ stop("broken null klass");
2036       __ bind(L2);
2037       __ load_klass(rscratch1, dst);
2038       __ cbz(rscratch1, L1);     // this would be broken also
2039       BLOCK_COMMENT("} assert klasses not null done");
2040     }
2041 #endif
2042 
2043     // Load layout helper (32-bits)
2044     //
2045     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2046     // 32        30    24            16              8     2                 0
2047     //
2048     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2049     //
2050 
2051     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2052 
2053     // Handle objArrays completely differently...
2054     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2055     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2056     __ movw(rscratch1, objArray_lh);
2057     __ eorw(rscratch2, lh, rscratch1);
2058     __ cbzw(rscratch2, L_objArray);
2059 
2060     //  if (src->klass() != dst->klass()) return -1;
2061     __ load_klass(rscratch2, dst);
2062     __ eor(rscratch2, rscratch2, scratch_src_klass);
2063     __ cbnz(rscratch2, L_failed);
2064 
2065     //  if (!src->is_Array()) return -1;
2066     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2067 
2068     // At this point, it is known to be a typeArray (array_tag 0x3).
2069 #ifdef ASSERT
2070     {
2071       BLOCK_COMMENT("assert primitive array {");
2072       Label L;
2073       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2074       __ cmpw(lh, rscratch2);
2075       __ br(Assembler::GE, L);
2076       __ stop("must be a primitive array");
2077       __ bind(L);
2078       BLOCK_COMMENT("} assert primitive array done");
2079     }
2080 #endif
2081 
2082     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2083                            rscratch2, L_failed);
2084 
2085     // TypeArrayKlass
2086     //
2087     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2088     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2089     //
2090 
2091     const Register rscratch1_offset = rscratch1;    // array offset
2092     const Register r18_elsize = lh; // element size
2093 
2094     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2095            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2096     __ add(src, src, rscratch1_offset);           // src array offset
2097     __ add(dst, dst, rscratch1_offset);           // dst array offset
2098     BLOCK_COMMENT("choose copy loop based on element size");
2099 
2100     // next registers should be set before the jump to corresponding stub
2101     const Register from     = c_rarg0;  // source array address
2102     const Register to       = c_rarg1;  // destination array address
2103     const Register count    = c_rarg2;  // elements count
2104 
2105     // 'from', 'to', 'count' registers should be set in such order
2106     // since they are the same as 'src', 'src_pos', 'dst'.
2107 
2108     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2109 
2110     // The possible values of elsize are 0-3, i.e. exact_log2(element
2111     // size in bytes).  We do a simple bitwise binary search.
2112   __ BIND(L_copy_bytes);
2113     __ tbnz(r18_elsize, 1, L_copy_ints);
2114     __ tbnz(r18_elsize, 0, L_copy_shorts);
2115     __ lea(from, Address(src, src_pos));// src_addr
2116     __ lea(to,   Address(dst, dst_pos));// dst_addr
2117     __ movw(count, scratch_length); // length
2118     __ b(RuntimeAddress(byte_copy_entry));
2119 
2120   __ BIND(L_copy_shorts);
2121     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2122     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2123     __ movw(count, scratch_length); // length
2124     __ b(RuntimeAddress(short_copy_entry));
2125 
2126   __ BIND(L_copy_ints);
2127     __ tbnz(r18_elsize, 0, L_copy_longs);
2128     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2129     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2130     __ movw(count, scratch_length); // length
2131     __ b(RuntimeAddress(int_copy_entry));
2132 
2133   __ BIND(L_copy_longs);
2134 #ifdef ASSERT
2135     {
2136       BLOCK_COMMENT("assert long copy {");
2137       Label L;
2138       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2139       __ cmpw(r18_elsize, LogBytesPerLong);
2140       __ br(Assembler::EQ, L);
2141       __ stop("must be long copy, but elsize is wrong");
2142       __ bind(L);
2143       BLOCK_COMMENT("} assert long copy done");
2144     }
2145 #endif
2146     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2147     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2148     __ movw(count, scratch_length); // length
2149     __ b(RuntimeAddress(long_copy_entry));
2150 
2151     // ObjArrayKlass
2152   __ BIND(L_objArray);
2153     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2154 
2155     Label L_plain_copy, L_checkcast_copy;
2156     //  test array classes for subtyping
2157     __ load_klass(r18, dst);
2158     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2159     __ br(Assembler::NE, L_checkcast_copy);
2160 
2161     // Identically typed arrays can be copied without element-wise checks.
2162     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2163                            rscratch2, L_failed);
2164 
2165     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2166     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2167     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2168     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2169     __ movw(count, scratch_length); // length
2170   __ BIND(L_plain_copy);
2171     __ b(RuntimeAddress(oop_copy_entry));
2172 
2173   __ BIND(L_checkcast_copy);
2174     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2175     {
2176       // Before looking at dst.length, make sure dst is also an objArray.
2177       __ ldrw(rscratch1, Address(r18, lh_offset));
2178       __ movw(rscratch2, objArray_lh);
2179       __ eorw(rscratch1, rscratch1, rscratch2);
2180       __ cbnzw(rscratch1, L_failed);
2181 
2182       // It is safe to examine both src.length and dst.length.
2183       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2184                              r18, L_failed);
2185 
2186       __ load_klass(dst_klass, dst); // reload
2187 
2188       // Marshal the base address arguments now, freeing registers.
2189       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2190       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2191       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2192       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2193       __ movw(count, length);           // length (reloaded)
2194       Register sco_temp = c_rarg3;      // this register is free now
2195       assert_different_registers(from, to, count, sco_temp,
2196                                  dst_klass, scratch_src_klass);
2197       // assert_clean_int(count, sco_temp);
2198 
2199       // Generate the type check.
2200       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2201       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2202 
2203       // Smashes rscratch1, rscratch2
2204       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2205 
2206       // Fetch destination element klass from the ObjArrayKlass header.
2207       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2208       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2209       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2210 
2211       // the checkcast_copy loop needs two extra arguments:
2212       assert(c_rarg3 == sco_temp, "#3 already in place");
2213       // Set up arguments for checkcast_copy_entry.
2214       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2215       __ b(RuntimeAddress(checkcast_copy_entry));
2216     }
2217 
2218   __ BIND(L_failed);
2219     __ mov(r0, -1);
2220     __ leave();   // required for proper stackwalking of RuntimeStub frame
2221     __ ret(lr);
2222 
2223     return start;
2224   }
2225 
2226   //
2227   // Generate stub for array fill. If "aligned" is true, the
2228   // "to" address is assumed to be heapword aligned.
2229   //
2230   // Arguments for generated stub:
2231   //   to:    c_rarg0
2232   //   value: c_rarg1
2233   //   count: c_rarg2 treated as signed
2234   //
2235   address generate_fill(BasicType t, bool aligned, const char *name) {
2236     __ align(CodeEntryAlignment);
2237     StubCodeMark mark(this, "StubRoutines", name);
2238     address start = __ pc();
2239 
2240     BLOCK_COMMENT("Entry:");
2241 
2242     const Register to        = c_rarg0;  // source array address
2243     const Register value     = c_rarg1;  // value
2244     const Register count     = c_rarg2;  // elements count
2245 
2246     const Register bz_base = r10;        // base for block_zero routine
2247     const Register cnt_words = r11;      // temp register
2248 
2249     __ enter();
2250 
2251     Label L_fill_elements, L_exit1;
2252 
2253     int shift = -1;
2254     switch (t) {
2255       case T_BYTE:
2256         shift = 0;
2257         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2258         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2259         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2260         __ br(Assembler::LO, L_fill_elements);
2261         break;
2262       case T_SHORT:
2263         shift = 1;
2264         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2265         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2266         __ br(Assembler::LO, L_fill_elements);
2267         break;
2268       case T_INT:
2269         shift = 2;
2270         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2271         __ br(Assembler::LO, L_fill_elements);
2272         break;
2273       default: ShouldNotReachHere();
2274     }
2275 
2276     // Align source address at 8 bytes address boundary.
2277     Label L_skip_align1, L_skip_align2, L_skip_align4;
2278     if (!aligned) {
2279       switch (t) {
2280         case T_BYTE:
2281           // One byte misalignment happens only for byte arrays.
2282           __ tbz(to, 0, L_skip_align1);
2283           __ strb(value, Address(__ post(to, 1)));
2284           __ subw(count, count, 1);
2285           __ bind(L_skip_align1);
2286           // Fallthrough
2287         case T_SHORT:
2288           // Two bytes misalignment happens only for byte and short (char) arrays.
2289           __ tbz(to, 1, L_skip_align2);
2290           __ strh(value, Address(__ post(to, 2)));
2291           __ subw(count, count, 2 >> shift);
2292           __ bind(L_skip_align2);
2293           // Fallthrough
2294         case T_INT:
2295           // Align to 8 bytes, we know we are 4 byte aligned to start.
2296           __ tbz(to, 2, L_skip_align4);
2297           __ strw(value, Address(__ post(to, 4)));
2298           __ subw(count, count, 4 >> shift);
2299           __ bind(L_skip_align4);
2300           break;
2301         default: ShouldNotReachHere();
2302       }
2303     }
2304 
2305     //
2306     //  Fill large chunks
2307     //
2308     __ lsrw(cnt_words, count, 3 - shift); // number of words
2309     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2310     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2311     if (UseBlockZeroing) {
2312       Label non_block_zeroing, rest;
2313       // If the fill value is zero we can use the fast zero_words().
2314       __ cbnz(value, non_block_zeroing);
2315       __ mov(bz_base, to);
2316       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2317       __ zero_words(bz_base, cnt_words);
2318       __ b(rest);
2319       __ bind(non_block_zeroing);
2320       __ fill_words(to, cnt_words, value);
2321       __ bind(rest);
2322     } else {
2323       __ fill_words(to, cnt_words, value);
2324     }
2325 
2326     // Remaining count is less than 8 bytes. Fill it by a single store.
2327     // Note that the total length is no less than 8 bytes.
2328     if (t == T_BYTE || t == T_SHORT) {
2329       Label L_exit1;
2330       __ cbzw(count, L_exit1);
2331       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2332       __ str(value, Address(to, -8));    // overwrite some elements
2333       __ bind(L_exit1);
2334       __ leave();
2335       __ ret(lr);
2336     }
2337 
2338     // Handle copies less than 8 bytes.
2339     Label L_fill_2, L_fill_4, L_exit2;
2340     __ bind(L_fill_elements);
2341     switch (t) {
2342       case T_BYTE:
2343         __ tbz(count, 0, L_fill_2);
2344         __ strb(value, Address(__ post(to, 1)));
2345         __ bind(L_fill_2);
2346         __ tbz(count, 1, L_fill_4);
2347         __ strh(value, Address(__ post(to, 2)));
2348         __ bind(L_fill_4);
2349         __ tbz(count, 2, L_exit2);
2350         __ strw(value, Address(to));
2351         break;
2352       case T_SHORT:
2353         __ tbz(count, 0, L_fill_4);
2354         __ strh(value, Address(__ post(to, 2)));
2355         __ bind(L_fill_4);
2356         __ tbz(count, 1, L_exit2);
2357         __ strw(value, Address(to));
2358         break;
2359       case T_INT:
2360         __ cbzw(count, L_exit2);
2361         __ strw(value, Address(to));
2362         break;
2363       default: ShouldNotReachHere();
2364     }
2365     __ bind(L_exit2);
2366     __ leave();
2367     __ ret(lr);
2368     return start;
2369   }
2370 
2371   void generate_arraycopy_stubs() {
2372     address entry;
2373     address entry_jbyte_arraycopy;
2374     address entry_jshort_arraycopy;
2375     address entry_jint_arraycopy;
2376     address entry_oop_arraycopy;
2377     address entry_jlong_arraycopy;
2378     address entry_checkcast_arraycopy;
2379 
2380     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2381     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2382 
2383     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2384 
2385     //*** jbyte
2386     // Always need aligned and unaligned versions
2387     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2388                                                                                   "jbyte_disjoint_arraycopy");
2389     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2390                                                                                   &entry_jbyte_arraycopy,
2391                                                                                   "jbyte_arraycopy");
2392     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2393                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2394     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2395                                                                                   "arrayof_jbyte_arraycopy");
2396 
2397     //*** jshort
2398     // Always need aligned and unaligned versions
2399     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2400                                                                                     "jshort_disjoint_arraycopy");
2401     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2402                                                                                     &entry_jshort_arraycopy,
2403                                                                                     "jshort_arraycopy");
2404     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2405                                                                                     "arrayof_jshort_disjoint_arraycopy");
2406     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2407                                                                                     "arrayof_jshort_arraycopy");
2408 
2409     //*** jint
2410     // Aligned versions
2411     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2412                                                                                 "arrayof_jint_disjoint_arraycopy");
2413     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2414                                                                                 "arrayof_jint_arraycopy");
2415     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2416     // entry_jint_arraycopy always points to the unaligned version
2417     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2418                                                                                 "jint_disjoint_arraycopy");
2419     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2420                                                                                 &entry_jint_arraycopy,
2421                                                                                 "jint_arraycopy");
2422 
2423     //*** jlong
2424     // It is always aligned
2425     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2426                                                                                   "arrayof_jlong_disjoint_arraycopy");
2427     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2428                                                                                   "arrayof_jlong_arraycopy");
2429     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2430     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2431 
2432     //*** oops
2433     {
2434       // With compressed oops we need unaligned versions; notice that
2435       // we overwrite entry_oop_arraycopy.
2436       bool aligned = !UseCompressedOops;
2437 
2438       StubRoutines::_arrayof_oop_disjoint_arraycopy
2439         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2440                                      /*dest_uninitialized*/false);
2441       StubRoutines::_arrayof_oop_arraycopy
2442         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2443                                      /*dest_uninitialized*/false);
2444       // Aligned versions without pre-barriers
2445       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2446         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2447                                      /*dest_uninitialized*/true);
2448       StubRoutines::_arrayof_oop_arraycopy_uninit
2449         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2450                                      /*dest_uninitialized*/true);
2451     }
2452 
2453     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2454     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2455     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2456     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2457 
2458     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2459     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2460                                                                         /*dest_uninitialized*/true);
2461 
2462     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2463                                                               entry_jbyte_arraycopy,
2464                                                               entry_jshort_arraycopy,
2465                                                               entry_jint_arraycopy,
2466                                                               entry_jlong_arraycopy);
2467 
2468     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2469                                                                entry_jbyte_arraycopy,
2470                                                                entry_jshort_arraycopy,
2471                                                                entry_jint_arraycopy,
2472                                                                entry_oop_arraycopy,
2473                                                                entry_jlong_arraycopy,
2474                                                                entry_checkcast_arraycopy);
2475 
2476     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2477     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2478     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2479     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2480     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2481     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2482   }
2483 
2484   void generate_math_stubs() { Unimplemented(); }
2485 
2486   // Arguments:
2487   //
2488   // Inputs:
2489   //   c_rarg0   - source byte array address
2490   //   c_rarg1   - destination byte array address
2491   //   c_rarg2   - K (key) in little endian int array
2492   //
2493   address generate_aescrypt_encryptBlock() {
2494     __ align(CodeEntryAlignment);
2495     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2496 
2497     Label L_doLast;
2498 
2499     const Register from        = c_rarg0;  // source array address
2500     const Register to          = c_rarg1;  // destination array address
2501     const Register key         = c_rarg2;  // key array address
2502     const Register keylen      = rscratch1;
2503 
2504     address start = __ pc();
2505     __ enter();
2506 
2507     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2508 
2509     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2510 
2511     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2512     __ rev32(v1, __ T16B, v1);
2513     __ rev32(v2, __ T16B, v2);
2514     __ rev32(v3, __ T16B, v3);
2515     __ rev32(v4, __ T16B, v4);
2516     __ aese(v0, v1);
2517     __ aesmc(v0, v0);
2518     __ aese(v0, v2);
2519     __ aesmc(v0, v0);
2520     __ aese(v0, v3);
2521     __ aesmc(v0, v0);
2522     __ aese(v0, v4);
2523     __ aesmc(v0, v0);
2524 
2525     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2526     __ rev32(v1, __ T16B, v1);
2527     __ rev32(v2, __ T16B, v2);
2528     __ rev32(v3, __ T16B, v3);
2529     __ rev32(v4, __ T16B, v4);
2530     __ aese(v0, v1);
2531     __ aesmc(v0, v0);
2532     __ aese(v0, v2);
2533     __ aesmc(v0, v0);
2534     __ aese(v0, v3);
2535     __ aesmc(v0, v0);
2536     __ aese(v0, v4);
2537     __ aesmc(v0, v0);
2538 
2539     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2540     __ rev32(v1, __ T16B, v1);
2541     __ rev32(v2, __ T16B, v2);
2542 
2543     __ cmpw(keylen, 44);
2544     __ br(Assembler::EQ, L_doLast);
2545 
2546     __ aese(v0, v1);
2547     __ aesmc(v0, v0);
2548     __ aese(v0, v2);
2549     __ aesmc(v0, v0);
2550 
2551     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2552     __ rev32(v1, __ T16B, v1);
2553     __ rev32(v2, __ T16B, v2);
2554 
2555     __ cmpw(keylen, 52);
2556     __ br(Assembler::EQ, L_doLast);
2557 
2558     __ aese(v0, v1);
2559     __ aesmc(v0, v0);
2560     __ aese(v0, v2);
2561     __ aesmc(v0, v0);
2562 
2563     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2564     __ rev32(v1, __ T16B, v1);
2565     __ rev32(v2, __ T16B, v2);
2566 
2567     __ BIND(L_doLast);
2568 
2569     __ aese(v0, v1);
2570     __ aesmc(v0, v0);
2571     __ aese(v0, v2);
2572 
2573     __ ld1(v1, __ T16B, key);
2574     __ rev32(v1, __ T16B, v1);
2575     __ eor(v0, __ T16B, v0, v1);
2576 
2577     __ st1(v0, __ T16B, to);
2578 
2579     __ mov(r0, 0);
2580 
2581     __ leave();
2582     __ ret(lr);
2583 
2584     return start;
2585   }
2586 
2587   // Arguments:
2588   //
2589   // Inputs:
2590   //   c_rarg0   - source byte array address
2591   //   c_rarg1   - destination byte array address
2592   //   c_rarg2   - K (key) in little endian int array
2593   //
2594   address generate_aescrypt_decryptBlock() {
2595     assert(UseAES, "need AES instructions and misaligned SSE support");
2596     __ align(CodeEntryAlignment);
2597     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2598     Label L_doLast;
2599 
2600     const Register from        = c_rarg0;  // source array address
2601     const Register to          = c_rarg1;  // destination array address
2602     const Register key         = c_rarg2;  // key array address
2603     const Register keylen      = rscratch1;
2604 
2605     address start = __ pc();
2606     __ enter(); // required for proper stackwalking of RuntimeStub frame
2607 
2608     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2609 
2610     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2611 
2612     __ ld1(v5, __ T16B, __ post(key, 16));
2613     __ rev32(v5, __ T16B, v5);
2614 
2615     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2616     __ rev32(v1, __ T16B, v1);
2617     __ rev32(v2, __ T16B, v2);
2618     __ rev32(v3, __ T16B, v3);
2619     __ rev32(v4, __ T16B, v4);
2620     __ aesd(v0, v1);
2621     __ aesimc(v0, v0);
2622     __ aesd(v0, v2);
2623     __ aesimc(v0, v0);
2624     __ aesd(v0, v3);
2625     __ aesimc(v0, v0);
2626     __ aesd(v0, v4);
2627     __ aesimc(v0, v0);
2628 
2629     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2630     __ rev32(v1, __ T16B, v1);
2631     __ rev32(v2, __ T16B, v2);
2632     __ rev32(v3, __ T16B, v3);
2633     __ rev32(v4, __ T16B, v4);
2634     __ aesd(v0, v1);
2635     __ aesimc(v0, v0);
2636     __ aesd(v0, v2);
2637     __ aesimc(v0, v0);
2638     __ aesd(v0, v3);
2639     __ aesimc(v0, v0);
2640     __ aesd(v0, v4);
2641     __ aesimc(v0, v0);
2642 
2643     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2644     __ rev32(v1, __ T16B, v1);
2645     __ rev32(v2, __ T16B, v2);
2646 
2647     __ cmpw(keylen, 44);
2648     __ br(Assembler::EQ, L_doLast);
2649 
2650     __ aesd(v0, v1);
2651     __ aesimc(v0, v0);
2652     __ aesd(v0, v2);
2653     __ aesimc(v0, v0);
2654 
2655     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2656     __ rev32(v1, __ T16B, v1);
2657     __ rev32(v2, __ T16B, v2);
2658 
2659     __ cmpw(keylen, 52);
2660     __ br(Assembler::EQ, L_doLast);
2661 
2662     __ aesd(v0, v1);
2663     __ aesimc(v0, v0);
2664     __ aesd(v0, v2);
2665     __ aesimc(v0, v0);
2666 
2667     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2668     __ rev32(v1, __ T16B, v1);
2669     __ rev32(v2, __ T16B, v2);
2670 
2671     __ BIND(L_doLast);
2672 
2673     __ aesd(v0, v1);
2674     __ aesimc(v0, v0);
2675     __ aesd(v0, v2);
2676 
2677     __ eor(v0, __ T16B, v0, v5);
2678 
2679     __ st1(v0, __ T16B, to);
2680 
2681     __ mov(r0, 0);
2682 
2683     __ leave();
2684     __ ret(lr);
2685 
2686     return start;
2687   }
2688 
2689   // Arguments:
2690   //
2691   // Inputs:
2692   //   c_rarg0   - source byte array address
2693   //   c_rarg1   - destination byte array address
2694   //   c_rarg2   - K (key) in little endian int array
2695   //   c_rarg3   - r vector byte array address
2696   //   c_rarg4   - input length
2697   //
2698   // Output:
2699   //   x0        - input length
2700   //
2701   address generate_cipherBlockChaining_encryptAESCrypt() {
2702     assert(UseAES, "need AES instructions and misaligned SSE support");
2703     __ align(CodeEntryAlignment);
2704     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2705 
2706     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2707 
2708     const Register from        = c_rarg0;  // source array address
2709     const Register to          = c_rarg1;  // destination array address
2710     const Register key         = c_rarg2;  // key array address
2711     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2712                                            // and left with the results of the last encryption block
2713     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2714     const Register keylen      = rscratch1;
2715 
2716     address start = __ pc();
2717 
2718       __ enter();
2719 
2720       __ movw(rscratch2, len_reg);
2721 
2722       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2723 
2724       __ ld1(v0, __ T16B, rvec);
2725 
2726       __ cmpw(keylen, 52);
2727       __ br(Assembler::CC, L_loadkeys_44);
2728       __ br(Assembler::EQ, L_loadkeys_52);
2729 
2730       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2731       __ rev32(v17, __ T16B, v17);
2732       __ rev32(v18, __ T16B, v18);
2733     __ BIND(L_loadkeys_52);
2734       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2735       __ rev32(v19, __ T16B, v19);
2736       __ rev32(v20, __ T16B, v20);
2737     __ BIND(L_loadkeys_44);
2738       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2739       __ rev32(v21, __ T16B, v21);
2740       __ rev32(v22, __ T16B, v22);
2741       __ rev32(v23, __ T16B, v23);
2742       __ rev32(v24, __ T16B, v24);
2743       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2744       __ rev32(v25, __ T16B, v25);
2745       __ rev32(v26, __ T16B, v26);
2746       __ rev32(v27, __ T16B, v27);
2747       __ rev32(v28, __ T16B, v28);
2748       __ ld1(v29, v30, v31, __ T16B, key);
2749       __ rev32(v29, __ T16B, v29);
2750       __ rev32(v30, __ T16B, v30);
2751       __ rev32(v31, __ T16B, v31);
2752 
2753     __ BIND(L_aes_loop);
2754       __ ld1(v1, __ T16B, __ post(from, 16));
2755       __ eor(v0, __ T16B, v0, v1);
2756 
2757       __ br(Assembler::CC, L_rounds_44);
2758       __ br(Assembler::EQ, L_rounds_52);
2759 
2760       __ aese(v0, v17); __ aesmc(v0, v0);
2761       __ aese(v0, v18); __ aesmc(v0, v0);
2762     __ BIND(L_rounds_52);
2763       __ aese(v0, v19); __ aesmc(v0, v0);
2764       __ aese(v0, v20); __ aesmc(v0, v0);
2765     __ BIND(L_rounds_44);
2766       __ aese(v0, v21); __ aesmc(v0, v0);
2767       __ aese(v0, v22); __ aesmc(v0, v0);
2768       __ aese(v0, v23); __ aesmc(v0, v0);
2769       __ aese(v0, v24); __ aesmc(v0, v0);
2770       __ aese(v0, v25); __ aesmc(v0, v0);
2771       __ aese(v0, v26); __ aesmc(v0, v0);
2772       __ aese(v0, v27); __ aesmc(v0, v0);
2773       __ aese(v0, v28); __ aesmc(v0, v0);
2774       __ aese(v0, v29); __ aesmc(v0, v0);
2775       __ aese(v0, v30);
2776       __ eor(v0, __ T16B, v0, v31);
2777 
2778       __ st1(v0, __ T16B, __ post(to, 16));
2779 
2780       __ subw(len_reg, len_reg, 16);
2781       __ cbnzw(len_reg, L_aes_loop);
2782 
2783       __ st1(v0, __ T16B, rvec);
2784 
2785       __ mov(r0, rscratch2);
2786 
2787       __ leave();
2788       __ ret(lr);
2789 
2790       return start;
2791   }
2792 
2793   // Arguments:
2794   //
2795   // Inputs:
2796   //   c_rarg0   - source byte array address
2797   //   c_rarg1   - destination byte array address
2798   //   c_rarg2   - K (key) in little endian int array
2799   //   c_rarg3   - r vector byte array address
2800   //   c_rarg4   - input length
2801   //
2802   // Output:
2803   //   r0        - input length
2804   //
2805   address generate_cipherBlockChaining_decryptAESCrypt() {
2806     assert(UseAES, "need AES instructions and misaligned SSE support");
2807     __ align(CodeEntryAlignment);
2808     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2809 
2810     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2811 
2812     const Register from        = c_rarg0;  // source array address
2813     const Register to          = c_rarg1;  // destination array address
2814     const Register key         = c_rarg2;  // key array address
2815     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2816                                            // and left with the results of the last encryption block
2817     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2818     const Register keylen      = rscratch1;
2819 
2820     address start = __ pc();
2821 
2822       __ enter();
2823 
2824       __ movw(rscratch2, len_reg);
2825 
2826       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2827 
2828       __ ld1(v2, __ T16B, rvec);
2829 
2830       __ ld1(v31, __ T16B, __ post(key, 16));
2831       __ rev32(v31, __ T16B, v31);
2832 
2833       __ cmpw(keylen, 52);
2834       __ br(Assembler::CC, L_loadkeys_44);
2835       __ br(Assembler::EQ, L_loadkeys_52);
2836 
2837       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2838       __ rev32(v17, __ T16B, v17);
2839       __ rev32(v18, __ T16B, v18);
2840     __ BIND(L_loadkeys_52);
2841       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2842       __ rev32(v19, __ T16B, v19);
2843       __ rev32(v20, __ T16B, v20);
2844     __ BIND(L_loadkeys_44);
2845       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2846       __ rev32(v21, __ T16B, v21);
2847       __ rev32(v22, __ T16B, v22);
2848       __ rev32(v23, __ T16B, v23);
2849       __ rev32(v24, __ T16B, v24);
2850       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2851       __ rev32(v25, __ T16B, v25);
2852       __ rev32(v26, __ T16B, v26);
2853       __ rev32(v27, __ T16B, v27);
2854       __ rev32(v28, __ T16B, v28);
2855       __ ld1(v29, v30, __ T16B, key);
2856       __ rev32(v29, __ T16B, v29);
2857       __ rev32(v30, __ T16B, v30);
2858 
2859     __ BIND(L_aes_loop);
2860       __ ld1(v0, __ T16B, __ post(from, 16));
2861       __ orr(v1, __ T16B, v0, v0);
2862 
2863       __ br(Assembler::CC, L_rounds_44);
2864       __ br(Assembler::EQ, L_rounds_52);
2865 
2866       __ aesd(v0, v17); __ aesimc(v0, v0);
2867       __ aesd(v0, v18); __ aesimc(v0, v0);
2868     __ BIND(L_rounds_52);
2869       __ aesd(v0, v19); __ aesimc(v0, v0);
2870       __ aesd(v0, v20); __ aesimc(v0, v0);
2871     __ BIND(L_rounds_44);
2872       __ aesd(v0, v21); __ aesimc(v0, v0);
2873       __ aesd(v0, v22); __ aesimc(v0, v0);
2874       __ aesd(v0, v23); __ aesimc(v0, v0);
2875       __ aesd(v0, v24); __ aesimc(v0, v0);
2876       __ aesd(v0, v25); __ aesimc(v0, v0);
2877       __ aesd(v0, v26); __ aesimc(v0, v0);
2878       __ aesd(v0, v27); __ aesimc(v0, v0);
2879       __ aesd(v0, v28); __ aesimc(v0, v0);
2880       __ aesd(v0, v29); __ aesimc(v0, v0);
2881       __ aesd(v0, v30);
2882       __ eor(v0, __ T16B, v0, v31);
2883       __ eor(v0, __ T16B, v0, v2);
2884 
2885       __ st1(v0, __ T16B, __ post(to, 16));
2886       __ orr(v2, __ T16B, v1, v1);
2887 
2888       __ subw(len_reg, len_reg, 16);
2889       __ cbnzw(len_reg, L_aes_loop);
2890 
2891       __ st1(v2, __ T16B, rvec);
2892 
2893       __ mov(r0, rscratch2);
2894 
2895       __ leave();
2896       __ ret(lr);
2897 
2898     return start;
2899   }
2900 
2901   // Arguments:
2902   //
2903   // Inputs:
2904   //   c_rarg0   - byte[]  source+offset
2905   //   c_rarg1   - int[]   SHA.state
2906   //   c_rarg2   - int     offset
2907   //   c_rarg3   - int     limit
2908   //
2909   address generate_sha1_implCompress(bool multi_block, const char *name) {
2910     __ align(CodeEntryAlignment);
2911     StubCodeMark mark(this, "StubRoutines", name);
2912     address start = __ pc();
2913 
2914     Register buf   = c_rarg0;
2915     Register state = c_rarg1;
2916     Register ofs   = c_rarg2;
2917     Register limit = c_rarg3;
2918 
2919     Label keys;
2920     Label sha1_loop;
2921 
2922     // load the keys into v0..v3
2923     __ adr(rscratch1, keys);
2924     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2925     // load 5 words state into v6, v7
2926     __ ldrq(v6, Address(state, 0));
2927     __ ldrs(v7, Address(state, 16));
2928 
2929 
2930     __ BIND(sha1_loop);
2931     // load 64 bytes of data into v16..v19
2932     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2933     __ rev32(v16, __ T16B, v16);
2934     __ rev32(v17, __ T16B, v17);
2935     __ rev32(v18, __ T16B, v18);
2936     __ rev32(v19, __ T16B, v19);
2937 
2938     // do the sha1
2939     __ addv(v4, __ T4S, v16, v0);
2940     __ orr(v20, __ T16B, v6, v6);
2941 
2942     FloatRegister d0 = v16;
2943     FloatRegister d1 = v17;
2944     FloatRegister d2 = v18;
2945     FloatRegister d3 = v19;
2946 
2947     for (int round = 0; round < 20; round++) {
2948       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2949       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2950       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2951       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2952       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2953 
2954       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2955       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2956       __ sha1h(tmp2, __ T4S, v20);
2957       if (round < 5)
2958         __ sha1c(v20, __ T4S, tmp3, tmp4);
2959       else if (round < 10 || round >= 15)
2960         __ sha1p(v20, __ T4S, tmp3, tmp4);
2961       else
2962         __ sha1m(v20, __ T4S, tmp3, tmp4);
2963       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2964 
2965       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2966     }
2967 
2968     __ addv(v7, __ T2S, v7, v21);
2969     __ addv(v6, __ T4S, v6, v20);
2970 
2971     if (multi_block) {
2972       __ add(ofs, ofs, 64);
2973       __ cmp(ofs, limit);
2974       __ br(Assembler::LE, sha1_loop);
2975       __ mov(c_rarg0, ofs); // return ofs
2976     }
2977 
2978     __ strq(v6, Address(state, 0));
2979     __ strs(v7, Address(state, 16));
2980 
2981     __ ret(lr);
2982 
2983     __ bind(keys);
2984     __ emit_int32(0x5a827999);
2985     __ emit_int32(0x6ed9eba1);
2986     __ emit_int32(0x8f1bbcdc);
2987     __ emit_int32(0xca62c1d6);
2988 
2989     return start;
2990   }
2991 
2992 
2993   // Arguments:
2994   //
2995   // Inputs:
2996   //   c_rarg0   - byte[]  source+offset
2997   //   c_rarg1   - int[]   SHA.state
2998   //   c_rarg2   - int     offset
2999   //   c_rarg3   - int     limit
3000   //
3001   address generate_sha256_implCompress(bool multi_block, const char *name) {
3002     static const uint32_t round_consts[64] = {
3003       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3004       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3005       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3006       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3007       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3008       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3009       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3010       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3011       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3012       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3013       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3014       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3015       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3016       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3017       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3018       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3019     };
3020     __ align(CodeEntryAlignment);
3021     StubCodeMark mark(this, "StubRoutines", name);
3022     address start = __ pc();
3023 
3024     Register buf   = c_rarg0;
3025     Register state = c_rarg1;
3026     Register ofs   = c_rarg2;
3027     Register limit = c_rarg3;
3028 
3029     Label sha1_loop;
3030 
3031     __ stpd(v8, v9, __ pre(sp, -32));
3032     __ stpd(v10, v11, Address(sp, 16));
3033 
3034 // dga == v0
3035 // dgb == v1
3036 // dg0 == v2
3037 // dg1 == v3
3038 // dg2 == v4
3039 // t0 == v6
3040 // t1 == v7
3041 
3042     // load 16 keys to v16..v31
3043     __ lea(rscratch1, ExternalAddress((address)round_consts));
3044     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3045     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3046     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3047     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3048 
3049     // load 8 words (256 bits) state
3050     __ ldpq(v0, v1, state);
3051 
3052     __ BIND(sha1_loop);
3053     // load 64 bytes of data into v8..v11
3054     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3055     __ rev32(v8, __ T16B, v8);
3056     __ rev32(v9, __ T16B, v9);
3057     __ rev32(v10, __ T16B, v10);
3058     __ rev32(v11, __ T16B, v11);
3059 
3060     __ addv(v6, __ T4S, v8, v16);
3061     __ orr(v2, __ T16B, v0, v0);
3062     __ orr(v3, __ T16B, v1, v1);
3063 
3064     FloatRegister d0 = v8;
3065     FloatRegister d1 = v9;
3066     FloatRegister d2 = v10;
3067     FloatRegister d3 = v11;
3068 
3069 
3070     for (int round = 0; round < 16; round++) {
3071       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3072       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3073       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3074       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3075 
3076       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3077        __ orr(v4, __ T16B, v2, v2);
3078       if (round < 15)
3079         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3080       __ sha256h(v2, __ T4S, v3, tmp2);
3081       __ sha256h2(v3, __ T4S, v4, tmp2);
3082       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3083 
3084       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3085     }
3086 
3087     __ addv(v0, __ T4S, v0, v2);
3088     __ addv(v1, __ T4S, v1, v3);
3089 
3090     if (multi_block) {
3091       __ add(ofs, ofs, 64);
3092       __ cmp(ofs, limit);
3093       __ br(Assembler::LE, sha1_loop);
3094       __ mov(c_rarg0, ofs); // return ofs
3095     }
3096 
3097     __ ldpd(v10, v11, Address(sp, 16));
3098     __ ldpd(v8, v9, __ post(sp, 32));
3099 
3100     __ stpq(v0, v1, state);
3101 
3102     __ ret(lr);
3103 
3104     return start;
3105   }
3106 
3107 #ifndef BUILTIN_SIM
3108   // Safefetch stubs.
3109   void generate_safefetch(const char* name, int size, address* entry,
3110                           address* fault_pc, address* continuation_pc) {
3111     // safefetch signatures:
3112     //   int      SafeFetch32(int*      adr, int      errValue);
3113     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3114     //
3115     // arguments:
3116     //   c_rarg0 = adr
3117     //   c_rarg1 = errValue
3118     //
3119     // result:
3120     //   PPC_RET  = *adr or errValue
3121 
3122     StubCodeMark mark(this, "StubRoutines", name);
3123 
3124     // Entry point, pc or function descriptor.
3125     *entry = __ pc();
3126 
3127     // Load *adr into c_rarg1, may fault.
3128     *fault_pc = __ pc();
3129     switch (size) {
3130       case 4:
3131         // int32_t
3132         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3133         break;
3134       case 8:
3135         // int64_t
3136         __ ldr(c_rarg1, Address(c_rarg0, 0));
3137         break;
3138       default:
3139         ShouldNotReachHere();
3140     }
3141 
3142     // return errValue or *adr
3143     *continuation_pc = __ pc();
3144     __ mov(r0, c_rarg1);
3145     __ ret(lr);
3146   }
3147 #endif
3148 
3149   /**
3150    *  Arguments:
3151    *
3152    * Inputs:
3153    *   c_rarg0   - int crc
3154    *   c_rarg1   - byte* buf
3155    *   c_rarg2   - int length
3156    *
3157    * Ouput:
3158    *       rax   - int crc result
3159    */
3160   address generate_updateBytesCRC32() {
3161     assert(UseCRC32Intrinsics, "what are we doing here?");
3162 
3163     __ align(CodeEntryAlignment);
3164     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3165 
3166     address start = __ pc();
3167 
3168     const Register crc   = c_rarg0;  // crc
3169     const Register buf   = c_rarg1;  // source java byte array address
3170     const Register len   = c_rarg2;  // length
3171     const Register table0 = c_rarg3; // crc_table address
3172     const Register table1 = c_rarg4;
3173     const Register table2 = c_rarg5;
3174     const Register table3 = c_rarg6;
3175     const Register tmp3 = c_rarg7;
3176 
3177     BLOCK_COMMENT("Entry:");
3178     __ enter(); // required for proper stackwalking of RuntimeStub frame
3179 
3180     __ kernel_crc32(crc, buf, len,
3181               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3182 
3183     __ leave(); // required for proper stackwalking of RuntimeStub frame
3184     __ ret(lr);
3185 
3186     return start;
3187   }
3188 
3189   /**
3190    *  Arguments:
3191    *
3192    * Inputs:
3193    *   c_rarg0   - int crc
3194    *   c_rarg1   - byte* buf
3195    *   c_rarg2   - int length
3196    *   c_rarg3   - int* table
3197    *
3198    * Ouput:
3199    *       r0   - int crc result
3200    */
3201   address generate_updateBytesCRC32C() {
3202     assert(UseCRC32CIntrinsics, "what are we doing here?");
3203 
3204     __ align(CodeEntryAlignment);
3205     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3206 
3207     address start = __ pc();
3208 
3209     const Register crc   = c_rarg0;  // crc
3210     const Register buf   = c_rarg1;  // source java byte array address
3211     const Register len   = c_rarg2;  // length
3212     const Register table0 = c_rarg3; // crc_table address
3213     const Register table1 = c_rarg4;
3214     const Register table2 = c_rarg5;
3215     const Register table3 = c_rarg6;
3216     const Register tmp3 = c_rarg7;
3217 
3218     BLOCK_COMMENT("Entry:");
3219     __ enter(); // required for proper stackwalking of RuntimeStub frame
3220 
3221     __ kernel_crc32c(crc, buf, len,
3222               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3223 
3224     __ leave(); // required for proper stackwalking of RuntimeStub frame
3225     __ ret(lr);
3226 
3227     return start;
3228   }
3229 
3230   /***
3231    *  Arguments:
3232    *
3233    *  Inputs:
3234    *   c_rarg0   - int   adler
3235    *   c_rarg1   - byte* buff
3236    *   c_rarg2   - int   len
3237    *
3238    * Output:
3239    *   c_rarg0   - int adler result
3240    */
3241   address generate_updateBytesAdler32() {
3242     __ align(CodeEntryAlignment);
3243     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3244     address start = __ pc();
3245 
3246     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3247 
3248     // Aliases
3249     Register adler  = c_rarg0;
3250     Register s1     = c_rarg0;
3251     Register s2     = c_rarg3;
3252     Register buff   = c_rarg1;
3253     Register len    = c_rarg2;
3254     Register nmax  = r4;
3255     Register base  = r5;
3256     Register count = r6;
3257     Register temp0 = rscratch1;
3258     Register temp1 = rscratch2;
3259     FloatRegister vbytes = v0;
3260     FloatRegister vs1acc = v1;
3261     FloatRegister vs2acc = v2;
3262     FloatRegister vtable = v3;
3263 
3264     // Max number of bytes we can process before having to take the mod
3265     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3266     unsigned long BASE = 0xfff1;
3267     unsigned long NMAX = 0x15B0;
3268 
3269     __ mov(base, BASE);
3270     __ mov(nmax, NMAX);
3271 
3272     // Load accumulation coefficients for the upper 16 bits
3273     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3274     __ ld1(vtable, __ T16B, Address(temp0));
3275 
3276     // s1 is initialized to the lower 16 bits of adler
3277     // s2 is initialized to the upper 16 bits of adler
3278     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3279     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3280 
3281     // The pipelined loop needs at least 16 elements for 1 iteration
3282     // It does check this, but it is more effective to skip to the cleanup loop
3283     __ cmp(len, (u1)16);
3284     __ br(Assembler::HS, L_nmax);
3285     __ cbz(len, L_combine);
3286 
3287     __ bind(L_simple_by1_loop);
3288     __ ldrb(temp0, Address(__ post(buff, 1)));
3289     __ add(s1, s1, temp0);
3290     __ add(s2, s2, s1);
3291     __ subs(len, len, 1);
3292     __ br(Assembler::HI, L_simple_by1_loop);
3293 
3294     // s1 = s1 % BASE
3295     __ subs(temp0, s1, base);
3296     __ csel(s1, temp0, s1, Assembler::HS);
3297 
3298     // s2 = s2 % BASE
3299     __ lsr(temp0, s2, 16);
3300     __ lsl(temp1, temp0, 4);
3301     __ sub(temp1, temp1, temp0);
3302     __ add(s2, temp1, s2, ext::uxth);
3303 
3304     __ subs(temp0, s2, base);
3305     __ csel(s2, temp0, s2, Assembler::HS);
3306 
3307     __ b(L_combine);
3308 
3309     __ bind(L_nmax);
3310     __ subs(len, len, nmax);
3311     __ sub(count, nmax, 16);
3312     __ br(Assembler::LO, L_by16);
3313 
3314     __ bind(L_nmax_loop);
3315 
3316     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3317                                       vbytes, vs1acc, vs2acc, vtable);
3318 
3319     __ subs(count, count, 16);
3320     __ br(Assembler::HS, L_nmax_loop);
3321 
3322     // s1 = s1 % BASE
3323     __ lsr(temp0, s1, 16);
3324     __ lsl(temp1, temp0, 4);
3325     __ sub(temp1, temp1, temp0);
3326     __ add(temp1, temp1, s1, ext::uxth);
3327 
3328     __ lsr(temp0, temp1, 16);
3329     __ lsl(s1, temp0, 4);
3330     __ sub(s1, s1, temp0);
3331     __ add(s1, s1, temp1, ext:: uxth);
3332 
3333     __ subs(temp0, s1, base);
3334     __ csel(s1, temp0, s1, Assembler::HS);
3335 
3336     // s2 = s2 % BASE
3337     __ lsr(temp0, s2, 16);
3338     __ lsl(temp1, temp0, 4);
3339     __ sub(temp1, temp1, temp0);
3340     __ add(temp1, temp1, s2, ext::uxth);
3341 
3342     __ lsr(temp0, temp1, 16);
3343     __ lsl(s2, temp0, 4);
3344     __ sub(s2, s2, temp0);
3345     __ add(s2, s2, temp1, ext:: uxth);
3346 
3347     __ subs(temp0, s2, base);
3348     __ csel(s2, temp0, s2, Assembler::HS);
3349 
3350     __ subs(len, len, nmax);
3351     __ sub(count, nmax, 16);
3352     __ br(Assembler::HS, L_nmax_loop);
3353 
3354     __ bind(L_by16);
3355     __ adds(len, len, count);
3356     __ br(Assembler::LO, L_by1);
3357 
3358     __ bind(L_by16_loop);
3359 
3360     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3361                                       vbytes, vs1acc, vs2acc, vtable);
3362 
3363     __ subs(len, len, 16);
3364     __ br(Assembler::HS, L_by16_loop);
3365 
3366     __ bind(L_by1);
3367     __ adds(len, len, 15);
3368     __ br(Assembler::LO, L_do_mod);
3369 
3370     __ bind(L_by1_loop);
3371     __ ldrb(temp0, Address(__ post(buff, 1)));
3372     __ add(s1, temp0, s1);
3373     __ add(s2, s2, s1);
3374     __ subs(len, len, 1);
3375     __ br(Assembler::HS, L_by1_loop);
3376 
3377     __ bind(L_do_mod);
3378     // s1 = s1 % BASE
3379     __ lsr(temp0, s1, 16);
3380     __ lsl(temp1, temp0, 4);
3381     __ sub(temp1, temp1, temp0);
3382     __ add(temp1, temp1, s1, ext::uxth);
3383 
3384     __ lsr(temp0, temp1, 16);
3385     __ lsl(s1, temp0, 4);
3386     __ sub(s1, s1, temp0);
3387     __ add(s1, s1, temp1, ext:: uxth);
3388 
3389     __ subs(temp0, s1, base);
3390     __ csel(s1, temp0, s1, Assembler::HS);
3391 
3392     // s2 = s2 % BASE
3393     __ lsr(temp0, s2, 16);
3394     __ lsl(temp1, temp0, 4);
3395     __ sub(temp1, temp1, temp0);
3396     __ add(temp1, temp1, s2, ext::uxth);
3397 
3398     __ lsr(temp0, temp1, 16);
3399     __ lsl(s2, temp0, 4);
3400     __ sub(s2, s2, temp0);
3401     __ add(s2, s2, temp1, ext:: uxth);
3402 
3403     __ subs(temp0, s2, base);
3404     __ csel(s2, temp0, s2, Assembler::HS);
3405 
3406     // Combine lower bits and higher bits
3407     __ bind(L_combine);
3408     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3409 
3410     __ ret(lr);
3411 
3412     return start;
3413   }
3414 
3415   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3416           Register temp0, Register temp1, FloatRegister vbytes,
3417           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3418     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3419     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3420     // In non-vectorized code, we update s1 and s2 as:
3421     //   s1 <- s1 + b1
3422     //   s2 <- s2 + s1
3423     //   s1 <- s1 + b2
3424     //   s2 <- s2 + b1
3425     //   ...
3426     //   s1 <- s1 + b16
3427     //   s2 <- s2 + s1
3428     // Putting above assignments together, we have:
3429     //   s1_new = s1 + b1 + b2 + ... + b16
3430     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3431     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3432     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3433     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3434 
3435     // s2 = s2 + s1 * 16
3436     __ add(s2, s2, s1, Assembler::LSL, 4);
3437 
3438     // vs1acc = b1 + b2 + b3 + ... + b16
3439     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3440     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3441     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3442     __ uaddlv(vs1acc, __ T16B, vbytes);
3443     __ uaddlv(vs2acc, __ T8H, vs2acc);
3444 
3445     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3446     __ fmovd(temp0, vs1acc);
3447     __ fmovd(temp1, vs2acc);
3448     __ add(s1, s1, temp0);
3449     __ add(s2, s2, temp1);
3450   }
3451 
3452   /**
3453    *  Arguments:
3454    *
3455    *  Input:
3456    *    c_rarg0   - x address
3457    *    c_rarg1   - x length
3458    *    c_rarg2   - y address
3459    *    c_rarg3   - y lenth
3460    *    c_rarg4   - z address
3461    *    c_rarg5   - z length
3462    */
3463   address generate_multiplyToLen() {
3464     __ align(CodeEntryAlignment);
3465     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3466 
3467     address start = __ pc();
3468     const Register x     = r0;
3469     const Register xlen  = r1;
3470     const Register y     = r2;
3471     const Register ylen  = r3;
3472     const Register z     = r4;
3473     const Register zlen  = r5;
3474 
3475     const Register tmp1  = r10;
3476     const Register tmp2  = r11;
3477     const Register tmp3  = r12;
3478     const Register tmp4  = r13;
3479     const Register tmp5  = r14;
3480     const Register tmp6  = r15;
3481     const Register tmp7  = r16;
3482 
3483     BLOCK_COMMENT("Entry:");
3484     __ enter(); // required for proper stackwalking of RuntimeStub frame
3485     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3486     __ leave(); // required for proper stackwalking of RuntimeStub frame
3487     __ ret(lr);
3488 
3489     return start;
3490   }
3491 
3492   address generate_squareToLen() {
3493     // squareToLen algorithm for sizes 1..127 described in java code works
3494     // faster than multiply_to_len on some CPUs and slower on others, but
3495     // multiply_to_len shows a bit better overall results
3496     __ align(CodeEntryAlignment);
3497     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3498     address start = __ pc();
3499 
3500     const Register x     = r0;
3501     const Register xlen  = r1;
3502     const Register z     = r2;
3503     const Register zlen  = r3;
3504     const Register y     = r4; // == x
3505     const Register ylen  = r5; // == xlen
3506 
3507     const Register tmp1  = r10;
3508     const Register tmp2  = r11;
3509     const Register tmp3  = r12;
3510     const Register tmp4  = r13;
3511     const Register tmp5  = r14;
3512     const Register tmp6  = r15;
3513     const Register tmp7  = r16;
3514 
3515     RegSet spilled_regs = RegSet::of(y, ylen);
3516     BLOCK_COMMENT("Entry:");
3517     __ enter();
3518     __ push(spilled_regs, sp);
3519     __ mov(y, x);
3520     __ mov(ylen, xlen);
3521     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3522     __ pop(spilled_regs, sp);
3523     __ leave();
3524     __ ret(lr);
3525     return start;
3526   }
3527 
3528   address generate_mulAdd() {
3529     __ align(CodeEntryAlignment);
3530     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3531 
3532     address start = __ pc();
3533 
3534     const Register out     = r0;
3535     const Register in      = r1;
3536     const Register offset  = r2;
3537     const Register len     = r3;
3538     const Register k       = r4;
3539 
3540     BLOCK_COMMENT("Entry:");
3541     __ enter();
3542     __ mul_add(out, in, offset, len, k);
3543     __ leave();
3544     __ ret(lr);
3545 
3546     return start;
3547   }
3548 
3549   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3550                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3551                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3552     // Karatsuba multiplication performs a 128*128 -> 256-bit
3553     // multiplication in three 128-bit multiplications and a few
3554     // additions.
3555     //
3556     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3557     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3558     //
3559     // Inputs:
3560     //
3561     // A0 in a.d[0]     (subkey)
3562     // A1 in a.d[1]
3563     // (A1+A0) in a1_xor_a0.d[0]
3564     //
3565     // B0 in b.d[0]     (state)
3566     // B1 in b.d[1]
3567 
3568     __ ext(tmp1, __ T16B, b, b, 0x08);
3569     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3570     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3571     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3572     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3573 
3574     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3575     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3576     __ eor(tmp2, __ T16B, tmp2, tmp4);
3577     __ eor(tmp2, __ T16B, tmp2, tmp3);
3578 
3579     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3580     __ ins(result_hi, __ D, tmp2, 0, 1);
3581     __ ins(result_lo, __ D, tmp2, 1, 0);
3582   }
3583 
3584   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3585                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3586     const FloatRegister t0 = result;
3587 
3588     // The GCM field polynomial f is z^128 + p(z), where p =
3589     // z^7+z^2+z+1.
3590     //
3591     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3592     //
3593     // so, given that the product we're reducing is
3594     //    a == lo + hi * z^128
3595     // substituting,
3596     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3597     //
3598     // we reduce by multiplying hi by p(z) and subtracting the result
3599     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3600     // bits we can do this with two 64-bit multiplications, lo*p and
3601     // hi*p.
3602 
3603     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3604     __ ext(t1, __ T16B, t0, z, 8);
3605     __ eor(hi, __ T16B, hi, t1);
3606     __ ext(t1, __ T16B, z, t0, 8);
3607     __ eor(lo, __ T16B, lo, t1);
3608     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3609     __ eor(result, __ T16B, lo, t0);
3610   }
3611 
3612   address generate_has_negatives(address &has_negatives_long) {
3613     const u1 large_loop_size = 64;
3614     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3615     int dcache_line = VM_Version::dcache_line_size();
3616 
3617     Register ary1 = r1, len = r2, result = r0;
3618 
3619     __ align(CodeEntryAlignment);
3620 
3621     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3622 
3623     address entry = __ pc();
3624 
3625     __ enter();
3626 
3627   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3628         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3629 
3630   __ cmp(len, (u1)15);
3631   __ br(Assembler::GT, LEN_OVER_15);
3632   // The only case when execution falls into this code is when pointer is near
3633   // the end of memory page and we have to avoid reading next page
3634   __ add(ary1, ary1, len);
3635   __ subs(len, len, 8);
3636   __ br(Assembler::GT, LEN_OVER_8);
3637   __ ldr(rscratch2, Address(ary1, -8));
3638   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3639   __ lsrv(rscratch2, rscratch2, rscratch1);
3640   __ tst(rscratch2, UPPER_BIT_MASK);
3641   __ cset(result, Assembler::NE);
3642   __ leave();
3643   __ ret(lr);
3644   __ bind(LEN_OVER_8);
3645   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3646   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3647   __ tst(rscratch2, UPPER_BIT_MASK);
3648   __ br(Assembler::NE, RET_TRUE_NO_POP);
3649   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3650   __ lsrv(rscratch1, rscratch1, rscratch2);
3651   __ tst(rscratch1, UPPER_BIT_MASK);
3652   __ cset(result, Assembler::NE);
3653   __ leave();
3654   __ ret(lr);
3655 
3656   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3657   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3658 
3659   has_negatives_long = __ pc(); // 2nd entry point
3660 
3661   __ enter();
3662 
3663   __ bind(LEN_OVER_15);
3664     __ push(spilled_regs, sp);
3665     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3666     __ cbz(rscratch2, ALIGNED);
3667     __ ldp(tmp6, tmp1, Address(ary1));
3668     __ mov(tmp5, 16);
3669     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3670     __ add(ary1, ary1, rscratch1);
3671     __ sub(len, len, rscratch1);
3672     __ orr(tmp6, tmp6, tmp1);
3673     __ tst(tmp6, UPPER_BIT_MASK);
3674     __ br(Assembler::NE, RET_TRUE);
3675 
3676   __ bind(ALIGNED);
3677     __ cmp(len, large_loop_size);
3678     __ br(Assembler::LT, CHECK_16);
3679     // Perform 16-byte load as early return in pre-loop to handle situation
3680     // when initially aligned large array has negative values at starting bytes,
3681     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3682     // slower. Cases with negative bytes further ahead won't be affected that
3683     // much. In fact, it'll be faster due to early loads, less instructions and
3684     // less branches in LARGE_LOOP.
3685     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3686     __ sub(len, len, 16);
3687     __ orr(tmp6, tmp6, tmp1);
3688     __ tst(tmp6, UPPER_BIT_MASK);
3689     __ br(Assembler::NE, RET_TRUE);
3690     __ cmp(len, large_loop_size);
3691     __ br(Assembler::LT, CHECK_16);
3692 
3693     if (SoftwarePrefetchHintDistance >= 0
3694         && SoftwarePrefetchHintDistance >= dcache_line) {
3695       // initial prefetch
3696       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3697     }
3698   __ bind(LARGE_LOOP);
3699     if (SoftwarePrefetchHintDistance >= 0) {
3700       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3701     }
3702     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3703     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3704     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3705     // instructions per cycle and have less branches, but this approach disables
3706     // early return, thus, all 64 bytes are loaded and checked every time.
3707     __ ldp(tmp2, tmp3, Address(ary1));
3708     __ ldp(tmp4, tmp5, Address(ary1, 16));
3709     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3710     __ ldp(tmp6, tmp1, Address(ary1, 48));
3711     __ add(ary1, ary1, large_loop_size);
3712     __ sub(len, len, large_loop_size);
3713     __ orr(tmp2, tmp2, tmp3);
3714     __ orr(tmp4, tmp4, tmp5);
3715     __ orr(rscratch1, rscratch1, rscratch2);
3716     __ orr(tmp6, tmp6, tmp1);
3717     __ orr(tmp2, tmp2, tmp4);
3718     __ orr(rscratch1, rscratch1, tmp6);
3719     __ orr(tmp2, tmp2, rscratch1);
3720     __ tst(tmp2, UPPER_BIT_MASK);
3721     __ br(Assembler::NE, RET_TRUE);
3722     __ cmp(len, large_loop_size);
3723     __ br(Assembler::GE, LARGE_LOOP);
3724 
3725   __ bind(CHECK_16); // small 16-byte load pre-loop
3726     __ cmp(len, (u1)16);
3727     __ br(Assembler::LT, POST_LOOP16);
3728 
3729   __ bind(LOOP16); // small 16-byte load loop
3730     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3731     __ sub(len, len, 16);
3732     __ orr(tmp2, tmp2, tmp3);
3733     __ tst(tmp2, UPPER_BIT_MASK);
3734     __ br(Assembler::NE, RET_TRUE);
3735     __ cmp(len, (u1)16);
3736     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3737 
3738   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3739     __ cmp(len, (u1)8);
3740     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3741     __ ldr(tmp3, Address(__ post(ary1, 8)));
3742     __ sub(len, len, 8);
3743     __ tst(tmp3, UPPER_BIT_MASK);
3744     __ br(Assembler::NE, RET_TRUE);
3745 
3746   __ bind(POST_LOOP16_LOAD_TAIL);
3747     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3748     __ ldr(tmp1, Address(ary1));
3749     __ mov(tmp2, 64);
3750     __ sub(tmp4, tmp2, len, __ LSL, 3);
3751     __ lslv(tmp1, tmp1, tmp4);
3752     __ tst(tmp1, UPPER_BIT_MASK);
3753     __ br(Assembler::NE, RET_TRUE);
3754     // Fallthrough
3755 
3756   __ bind(RET_FALSE);
3757     __ pop(spilled_regs, sp);
3758     __ leave();
3759     __ mov(result, zr);
3760     __ ret(lr);
3761 
3762   __ bind(RET_TRUE);
3763     __ pop(spilled_regs, sp);
3764   __ bind(RET_TRUE_NO_POP);
3765     __ leave();
3766     __ mov(result, 1);
3767     __ ret(lr);
3768 
3769   __ bind(DONE);
3770     __ pop(spilled_regs, sp);
3771     __ leave();
3772     __ ret(lr);
3773     return entry;
3774   }
3775 
3776   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3777         bool usePrefetch, Label &NOT_EQUAL) {
3778     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3779         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3780         tmp7 = r12, tmp8 = r13;
3781     Label LOOP;
3782 
3783     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3784     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3785     __ bind(LOOP);
3786     if (usePrefetch) {
3787       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3788       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3789     }
3790     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3791     __ eor(tmp1, tmp1, tmp2);
3792     __ eor(tmp3, tmp3, tmp4);
3793     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3794     __ orr(tmp1, tmp1, tmp3);
3795     __ cbnz(tmp1, NOT_EQUAL);
3796     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3797     __ eor(tmp5, tmp5, tmp6);
3798     __ eor(tmp7, tmp7, tmp8);
3799     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3800     __ orr(tmp5, tmp5, tmp7);
3801     __ cbnz(tmp5, NOT_EQUAL);
3802     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3803     __ eor(tmp1, tmp1, tmp2);
3804     __ eor(tmp3, tmp3, tmp4);
3805     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3806     __ orr(tmp1, tmp1, tmp3);
3807     __ cbnz(tmp1, NOT_EQUAL);
3808     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3809     __ eor(tmp5, tmp5, tmp6);
3810     __ sub(cnt1, cnt1, 8 * wordSize);
3811     __ eor(tmp7, tmp7, tmp8);
3812     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3813     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3814     // cmp) because subs allows an unlimited range of immediate operand.
3815     __ subs(tmp6, cnt1, loopThreshold);
3816     __ orr(tmp5, tmp5, tmp7);
3817     __ cbnz(tmp5, NOT_EQUAL);
3818     __ br(__ GE, LOOP);
3819     // post-loop
3820     __ eor(tmp1, tmp1, tmp2);
3821     __ eor(tmp3, tmp3, tmp4);
3822     __ orr(tmp1, tmp1, tmp3);
3823     __ sub(cnt1, cnt1, 2 * wordSize);
3824     __ cbnz(tmp1, NOT_EQUAL);
3825   }
3826 
3827   void generate_large_array_equals_loop_simd(int loopThreshold,
3828         bool usePrefetch, Label &NOT_EQUAL) {
3829     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3830         tmp2 = rscratch2;
3831     Label LOOP;
3832 
3833     __ bind(LOOP);
3834     if (usePrefetch) {
3835       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3836       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3837     }
3838     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3839     __ sub(cnt1, cnt1, 8 * wordSize);
3840     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3841     __ subs(tmp1, cnt1, loopThreshold);
3842     __ eor(v0, __ T16B, v0, v4);
3843     __ eor(v1, __ T16B, v1, v5);
3844     __ eor(v2, __ T16B, v2, v6);
3845     __ eor(v3, __ T16B, v3, v7);
3846     __ orr(v0, __ T16B, v0, v1);
3847     __ orr(v1, __ T16B, v2, v3);
3848     __ orr(v0, __ T16B, v0, v1);
3849     __ umov(tmp1, v0, __ D, 0);
3850     __ umov(tmp2, v0, __ D, 1);
3851     __ orr(tmp1, tmp1, tmp2);
3852     __ cbnz(tmp1, NOT_EQUAL);
3853     __ br(__ GE, LOOP);
3854   }
3855 
3856   // a1 = r1 - array1 address
3857   // a2 = r2 - array2 address
3858   // result = r0 - return value. Already contains "false"
3859   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3860   // r3-r5 are reserved temporary registers
3861   address generate_large_array_equals() {
3862     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3863         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3864         tmp7 = r12, tmp8 = r13;
3865     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3866         SMALL_LOOP, POST_LOOP;
3867     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3868     // calculate if at least 32 prefetched bytes are used
3869     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3870     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3871     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3872     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3873         tmp5, tmp6, tmp7, tmp8);
3874 
3875     __ align(CodeEntryAlignment);
3876 
3877     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3878 
3879     address entry = __ pc();
3880     __ enter();
3881     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3882     // also advance pointers to use post-increment instead of pre-increment
3883     __ add(a1, a1, wordSize);
3884     __ add(a2, a2, wordSize);
3885     if (AvoidUnalignedAccesses) {
3886       // both implementations (SIMD/nonSIMD) are using relatively large load
3887       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3888       // on some CPUs in case of address is not at least 16-byte aligned.
3889       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3890       // load if needed at least for 1st address and make if 16-byte aligned.
3891       Label ALIGNED16;
3892       __ tbz(a1, 3, ALIGNED16);
3893       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3894       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3895       __ sub(cnt1, cnt1, wordSize);
3896       __ eor(tmp1, tmp1, tmp2);
3897       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3898       __ bind(ALIGNED16);
3899     }
3900     if (UseSIMDForArrayEquals) {
3901       if (SoftwarePrefetchHintDistance >= 0) {
3902         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3903         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3904         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3905             /* prfm = */ true, NOT_EQUAL);
3906         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3907         __ br(__ LT, TAIL);
3908       }
3909       __ bind(NO_PREFETCH_LARGE_LOOP);
3910       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3911           /* prfm = */ false, NOT_EQUAL);
3912     } else {
3913       __ push(spilled_regs, sp);
3914       if (SoftwarePrefetchHintDistance >= 0) {
3915         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3916         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3917         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3918             /* prfm = */ true, NOT_EQUAL);
3919         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3920         __ br(__ LT, TAIL);
3921       }
3922       __ bind(NO_PREFETCH_LARGE_LOOP);
3923       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3924           /* prfm = */ false, NOT_EQUAL);
3925     }
3926     __ bind(TAIL);
3927       __ cbz(cnt1, EQUAL);
3928       __ subs(cnt1, cnt1, wordSize);
3929       __ br(__ LE, POST_LOOP);
3930     __ bind(SMALL_LOOP);
3931       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3932       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3933       __ subs(cnt1, cnt1, wordSize);
3934       __ eor(tmp1, tmp1, tmp2);
3935       __ cbnz(tmp1, NOT_EQUAL);
3936       __ br(__ GT, SMALL_LOOP);
3937     __ bind(POST_LOOP);
3938       __ ldr(tmp1, Address(a1, cnt1));
3939       __ ldr(tmp2, Address(a2, cnt1));
3940       __ eor(tmp1, tmp1, tmp2);
3941       __ cbnz(tmp1, NOT_EQUAL);
3942     __ bind(EQUAL);
3943       __ mov(result, true);
3944     __ bind(NOT_EQUAL);
3945       if (!UseSIMDForArrayEquals) {
3946         __ pop(spilled_regs, sp);
3947       }
3948     __ bind(NOT_EQUAL_NO_POP);
3949     __ leave();
3950     __ ret(lr);
3951     return entry;
3952   }
3953 
3954   address generate_dsin_dcos(bool isCos) {
3955     __ align(CodeEntryAlignment);
3956     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3957     address start = __ pc();
3958     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3959         (address)StubRoutines::aarch64::_two_over_pi,
3960         (address)StubRoutines::aarch64::_pio2,
3961         (address)StubRoutines::aarch64::_dsin_coef,
3962         (address)StubRoutines::aarch64::_dcos_coef);
3963     return start;
3964   }
3965 
3966   address generate_dlog() {
3967     __ align(CodeEntryAlignment);
3968     StubCodeMark mark(this, "StubRoutines", "dlog");
3969     address entry = __ pc();
3970     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3971         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3972     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3973     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3974         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3975     return entry;
3976   }
3977 
3978   // code for comparing 16 bytes of strings with same encoding
3979   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3980     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3981     __ ldr(rscratch1, Address(__ post(str1, 8)));
3982     __ eor(rscratch2, tmp1, tmp2);
3983     __ ldr(cnt1, Address(__ post(str2, 8)));
3984     __ cbnz(rscratch2, DIFF1);
3985     __ ldr(tmp1, Address(__ post(str1, 8)));
3986     __ eor(rscratch2, rscratch1, cnt1);
3987     __ ldr(tmp2, Address(__ post(str2, 8)));
3988     __ cbnz(rscratch2, DIFF2);
3989   }
3990 
3991   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
3992   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
3993       Label &DIFF2) {
3994     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
3995     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
3996 
3997     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
3998     __ ldr(tmpU, Address(__ post(cnt1, 8)));
3999     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4000     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4001 
4002     __ fmovd(tmpL, vtmp3);
4003     __ eor(rscratch2, tmp3, tmpL);
4004     __ cbnz(rscratch2, DIFF2);
4005 
4006     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4007     __ umov(tmpL, vtmp3, __ D, 1);
4008     __ eor(rscratch2, tmpU, tmpL);
4009     __ cbnz(rscratch2, DIFF1);
4010 
4011     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4012     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4013     __ fmovd(tmpL, vtmp);
4014     __ eor(rscratch2, tmp3, tmpL);
4015     __ cbnz(rscratch2, DIFF2);
4016 
4017     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4018     __ umov(tmpL, vtmp, __ D, 1);
4019     __ eor(rscratch2, tmpU, tmpL);
4020     __ cbnz(rscratch2, DIFF1);
4021   }
4022 
4023   // r0  = result
4024   // r1  = str1
4025   // r2  = cnt1
4026   // r3  = str2
4027   // r4  = cnt2
4028   // r10 = tmp1
4029   // r11 = tmp2
4030   address generate_compare_long_string_different_encoding(bool isLU) {
4031     __ align(CodeEntryAlignment);
4032     StubCodeMark mark(this, "StubRoutines", isLU
4033         ? "compare_long_string_different_encoding LU"
4034         : "compare_long_string_different_encoding UL");
4035     address entry = __ pc();
4036     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4037         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
4038         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4039     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4040         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4041     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4042     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4043 
4044     int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
4045 
4046     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4047     // cnt2 == amount of characters left to compare
4048     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4049     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4050     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4051     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4052     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4053     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4054     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4055     __ eor(rscratch2, tmp1, tmp2);
4056     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4057     __ mov(rscratch1, tmp2);
4058     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4059     Register strU = isLU ? str2 : str1,
4060              strL = isLU ? str1 : str2,
4061              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4062              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4063     __ push(spilled_regs, sp);
4064     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4065     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4066 
4067     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4068 
4069     if (SoftwarePrefetchHintDistance >= 0) {
4070       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4071       __ br(__ LT, SMALL_LOOP);
4072       __ bind(LARGE_LOOP_PREFETCH);
4073         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4074         __ mov(tmp4, 2);
4075         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4076         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4077           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4078           __ subs(tmp4, tmp4, 1);
4079           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4080           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4081           __ mov(tmp4, 2);
4082         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4083           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4084           __ subs(tmp4, tmp4, 1);
4085           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4086           __ sub(cnt2, cnt2, 64);
4087           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4088           __ br(__ GE, LARGE_LOOP_PREFETCH);
4089     }
4090     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4091     __ subs(cnt2, cnt2, 16);
4092     __ br(__ LT, TAIL);
4093     __ b(SMALL_LOOP_ENTER);
4094     __ bind(SMALL_LOOP); // smaller loop
4095       __ subs(cnt2, cnt2, 16);
4096     __ bind(SMALL_LOOP_ENTER);
4097       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4098       __ br(__ GE, SMALL_LOOP);
4099       __ cbz(cnt2, LOAD_LAST);
4100     __ bind(TAIL); // 1..15 characters left
4101       __ subs(zr, cnt2, -8);
4102       __ br(__ GT, TAIL_LOAD_16);
4103       __ ldrd(vtmp, Address(tmp2));
4104       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4105 
4106       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4107       __ fmovd(tmpL, vtmp3);
4108       __ eor(rscratch2, tmp3, tmpL);
4109       __ cbnz(rscratch2, DIFF2);
4110       __ umov(tmpL, vtmp3, __ D, 1);
4111       __ eor(rscratch2, tmpU, tmpL);
4112       __ cbnz(rscratch2, DIFF1);
4113       __ b(LOAD_LAST);
4114     __ bind(TAIL_LOAD_16);
4115       __ ldrq(vtmp, Address(tmp2));
4116       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4117       __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4118       __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4119       __ fmovd(tmpL, vtmp3);
4120       __ eor(rscratch2, tmp3, tmpL);
4121       __ cbnz(rscratch2, DIFF2);
4122 
4123       __ ldr(tmp3, Address(__ post(cnt1, 8)));
4124       __ umov(tmpL, vtmp3, __ D, 1);
4125       __ eor(rscratch2, tmpU, tmpL);
4126       __ cbnz(rscratch2, DIFF1);
4127 
4128       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4129       __ fmovd(tmpL, vtmp);
4130       __ eor(rscratch2, tmp3, tmpL);
4131       __ cbnz(rscratch2, DIFF2);
4132 
4133       __ umov(tmpL, vtmp, __ D, 1);
4134       __ eor(rscratch2, tmpU, tmpL);
4135       __ cbnz(rscratch2, DIFF1);
4136       __ b(LOAD_LAST);
4137     __ bind(DIFF2);
4138       __ mov(tmpU, tmp3);
4139     __ bind(DIFF1);
4140       __ pop(spilled_regs, sp);
4141       __ b(CALCULATE_DIFFERENCE);
4142     __ bind(LOAD_LAST);
4143       __ pop(spilled_regs, sp);
4144 
4145       __ ldrs(vtmp, Address(strL));
4146       __ ldr(tmpU, Address(strU));
4147       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4148       __ fmovd(tmpL, vtmp);
4149 
4150       __ eor(rscratch2, tmpU, tmpL);
4151       __ cbz(rscratch2, DONE);
4152 
4153     // Find the first different characters in the longwords and
4154     // compute their difference.
4155     __ bind(CALCULATE_DIFFERENCE);
4156       __ rev(rscratch2, rscratch2);
4157       __ clz(rscratch2, rscratch2);
4158       __ andr(rscratch2, rscratch2, -16);
4159       __ lsrv(tmp1, tmp1, rscratch2);
4160       __ uxthw(tmp1, tmp1);
4161       __ lsrv(rscratch1, rscratch1, rscratch2);
4162       __ uxthw(rscratch1, rscratch1);
4163       __ subw(result, tmp1, rscratch1);
4164     __ bind(DONE);
4165       __ ret(lr);
4166     return entry;
4167   }
4168 
4169   // r0  = result
4170   // r1  = str1
4171   // r2  = cnt1
4172   // r3  = str2
4173   // r4  = cnt2
4174   // r10 = tmp1
4175   // r11 = tmp2
4176   address generate_compare_long_string_same_encoding(bool isLL) {
4177     __ align(CodeEntryAlignment);
4178     StubCodeMark mark(this, "StubRoutines", isLL
4179         ? "compare_long_string_same_encoding LL"
4180         : "compare_long_string_same_encoding UU");
4181     address entry = __ pc();
4182     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4183         tmp1 = r10, tmp2 = r11;
4184     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4185         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4186         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4187     // exit from large loop when less than 64 bytes left to read or we're about
4188     // to prefetch memory behind array border
4189     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4190     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4191     // update cnt2 counter with already loaded 8 bytes
4192     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4193     // update pointers, because of previous read
4194     __ add(str1, str1, wordSize);
4195     __ add(str2, str2, wordSize);
4196     if (SoftwarePrefetchHintDistance >= 0) {
4197       __ bind(LARGE_LOOP_PREFETCH);
4198         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4199         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4200         compare_string_16_bytes_same(DIFF, DIFF2);
4201         compare_string_16_bytes_same(DIFF, DIFF2);
4202         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4203         compare_string_16_bytes_same(DIFF, DIFF2);
4204         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4205         compare_string_16_bytes_same(DIFF, DIFF2);
4206         __ br(__ GT, LARGE_LOOP_PREFETCH);
4207         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4208         // less than 16 bytes left?
4209         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4210         __ br(__ LT, TAIL);
4211     }
4212     __ bind(SMALL_LOOP);
4213       compare_string_16_bytes_same(DIFF, DIFF2);
4214       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4215       __ br(__ GE, SMALL_LOOP);
4216     __ bind(TAIL);
4217       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4218       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4219       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4220       __ br(__ LE, CHECK_LAST);
4221       __ eor(rscratch2, tmp1, tmp2);
4222       __ cbnz(rscratch2, DIFF);
4223       __ ldr(tmp1, Address(__ post(str1, 8)));
4224       __ ldr(tmp2, Address(__ post(str2, 8)));
4225       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4226     __ bind(CHECK_LAST);
4227       if (!isLL) {
4228         __ add(cnt2, cnt2, cnt2); // now in bytes
4229       }
4230       __ eor(rscratch2, tmp1, tmp2);
4231       __ cbnz(rscratch2, DIFF);
4232       __ ldr(rscratch1, Address(str1, cnt2));
4233       __ ldr(cnt1, Address(str2, cnt2));
4234       __ eor(rscratch2, rscratch1, cnt1);
4235       __ cbz(rscratch2, LENGTH_DIFF);
4236       // Find the first different characters in the longwords and
4237       // compute their difference.
4238     __ bind(DIFF2);
4239       __ rev(rscratch2, rscratch2);
4240       __ clz(rscratch2, rscratch2);
4241       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4242       __ lsrv(rscratch1, rscratch1, rscratch2);
4243       if (isLL) {
4244         __ lsrv(cnt1, cnt1, rscratch2);
4245         __ uxtbw(rscratch1, rscratch1);
4246         __ uxtbw(cnt1, cnt1);
4247       } else {
4248         __ lsrv(cnt1, cnt1, rscratch2);
4249         __ uxthw(rscratch1, rscratch1);
4250         __ uxthw(cnt1, cnt1);
4251       }
4252       __ subw(result, rscratch1, cnt1);
4253       __ b(LENGTH_DIFF);
4254     __ bind(DIFF);
4255       __ rev(rscratch2, rscratch2);
4256       __ clz(rscratch2, rscratch2);
4257       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4258       __ lsrv(tmp1, tmp1, rscratch2);
4259       if (isLL) {
4260         __ lsrv(tmp2, tmp2, rscratch2);
4261         __ uxtbw(tmp1, tmp1);
4262         __ uxtbw(tmp2, tmp2);
4263       } else {
4264         __ lsrv(tmp2, tmp2, rscratch2);
4265         __ uxthw(tmp1, tmp1);
4266         __ uxthw(tmp2, tmp2);
4267       }
4268       __ subw(result, tmp1, tmp2);
4269       __ b(LENGTH_DIFF);
4270     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4271       __ eor(rscratch2, tmp1, tmp2);
4272       __ cbnz(rscratch2, DIFF);
4273     __ bind(LENGTH_DIFF);
4274       __ ret(lr);
4275     return entry;
4276   }
4277 
4278   void generate_compare_long_strings() {
4279       StubRoutines::aarch64::_compare_long_string_LL
4280           = generate_compare_long_string_same_encoding(true);
4281       StubRoutines::aarch64::_compare_long_string_UU
4282           = generate_compare_long_string_same_encoding(false);
4283       StubRoutines::aarch64::_compare_long_string_LU
4284           = generate_compare_long_string_different_encoding(true);
4285       StubRoutines::aarch64::_compare_long_string_UL
4286           = generate_compare_long_string_different_encoding(false);
4287   }
4288 
4289   // R0 = result
4290   // R1 = str2
4291   // R2 = cnt1
4292   // R3 = str1
4293   // R4 = cnt2
4294   // This generic linear code use few additional ideas, which makes it faster:
4295   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4296   // in order to skip initial loading(help in systems with 1 ld pipeline)
4297   // 2) we can use "fast" algorithm of finding single character to search for
4298   // first symbol with less branches(1 branch per each loaded register instead
4299   // of branch for each symbol), so, this is where constants like
4300   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4301   // 3) after loading and analyzing 1st register of source string, it can be
4302   // used to search for every 1st character entry, saving few loads in
4303   // comparison with "simplier-but-slower" implementation
4304   // 4) in order to avoid lots of push/pop operations, code below is heavily
4305   // re-using/re-initializing/compressing register values, which makes code
4306   // larger and a bit less readable, however, most of extra operations are
4307   // issued during loads or branches, so, penalty is minimal
4308   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4309     const char* stubName = str1_isL
4310         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4311         : "indexof_linear_uu";
4312     __ align(CodeEntryAlignment);
4313     StubCodeMark mark(this, "StubRoutines", stubName);
4314     address entry = __ pc();
4315 
4316     int str1_chr_size = str1_isL ? 1 : 2;
4317     int str2_chr_size = str2_isL ? 1 : 2;
4318     int str1_chr_shift = str1_isL ? 0 : 1;
4319     int str2_chr_shift = str2_isL ? 0 : 1;
4320     bool isL = str1_isL && str2_isL;
4321    // parameters
4322     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4323     // temporary registers
4324     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4325     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4326     // redefinitions
4327     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4328 
4329     __ push(spilled_regs, sp);
4330     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4331         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4332         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4333         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4334         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4335         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4336     // Read whole register from str1. It is safe, because length >=8 here
4337     __ ldr(ch1, Address(str1));
4338     // Read whole register from str2. It is safe, because length >=8 here
4339     __ ldr(ch2, Address(str2));
4340     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4341     if (str1_isL != str2_isL) {
4342       __ eor(v0, __ T16B, v0, v0);
4343     }
4344     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4345     __ mul(first, first, tmp1);
4346     // check if we have less than 1 register to check
4347     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4348     if (str1_isL != str2_isL) {
4349       __ fmovd(v1, ch1);
4350     }
4351     __ br(__ LE, L_SMALL);
4352     __ eor(ch2, first, ch2);
4353     if (str1_isL != str2_isL) {
4354       __ zip1(v1, __ T16B, v1, v0);
4355     }
4356     __ sub(tmp2, ch2, tmp1);
4357     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4358     __ bics(tmp2, tmp2, ch2);
4359     if (str1_isL != str2_isL) {
4360       __ fmovd(ch1, v1);
4361     }
4362     __ br(__ NE, L_HAS_ZERO);
4363     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4364     __ add(result, result, wordSize/str2_chr_size);
4365     __ add(str2, str2, wordSize);
4366     __ br(__ LT, L_POST_LOOP);
4367     __ BIND(L_LOOP);
4368       __ ldr(ch2, Address(str2));
4369       __ eor(ch2, first, ch2);
4370       __ sub(tmp2, ch2, tmp1);
4371       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4372       __ bics(tmp2, tmp2, ch2);
4373       __ br(__ NE, L_HAS_ZERO);
4374     __ BIND(L_LOOP_PROCEED);
4375       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4376       __ add(str2, str2, wordSize);
4377       __ add(result, result, wordSize/str2_chr_size);
4378       __ br(__ GE, L_LOOP);
4379     __ BIND(L_POST_LOOP);
4380       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4381       __ br(__ LE, NOMATCH);
4382       __ ldr(ch2, Address(str2));
4383       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4384       __ eor(ch2, first, ch2);
4385       __ sub(tmp2, ch2, tmp1);
4386       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4387       __ mov(tmp4, -1); // all bits set
4388       __ b(L_SMALL_PROCEED);
4389     __ align(OptoLoopAlignment);
4390     __ BIND(L_SMALL);
4391       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4392       __ eor(ch2, first, ch2);
4393       if (str1_isL != str2_isL) {
4394         __ zip1(v1, __ T16B, v1, v0);
4395       }
4396       __ sub(tmp2, ch2, tmp1);
4397       __ mov(tmp4, -1); // all bits set
4398       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4399       if (str1_isL != str2_isL) {
4400         __ fmovd(ch1, v1); // move converted 4 symbols
4401       }
4402     __ BIND(L_SMALL_PROCEED);
4403       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4404       __ bic(tmp2, tmp2, ch2);
4405       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4406       __ rbit(tmp2, tmp2);
4407       __ br(__ EQ, NOMATCH);
4408     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4409       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4410       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4411       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4412       if (str2_isL) { // LL
4413         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4414         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4415         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4416         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4417         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4418       } else {
4419         __ mov(ch2, 0xE); // all bits in byte set except last one
4420         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4421         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4422         __ lslv(tmp2, tmp2, tmp4);
4423         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4424         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4425         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4426         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4427       }
4428       __ cmp(ch1, ch2);
4429       __ mov(tmp4, wordSize/str2_chr_size);
4430       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4431     __ BIND(L_SMALL_CMP_LOOP);
4432       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4433                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4434       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4435                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4436       __ add(tmp4, tmp4, 1);
4437       __ cmp(tmp4, cnt1);
4438       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4439       __ cmp(first, ch2);
4440       __ br(__ EQ, L_SMALL_CMP_LOOP);
4441     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4442       __ cbz(tmp2, NOMATCH); // no more matches. exit
4443       __ clz(tmp4, tmp2);
4444       __ add(result, result, 1); // advance index
4445       __ add(str2, str2, str2_chr_size); // advance pointer
4446       __ b(L_SMALL_HAS_ZERO_LOOP);
4447     __ align(OptoLoopAlignment);
4448     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4449       __ cmp(first, ch2);
4450       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4451       __ b(DONE);
4452     __ align(OptoLoopAlignment);
4453     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4454       if (str2_isL) { // LL
4455         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4456         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4457         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4458         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4459         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4460       } else {
4461         __ mov(ch2, 0xE); // all bits in byte set except last one
4462         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4463         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4464         __ lslv(tmp2, tmp2, tmp4);
4465         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4466         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4467         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4468         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4469       }
4470       __ cmp(ch1, ch2);
4471       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4472       __ b(DONE);
4473     __ align(OptoLoopAlignment);
4474     __ BIND(L_HAS_ZERO);
4475       __ rbit(tmp2, tmp2);
4476       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4477       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4478       // It's fine because both counters are 32bit and are not changed in this
4479       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4480       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4481       __ sub(result, result, 1);
4482     __ BIND(L_HAS_ZERO_LOOP);
4483       __ mov(cnt1, wordSize/str2_chr_size);
4484       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4485       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4486       if (str2_isL) {
4487         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4488         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4489         __ lslv(tmp2, tmp2, tmp4);
4490         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4491         __ add(tmp4, tmp4, 1);
4492         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4493         __ lsl(tmp2, tmp2, 1);
4494         __ mov(tmp4, wordSize/str2_chr_size);
4495       } else {
4496         __ mov(ch2, 0xE);
4497         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4498         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4499         __ lslv(tmp2, tmp2, tmp4);
4500         __ add(tmp4, tmp4, 1);
4501         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4502         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4503         __ lsl(tmp2, tmp2, 1);
4504         __ mov(tmp4, wordSize/str2_chr_size);
4505         __ sub(str2, str2, str2_chr_size);
4506       }
4507       __ cmp(ch1, ch2);
4508       __ mov(tmp4, wordSize/str2_chr_size);
4509       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4510     __ BIND(L_CMP_LOOP);
4511       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4512                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4513       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4514                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4515       __ add(tmp4, tmp4, 1);
4516       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4517       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4518       __ cmp(cnt1, ch2);
4519       __ br(__ EQ, L_CMP_LOOP);
4520     __ BIND(L_CMP_LOOP_NOMATCH);
4521       // here we're not matched
4522       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4523       __ clz(tmp4, tmp2);
4524       __ add(str2, str2, str2_chr_size); // advance pointer
4525       __ b(L_HAS_ZERO_LOOP);
4526     __ align(OptoLoopAlignment);
4527     __ BIND(L_CMP_LOOP_LAST_CMP);
4528       __ cmp(cnt1, ch2);
4529       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4530       __ b(DONE);
4531     __ align(OptoLoopAlignment);
4532     __ BIND(L_CMP_LOOP_LAST_CMP2);
4533       if (str2_isL) {
4534         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4535         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4536         __ lslv(tmp2, tmp2, tmp4);
4537         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4538         __ add(tmp4, tmp4, 1);
4539         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4540         __ lsl(tmp2, tmp2, 1);
4541       } else {
4542         __ mov(ch2, 0xE);
4543         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4544         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4545         __ lslv(tmp2, tmp2, tmp4);
4546         __ add(tmp4, tmp4, 1);
4547         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4548         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4549         __ lsl(tmp2, tmp2, 1);
4550         __ sub(str2, str2, str2_chr_size);
4551       }
4552       __ cmp(ch1, ch2);
4553       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4554       __ b(DONE);
4555     __ align(OptoLoopAlignment);
4556     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4557       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4558       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4559       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4560       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4561       // result by analyzed characters value, so, we can just reset lower bits
4562       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4563       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4564       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4565       // index of last analyzed substring inside current octet. So, str2 in at
4566       // respective start address. We need to advance it to next octet
4567       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4568       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4569       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4570       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4571       __ movw(cnt2, cnt2);
4572       __ b(L_LOOP_PROCEED);
4573     __ align(OptoLoopAlignment);
4574     __ BIND(NOMATCH);
4575       __ mov(result, -1);
4576     __ BIND(DONE);
4577       __ pop(spilled_regs, sp);
4578       __ ret(lr);
4579     return entry;
4580   }
4581 
4582   void generate_string_indexof_stubs() {
4583     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4584     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4585     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4586   }
4587 
4588   void inflate_and_store_2_fp_registers(bool generatePrfm,
4589       FloatRegister src1, FloatRegister src2) {
4590     Register dst = r1;
4591     __ zip1(v1, __ T16B, src1, v0);
4592     __ zip2(v2, __ T16B, src1, v0);
4593     if (generatePrfm) {
4594       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4595     }
4596     __ zip1(v3, __ T16B, src2, v0);
4597     __ zip2(v4, __ T16B, src2, v0);
4598     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4599   }
4600 
4601   // R0 = src
4602   // R1 = dst
4603   // R2 = len
4604   // R3 = len >> 3
4605   // V0 = 0
4606   // v1 = loaded 8 bytes
4607   address generate_large_byte_array_inflate() {
4608     __ align(CodeEntryAlignment);
4609     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4610     address entry = __ pc();
4611     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4612     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4613     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4614 
4615     // do one more 8-byte read to have address 16-byte aligned in most cases
4616     // also use single store instruction
4617     __ ldrd(v2, __ post(src, 8));
4618     __ sub(octetCounter, octetCounter, 2);
4619     __ zip1(v1, __ T16B, v1, v0);
4620     __ zip1(v2, __ T16B, v2, v0);
4621     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4622     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4623     __ subs(rscratch1, octetCounter, large_loop_threshold);
4624     __ br(__ LE, LOOP_START);
4625     __ b(LOOP_PRFM_START);
4626     __ bind(LOOP_PRFM);
4627       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4628     __ bind(LOOP_PRFM_START);
4629       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4630       __ sub(octetCounter, octetCounter, 8);
4631       __ subs(rscratch1, octetCounter, large_loop_threshold);
4632       inflate_and_store_2_fp_registers(true, v3, v4);
4633       inflate_and_store_2_fp_registers(true, v5, v6);
4634       __ br(__ GT, LOOP_PRFM);
4635       __ cmp(octetCounter, (u1)8);
4636       __ br(__ LT, DONE);
4637     __ bind(LOOP);
4638       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4639       __ bind(LOOP_START);
4640       __ sub(octetCounter, octetCounter, 8);
4641       __ cmp(octetCounter, (u1)8);
4642       inflate_and_store_2_fp_registers(false, v3, v4);
4643       inflate_and_store_2_fp_registers(false, v5, v6);
4644       __ br(__ GE, LOOP);
4645     __ bind(DONE);
4646       __ ret(lr);
4647     return entry;
4648   }
4649 
4650   /**
4651    *  Arguments:
4652    *
4653    *  Input:
4654    *  c_rarg0   - current state address
4655    *  c_rarg1   - H key address
4656    *  c_rarg2   - data address
4657    *  c_rarg3   - number of blocks
4658    *
4659    *  Output:
4660    *  Updated state at c_rarg0
4661    */
4662   address generate_ghash_processBlocks() {
4663     // Bafflingly, GCM uses little-endian for the byte order, but
4664     // big-endian for the bit order.  For example, the polynomial 1 is
4665     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4666     //
4667     // So, we must either reverse the bytes in each word and do
4668     // everything big-endian or reverse the bits in each byte and do
4669     // it little-endian.  On AArch64 it's more idiomatic to reverse
4670     // the bits in each byte (we have an instruction, RBIT, to do
4671     // that) and keep the data in little-endian bit order throught the
4672     // calculation, bit-reversing the inputs and outputs.
4673 
4674     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4675     __ align(wordSize * 2);
4676     address p = __ pc();
4677     __ emit_int64(0x87);  // The low-order bits of the field
4678                           // polynomial (i.e. p = z^7+z^2+z+1)
4679                           // repeated in the low and high parts of a
4680                           // 128-bit vector
4681     __ emit_int64(0x87);
4682 
4683     __ align(CodeEntryAlignment);
4684     address start = __ pc();
4685 
4686     Register state   = c_rarg0;
4687     Register subkeyH = c_rarg1;
4688     Register data    = c_rarg2;
4689     Register blocks  = c_rarg3;
4690 
4691     FloatRegister vzr = v30;
4692     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4693 
4694     __ ldrq(v0, Address(state));
4695     __ ldrq(v1, Address(subkeyH));
4696 
4697     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4698     __ rbit(v0, __ T16B, v0);
4699     __ rev64(v1, __ T16B, v1);
4700     __ rbit(v1, __ T16B, v1);
4701 
4702     __ ldrq(v26, p);
4703 
4704     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4705     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4706 
4707     {
4708       Label L_ghash_loop;
4709       __ bind(L_ghash_loop);
4710 
4711       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4712                                                  // reversing each byte
4713       __ rbit(v2, __ T16B, v2);
4714       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4715 
4716       // Multiply state in v2 by subkey in v1
4717       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4718                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4719                      /*temps*/v6, v20, v18, v21);
4720       // Reduce v7:v5 by the field polynomial
4721       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4722 
4723       __ sub(blocks, blocks, 1);
4724       __ cbnz(blocks, L_ghash_loop);
4725     }
4726 
4727     // The bit-reversed result is at this point in v0
4728     __ rev64(v1, __ T16B, v0);
4729     __ rbit(v1, __ T16B, v1);
4730 
4731     __ st1(v1, __ T16B, state);
4732     __ ret(lr);
4733 
4734     return start;
4735   }
4736 
4737   // Continuation point for throwing of implicit exceptions that are
4738   // not handled in the current activation. Fabricates an exception
4739   // oop and initiates normal exception dispatching in this
4740   // frame. Since we need to preserve callee-saved values (currently
4741   // only for C2, but done for C1 as well) we need a callee-saved oop
4742   // map and therefore have to make these stubs into RuntimeStubs
4743   // rather than BufferBlobs.  If the compiler needs all registers to
4744   // be preserved between the fault point and the exception handler
4745   // then it must assume responsibility for that in
4746   // AbstractCompiler::continuation_for_implicit_null_exception or
4747   // continuation_for_implicit_division_by_zero_exception. All other
4748   // implicit exceptions (e.g., NullPointerException or
4749   // AbstractMethodError on entry) are either at call sites or
4750   // otherwise assume that stack unwinding will be initiated, so
4751   // caller saved registers were assumed volatile in the compiler.
4752 
4753 #undef __
4754 #define __ masm->
4755 
4756   address generate_throw_exception(const char* name,
4757                                    address runtime_entry,
4758                                    Register arg1 = noreg,
4759                                    Register arg2 = noreg) {
4760     // Information about frame layout at time of blocking runtime call.
4761     // Note that we only have to preserve callee-saved registers since
4762     // the compilers are responsible for supplying a continuation point
4763     // if they expect all registers to be preserved.
4764     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4765     enum layout {
4766       rfp_off = 0,
4767       rfp_off2,
4768       return_off,
4769       return_off2,
4770       framesize // inclusive of return address
4771     };
4772 
4773     int insts_size = 512;
4774     int locs_size  = 64;
4775 
4776     CodeBuffer code(name, insts_size, locs_size);
4777     OopMapSet* oop_maps  = new OopMapSet();
4778     MacroAssembler* masm = new MacroAssembler(&code);
4779 
4780     address start = __ pc();
4781 
4782     // This is an inlined and slightly modified version of call_VM
4783     // which has the ability to fetch the return PC out of
4784     // thread-local storage and also sets up last_Java_sp slightly
4785     // differently than the real call_VM
4786 
4787     __ enter(); // Save FP and LR before call
4788 
4789     assert(is_even(framesize/2), "sp not 16-byte aligned");
4790 
4791     // lr and fp are already in place
4792     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4793 
4794     int frame_complete = __ pc() - start;
4795 
4796     // Set up last_Java_sp and last_Java_fp
4797     address the_pc = __ pc();
4798     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4799 
4800     // Call runtime
4801     if (arg1 != noreg) {
4802       assert(arg2 != c_rarg1, "clobbered");
4803       __ mov(c_rarg1, arg1);
4804     }
4805     if (arg2 != noreg) {
4806       __ mov(c_rarg2, arg2);
4807     }
4808     __ mov(c_rarg0, rthread);
4809     BLOCK_COMMENT("call runtime_entry");
4810     __ mov(rscratch1, runtime_entry);
4811     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4812 
4813     // Generate oop map
4814     OopMap* map = new OopMap(framesize, 0);
4815 
4816     oop_maps->add_gc_map(the_pc - start, map);
4817 
4818     __ reset_last_Java_frame(true);
4819     __ maybe_isb();
4820 
4821     __ leave();
4822 
4823     // check for pending exceptions
4824 #ifdef ASSERT
4825     Label L;
4826     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4827     __ cbnz(rscratch1, L);
4828     __ should_not_reach_here();
4829     __ bind(L);
4830 #endif // ASSERT
4831     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4832 
4833 
4834     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4835     RuntimeStub* stub =
4836       RuntimeStub::new_runtime_stub(name,
4837                                     &code,
4838                                     frame_complete,
4839                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4840                                     oop_maps, false);
4841     return stub->entry_point();
4842   }
4843 
4844   class MontgomeryMultiplyGenerator : public MacroAssembler {
4845 
4846     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4847       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4848 
4849     RegSet _toSave;
4850     bool _squaring;
4851 
4852   public:
4853     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4854       : MacroAssembler(as->code()), _squaring(squaring) {
4855 
4856       // Register allocation
4857 
4858       Register reg = c_rarg0;
4859       Pa_base = reg;       // Argument registers
4860       if (squaring)
4861         Pb_base = Pa_base;
4862       else
4863         Pb_base = ++reg;
4864       Pn_base = ++reg;
4865       Rlen= ++reg;
4866       inv = ++reg;
4867       Pm_base = ++reg;
4868 
4869                           // Working registers:
4870       Ra =  ++reg;        // The current digit of a, b, n, and m.
4871       Rb =  ++reg;
4872       Rm =  ++reg;
4873       Rn =  ++reg;
4874 
4875       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4876       Pb =  ++reg;
4877       Pm =  ++reg;
4878       Pn =  ++reg;
4879 
4880       t0 =  ++reg;        // Three registers which form a
4881       t1 =  ++reg;        // triple-precision accumuator.
4882       t2 =  ++reg;
4883 
4884       Ri =  ++reg;        // Inner and outer loop indexes.
4885       Rj =  ++reg;
4886 
4887       Rhi_ab = ++reg;     // Product registers: low and high parts
4888       Rlo_ab = ++reg;     // of a*b and m*n.
4889       Rhi_mn = ++reg;
4890       Rlo_mn = ++reg;
4891 
4892       // r19 and up are callee-saved.
4893       _toSave = RegSet::range(r19, reg) + Pm_base;
4894     }
4895 
4896   private:
4897     void save_regs() {
4898       push(_toSave, sp);
4899     }
4900 
4901     void restore_regs() {
4902       pop(_toSave, sp);
4903     }
4904 
4905     template <typename T>
4906     void unroll_2(Register count, T block) {
4907       Label loop, end, odd;
4908       tbnz(count, 0, odd);
4909       cbz(count, end);
4910       align(16);
4911       bind(loop);
4912       (this->*block)();
4913       bind(odd);
4914       (this->*block)();
4915       subs(count, count, 2);
4916       br(Assembler::GT, loop);
4917       bind(end);
4918     }
4919 
4920     template <typename T>
4921     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4922       Label loop, end, odd;
4923       tbnz(count, 0, odd);
4924       cbz(count, end);
4925       align(16);
4926       bind(loop);
4927       (this->*block)(d, s, tmp);
4928       bind(odd);
4929       (this->*block)(d, s, tmp);
4930       subs(count, count, 2);
4931       br(Assembler::GT, loop);
4932       bind(end);
4933     }
4934 
4935     void pre1(RegisterOrConstant i) {
4936       block_comment("pre1");
4937       // Pa = Pa_base;
4938       // Pb = Pb_base + i;
4939       // Pm = Pm_base;
4940       // Pn = Pn_base + i;
4941       // Ra = *Pa;
4942       // Rb = *Pb;
4943       // Rm = *Pm;
4944       // Rn = *Pn;
4945       ldr(Ra, Address(Pa_base));
4946       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4947       ldr(Rm, Address(Pm_base));
4948       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4949       lea(Pa, Address(Pa_base));
4950       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4951       lea(Pm, Address(Pm_base));
4952       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4953 
4954       // Zero the m*n result.
4955       mov(Rhi_mn, zr);
4956       mov(Rlo_mn, zr);
4957     }
4958 
4959     // The core multiply-accumulate step of a Montgomery
4960     // multiplication.  The idea is to schedule operations as a
4961     // pipeline so that instructions with long latencies (loads and
4962     // multiplies) have time to complete before their results are
4963     // used.  This most benefits in-order implementations of the
4964     // architecture but out-of-order ones also benefit.
4965     void step() {
4966       block_comment("step");
4967       // MACC(Ra, Rb, t0, t1, t2);
4968       // Ra = *++Pa;
4969       // Rb = *--Pb;
4970       umulh(Rhi_ab, Ra, Rb);
4971       mul(Rlo_ab, Ra, Rb);
4972       ldr(Ra, pre(Pa, wordSize));
4973       ldr(Rb, pre(Pb, -wordSize));
4974       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4975                                        // previous iteration.
4976       // MACC(Rm, Rn, t0, t1, t2);
4977       // Rm = *++Pm;
4978       // Rn = *--Pn;
4979       umulh(Rhi_mn, Rm, Rn);
4980       mul(Rlo_mn, Rm, Rn);
4981       ldr(Rm, pre(Pm, wordSize));
4982       ldr(Rn, pre(Pn, -wordSize));
4983       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4984     }
4985 
4986     void post1() {
4987       block_comment("post1");
4988 
4989       // MACC(Ra, Rb, t0, t1, t2);
4990       // Ra = *++Pa;
4991       // Rb = *--Pb;
4992       umulh(Rhi_ab, Ra, Rb);
4993       mul(Rlo_ab, Ra, Rb);
4994       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4995       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4996 
4997       // *Pm = Rm = t0 * inv;
4998       mul(Rm, t0, inv);
4999       str(Rm, Address(Pm));
5000 
5001       // MACC(Rm, Rn, t0, t1, t2);
5002       // t0 = t1; t1 = t2; t2 = 0;
5003       umulh(Rhi_mn, Rm, Rn);
5004 
5005 #ifndef PRODUCT
5006       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5007       {
5008         mul(Rlo_mn, Rm, Rn);
5009         add(Rlo_mn, t0, Rlo_mn);
5010         Label ok;
5011         cbz(Rlo_mn, ok); {
5012           stop("broken Montgomery multiply");
5013         } bind(ok);
5014       }
5015 #endif
5016       // We have very carefully set things up so that
5017       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5018       // the lower half of Rm * Rn because we know the result already:
5019       // it must be -t0.  t0 + (-t0) must generate a carry iff
5020       // t0 != 0.  So, rather than do a mul and an adds we just set
5021       // the carry flag iff t0 is nonzero.
5022       //
5023       // mul(Rlo_mn, Rm, Rn);
5024       // adds(zr, t0, Rlo_mn);
5025       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5026       adcs(t0, t1, Rhi_mn);
5027       adc(t1, t2, zr);
5028       mov(t2, zr);
5029     }
5030 
5031     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5032       block_comment("pre2");
5033       // Pa = Pa_base + i-len;
5034       // Pb = Pb_base + len;
5035       // Pm = Pm_base + i-len;
5036       // Pn = Pn_base + len;
5037 
5038       if (i.is_register()) {
5039         sub(Rj, i.as_register(), len);
5040       } else {
5041         mov(Rj, i.as_constant());
5042         sub(Rj, Rj, len);
5043       }
5044       // Rj == i-len
5045 
5046       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5047       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5048       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5049       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5050 
5051       // Ra = *++Pa;
5052       // Rb = *--Pb;
5053       // Rm = *++Pm;
5054       // Rn = *--Pn;
5055       ldr(Ra, pre(Pa, wordSize));
5056       ldr(Rb, pre(Pb, -wordSize));
5057       ldr(Rm, pre(Pm, wordSize));
5058       ldr(Rn, pre(Pn, -wordSize));
5059 
5060       mov(Rhi_mn, zr);
5061       mov(Rlo_mn, zr);
5062     }
5063 
5064     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5065       block_comment("post2");
5066       if (i.is_constant()) {
5067         mov(Rj, i.as_constant()-len.as_constant());
5068       } else {
5069         sub(Rj, i.as_register(), len);
5070       }
5071 
5072       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5073 
5074       // As soon as we know the least significant digit of our result,
5075       // store it.
5076       // Pm_base[i-len] = t0;
5077       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5078 
5079       // t0 = t1; t1 = t2; t2 = 0;
5080       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5081       adc(t1, t2, zr);
5082       mov(t2, zr);
5083     }
5084 
5085     // A carry in t0 after Montgomery multiplication means that we
5086     // should subtract multiples of n from our result in m.  We'll
5087     // keep doing that until there is no carry.
5088     void normalize(RegisterOrConstant len) {
5089       block_comment("normalize");
5090       // while (t0)
5091       //   t0 = sub(Pm_base, Pn_base, t0, len);
5092       Label loop, post, again;
5093       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5094       cbz(t0, post); {
5095         bind(again); {
5096           mov(i, zr);
5097           mov(cnt, len);
5098           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5099           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5100           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5101           align(16);
5102           bind(loop); {
5103             sbcs(Rm, Rm, Rn);
5104             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5105             add(i, i, 1);
5106             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5107             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5108             sub(cnt, cnt, 1);
5109           } cbnz(cnt, loop);
5110           sbc(t0, t0, zr);
5111         } cbnz(t0, again);
5112       } bind(post);
5113     }
5114 
5115     // Move memory at s to d, reversing words.
5116     //    Increments d to end of copied memory
5117     //    Destroys tmp1, tmp2
5118     //    Preserves len
5119     //    Leaves s pointing to the address which was in d at start
5120     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5121       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5122 
5123       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5124       mov(tmp1, len);
5125       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5126       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5127     }
5128     // where
5129     void reverse1(Register d, Register s, Register tmp) {
5130       ldr(tmp, pre(s, -wordSize));
5131       ror(tmp, tmp, 32);
5132       str(tmp, post(d, wordSize));
5133     }
5134 
5135     void step_squaring() {
5136       // An extra ACC
5137       step();
5138       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5139     }
5140 
5141     void last_squaring(RegisterOrConstant i) {
5142       Label dont;
5143       // if ((i & 1) == 0) {
5144       tbnz(i.as_register(), 0, dont); {
5145         // MACC(Ra, Rb, t0, t1, t2);
5146         // Ra = *++Pa;
5147         // Rb = *--Pb;
5148         umulh(Rhi_ab, Ra, Rb);
5149         mul(Rlo_ab, Ra, Rb);
5150         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5151       } bind(dont);
5152     }
5153 
5154     void extra_step_squaring() {
5155       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5156 
5157       // MACC(Rm, Rn, t0, t1, t2);
5158       // Rm = *++Pm;
5159       // Rn = *--Pn;
5160       umulh(Rhi_mn, Rm, Rn);
5161       mul(Rlo_mn, Rm, Rn);
5162       ldr(Rm, pre(Pm, wordSize));
5163       ldr(Rn, pre(Pn, -wordSize));
5164     }
5165 
5166     void post1_squaring() {
5167       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5168 
5169       // *Pm = Rm = t0 * inv;
5170       mul(Rm, t0, inv);
5171       str(Rm, Address(Pm));
5172 
5173       // MACC(Rm, Rn, t0, t1, t2);
5174       // t0 = t1; t1 = t2; t2 = 0;
5175       umulh(Rhi_mn, Rm, Rn);
5176 
5177 #ifndef PRODUCT
5178       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5179       {
5180         mul(Rlo_mn, Rm, Rn);
5181         add(Rlo_mn, t0, Rlo_mn);
5182         Label ok;
5183         cbz(Rlo_mn, ok); {
5184           stop("broken Montgomery multiply");
5185         } bind(ok);
5186       }
5187 #endif
5188       // We have very carefully set things up so that
5189       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5190       // the lower half of Rm * Rn because we know the result already:
5191       // it must be -t0.  t0 + (-t0) must generate a carry iff
5192       // t0 != 0.  So, rather than do a mul and an adds we just set
5193       // the carry flag iff t0 is nonzero.
5194       //
5195       // mul(Rlo_mn, Rm, Rn);
5196       // adds(zr, t0, Rlo_mn);
5197       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5198       adcs(t0, t1, Rhi_mn);
5199       adc(t1, t2, zr);
5200       mov(t2, zr);
5201     }
5202 
5203     void acc(Register Rhi, Register Rlo,
5204              Register t0, Register t1, Register t2) {
5205       adds(t0, t0, Rlo);
5206       adcs(t1, t1, Rhi);
5207       adc(t2, t2, zr);
5208     }
5209 
5210   public:
5211     /**
5212      * Fast Montgomery multiplication.  The derivation of the
5213      * algorithm is in A Cryptographic Library for the Motorola
5214      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5215      *
5216      * Arguments:
5217      *
5218      * Inputs for multiplication:
5219      *   c_rarg0   - int array elements a
5220      *   c_rarg1   - int array elements b
5221      *   c_rarg2   - int array elements n (the modulus)
5222      *   c_rarg3   - int length
5223      *   c_rarg4   - int inv
5224      *   c_rarg5   - int array elements m (the result)
5225      *
5226      * Inputs for squaring:
5227      *   c_rarg0   - int array elements a
5228      *   c_rarg1   - int array elements n (the modulus)
5229      *   c_rarg2   - int length
5230      *   c_rarg3   - int inv
5231      *   c_rarg4   - int array elements m (the result)
5232      *
5233      */
5234     address generate_multiply() {
5235       Label argh, nothing;
5236       bind(argh);
5237       stop("MontgomeryMultiply total_allocation must be <= 8192");
5238 
5239       align(CodeEntryAlignment);
5240       address entry = pc();
5241 
5242       cbzw(Rlen, nothing);
5243 
5244       enter();
5245 
5246       // Make room.
5247       cmpw(Rlen, 512);
5248       br(Assembler::HI, argh);
5249       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5250       andr(sp, Ra, -2 * wordSize);
5251 
5252       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5253 
5254       {
5255         // Copy input args, reversing as we go.  We use Ra as a
5256         // temporary variable.
5257         reverse(Ra, Pa_base, Rlen, t0, t1);
5258         if (!_squaring)
5259           reverse(Ra, Pb_base, Rlen, t0, t1);
5260         reverse(Ra, Pn_base, Rlen, t0, t1);
5261       }
5262 
5263       // Push all call-saved registers and also Pm_base which we'll need
5264       // at the end.
5265       save_regs();
5266 
5267 #ifndef PRODUCT
5268       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5269       {
5270         ldr(Rn, Address(Pn_base, 0));
5271         mul(Rlo_mn, Rn, inv);
5272         subs(zr, Rlo_mn, -1);
5273         Label ok;
5274         br(EQ, ok); {
5275           stop("broken inverse in Montgomery multiply");
5276         } bind(ok);
5277       }
5278 #endif
5279 
5280       mov(Pm_base, Ra);
5281 
5282       mov(t0, zr);
5283       mov(t1, zr);
5284       mov(t2, zr);
5285 
5286       block_comment("for (int i = 0; i < len; i++) {");
5287       mov(Ri, zr); {
5288         Label loop, end;
5289         cmpw(Ri, Rlen);
5290         br(Assembler::GE, end);
5291 
5292         bind(loop);
5293         pre1(Ri);
5294 
5295         block_comment("  for (j = i; j; j--) {"); {
5296           movw(Rj, Ri);
5297           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5298         } block_comment("  } // j");
5299 
5300         post1();
5301         addw(Ri, Ri, 1);
5302         cmpw(Ri, Rlen);
5303         br(Assembler::LT, loop);
5304         bind(end);
5305         block_comment("} // i");
5306       }
5307 
5308       block_comment("for (int i = len; i < 2*len; i++) {");
5309       mov(Ri, Rlen); {
5310         Label loop, end;
5311         cmpw(Ri, Rlen, Assembler::LSL, 1);
5312         br(Assembler::GE, end);
5313 
5314         bind(loop);
5315         pre2(Ri, Rlen);
5316 
5317         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5318           lslw(Rj, Rlen, 1);
5319           subw(Rj, Rj, Ri);
5320           subw(Rj, Rj, 1);
5321           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5322         } block_comment("  } // j");
5323 
5324         post2(Ri, Rlen);
5325         addw(Ri, Ri, 1);
5326         cmpw(Ri, Rlen, Assembler::LSL, 1);
5327         br(Assembler::LT, loop);
5328         bind(end);
5329       }
5330       block_comment("} // i");
5331 
5332       normalize(Rlen);
5333 
5334       mov(Ra, Pm_base);  // Save Pm_base in Ra
5335       restore_regs();  // Restore caller's Pm_base
5336 
5337       // Copy our result into caller's Pm_base
5338       reverse(Pm_base, Ra, Rlen, t0, t1);
5339 
5340       leave();
5341       bind(nothing);
5342       ret(lr);
5343 
5344       return entry;
5345     }
5346     // In C, approximately:
5347 
5348     // void
5349     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5350     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5351     //                     unsigned long inv, int len) {
5352     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5353     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5354     //   unsigned long Ra, Rb, Rn, Rm;
5355 
5356     //   int i;
5357 
5358     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5359 
5360     //   for (i = 0; i < len; i++) {
5361     //     int j;
5362 
5363     //     Pa = Pa_base;
5364     //     Pb = Pb_base + i;
5365     //     Pm = Pm_base;
5366     //     Pn = Pn_base + i;
5367 
5368     //     Ra = *Pa;
5369     //     Rb = *Pb;
5370     //     Rm = *Pm;
5371     //     Rn = *Pn;
5372 
5373     //     int iters = i;
5374     //     for (j = 0; iters--; j++) {
5375     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5376     //       MACC(Ra, Rb, t0, t1, t2);
5377     //       Ra = *++Pa;
5378     //       Rb = *--Pb;
5379     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5380     //       MACC(Rm, Rn, t0, t1, t2);
5381     //       Rm = *++Pm;
5382     //       Rn = *--Pn;
5383     //     }
5384 
5385     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5386     //     MACC(Ra, Rb, t0, t1, t2);
5387     //     *Pm = Rm = t0 * inv;
5388     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5389     //     MACC(Rm, Rn, t0, t1, t2);
5390 
5391     //     assert(t0 == 0, "broken Montgomery multiply");
5392 
5393     //     t0 = t1; t1 = t2; t2 = 0;
5394     //   }
5395 
5396     //   for (i = len; i < 2*len; i++) {
5397     //     int j;
5398 
5399     //     Pa = Pa_base + i-len;
5400     //     Pb = Pb_base + len;
5401     //     Pm = Pm_base + i-len;
5402     //     Pn = Pn_base + len;
5403 
5404     //     Ra = *++Pa;
5405     //     Rb = *--Pb;
5406     //     Rm = *++Pm;
5407     //     Rn = *--Pn;
5408 
5409     //     int iters = len*2-i-1;
5410     //     for (j = i-len+1; iters--; j++) {
5411     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5412     //       MACC(Ra, Rb, t0, t1, t2);
5413     //       Ra = *++Pa;
5414     //       Rb = *--Pb;
5415     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5416     //       MACC(Rm, Rn, t0, t1, t2);
5417     //       Rm = *++Pm;
5418     //       Rn = *--Pn;
5419     //     }
5420 
5421     //     Pm_base[i-len] = t0;
5422     //     t0 = t1; t1 = t2; t2 = 0;
5423     //   }
5424 
5425     //   while (t0)
5426     //     t0 = sub(Pm_base, Pn_base, t0, len);
5427     // }
5428 
5429     /**
5430      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5431      * multiplies than Montgomery multiplication so it should be up to
5432      * 25% faster.  However, its loop control is more complex and it
5433      * may actually run slower on some machines.
5434      *
5435      * Arguments:
5436      *
5437      * Inputs:
5438      *   c_rarg0   - int array elements a
5439      *   c_rarg1   - int array elements n (the modulus)
5440      *   c_rarg2   - int length
5441      *   c_rarg3   - int inv
5442      *   c_rarg4   - int array elements m (the result)
5443      *
5444      */
5445     address generate_square() {
5446       Label argh;
5447       bind(argh);
5448       stop("MontgomeryMultiply total_allocation must be <= 8192");
5449 
5450       align(CodeEntryAlignment);
5451       address entry = pc();
5452 
5453       enter();
5454 
5455       // Make room.
5456       cmpw(Rlen, 512);
5457       br(Assembler::HI, argh);
5458       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5459       andr(sp, Ra, -2 * wordSize);
5460 
5461       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5462 
5463       {
5464         // Copy input args, reversing as we go.  We use Ra as a
5465         // temporary variable.
5466         reverse(Ra, Pa_base, Rlen, t0, t1);
5467         reverse(Ra, Pn_base, Rlen, t0, t1);
5468       }
5469 
5470       // Push all call-saved registers and also Pm_base which we'll need
5471       // at the end.
5472       save_regs();
5473 
5474       mov(Pm_base, Ra);
5475 
5476       mov(t0, zr);
5477       mov(t1, zr);
5478       mov(t2, zr);
5479 
5480       block_comment("for (int i = 0; i < len; i++) {");
5481       mov(Ri, zr); {
5482         Label loop, end;
5483         bind(loop);
5484         cmp(Ri, Rlen);
5485         br(Assembler::GE, end);
5486 
5487         pre1(Ri);
5488 
5489         block_comment("for (j = (i+1)/2; j; j--) {"); {
5490           add(Rj, Ri, 1);
5491           lsr(Rj, Rj, 1);
5492           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5493         } block_comment("  } // j");
5494 
5495         last_squaring(Ri);
5496 
5497         block_comment("  for (j = i/2; j; j--) {"); {
5498           lsr(Rj, Ri, 1);
5499           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5500         } block_comment("  } // j");
5501 
5502         post1_squaring();
5503         add(Ri, Ri, 1);
5504         cmp(Ri, Rlen);
5505         br(Assembler::LT, loop);
5506 
5507         bind(end);
5508         block_comment("} // i");
5509       }
5510 
5511       block_comment("for (int i = len; i < 2*len; i++) {");
5512       mov(Ri, Rlen); {
5513         Label loop, end;
5514         bind(loop);
5515         cmp(Ri, Rlen, Assembler::LSL, 1);
5516         br(Assembler::GE, end);
5517 
5518         pre2(Ri, Rlen);
5519 
5520         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5521           lsl(Rj, Rlen, 1);
5522           sub(Rj, Rj, Ri);
5523           sub(Rj, Rj, 1);
5524           lsr(Rj, Rj, 1);
5525           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5526         } block_comment("  } // j");
5527 
5528         last_squaring(Ri);
5529 
5530         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5531           lsl(Rj, Rlen, 1);
5532           sub(Rj, Rj, Ri);
5533           lsr(Rj, Rj, 1);
5534           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5535         } block_comment("  } // j");
5536 
5537         post2(Ri, Rlen);
5538         add(Ri, Ri, 1);
5539         cmp(Ri, Rlen, Assembler::LSL, 1);
5540 
5541         br(Assembler::LT, loop);
5542         bind(end);
5543         block_comment("} // i");
5544       }
5545 
5546       normalize(Rlen);
5547 
5548       mov(Ra, Pm_base);  // Save Pm_base in Ra
5549       restore_regs();  // Restore caller's Pm_base
5550 
5551       // Copy our result into caller's Pm_base
5552       reverse(Pm_base, Ra, Rlen, t0, t1);
5553 
5554       leave();
5555       ret(lr);
5556 
5557       return entry;
5558     }
5559     // In C, approximately:
5560 
5561     // void
5562     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5563     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5564     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5565     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5566     //   unsigned long Ra, Rb, Rn, Rm;
5567 
5568     //   int i;
5569 
5570     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5571 
5572     //   for (i = 0; i < len; i++) {
5573     //     int j;
5574 
5575     //     Pa = Pa_base;
5576     //     Pb = Pa_base + i;
5577     //     Pm = Pm_base;
5578     //     Pn = Pn_base + i;
5579 
5580     //     Ra = *Pa;
5581     //     Rb = *Pb;
5582     //     Rm = *Pm;
5583     //     Rn = *Pn;
5584 
5585     //     int iters = (i+1)/2;
5586     //     for (j = 0; iters--; j++) {
5587     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5588     //       MACC2(Ra, Rb, t0, t1, t2);
5589     //       Ra = *++Pa;
5590     //       Rb = *--Pb;
5591     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5592     //       MACC(Rm, Rn, t0, t1, t2);
5593     //       Rm = *++Pm;
5594     //       Rn = *--Pn;
5595     //     }
5596     //     if ((i & 1) == 0) {
5597     //       assert(Ra == Pa_base[j], "must be");
5598     //       MACC(Ra, Ra, t0, t1, t2);
5599     //     }
5600     //     iters = i/2;
5601     //     assert(iters == i-j, "must be");
5602     //     for (; iters--; j++) {
5603     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5604     //       MACC(Rm, Rn, t0, t1, t2);
5605     //       Rm = *++Pm;
5606     //       Rn = *--Pn;
5607     //     }
5608 
5609     //     *Pm = Rm = t0 * inv;
5610     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5611     //     MACC(Rm, Rn, t0, t1, t2);
5612 
5613     //     assert(t0 == 0, "broken Montgomery multiply");
5614 
5615     //     t0 = t1; t1 = t2; t2 = 0;
5616     //   }
5617 
5618     //   for (i = len; i < 2*len; i++) {
5619     //     int start = i-len+1;
5620     //     int end = start + (len - start)/2;
5621     //     int j;
5622 
5623     //     Pa = Pa_base + i-len;
5624     //     Pb = Pa_base + len;
5625     //     Pm = Pm_base + i-len;
5626     //     Pn = Pn_base + len;
5627 
5628     //     Ra = *++Pa;
5629     //     Rb = *--Pb;
5630     //     Rm = *++Pm;
5631     //     Rn = *--Pn;
5632 
5633     //     int iters = (2*len-i-1)/2;
5634     //     assert(iters == end-start, "must be");
5635     //     for (j = start; iters--; j++) {
5636     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5637     //       MACC2(Ra, Rb, t0, t1, t2);
5638     //       Ra = *++Pa;
5639     //       Rb = *--Pb;
5640     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5641     //       MACC(Rm, Rn, t0, t1, t2);
5642     //       Rm = *++Pm;
5643     //       Rn = *--Pn;
5644     //     }
5645     //     if ((i & 1) == 0) {
5646     //       assert(Ra == Pa_base[j], "must be");
5647     //       MACC(Ra, Ra, t0, t1, t2);
5648     //     }
5649     //     iters =  (2*len-i)/2;
5650     //     assert(iters == len-j, "must be");
5651     //     for (; iters--; j++) {
5652     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5653     //       MACC(Rm, Rn, t0, t1, t2);
5654     //       Rm = *++Pm;
5655     //       Rn = *--Pn;
5656     //     }
5657     //     Pm_base[i-len] = t0;
5658     //     t0 = t1; t1 = t2; t2 = 0;
5659     //   }
5660 
5661     //   while (t0)
5662     //     t0 = sub(Pm_base, Pn_base, t0, len);
5663     // }
5664   };
5665 
5666 
5667   // Initialization
5668   void generate_initial() {
5669     // Generate initial stubs and initializes the entry points
5670 
5671     // entry points that exist in all platforms Note: This is code
5672     // that could be shared among different platforms - however the
5673     // benefit seems to be smaller than the disadvantage of having a
5674     // much more complicated generator structure. See also comment in
5675     // stubRoutines.hpp.
5676 
5677     StubRoutines::_forward_exception_entry = generate_forward_exception();
5678 
5679     StubRoutines::_call_stub_entry =
5680       generate_call_stub(StubRoutines::_call_stub_return_address);
5681 
5682     // is referenced by megamorphic call
5683     StubRoutines::_catch_exception_entry = generate_catch_exception();
5684 
5685     // Build this early so it's available for the interpreter.
5686     StubRoutines::_throw_StackOverflowError_entry =
5687       generate_throw_exception("StackOverflowError throw_exception",
5688                                CAST_FROM_FN_PTR(address,
5689                                                 SharedRuntime::throw_StackOverflowError));
5690     StubRoutines::_throw_delayed_StackOverflowError_entry =
5691       generate_throw_exception("delayed StackOverflowError throw_exception",
5692                                CAST_FROM_FN_PTR(address,
5693                                                 SharedRuntime::throw_delayed_StackOverflowError));
5694     if (UseCRC32Intrinsics) {
5695       // set table address before stub generation which use it
5696       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5697       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5698     }
5699 
5700     if (UseCRC32CIntrinsics) {
5701       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5702     }
5703 
5704     // Disabled until JDK-8210858 is fixed
5705     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5706     //   StubRoutines::_dlog = generate_dlog();
5707     // }
5708 
5709     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5710       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5711     }
5712 
5713     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5714       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5715     }
5716   }
5717 
5718   void generate_all() {
5719     // support for verify_oop (must happen after universe_init)
5720     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5721     StubRoutines::_throw_AbstractMethodError_entry =
5722       generate_throw_exception("AbstractMethodError throw_exception",
5723                                CAST_FROM_FN_PTR(address,
5724                                                 SharedRuntime::
5725                                                 throw_AbstractMethodError));
5726 
5727     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5728       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5729                                CAST_FROM_FN_PTR(address,
5730                                                 SharedRuntime::
5731                                                 throw_IncompatibleClassChangeError));
5732 
5733     StubRoutines::_throw_NullPointerException_at_call_entry =
5734       generate_throw_exception("NullPointerException at call throw_exception",
5735                                CAST_FROM_FN_PTR(address,
5736                                                 SharedRuntime::
5737                                                 throw_NullPointerException_at_call));
5738 
5739     // arraycopy stubs used by compilers
5740     generate_arraycopy_stubs();
5741 
5742     // has negatives stub for large arrays.
5743     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5744 
5745     // array equals stub for large arrays.
5746     if (!UseSimpleArrayEquals) {
5747       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5748     }
5749 
5750     generate_compare_long_strings();
5751 
5752     generate_string_indexof_stubs();
5753 
5754     // byte_array_inflate stub for large arrays.
5755     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5756 
5757 #ifdef COMPILER2
5758     if (UseMultiplyToLenIntrinsic) {
5759       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5760     }
5761 
5762     if (UseSquareToLenIntrinsic) {
5763       StubRoutines::_squareToLen = generate_squareToLen();
5764     }
5765 
5766     if (UseMulAddIntrinsic) {
5767       StubRoutines::_mulAdd = generate_mulAdd();
5768     }
5769 
5770     if (UseMontgomeryMultiplyIntrinsic) {
5771       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5772       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5773       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5774     }
5775 
5776     if (UseMontgomerySquareIntrinsic) {
5777       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5778       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5779       // We use generate_multiply() rather than generate_square()
5780       // because it's faster for the sizes of modulus we care about.
5781       StubRoutines::_montgomerySquare = g.generate_multiply();
5782     }
5783 #endif // COMPILER2
5784 
5785 #ifndef BUILTIN_SIM
5786     // generate GHASH intrinsics code
5787     if (UseGHASHIntrinsics) {
5788       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5789     }
5790 
5791     if (UseAESIntrinsics) {
5792       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5793       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5794       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5795       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5796     }
5797 
5798     if (UseSHA1Intrinsics) {
5799       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5800       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5801     }
5802     if (UseSHA256Intrinsics) {
5803       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5804       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5805     }
5806 
5807     // generate Adler32 intrinsics code
5808     if (UseAdler32Intrinsics) {
5809       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5810     }
5811 
5812     // Safefetch stubs.
5813     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5814                                                        &StubRoutines::_safefetch32_fault_pc,
5815                                                        &StubRoutines::_safefetch32_continuation_pc);
5816     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5817                                                        &StubRoutines::_safefetchN_fault_pc,
5818                                                        &StubRoutines::_safefetchN_continuation_pc);
5819 #endif
5820     StubRoutines::aarch64::set_completed();
5821   }
5822 
5823  public:
5824   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5825     if (all) {
5826       generate_all();
5827     } else {
5828       generate_initial();
5829     }
5830   }
5831 }; // end class declaration
5832 
5833 void StubGenerator_generate(CodeBuffer* code, bool all) {
5834   StubGenerator g(code, all);
5835 }