New src/hotspot/cpu/aarch64/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 #if INCLUDE_ZGC
  49 #include "gc/z/zThreadLocalData.hpp"
  50 #endif
  51 
  52 #ifdef BUILTIN_SIM
  53 #include "../../../../../../simulator/simulator.hpp"
  54 #endif
  55 
  56 // Declaration and definition of StubGenerator (no .hpp file).
  57 // For a more detailed description of the stub routine structure
  58 // see the comment in stubRoutines.hpp
  59 
  60 #undef __
  61 #define __ _masm->
  62 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  63 
  64 #ifdef PRODUCT
  65 #define BLOCK_COMMENT(str) /* nothing */
  66 #else
  67 #define BLOCK_COMMENT(str) __ block_comment(str)
  68 #endif
  69 
  70 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  71 
  72 // Stub Code definitions
  73 
  74 class StubGenerator: public StubCodeGenerator {
  75  private:
  76 
  77 #ifdef PRODUCT
  78 #define inc_counter_np(counter) ((void)0)
  79 #else
  80   void inc_counter_np_(int& counter) {
  81     __ lea(rscratch2, ExternalAddress((address)&counter));
  82     __ ldrw(rscratch1, Address(rscratch2));
  83     __ addw(rscratch1, rscratch1, 1);
  84     __ strw(rscratch1, Address(rscratch2));
  85   }
  86 #define inc_counter_np(counter) \
  87   BLOCK_COMMENT("inc_counter " #counter); \
  88   inc_counter_np_(counter);
  89 #endif
  90 
  91   // Call stubs are used to call Java from C
  92   //
  93   // Arguments:
  94   //    c_rarg0:   call wrapper address                   address
  95   //    c_rarg1:   result                                 address
  96   //    c_rarg2:   result type                            BasicType
  97   //    c_rarg3:   method                                 Method*
  98   //    c_rarg4:   (interpreter) entry point              address
  99   //    c_rarg5:   parameters                             intptr_t*
 100   //    c_rarg6:   parameter size (in words)              int
 101   //    c_rarg7:   thread                                 Thread*
 102   //
 103   // There is no return from the stub itself as any Java result
 104   // is written to result
 105   //
 106   // we save r30 (lr) as the return PC at the base of the frame and
 107   // link r29 (fp) below it as the frame pointer installing sp (r31)
 108   // into fp.
 109   //
 110   // we save r0-r7, which accounts for all the c arguments.
 111   //
 112   // TODO: strictly do we need to save them all? they are treated as
 113   // volatile by C so could we omit saving the ones we are going to
 114   // place in global registers (thread? method?) or those we only use
 115   // during setup of the Java call?
 116   //
 117   // we don't need to save r8 which C uses as an indirect result location
 118   // return register.
 119   //
 120   // we don't need to save r9-r15 which both C and Java treat as
 121   // volatile
 122   //
 123   // we don't need to save r16-18 because Java does not use them
 124   //
 125   // we save r19-r28 which Java uses as scratch registers and C
 126   // expects to be callee-save
 127   //
 128   // we save the bottom 64 bits of each value stored in v8-v15; it is
 129   // the responsibility of the caller to preserve larger values.
 130   //
 131   // so the stub frame looks like this when we enter Java code
 132   //
 133   //     [ return_from_Java     ] <--- sp
 134   //     [ argument word n      ]
 135   //      ...
 136   // -27 [ argument word 1      ]
 137   // -26 [ saved v15            ] <--- sp_after_call
 138   // -25 [ saved v14            ]
 139   // -24 [ saved v13            ]
 140   // -23 [ saved v12            ]
 141   // -22 [ saved v11            ]
 142   // -21 [ saved v10            ]
 143   // -20 [ saved v9             ]
 144   // -19 [ saved v8             ]
 145   // -18 [ saved r28            ]
 146   // -17 [ saved r27            ]
 147   // -16 [ saved r26            ]
 148   // -15 [ saved r25            ]
 149   // -14 [ saved r24            ]
 150   // -13 [ saved r23            ]
 151   // -12 [ saved r22            ]
 152   // -11 [ saved r21            ]
 153   // -10 [ saved r20            ]
 154   //  -9 [ saved r19            ]
 155   //  -8 [ call wrapper    (r0) ]
 156   //  -7 [ result          (r1) ]
 157   //  -6 [ result type     (r2) ]
 158   //  -5 [ method          (r3) ]
 159   //  -4 [ entry point     (r4) ]
 160   //  -3 [ parameters      (r5) ]
 161   //  -2 [ parameter size  (r6) ]
 162   //  -1 [ thread (r7)          ]
 163   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 164   //   1 [ saved lr       (r30) ]
 165 
 166   // Call stub stack layout word offsets from fp
 167   enum call_stub_layout {
 168     sp_after_call_off = -26,
 169 
 170     d15_off            = -26,
 171     d13_off            = -24,
 172     d11_off            = -22,
 173     d9_off             = -20,
 174 
 175     r28_off            = -18,
 176     r26_off            = -16,
 177     r24_off            = -14,
 178     r22_off            = -12,
 179     r20_off            = -10,
 180     call_wrapper_off   =  -8,
 181     result_off         =  -7,
 182     result_type_off    =  -6,
 183     method_off         =  -5,
 184     entry_point_off    =  -4,
 185     parameter_size_off =  -2,
 186     thread_off         =  -1,
 187     fp_f               =   0,
 188     retaddr_off        =   1,
 189   };
 190 
 191   address generate_call_stub(address& return_address) {
 192     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 193            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 194            "adjust this code");
 195 
 196     StubCodeMark mark(this, "StubRoutines", "call_stub");
 197     address start = __ pc();
 198 
 199     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 200 
 201     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 202     const Address result        (rfp, result_off         * wordSize);
 203     const Address result_type   (rfp, result_type_off    * wordSize);
 204     const Address method        (rfp, method_off         * wordSize);
 205     const Address entry_point   (rfp, entry_point_off    * wordSize);
 206     const Address parameter_size(rfp, parameter_size_off * wordSize);
 207 
 208     const Address thread        (rfp, thread_off         * wordSize);
 209 
 210     const Address d15_save      (rfp, d15_off * wordSize);
 211     const Address d13_save      (rfp, d13_off * wordSize);
 212     const Address d11_save      (rfp, d11_off * wordSize);
 213     const Address d9_save       (rfp, d9_off * wordSize);
 214 
 215     const Address r28_save      (rfp, r28_off * wordSize);
 216     const Address r26_save      (rfp, r26_off * wordSize);
 217     const Address r24_save      (rfp, r24_off * wordSize);
 218     const Address r22_save      (rfp, r22_off * wordSize);
 219     const Address r20_save      (rfp, r20_off * wordSize);
 220 
 221     // stub code
 222 
 223     // we need a C prolog to bootstrap the x86 caller into the sim
 224     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 225 
 226     address aarch64_entry = __ pc();
 227 
 228 #ifdef BUILTIN_SIM
 229     // Save sender's SP for stack traces.
 230     __ mov(rscratch1, sp);
 231     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 232 #endif
 233     // set up frame and move sp to end of save area
 234     __ enter();
 235     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 236 
 237     // save register parameters and Java scratch/global registers
 238     // n.b. we save thread even though it gets installed in
 239     // rthread because we want to sanity check rthread later
 240     __ str(c_rarg7,  thread);
 241     __ strw(c_rarg6, parameter_size);
 242     __ stp(c_rarg4, c_rarg5,  entry_point);
 243     __ stp(c_rarg2, c_rarg3,  result_type);
 244     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 245 
 246     __ stp(r20, r19,   r20_save);
 247     __ stp(r22, r21,   r22_save);
 248     __ stp(r24, r23,   r24_save);
 249     __ stp(r26, r25,   r26_save);
 250     __ stp(r28, r27,   r28_save);
 251 
 252     __ stpd(v9,  v8,   d9_save);
 253     __ stpd(v11, v10,  d11_save);
 254     __ stpd(v13, v12,  d13_save);
 255     __ stpd(v15, v14,  d15_save);
 256 
 257     // install Java thread in global register now we have saved
 258     // whatever value it held
 259     __ mov(rthread, c_rarg7);
 260     // And method
 261     __ mov(rmethod, c_rarg3);
 262 
 263     // set up the heapbase register
 264     __ reinit_heapbase();
 265 
 266 #ifdef ASSERT
 267     // make sure we have no pending exceptions
 268     {
 269       Label L;
 270       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 271       __ cmp(rscratch1, (u1)NULL_WORD);
 272       __ br(Assembler::EQ, L);
 273       __ stop("StubRoutines::call_stub: entered with pending exception");
 274       __ BIND(L);
 275     }
 276 #endif
 277     // pass parameters if any
 278     __ mov(esp, sp);
 279     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 280     __ andr(sp, rscratch1, -2 * wordSize);
 281 
 282     BLOCK_COMMENT("pass parameters if any");
 283     Label parameters_done;
 284     // parameter count is still in c_rarg6
 285     // and parameter pointer identifying param 1 is in c_rarg5
 286     __ cbzw(c_rarg6, parameters_done);
 287 
 288     address loop = __ pc();
 289     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 290     __ subsw(c_rarg6, c_rarg6, 1);
 291     __ push(rscratch1);
 292     __ br(Assembler::GT, loop);
 293 
 294     __ BIND(parameters_done);
 295 
 296     // call Java entry -- passing methdoOop, and current sp
 297     //      rmethod: Method*
 298     //      r13: sender sp
 299     BLOCK_COMMENT("call Java function");
 300     __ mov(r13, sp);
 301     __ blr(c_rarg4);
 302 
 303     // tell the simulator we have returned to the stub
 304 
 305     // we do this here because the notify will already have been done
 306     // if we get to the next instruction via an exception
 307     //
 308     // n.b. adding this instruction here affects the calculation of
 309     // whether or not a routine returns to the call stub (used when
 310     // doing stack walks) since the normal test is to check the return
 311     // pc against the address saved below. so we may need to allow for
 312     // this extra instruction in the check.
 313 
 314     if (NotifySimulator) {
 315       __ notify(Assembler::method_reentry);
 316     }
 317     // save current address for use by exception handling code
 318 
 319     return_address = __ pc();
 320 
 321     // store result depending on type (everything that is not
 322     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 323     // n.b. this assumes Java returns an integral result in r0
 324     // and a floating result in j_farg0
 325     __ ldr(j_rarg2, result);
 326     Label is_long, is_float, is_double, exit;
 327     __ ldr(j_rarg1, result_type);
 328     __ cmp(j_rarg1, (u1)T_OBJECT);
 329     __ br(Assembler::EQ, is_long);
 330     __ cmp(j_rarg1, (u1)T_LONG);
 331     __ br(Assembler::EQ, is_long);
 332     __ cmp(j_rarg1, (u1)T_FLOAT);
 333     __ br(Assembler::EQ, is_float);
 334     __ cmp(j_rarg1, (u1)T_DOUBLE);
 335     __ br(Assembler::EQ, is_double);
 336 
 337     // handle T_INT case
 338     __ strw(r0, Address(j_rarg2));
 339 
 340     __ BIND(exit);
 341 
 342     // pop parameters
 343     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 344 
 345 #ifdef ASSERT
 346     // verify that threads correspond
 347     {
 348       Label L, S;
 349       __ ldr(rscratch1, thread);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::NE, S);
 352       __ get_thread(rscratch1);
 353       __ cmp(rthread, rscratch1);
 354       __ br(Assembler::EQ, L);
 355       __ BIND(S);
 356       __ stop("StubRoutines::call_stub: threads must correspond");
 357       __ BIND(L);
 358     }
 359 #endif
 360 
 361     // restore callee-save registers
 362     __ ldpd(v15, v14,  d15_save);
 363     __ ldpd(v13, v12,  d13_save);
 364     __ ldpd(v11, v10,  d11_save);
 365     __ ldpd(v9,  v8,   d9_save);
 366 
 367     __ ldp(r28, r27,   r28_save);
 368     __ ldp(r26, r25,   r26_save);
 369     __ ldp(r24, r23,   r24_save);
 370     __ ldp(r22, r21,   r22_save);
 371     __ ldp(r20, r19,   r20_save);
 372 
 373     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 374     __ ldrw(c_rarg2, result_type);
 375     __ ldr(c_rarg3,  method);
 376     __ ldp(c_rarg4, c_rarg5,  entry_point);
 377     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 378 
 379 #ifndef PRODUCT
 380     // tell the simulator we are about to end Java execution
 381     if (NotifySimulator) {
 382       __ notify(Assembler::method_exit);
 383     }
 384 #endif
 385     // leave frame and return to caller
 386     __ leave();
 387     __ ret(lr);
 388 
 389     // handle return types different from T_INT
 390 
 391     __ BIND(is_long);
 392     __ str(r0, Address(j_rarg2, 0));
 393     __ br(Assembler::AL, exit);
 394 
 395     __ BIND(is_float);
 396     __ strs(j_farg0, Address(j_rarg2, 0));
 397     __ br(Assembler::AL, exit);
 398 
 399     __ BIND(is_double);
 400     __ strd(j_farg0, Address(j_rarg2, 0));
 401     __ br(Assembler::AL, exit);
 402 
 403     return start;
 404   }
 405 
 406   // Return point for a Java call if there's an exception thrown in
 407   // Java code.  The exception is caught and transformed into a
 408   // pending exception stored in JavaThread that can be tested from
 409   // within the VM.
 410   //
 411   // Note: Usually the parameters are removed by the callee. In case
 412   // of an exception crossing an activation frame boundary, that is
 413   // not the case if the callee is compiled code => need to setup the
 414   // rsp.
 415   //
 416   // r0: exception oop
 417 
 418   // NOTE: this is used as a target from the signal handler so it
 419   // needs an x86 prolog which returns into the current simulator
 420   // executing the generated catch_exception code. so the prolog
 421   // needs to install rax in a sim register and adjust the sim's
 422   // restart pc to enter the generated code at the start position
 423   // then return from native to simulated execution.
 424 
 425   address generate_catch_exception() {
 426     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 427     address start = __ pc();
 428 
 429     // same as in generate_call_stub():
 430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 431     const Address thread        (rfp, thread_off         * wordSize);
 432 
 433 #ifdef ASSERT
 434     // verify that threads correspond
 435     {
 436       Label L, S;
 437       __ ldr(rscratch1, thread);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::NE, S);
 440       __ get_thread(rscratch1);
 441       __ cmp(rthread, rscratch1);
 442       __ br(Assembler::EQ, L);
 443       __ bind(S);
 444       __ stop("StubRoutines::catch_exception: threads must correspond");
 445       __ bind(L);
 446     }
 447 #endif
 448 
 449     // set pending exception
 450     __ verify_oop(r0);
 451 
 452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 453     __ mov(rscratch1, (address)__FILE__);
 454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 455     __ movw(rscratch1, (int)__LINE__);
 456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 457 
 458     // complete return to VM
 459     assert(StubRoutines::_call_stub_return_address != NULL,
 460            "_call_stub_return_address must have been generated before");
 461     __ b(StubRoutines::_call_stub_return_address);
 462 
 463     return start;
 464   }
 465 
 466   // Continuation point for runtime calls returning with a pending
 467   // exception.  The pending exception check happened in the runtime
 468   // or native call stub.  The pending exception in Thread is
 469   // converted into a Java-level exception.
 470   //
 471   // Contract with Java-level exception handlers:
 472   // r0: exception
 473   // r3: throwing pc
 474   //
 475   // NOTE: At entry of this stub, exception-pc must be in LR !!
 476 
 477   // NOTE: this is always used as a jump target within generated code
 478   // so it just needs to be generated code wiht no x86 prolog
 479 
 480   address generate_forward_exception() {
 481     StubCodeMark mark(this, "StubRoutines", "forward exception");
 482     address start = __ pc();
 483 
 484     // Upon entry, LR points to the return address returning into
 485     // Java (interpreted or compiled) code; i.e., the return address
 486     // becomes the throwing pc.
 487     //
 488     // Arguments pushed before the runtime call are still on the stack
 489     // but the exception handler will reset the stack pointer ->
 490     // ignore them.  A potential result in registers can be ignored as
 491     // well.
 492 
 493 #ifdef ASSERT
 494     // make sure this code is only executed if there is a pending exception
 495     {
 496       Label L;
 497       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 498       __ cbnz(rscratch1, L);
 499       __ stop("StubRoutines::forward exception: no pending exception (1)");
 500       __ bind(L);
 501     }
 502 #endif
 503 
 504     // compute exception handler into r19
 505 
 506     // call the VM to find the handler address associated with the
 507     // caller address. pass thread in r0 and caller pc (ret address)
 508     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 509     // the stack.
 510     __ mov(c_rarg1, lr);
 511     // lr will be trashed by the VM call so we move it to R19
 512     // (callee-saved) because we also need to pass it to the handler
 513     // returned by this call.
 514     __ mov(r19, lr);
 515     BLOCK_COMMENT("call exception_handler_for_return_address");
 516     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 517                          SharedRuntime::exception_handler_for_return_address),
 518                     rthread, c_rarg1);
 519     // we should not really care that lr is no longer the callee
 520     // address. we saved the value the handler needs in r19 so we can
 521     // just copy it to r3. however, the C2 handler will push its own
 522     // frame and then calls into the VM and the VM code asserts that
 523     // the PC for the frame above the handler belongs to a compiled
 524     // Java method. So, we restore lr here to satisfy that assert.
 525     __ mov(lr, r19);
 526     // setup r0 & r3 & clear pending exception
 527     __ mov(r3, r19);
 528     __ mov(r19, r0);
 529     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 530     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 531 
 532 #ifdef ASSERT
 533     // make sure exception is set
 534     {
 535       Label L;
 536       __ cbnz(r0, L);
 537       __ stop("StubRoutines::forward exception: no pending exception (2)");
 538       __ bind(L);
 539     }
 540 #endif
 541 
 542     // continue at exception handler
 543     // r0: exception
 544     // r3: throwing pc
 545     // r19: exception handler
 546     __ verify_oop(r0);
 547     __ br(r19);
 548 
 549     return start;
 550   }
 551 
 552   // Non-destructive plausibility checks for oops
 553   //
 554   // Arguments:
 555   //    r0: oop to verify
 556   //    rscratch1: error message
 557   //
 558   // Stack after saving c_rarg3:
 559   //    [tos + 0]: saved c_rarg3
 560   //    [tos + 1]: saved c_rarg2
 561   //    [tos + 2]: saved lr
 562   //    [tos + 3]: saved rscratch2
 563   //    [tos + 4]: saved r0
 564   //    [tos + 5]: saved rscratch1
 565   address generate_verify_oop() {
 566 
 567     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 568     address start = __ pc();
 569 
 570     Label exit, error;
 571 
 572     // save c_rarg2 and c_rarg3
 573     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 574 
 575     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 576     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 577     __ ldr(c_rarg3, Address(c_rarg2));
 578     __ add(c_rarg3, c_rarg3, 1);
 579     __ str(c_rarg3, Address(c_rarg2));
 580 
 581     // object is in r0
 582     // make sure object is 'reasonable'
 583     __ cbz(r0, exit); // if obj is NULL it is OK
 584 
 585 #if INCLUDE_ZGC
 586     if (UseZGC) {
 587       // Check if mask is good.
 588       // verifies that ZAddressBadMask & r0 == 0
 589       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 590       __ andr(c_rarg2, r0, c_rarg3);
 591       __ cbnz(c_rarg2, error);
 592     }
 593 #endif
 594 
 595     // Check if the oop is in the right area of memory
 596     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 597     __ andr(c_rarg2, r0, c_rarg3);
 598     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 599 
 600     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 601     // instruction here because the flags register is live.
 602     __ eor(c_rarg2, c_rarg2, c_rarg3);
 603     __ cbnz(c_rarg2, error);
 604 
 605     // make sure klass is 'reasonable', which is not zero.
 606     __ load_klass(r0, r0);  // get klass
 607     __ cbz(r0, error);      // if klass is NULL it is broken
 608 
 609     // return if everything seems ok
 610     __ bind(exit);
 611 
 612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 613     __ ret(lr);
 614 
 615     // handle errors
 616     __ bind(error);
 617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 618 
 619     __ push(RegSet::range(r0, r29), sp);
 620     // debug(char* msg, int64_t pc, int64_t regs[])
 621     __ mov(c_rarg0, rscratch1);      // pass address of error message
 622     __ mov(c_rarg1, lr);             // pass return address
 623     __ mov(c_rarg2, sp);             // pass address of regs on stack
 624 #ifndef PRODUCT
 625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 626 #endif
 627     BLOCK_COMMENT("call MacroAssembler::debug");
 628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 629     __ blrt(rscratch1, 3, 0, 1);
 630 
 631     return start;
 632   }
 633 
 634   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 635 
 636   // The inner part of zero_words().  This is the bulk operation,
 637   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 638   // caller is responsible for zeroing the last few words.
 639   //
 640   // Inputs:
 641   // r10: the HeapWord-aligned base address of an array to zero.
 642   // r11: the count in HeapWords, r11 > 0.
 643   //
 644   // Returns r10 and r11, adjusted for the caller to clear.
 645   // r10: the base address of the tail of words left to clear.
 646   // r11: the number of words in the tail.
 647   //      r11 < MacroAssembler::zero_words_block_size.
 648 
 649   address generate_zero_blocks() {
 650     Label done;
 651     Label base_aligned;
 652 
 653     Register base = r10, cnt = r11;
 654 
 655     __ align(CodeEntryAlignment);
 656     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 657     address start = __ pc();
 658 
 659     if (UseBlockZeroing) {
 660       int zva_length = VM_Version::zva_length();
 661 
 662       // Ensure ZVA length can be divided by 16. This is required by
 663       // the subsequent operations.
 664       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 665 
 666       __ tbz(base, 3, base_aligned);
 667       __ str(zr, Address(__ post(base, 8)));
 668       __ sub(cnt, cnt, 1);
 669       __ bind(base_aligned);
 670 
 671       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 672       // alignment.
 673       Label small;
 674       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 675       __ subs(rscratch1, cnt, low_limit >> 3);
 676       __ br(Assembler::LT, small);
 677       __ zero_dcache_blocks(base, cnt);
 678       __ bind(small);
 679     }
 680 
 681     {
 682       // Number of stp instructions we'll unroll
 683       const int unroll =
 684         MacroAssembler::zero_words_block_size / 2;
 685       // Clear the remaining blocks.
 686       Label loop;
 687       __ subs(cnt, cnt, unroll * 2);
 688       __ br(Assembler::LT, done);
 689       __ bind(loop);
 690       for (int i = 0; i < unroll; i++)
 691         __ stp(zr, zr, __ post(base, 16));
 692       __ subs(cnt, cnt, unroll * 2);
 693       __ br(Assembler::GE, loop);
 694       __ bind(done);
 695       __ add(cnt, cnt, unroll * 2);
 696     }
 697 
 698     __ ret(lr);
 699 
 700     return start;
 701   }
 702 
 703 
 704   typedef enum {
 705     copy_forwards = 1,
 706     copy_backwards = -1
 707   } copy_direction;
 708 
 709   // Bulk copy of blocks of 8 words.
 710   //
 711   // count is a count of words.
 712   //
 713   // Precondition: count >= 8
 714   //
 715   // Postconditions:
 716   //
 717   // The least significant bit of count contains the remaining count
 718   // of words to copy.  The rest of count is trash.
 719   //
 720   // s and d are adjusted to point to the remaining words to copy
 721   //
 722   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 723                            copy_direction direction) {
 724     int unit = wordSize * direction;
 725     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 726 
 727     int offset;
 728     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 729       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 730     const Register stride = r13;
 731 
 732     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 733     assert_different_registers(s, d, count, rscratch1);
 734 
 735     Label again, drain;
 736     const char *stub_name;
 737     if (direction == copy_forwards)
 738       stub_name = "forward_copy_longs";
 739     else
 740       stub_name = "backward_copy_longs";
 741 
 742     __ align(CodeEntryAlignment);
 743 
 744     StubCodeMark mark(this, "StubRoutines", stub_name);
 745 
 746     __ bind(start);
 747 
 748     Label unaligned_copy_long;
 749     if (AvoidUnalignedAccesses) {
 750       __ tbnz(d, 3, unaligned_copy_long);
 751     }
 752 
 753     if (direction == copy_forwards) {
 754       __ sub(s, s, bias);
 755       __ sub(d, d, bias);
 756     }
 757 
 758 #ifdef ASSERT
 759     // Make sure we are never given < 8 words
 760     {
 761       Label L;
 762       __ cmp(count, (u1)8);
 763       __ br(Assembler::GE, L);
 764       __ stop("genrate_copy_longs called with < 8 words");
 765       __ bind(L);
 766     }
 767 #endif
 768 
 769     // Fill 8 registers
 770     if (UseSIMDForMemoryOps) {
 771       __ ldpq(v0, v1, Address(s, 4 * unit));
 772       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 773     } else {
 774       __ ldp(t0, t1, Address(s, 2 * unit));
 775       __ ldp(t2, t3, Address(s, 4 * unit));
 776       __ ldp(t4, t5, Address(s, 6 * unit));
 777       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 778     }
 779 
 780     __ subs(count, count, 16);
 781     __ br(Assembler::LO, drain);
 782 
 783     int prefetch = PrefetchCopyIntervalInBytes;
 784     bool use_stride = false;
 785     if (direction == copy_backwards) {
 786        use_stride = prefetch > 256;
 787        prefetch = -prefetch;
 788        if (use_stride) __ mov(stride, prefetch);
 789     }
 790 
 791     __ bind(again);
 792 
 793     if (PrefetchCopyIntervalInBytes > 0)
 794       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 795 
 796     if (UseSIMDForMemoryOps) {
 797       __ stpq(v0, v1, Address(d, 4 * unit));
 798       __ ldpq(v0, v1, Address(s, 4 * unit));
 799       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 800       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 801     } else {
 802       __ stp(t0, t1, Address(d, 2 * unit));
 803       __ ldp(t0, t1, Address(s, 2 * unit));
 804       __ stp(t2, t3, Address(d, 4 * unit));
 805       __ ldp(t2, t3, Address(s, 4 * unit));
 806       __ stp(t4, t5, Address(d, 6 * unit));
 807       __ ldp(t4, t5, Address(s, 6 * unit));
 808       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 809       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 810     }
 811 
 812     __ subs(count, count, 8);
 813     __ br(Assembler::HS, again);
 814 
 815     // Drain
 816     __ bind(drain);
 817     if (UseSIMDForMemoryOps) {
 818       __ stpq(v0, v1, Address(d, 4 * unit));
 819       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 820     } else {
 821       __ stp(t0, t1, Address(d, 2 * unit));
 822       __ stp(t2, t3, Address(d, 4 * unit));
 823       __ stp(t4, t5, Address(d, 6 * unit));
 824       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 825     }
 826 
 827     {
 828       Label L1, L2;
 829       __ tbz(count, exact_log2(4), L1);
 830       if (UseSIMDForMemoryOps) {
 831         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 832         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 833       } else {
 834         __ ldp(t0, t1, Address(s, 2 * unit));
 835         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 836         __ stp(t0, t1, Address(d, 2 * unit));
 837         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 838       }
 839       __ bind(L1);
 840 
 841       if (direction == copy_forwards) {
 842         __ add(s, s, bias);
 843         __ add(d, d, bias);
 844       }
 845 
 846       __ tbz(count, 1, L2);
 847       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 848       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 849       __ bind(L2);
 850     }
 851 
 852     __ ret(lr);
 853 
 854     if (AvoidUnalignedAccesses) {
 855       Label drain, again;
 856       // Register order for storing. Order is different for backward copy.
 857 
 858       __ bind(unaligned_copy_long);
 859 
 860       // source address is even aligned, target odd aligned
 861       //
 862       // when forward copying word pairs we read long pairs at offsets
 863       // {0, 2, 4, 6} (in long words). when backwards copying we read
 864       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 865       // address by -2 in the forwards case so we can compute the
 866       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 867       // or -1.
 868       //
 869       // when forward copying we need to store 1 word, 3 pairs and
 870       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 871       // zero offset We adjust the destination by -1 which means we
 872       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 873       //
 874       // When backwards copyng we need to store 1 word, 3 pairs and
 875       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 876       // offsets {1, 3, 5, 7, 8} * unit.
 877 
 878       if (direction == copy_forwards) {
 879         __ sub(s, s, 16);
 880         __ sub(d, d, 8);
 881       }
 882 
 883       // Fill 8 registers
 884       //
 885       // for forwards copy s was offset by -16 from the original input
 886       // value of s so the register contents are at these offsets
 887       // relative to the 64 bit block addressed by that original input
 888       // and so on for each successive 64 byte block when s is updated
 889       //
 890       // t0 at offset 0,  t1 at offset 8
 891       // t2 at offset 16, t3 at offset 24
 892       // t4 at offset 32, t5 at offset 40
 893       // t6 at offset 48, t7 at offset 56
 894 
 895       // for backwards copy s was not offset so the register contents
 896       // are at these offsets into the preceding 64 byte block
 897       // relative to that original input and so on for each successive
 898       // preceding 64 byte block when s is updated. this explains the
 899       // slightly counter-intuitive looking pattern of register usage
 900       // in the stp instructions for backwards copy.
 901       //
 902       // t0 at offset -16, t1 at offset -8
 903       // t2 at offset -32, t3 at offset -24
 904       // t4 at offset -48, t5 at offset -40
 905       // t6 at offset -64, t7 at offset -56
 906 
 907       __ ldp(t0, t1, Address(s, 2 * unit));
 908       __ ldp(t2, t3, Address(s, 4 * unit));
 909       __ ldp(t4, t5, Address(s, 6 * unit));
 910       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 911 
 912       __ subs(count, count, 16);
 913       __ br(Assembler::LO, drain);
 914 
 915       int prefetch = PrefetchCopyIntervalInBytes;
 916       bool use_stride = false;
 917       if (direction == copy_backwards) {
 918          use_stride = prefetch > 256;
 919          prefetch = -prefetch;
 920          if (use_stride) __ mov(stride, prefetch);
 921       }
 922 
 923       __ bind(again);
 924 
 925       if (PrefetchCopyIntervalInBytes > 0)
 926         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 927 
 928       if (direction == copy_forwards) {
 929        // allowing for the offset of -8 the store instructions place
 930        // registers into the target 64 bit block at the following
 931        // offsets
 932        //
 933        // t0 at offset 0
 934        // t1 at offset 8,  t2 at offset 16
 935        // t3 at offset 24, t4 at offset 32
 936        // t5 at offset 40, t6 at offset 48
 937        // t7 at offset 56
 938 
 939         __ str(t0, Address(d, 1 * unit));
 940         __ stp(t1, t2, Address(d, 2 * unit));
 941         __ ldp(t0, t1, Address(s, 2 * unit));
 942         __ stp(t3, t4, Address(d, 4 * unit));
 943         __ ldp(t2, t3, Address(s, 4 * unit));
 944         __ stp(t5, t6, Address(d, 6 * unit));
 945         __ ldp(t4, t5, Address(s, 6 * unit));
 946         __ str(t7, Address(__ pre(d, 8 * unit)));
 947         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 948       } else {
 949        // d was not offset when we started so the registers are
 950        // written into the 64 bit block preceding d with the following
 951        // offsets
 952        //
 953        // t1 at offset -8
 954        // t3 at offset -24, t0 at offset -16
 955        // t5 at offset -48, t2 at offset -32
 956        // t7 at offset -56, t4 at offset -48
 957        //                   t6 at offset -64
 958        //
 959        // note that this matches the offsets previously noted for the
 960        // loads
 961 
 962         __ str(t1, Address(d, 1 * unit));
 963         __ stp(t3, t0, Address(d, 3 * unit));
 964         __ ldp(t0, t1, Address(s, 2 * unit));
 965         __ stp(t5, t2, Address(d, 5 * unit));
 966         __ ldp(t2, t3, Address(s, 4 * unit));
 967         __ stp(t7, t4, Address(d, 7 * unit));
 968         __ ldp(t4, t5, Address(s, 6 * unit));
 969         __ str(t6, Address(__ pre(d, 8 * unit)));
 970         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 971       }
 972 
 973       __ subs(count, count, 8);
 974       __ br(Assembler::HS, again);
 975 
 976       // Drain
 977       //
 978       // this uses the same pattern of offsets and register arguments
 979       // as above
 980       __ bind(drain);
 981       if (direction == copy_forwards) {
 982         __ str(t0, Address(d, 1 * unit));
 983         __ stp(t1, t2, Address(d, 2 * unit));
 984         __ stp(t3, t4, Address(d, 4 * unit));
 985         __ stp(t5, t6, Address(d, 6 * unit));
 986         __ str(t7, Address(__ pre(d, 8 * unit)));
 987       } else {
 988         __ str(t1, Address(d, 1 * unit));
 989         __ stp(t3, t0, Address(d, 3 * unit));
 990         __ stp(t5, t2, Address(d, 5 * unit));
 991         __ stp(t7, t4, Address(d, 7 * unit));
 992         __ str(t6, Address(__ pre(d, 8 * unit)));
 993       }
 994       // now we need to copy any remaining part block which may
 995       // include a 4 word block subblock and/or a 2 word subblock.
 996       // bits 2 and 1 in the count are the tell-tale for whetehr we
 997       // have each such subblock
 998       {
 999         Label L1, L2;
1000         __ tbz(count, exact_log2(4), L1);
1001        // this is the same as above but copying only 4 longs hence
1002        // with ony one intervening stp between the str instructions
1003        // but note that the offsets and registers still follow the
1004        // same pattern
1005         __ ldp(t0, t1, Address(s, 2 * unit));
1006         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1007         if (direction == copy_forwards) {
1008           __ str(t0, Address(d, 1 * unit));
1009           __ stp(t1, t2, Address(d, 2 * unit));
1010           __ str(t3, Address(__ pre(d, 4 * unit)));
1011         } else {
1012           __ str(t1, Address(d, 1 * unit));
1013           __ stp(t3, t0, Address(d, 3 * unit));
1014           __ str(t2, Address(__ pre(d, 4 * unit)));
1015         }
1016         __ bind(L1);
1017 
1018         __ tbz(count, 1, L2);
1019        // this is the same as above but copying only 2 longs hence
1020        // there is no intervening stp between the str instructions
1021        // but note that the offset and register patterns are still
1022        // the same
1023         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1024         if (direction == copy_forwards) {
1025           __ str(t0, Address(d, 1 * unit));
1026           __ str(t1, Address(__ pre(d, 2 * unit)));
1027         } else {
1028           __ str(t1, Address(d, 1 * unit));
1029           __ str(t0, Address(__ pre(d, 2 * unit)));
1030         }
1031         __ bind(L2);
1032 
1033        // for forwards copy we need to re-adjust the offsets we
1034        // applied so that s and d are follow the last words written
1035 
1036        if (direction == copy_forwards) {
1037          __ add(s, s, 16);
1038          __ add(d, d, 8);
1039        }
1040 
1041       }
1042 
1043       __ ret(lr);
1044       }
1045   }
1046 
1047   // Small copy: less than 16 bytes.
1048   //
1049   // NB: Ignores all of the bits of count which represent more than 15
1050   // bytes, so a caller doesn't have to mask them.
1051 
1052   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1053     bool is_backwards = step < 0;
1054     size_t granularity = uabs(step);
1055     int direction = is_backwards ? -1 : 1;
1056     int unit = wordSize * direction;
1057 
1058     Label Lword, Lint, Lshort, Lbyte;
1059 
1060     assert(granularity
1061            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1062 
1063     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1064 
1065     // ??? I don't know if this bit-test-and-branch is the right thing
1066     // to do.  It does a lot of jumping, resulting in several
1067     // mispredicted branches.  It might make more sense to do this
1068     // with something like Duff's device with a single computed branch.
1069 
1070     __ tbz(count, 3 - exact_log2(granularity), Lword);
1071     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1072     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1073     __ bind(Lword);
1074 
1075     if (granularity <= sizeof (jint)) {
1076       __ tbz(count, 2 - exact_log2(granularity), Lint);
1077       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1078       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1079       __ bind(Lint);
1080     }
1081 
1082     if (granularity <= sizeof (jshort)) {
1083       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1084       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1085       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1086       __ bind(Lshort);
1087     }
1088 
1089     if (granularity <= sizeof (jbyte)) {
1090       __ tbz(count, 0, Lbyte);
1091       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1092       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1093       __ bind(Lbyte);
1094     }
1095   }
1096 
1097   Label copy_f, copy_b;
1098 
1099   // All-singing all-dancing memory copy.
1100   //
1101   // Copy count units of memory from s to d.  The size of a unit is
1102   // step, which can be positive or negative depending on the direction
1103   // of copy.  If is_aligned is false, we align the source address.
1104   //
1105 
1106   void copy_memory(bool is_aligned, Register s, Register d,
1107                    Register count, Register tmp, int step) {
1108     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1109     bool is_backwards = step < 0;
1110     int granularity = uabs(step);
1111     const Register t0 = r3, t1 = r4;
1112 
1113     // <= 96 bytes do inline. Direction doesn't matter because we always
1114     // load all the data before writing anything
1115     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1116     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1117     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1118     const Register send = r17, dend = r18;
1119 
1120     if (PrefetchCopyIntervalInBytes > 0)
1121       __ prfm(Address(s, 0), PLDL1KEEP);
1122     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1123     __ br(Assembler::HI, copy_big);
1124 
1125     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1126     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1127 
1128     __ cmp(count, u1(16/granularity));
1129     __ br(Assembler::LS, copy16);
1130 
1131     __ cmp(count, u1(64/granularity));
1132     __ br(Assembler::HI, copy80);
1133 
1134     __ cmp(count, u1(32/granularity));
1135     __ br(Assembler::LS, copy32);
1136 
1137     // 33..64 bytes
1138     if (UseSIMDForMemoryOps) {
1139       __ ldpq(v0, v1, Address(s, 0));
1140       __ ldpq(v2, v3, Address(send, -32));
1141       __ stpq(v0, v1, Address(d, 0));
1142       __ stpq(v2, v3, Address(dend, -32));
1143     } else {
1144       __ ldp(t0, t1, Address(s, 0));
1145       __ ldp(t2, t3, Address(s, 16));
1146       __ ldp(t4, t5, Address(send, -32));
1147       __ ldp(t6, t7, Address(send, -16));
1148 
1149       __ stp(t0, t1, Address(d, 0));
1150       __ stp(t2, t3, Address(d, 16));
1151       __ stp(t4, t5, Address(dend, -32));
1152       __ stp(t6, t7, Address(dend, -16));
1153     }
1154     __ b(finish);
1155 
1156     // 17..32 bytes
1157     __ bind(copy32);
1158     __ ldp(t0, t1, Address(s, 0));
1159     __ ldp(t2, t3, Address(send, -16));
1160     __ stp(t0, t1, Address(d, 0));
1161     __ stp(t2, t3, Address(dend, -16));
1162     __ b(finish);
1163 
1164     // 65..80/96 bytes
1165     // (96 bytes if SIMD because we do 32 byes per instruction)
1166     __ bind(copy80);
1167     if (UseSIMDForMemoryOps) {
1168       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1169       __ ldpq(v4, v5, Address(send, -32));
1170       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1171       __ stpq(v4, v5, Address(dend, -32));
1172     } else {
1173       __ ldp(t0, t1, Address(s, 0));
1174       __ ldp(t2, t3, Address(s, 16));
1175       __ ldp(t4, t5, Address(s, 32));
1176       __ ldp(t6, t7, Address(s, 48));
1177       __ ldp(t8, t9, Address(send, -16));
1178 
1179       __ stp(t0, t1, Address(d, 0));
1180       __ stp(t2, t3, Address(d, 16));
1181       __ stp(t4, t5, Address(d, 32));
1182       __ stp(t6, t7, Address(d, 48));
1183       __ stp(t8, t9, Address(dend, -16));
1184     }
1185     __ b(finish);
1186 
1187     // 0..16 bytes
1188     __ bind(copy16);
1189     __ cmp(count, u1(8/granularity));
1190     __ br(Assembler::LO, copy8);
1191 
1192     // 8..16 bytes
1193     __ ldr(t0, Address(s, 0));
1194     __ ldr(t1, Address(send, -8));
1195     __ str(t0, Address(d, 0));
1196     __ str(t1, Address(dend, -8));
1197     __ b(finish);
1198 
1199     if (granularity < 8) {
1200       // 4..7 bytes
1201       __ bind(copy8);
1202       __ tbz(count, 2 - exact_log2(granularity), copy4);
1203       __ ldrw(t0, Address(s, 0));
1204       __ ldrw(t1, Address(send, -4));
1205       __ strw(t0, Address(d, 0));
1206       __ strw(t1, Address(dend, -4));
1207       __ b(finish);
1208       if (granularity < 4) {
1209         // 0..3 bytes
1210         __ bind(copy4);
1211         __ cbz(count, finish); // get rid of 0 case
1212         if (granularity == 2) {
1213           __ ldrh(t0, Address(s, 0));
1214           __ strh(t0, Address(d, 0));
1215         } else { // granularity == 1
1216           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1217           // the first and last byte.
1218           // Handle the 3 byte case by loading and storing base + count/2
1219           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1220           // This does means in the 1 byte case we load/store the same
1221           // byte 3 times.
1222           __ lsr(count, count, 1);
1223           __ ldrb(t0, Address(s, 0));
1224           __ ldrb(t1, Address(send, -1));
1225           __ ldrb(t2, Address(s, count));
1226           __ strb(t0, Address(d, 0));
1227           __ strb(t1, Address(dend, -1));
1228           __ strb(t2, Address(d, count));
1229         }
1230         __ b(finish);
1231       }
1232     }
1233 
1234     __ bind(copy_big);
1235     if (is_backwards) {
1236       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1237       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1238     }
1239 
1240     // Now we've got the small case out of the way we can align the
1241     // source address on a 2-word boundary.
1242 
1243     Label aligned;
1244 
1245     if (is_aligned) {
1246       // We may have to adjust by 1 word to get s 2-word-aligned.
1247       __ tbz(s, exact_log2(wordSize), aligned);
1248       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1249       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1250       __ sub(count, count, wordSize/granularity);
1251     } else {
1252       if (is_backwards) {
1253         __ andr(rscratch2, s, 2 * wordSize - 1);
1254       } else {
1255         __ neg(rscratch2, s);
1256         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1257       }
1258       // rscratch2 is the byte adjustment needed to align s.
1259       __ cbz(rscratch2, aligned);
1260       int shift = exact_log2(granularity);
1261       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1262       __ sub(count, count, rscratch2);
1263 
1264 #if 0
1265       // ?? This code is only correct for a disjoint copy.  It may or
1266       // may not make sense to use it in that case.
1267 
1268       // Copy the first pair; s and d may not be aligned.
1269       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1270       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1271 
1272       // Align s and d, adjust count
1273       if (is_backwards) {
1274         __ sub(s, s, rscratch2);
1275         __ sub(d, d, rscratch2);
1276       } else {
1277         __ add(s, s, rscratch2);
1278         __ add(d, d, rscratch2);
1279       }
1280 #else
1281       copy_memory_small(s, d, rscratch2, rscratch1, step);
1282 #endif
1283     }
1284 
1285     __ bind(aligned);
1286 
1287     // s is now 2-word-aligned.
1288 
1289     // We have a count of units and some trailing bytes.  Adjust the
1290     // count and do a bulk copy of words.
1291     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1292     if (direction == copy_forwards)
1293       __ bl(copy_f);
1294     else
1295       __ bl(copy_b);
1296 
1297     // And the tail.
1298     copy_memory_small(s, d, count, tmp, step);
1299 
1300     if (granularity >= 8) __ bind(copy8);
1301     if (granularity >= 4) __ bind(copy4);
1302     __ bind(finish);
1303   }
1304 
1305 
1306   void clobber_registers() {
1307 #ifdef ASSERT
1308     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1309     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1310     for (Register r = r3; r <= r18; r++)
1311       if (r != rscratch1) __ mov(r, rscratch1);
1312 #endif
1313   }
1314 
1315   // Scan over array at a for count oops, verifying each one.
1316   // Preserves a and count, clobbers rscratch1 and rscratch2.
1317   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1318     Label loop, end;
1319     __ mov(rscratch1, a);
1320     __ mov(rscratch2, zr);
1321     __ bind(loop);
1322     __ cmp(rscratch2, count);
1323     __ br(Assembler::HS, end);
1324     if (size == (size_t)wordSize) {
1325       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1326       __ verify_oop(temp);
1327     } else {
1328       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1329       __ decode_heap_oop(temp); // calls verify_oop
1330     }
1331     __ add(rscratch2, rscratch2, size);
1332     __ b(loop);
1333     __ bind(end);
1334   }
1335 
1336   // Arguments:
1337   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1338   //             ignored
1339   //   is_oop  - true => oop array, so generate store check code
1340   //   name    - stub name string
1341   //
1342   // Inputs:
1343   //   c_rarg0   - source array address
1344   //   c_rarg1   - destination array address
1345   //   c_rarg2   - element count, treated as ssize_t, can be zero
1346   //
1347   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1348   // the hardware handle it.  The two dwords within qwords that span
1349   // cache line boundaries will still be loaded and stored atomicly.
1350   //
1351   // Side Effects:
1352   //   disjoint_int_copy_entry is set to the no-overlap entry point
1353   //   used by generate_conjoint_int_oop_copy().
1354   //
1355   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1356                                   const char *name, bool dest_uninitialized = false) {
1357     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1358     RegSet saved_reg = RegSet::of(s, d, count);
1359     __ align(CodeEntryAlignment);
1360     StubCodeMark mark(this, "StubRoutines", name);
1361     address start = __ pc();
1362     __ enter();
1363 
1364     if (entry != NULL) {
1365       *entry = __ pc();
1366       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1367       BLOCK_COMMENT("Entry:");
1368     }
1369 
1370     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1371     if (dest_uninitialized) {
1372       decorators |= IS_DEST_UNINITIALIZED;
1373     }
1374     if (aligned) {
1375       decorators |= ARRAYCOPY_ALIGNED;
1376     }
1377 
1378     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1379     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1380 
1381     if (is_oop) {
1382       // save regs before copy_memory
1383       __ push(RegSet::of(d, count), sp);
1384     }
1385     copy_memory(aligned, s, d, count, rscratch1, size);
1386 
1387     if (is_oop) {
1388       __ pop(RegSet::of(d, count), sp);
1389       if (VerifyOops)
1390         verify_oop_array(size, d, count, r16);
1391       __ sub(count, count, 1); // make an inclusive end pointer
1392       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1393     }
1394 
1395     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1396 
1397     __ leave();
1398     __ mov(r0, zr); // return 0
1399     __ ret(lr);
1400 #ifdef BUILTIN_SIM
1401     {
1402       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1403       sim->notifyCompile(const_cast<char*>(name), start);
1404     }
1405 #endif
1406     return start;
1407   }
1408 
1409   // Arguments:
1410   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1411   //             ignored
1412   //   is_oop  - true => oop array, so generate store check code
1413   //   name    - stub name string
1414   //
1415   // Inputs:
1416   //   c_rarg0   - source array address
1417   //   c_rarg1   - destination array address
1418   //   c_rarg2   - element count, treated as ssize_t, can be zero
1419   //
1420   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1421   // the hardware handle it.  The two dwords within qwords that span
1422   // cache line boundaries will still be loaded and stored atomicly.
1423   //
1424   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1425                                  address *entry, const char *name,
1426                                  bool dest_uninitialized = false) {
1427     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1428     RegSet saved_regs = RegSet::of(s, d, count);
1429     StubCodeMark mark(this, "StubRoutines", name);
1430     address start = __ pc();
1431     __ enter();
1432 
1433     if (entry != NULL) {
1434       *entry = __ pc();
1435       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1436       BLOCK_COMMENT("Entry:");
1437     }
1438 
1439     // use fwd copy when (d-s) above_equal (count*size)
1440     __ sub(rscratch1, d, s);
1441     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1442     __ br(Assembler::HS, nooverlap_target);
1443 
1444     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1445     if (dest_uninitialized) {
1446       decorators |= IS_DEST_UNINITIALIZED;
1447     }
1448     if (aligned) {
1449       decorators |= ARRAYCOPY_ALIGNED;
1450     }
1451 
1452     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1453     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1454 
1455     if (is_oop) {
1456       // save regs before copy_memory
1457       __ push(RegSet::of(d, count), sp);
1458     }
1459     copy_memory(aligned, s, d, count, rscratch1, -size);
1460     if (is_oop) {
1461       __ pop(RegSet::of(d, count), sp);
1462       if (VerifyOops)
1463         verify_oop_array(size, d, count, r16);
1464       __ sub(count, count, 1); // make an inclusive end pointer
1465       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1466     }
1467     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1468     __ leave();
1469     __ mov(r0, zr); // return 0
1470     __ ret(lr);
1471 #ifdef BUILTIN_SIM
1472     {
1473       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1474       sim->notifyCompile(const_cast<char*>(name), start);
1475     }
1476 #endif
1477     return start;
1478 }
1479 
1480   // Arguments:
1481   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1482   //             ignored
1483   //   name    - stub name string
1484   //
1485   // Inputs:
1486   //   c_rarg0   - source array address
1487   //   c_rarg1   - destination array address
1488   //   c_rarg2   - element count, treated as ssize_t, can be zero
1489   //
1490   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1491   // we let the hardware handle it.  The one to eight bytes within words,
1492   // dwords or qwords that span cache line boundaries will still be loaded
1493   // and stored atomically.
1494   //
1495   // Side Effects:
1496   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1497   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1498   // we let the hardware handle it.  The one to eight bytes within words,
1499   // dwords or qwords that span cache line boundaries will still be loaded
1500   // and stored atomically.
1501   //
1502   // Side Effects:
1503   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1504   //   used by generate_conjoint_byte_copy().
1505   //
1506   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1507     const bool not_oop = false;
1508     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1509   }
1510 
1511   // Arguments:
1512   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1513   //             ignored
1514   //   name    - stub name string
1515   //
1516   // Inputs:
1517   //   c_rarg0   - source array address
1518   //   c_rarg1   - destination array address
1519   //   c_rarg2   - element count, treated as ssize_t, can be zero
1520   //
1521   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1522   // we let the hardware handle it.  The one to eight bytes within words,
1523   // dwords or qwords that span cache line boundaries will still be loaded
1524   // and stored atomically.
1525   //
1526   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1527                                       address* entry, const char *name) {
1528     const bool not_oop = false;
1529     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1530   }
1531 
1532   // Arguments:
1533   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1534   //             ignored
1535   //   name    - stub name string
1536   //
1537   // Inputs:
1538   //   c_rarg0   - source array address
1539   //   c_rarg1   - destination array address
1540   //   c_rarg2   - element count, treated as ssize_t, can be zero
1541   //
1542   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1543   // let the hardware handle it.  The two or four words within dwords
1544   // or qwords that span cache line boundaries will still be loaded
1545   // and stored atomically.
1546   //
1547   // Side Effects:
1548   //   disjoint_short_copy_entry is set to the no-overlap entry point
1549   //   used by generate_conjoint_short_copy().
1550   //
1551   address generate_disjoint_short_copy(bool aligned,
1552                                        address* entry, const char *name) {
1553     const bool not_oop = false;
1554     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1555   }
1556 
1557   // Arguments:
1558   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1559   //             ignored
1560   //   name    - stub name string
1561   //
1562   // Inputs:
1563   //   c_rarg0   - source array address
1564   //   c_rarg1   - destination array address
1565   //   c_rarg2   - element count, treated as ssize_t, can be zero
1566   //
1567   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1568   // let the hardware handle it.  The two or four words within dwords
1569   // or qwords that span cache line boundaries will still be loaded
1570   // and stored atomically.
1571   //
1572   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1573                                        address *entry, const char *name) {
1574     const bool not_oop = false;
1575     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1576 
1577   }
1578   // Arguments:
1579   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1580   //             ignored
1581   //   name    - stub name string
1582   //
1583   // Inputs:
1584   //   c_rarg0   - source array address
1585   //   c_rarg1   - destination array address
1586   //   c_rarg2   - element count, treated as ssize_t, can be zero
1587   //
1588   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1589   // the hardware handle it.  The two dwords within qwords that span
1590   // cache line boundaries will still be loaded and stored atomicly.
1591   //
1592   // Side Effects:
1593   //   disjoint_int_copy_entry is set to the no-overlap entry point
1594   //   used by generate_conjoint_int_oop_copy().
1595   //
1596   address generate_disjoint_int_copy(bool aligned, address *entry,
1597                                          const char *name, bool dest_uninitialized = false) {
1598     const bool not_oop = false;
1599     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1600   }
1601 
1602   // Arguments:
1603   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1604   //             ignored
1605   //   name    - stub name string
1606   //
1607   // Inputs:
1608   //   c_rarg0   - source array address
1609   //   c_rarg1   - destination array address
1610   //   c_rarg2   - element count, treated as ssize_t, can be zero
1611   //
1612   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1613   // the hardware handle it.  The two dwords within qwords that span
1614   // cache line boundaries will still be loaded and stored atomicly.
1615   //
1616   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1617                                      address *entry, const char *name,
1618                                      bool dest_uninitialized = false) {
1619     const bool not_oop = false;
1620     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1621   }
1622 
1623 
1624   // Arguments:
1625   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1626   //             ignored
1627   //   name    - stub name string
1628   //
1629   // Inputs:
1630   //   c_rarg0   - source array address
1631   //   c_rarg1   - destination array address
1632   //   c_rarg2   - element count, treated as size_t, can be zero
1633   //
1634   // Side Effects:
1635   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1636   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1637   //
1638   address generate_disjoint_long_copy(bool aligned, address *entry,
1639                                           const char *name, bool dest_uninitialized = false) {
1640     const bool not_oop = false;
1641     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1642   }
1643 
1644   // Arguments:
1645   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1646   //             ignored
1647   //   name    - stub name string
1648   //
1649   // Inputs:
1650   //   c_rarg0   - source array address
1651   //   c_rarg1   - destination array address
1652   //   c_rarg2   - element count, treated as size_t, can be zero
1653   //
1654   address generate_conjoint_long_copy(bool aligned,
1655                                       address nooverlap_target, address *entry,
1656                                       const char *name, bool dest_uninitialized = false) {
1657     const bool not_oop = false;
1658     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1659   }
1660 
1661   // Arguments:
1662   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1663   //             ignored
1664   //   name    - stub name string
1665   //
1666   // Inputs:
1667   //   c_rarg0   - source array address
1668   //   c_rarg1   - destination array address
1669   //   c_rarg2   - element count, treated as size_t, can be zero
1670   //
1671   // Side Effects:
1672   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1673   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1674   //
1675   address generate_disjoint_oop_copy(bool aligned, address *entry,
1676                                      const char *name, bool dest_uninitialized) {
1677     const bool is_oop = true;
1678     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1679     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1680   }
1681 
1682   // Arguments:
1683   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1684   //             ignored
1685   //   name    - stub name string
1686   //
1687   // Inputs:
1688   //   c_rarg0   - source array address
1689   //   c_rarg1   - destination array address
1690   //   c_rarg2   - element count, treated as size_t, can be zero
1691   //
1692   address generate_conjoint_oop_copy(bool aligned,
1693                                      address nooverlap_target, address *entry,
1694                                      const char *name, bool dest_uninitialized) {
1695     const bool is_oop = true;
1696     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1697     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1698                                   name, dest_uninitialized);
1699   }
1700 
1701 
1702   // Helper for generating a dynamic type check.
1703   // Smashes rscratch1, rscratch2.
1704   void generate_type_check(Register sub_klass,
1705                            Register super_check_offset,
1706                            Register super_klass,
1707                            Label& L_success) {
1708     assert_different_registers(sub_klass, super_check_offset, super_klass);
1709 
1710     BLOCK_COMMENT("type_check:");
1711 
1712     Label L_miss;
1713 
1714     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1715                                      super_check_offset);
1716     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1717 
1718     // Fall through on failure!
1719     __ BIND(L_miss);
1720   }
1721 
1722   //
1723   //  Generate checkcasting array copy stub
1724   //
1725   //  Input:
1726   //    c_rarg0   - source array address
1727   //    c_rarg1   - destination array address
1728   //    c_rarg2   - element count, treated as ssize_t, can be zero
1729   //    c_rarg3   - size_t ckoff (super_check_offset)
1730   //    c_rarg4   - oop ckval (super_klass)
1731   //
1732   //  Output:
1733   //    r0 ==  0  -  success
1734   //    r0 == -1^K - failure, where K is partial transfer count
1735   //
1736   address generate_checkcast_copy(const char *name, address *entry,
1737                                   bool dest_uninitialized = false) {
1738 
1739     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1740 
1741     // Input registers (after setup_arg_regs)
1742     const Register from        = c_rarg0;   // source array address
1743     const Register to          = c_rarg1;   // destination array address
1744     const Register count       = c_rarg2;   // elementscount
1745     const Register ckoff       = c_rarg3;   // super_check_offset
1746     const Register ckval       = c_rarg4;   // super_klass
1747 
1748     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1749     RegSet wb_post_saved_regs = RegSet::of(count);
1750 
1751     // Registers used as temps (r18, r19, r20 are save-on-entry)
1752     const Register count_save  = r21;       // orig elementscount
1753     const Register start_to    = r20;       // destination array start address
1754     const Register copied_oop  = r18;       // actual oop copied
1755     const Register r19_klass   = r19;       // oop._klass
1756 
1757     //---------------------------------------------------------------
1758     // Assembler stub will be used for this call to arraycopy
1759     // if the two arrays are subtypes of Object[] but the
1760     // destination array type is not equal to or a supertype
1761     // of the source type.  Each element must be separately
1762     // checked.
1763 
1764     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1765                                copied_oop, r19_klass, count_save);
1766 
1767     __ align(CodeEntryAlignment);
1768     StubCodeMark mark(this, "StubRoutines", name);
1769     address start = __ pc();
1770 
1771     __ enter(); // required for proper stackwalking of RuntimeStub frame
1772 
1773 #ifdef ASSERT
1774     // caller guarantees that the arrays really are different
1775     // otherwise, we would have to make conjoint checks
1776     { Label L;
1777       array_overlap_test(L, TIMES_OOP);
1778       __ stop("checkcast_copy within a single array");
1779       __ bind(L);
1780     }
1781 #endif //ASSERT
1782 
1783     // Caller of this entry point must set up the argument registers.
1784     if (entry != NULL) {
1785       *entry = __ pc();
1786       BLOCK_COMMENT("Entry:");
1787     }
1788 
1789      // Empty array:  Nothing to do.
1790     __ cbz(count, L_done);
1791 
1792     __ push(RegSet::of(r18, r19, r20, r21), sp);
1793 
1794 #ifdef ASSERT
1795     BLOCK_COMMENT("assert consistent ckoff/ckval");
1796     // The ckoff and ckval must be mutually consistent,
1797     // even though caller generates both.
1798     { Label L;
1799       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1800       __ ldrw(start_to, Address(ckval, sco_offset));
1801       __ cmpw(ckoff, start_to);
1802       __ br(Assembler::EQ, L);
1803       __ stop("super_check_offset inconsistent");
1804       __ bind(L);
1805     }
1806 #endif //ASSERT
1807 
1808     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1809     bool is_oop = true;
1810     if (dest_uninitialized) {
1811       decorators |= IS_DEST_UNINITIALIZED;
1812     }
1813 
1814     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1815     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1816 
1817     // save the original count
1818     __ mov(count_save, count);
1819 
1820     // Copy from low to high addresses
1821     __ mov(start_to, to);              // Save destination array start address
1822     __ b(L_load_element);
1823 
1824     // ======== begin loop ========
1825     // (Loop is rotated; its entry is L_load_element.)
1826     // Loop control:
1827     //   for (; count != 0; count--) {
1828     //     copied_oop = load_heap_oop(from++);
1829     //     ... generate_type_check ...;
1830     //     store_heap_oop(to++, copied_oop);
1831     //   }
1832     __ align(OptoLoopAlignment);
1833 
1834     __ BIND(L_store_element);
1835     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1836     __ sub(count, count, 1);
1837     __ cbz(count, L_do_card_marks);
1838 
1839     // ======== loop entry is here ========
1840     __ BIND(L_load_element);
1841     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1842     __ cbz(copied_oop, L_store_element);
1843 
1844     __ load_klass(r19_klass, copied_oop);// query the object klass
1845     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1846     // ======== end loop ========
1847 
1848     // It was a real error; we must depend on the caller to finish the job.
1849     // Register count = remaining oops, count_orig = total oops.
1850     // Emit GC store barriers for the oops we have copied and report
1851     // their number to the caller.
1852 
1853     __ subs(count, count_save, count);     // K = partially copied oop count
1854     __ eon(count, count, zr);                   // report (-1^K) to caller
1855     __ br(Assembler::EQ, L_done_pop);
1856 
1857     __ BIND(L_do_card_marks);
1858     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1859     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1860 
1861     __ bind(L_done_pop);
1862     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1863     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1864 
1865     __ bind(L_done);
1866     __ mov(r0, count);
1867     __ leave();
1868     __ ret(lr);
1869 
1870     return start;
1871   }
1872 
1873   // Perform range checks on the proposed arraycopy.
1874   // Kills temp, but nothing else.
1875   // Also, clean the sign bits of src_pos and dst_pos.
1876   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1877                               Register src_pos, // source position (c_rarg1)
1878                               Register dst,     // destination array oo (c_rarg2)
1879                               Register dst_pos, // destination position (c_rarg3)
1880                               Register length,
1881                               Register temp,
1882                               Label& L_failed) {
1883     BLOCK_COMMENT("arraycopy_range_checks:");
1884 
1885     assert_different_registers(rscratch1, temp);
1886 
1887     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1888     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1889     __ addw(temp, length, src_pos);
1890     __ cmpw(temp, rscratch1);
1891     __ br(Assembler::HI, L_failed);
1892 
1893     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1894     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1895     __ addw(temp, length, dst_pos);
1896     __ cmpw(temp, rscratch1);
1897     __ br(Assembler::HI, L_failed);
1898 
1899     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1900     __ movw(src_pos, src_pos);
1901     __ movw(dst_pos, dst_pos);
1902 
1903     BLOCK_COMMENT("arraycopy_range_checks done");
1904   }
1905 
1906   // These stubs get called from some dumb test routine.
1907   // I'll write them properly when they're called from
1908   // something that's actually doing something.
1909   static void fake_arraycopy_stub(address src, address dst, int count) {
1910     assert(count == 0, "huh?");
1911   }
1912 
1913 
1914   //
1915   //  Generate 'unsafe' array copy stub
1916   //  Though just as safe as the other stubs, it takes an unscaled
1917   //  size_t argument instead of an element count.
1918   //
1919   //  Input:
1920   //    c_rarg0   - source array address
1921   //    c_rarg1   - destination array address
1922   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1923   //
1924   // Examines the alignment of the operands and dispatches
1925   // to a long, int, short, or byte copy loop.
1926   //
1927   address generate_unsafe_copy(const char *name,
1928                                address byte_copy_entry,
1929                                address short_copy_entry,
1930                                address int_copy_entry,
1931                                address long_copy_entry) {
1932     Label L_long_aligned, L_int_aligned, L_short_aligned;
1933     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1934 
1935     __ align(CodeEntryAlignment);
1936     StubCodeMark mark(this, "StubRoutines", name);
1937     address start = __ pc();
1938     __ enter(); // required for proper stackwalking of RuntimeStub frame
1939 
1940     // bump this on entry, not on exit:
1941     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1942 
1943     __ orr(rscratch1, s, d);
1944     __ orr(rscratch1, rscratch1, count);
1945 
1946     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1947     __ cbz(rscratch1, L_long_aligned);
1948     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1949     __ cbz(rscratch1, L_int_aligned);
1950     __ tbz(rscratch1, 0, L_short_aligned);
1951     __ b(RuntimeAddress(byte_copy_entry));
1952 
1953     __ BIND(L_short_aligned);
1954     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1955     __ b(RuntimeAddress(short_copy_entry));
1956     __ BIND(L_int_aligned);
1957     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1958     __ b(RuntimeAddress(int_copy_entry));
1959     __ BIND(L_long_aligned);
1960     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1961     __ b(RuntimeAddress(long_copy_entry));
1962 
1963     return start;
1964   }
1965 
1966   //
1967   //  Generate generic array copy stubs
1968   //
1969   //  Input:
1970   //    c_rarg0    -  src oop
1971   //    c_rarg1    -  src_pos (32-bits)
1972   //    c_rarg2    -  dst oop
1973   //    c_rarg3    -  dst_pos (32-bits)
1974   //    c_rarg4    -  element count (32-bits)
1975   //
1976   //  Output:
1977   //    r0 ==  0  -  success
1978   //    r0 == -1^K - failure, where K is partial transfer count
1979   //
1980   address generate_generic_copy(const char *name,
1981                                 address byte_copy_entry, address short_copy_entry,
1982                                 address int_copy_entry, address oop_copy_entry,
1983                                 address long_copy_entry, address checkcast_copy_entry) {
1984 
1985     Label L_failed, L_objArray;
1986     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1987 
1988     // Input registers
1989     const Register src        = c_rarg0;  // source array oop
1990     const Register src_pos    = c_rarg1;  // source position
1991     const Register dst        = c_rarg2;  // destination array oop
1992     const Register dst_pos    = c_rarg3;  // destination position
1993     const Register length     = c_rarg4;
1994 
1995 
1996     // Registers used as temps
1997     const Register dst_klass  = c_rarg5;
1998 
1999     __ align(CodeEntryAlignment);
2000 
2001     StubCodeMark mark(this, "StubRoutines", name);
2002 
2003     address start = __ pc();
2004 
2005     __ enter(); // required for proper stackwalking of RuntimeStub frame
2006 
2007     // bump this on entry, not on exit:
2008     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2009 
2010     //-----------------------------------------------------------------------
2011     // Assembler stub will be used for this call to arraycopy
2012     // if the following conditions are met:
2013     //
2014     // (1) src and dst must not be null.
2015     // (2) src_pos must not be negative.
2016     // (3) dst_pos must not be negative.
2017     // (4) length  must not be negative.
2018     // (5) src klass and dst klass should be the same and not NULL.
2019     // (6) src and dst should be arrays.
2020     // (7) src_pos + length must not exceed length of src.
2021     // (8) dst_pos + length must not exceed length of dst.
2022     //
2023 
2024     //  if (src == NULL) return -1;
2025     __ cbz(src, L_failed);
2026 
2027     //  if (src_pos < 0) return -1;
2028     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2029 
2030     //  if (dst == NULL) return -1;
2031     __ cbz(dst, L_failed);
2032 
2033     //  if (dst_pos < 0) return -1;
2034     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2035 
2036     // registers used as temp
2037     const Register scratch_length    = r16; // elements count to copy
2038     const Register scratch_src_klass = r17; // array klass
2039     const Register lh                = r18; // layout helper
2040 
2041     //  if (length < 0) return -1;
2042     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2043     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2044 
2045     __ load_klass(scratch_src_klass, src);
2046 #ifdef ASSERT
2047     //  assert(src->klass() != NULL);
2048     {
2049       BLOCK_COMMENT("assert klasses not null {");
2050       Label L1, L2;
2051       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2052       __ bind(L1);
2053       __ stop("broken null klass");
2054       __ bind(L2);
2055       __ load_klass(rscratch1, dst);
2056       __ cbz(rscratch1, L1);     // this would be broken also
2057       BLOCK_COMMENT("} assert klasses not null done");
2058     }
2059 #endif
2060 
2061     // Load layout helper (32-bits)
2062     //
2063     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2064     // 32        30    24            16              8     2                 0
2065     //
2066     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2067     //
2068 
2069     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2070 
2071     // Handle objArrays completely differently...
2072     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2073     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2074     __ movw(rscratch1, objArray_lh);
2075     __ eorw(rscratch2, lh, rscratch1);
2076     __ cbzw(rscratch2, L_objArray);
2077 
2078     //  if (src->klass() != dst->klass()) return -1;
2079     __ load_klass(rscratch2, dst);
2080     __ eor(rscratch2, rscratch2, scratch_src_klass);
2081     __ cbnz(rscratch2, L_failed);
2082 
2083     //  if (!src->is_Array()) return -1;
2084     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2085 
2086     // At this point, it is known to be a typeArray (array_tag 0x3).
2087 #ifdef ASSERT
2088     {
2089       BLOCK_COMMENT("assert primitive array {");
2090       Label L;
2091       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2092       __ cmpw(lh, rscratch2);
2093       __ br(Assembler::GE, L);
2094       __ stop("must be a primitive array");
2095       __ bind(L);
2096       BLOCK_COMMENT("} assert primitive array done");
2097     }
2098 #endif
2099 
2100     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2101                            rscratch2, L_failed);
2102 
2103     // TypeArrayKlass
2104     //
2105     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2106     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2107     //
2108 
2109     const Register rscratch1_offset = rscratch1;    // array offset
2110     const Register r18_elsize = lh; // element size
2111 
2112     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2113            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2114     __ add(src, src, rscratch1_offset);           // src array offset
2115     __ add(dst, dst, rscratch1_offset);           // dst array offset
2116     BLOCK_COMMENT("choose copy loop based on element size");
2117 
2118     // next registers should be set before the jump to corresponding stub
2119     const Register from     = c_rarg0;  // source array address
2120     const Register to       = c_rarg1;  // destination array address
2121     const Register count    = c_rarg2;  // elements count
2122 
2123     // 'from', 'to', 'count' registers should be set in such order
2124     // since they are the same as 'src', 'src_pos', 'dst'.
2125 
2126     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2127 
2128     // The possible values of elsize are 0-3, i.e. exact_log2(element
2129     // size in bytes).  We do a simple bitwise binary search.
2130   __ BIND(L_copy_bytes);
2131     __ tbnz(r18_elsize, 1, L_copy_ints);
2132     __ tbnz(r18_elsize, 0, L_copy_shorts);
2133     __ lea(from, Address(src, src_pos));// src_addr
2134     __ lea(to,   Address(dst, dst_pos));// dst_addr
2135     __ movw(count, scratch_length); // length
2136     __ b(RuntimeAddress(byte_copy_entry));
2137 
2138   __ BIND(L_copy_shorts);
2139     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2140     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2141     __ movw(count, scratch_length); // length
2142     __ b(RuntimeAddress(short_copy_entry));
2143 
2144   __ BIND(L_copy_ints);
2145     __ tbnz(r18_elsize, 0, L_copy_longs);
2146     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2147     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2148     __ movw(count, scratch_length); // length
2149     __ b(RuntimeAddress(int_copy_entry));
2150 
2151   __ BIND(L_copy_longs);
2152 #ifdef ASSERT
2153     {
2154       BLOCK_COMMENT("assert long copy {");
2155       Label L;
2156       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2157       __ cmpw(r18_elsize, LogBytesPerLong);
2158       __ br(Assembler::EQ, L);
2159       __ stop("must be long copy, but elsize is wrong");
2160       __ bind(L);
2161       BLOCK_COMMENT("} assert long copy done");
2162     }
2163 #endif
2164     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2165     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2166     __ movw(count, scratch_length); // length
2167     __ b(RuntimeAddress(long_copy_entry));
2168 
2169     // ObjArrayKlass
2170   __ BIND(L_objArray);
2171     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2172 
2173     Label L_plain_copy, L_checkcast_copy;
2174     //  test array classes for subtyping
2175     __ load_klass(r18, dst);
2176     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2177     __ br(Assembler::NE, L_checkcast_copy);
2178 
2179     // Identically typed arrays can be copied without element-wise checks.
2180     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2181                            rscratch2, L_failed);
2182 
2183     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2184     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2185     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2186     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2187     __ movw(count, scratch_length); // length
2188   __ BIND(L_plain_copy);
2189     __ b(RuntimeAddress(oop_copy_entry));
2190 
2191   __ BIND(L_checkcast_copy);
2192     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2193     {
2194       // Before looking at dst.length, make sure dst is also an objArray.
2195       __ ldrw(rscratch1, Address(r18, lh_offset));
2196       __ movw(rscratch2, objArray_lh);
2197       __ eorw(rscratch1, rscratch1, rscratch2);
2198       __ cbnzw(rscratch1, L_failed);
2199 
2200       // It is safe to examine both src.length and dst.length.
2201       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2202                              r18, L_failed);
2203 
2204       __ load_klass(dst_klass, dst); // reload
2205 
2206       // Marshal the base address arguments now, freeing registers.
2207       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2208       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2209       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2210       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2211       __ movw(count, length);           // length (reloaded)
2212       Register sco_temp = c_rarg3;      // this register is free now
2213       assert_different_registers(from, to, count, sco_temp,
2214                                  dst_klass, scratch_src_klass);
2215       // assert_clean_int(count, sco_temp);
2216 
2217       // Generate the type check.
2218       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2219       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2220 
2221       // Smashes rscratch1, rscratch2
2222       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2223 
2224       // Fetch destination element klass from the ObjArrayKlass header.
2225       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2226       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2227       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2228 
2229       // the checkcast_copy loop needs two extra arguments:
2230       assert(c_rarg3 == sco_temp, "#3 already in place");
2231       // Set up arguments for checkcast_copy_entry.
2232       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2233       __ b(RuntimeAddress(checkcast_copy_entry));
2234     }
2235 
2236   __ BIND(L_failed);
2237     __ mov(r0, -1);
2238     __ leave();   // required for proper stackwalking of RuntimeStub frame
2239     __ ret(lr);
2240 
2241     return start;
2242   }
2243 
2244   //
2245   // Generate stub for array fill. If "aligned" is true, the
2246   // "to" address is assumed to be heapword aligned.
2247   //
2248   // Arguments for generated stub:
2249   //   to:    c_rarg0
2250   //   value: c_rarg1
2251   //   count: c_rarg2 treated as signed
2252   //
2253   address generate_fill(BasicType t, bool aligned, const char *name) {
2254     __ align(CodeEntryAlignment);
2255     StubCodeMark mark(this, "StubRoutines", name);
2256     address start = __ pc();
2257 
2258     BLOCK_COMMENT("Entry:");
2259 
2260     const Register to        = c_rarg0;  // source array address
2261     const Register value     = c_rarg1;  // value
2262     const Register count     = c_rarg2;  // elements count
2263 
2264     const Register bz_base = r10;        // base for block_zero routine
2265     const Register cnt_words = r11;      // temp register
2266 
2267     __ enter();
2268 
2269     Label L_fill_elements, L_exit1;
2270 
2271     int shift = -1;
2272     switch (t) {
2273       case T_BYTE:
2274         shift = 0;
2275         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2276         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2277         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2278         __ br(Assembler::LO, L_fill_elements);
2279         break;
2280       case T_SHORT:
2281         shift = 1;
2282         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2283         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2284         __ br(Assembler::LO, L_fill_elements);
2285         break;
2286       case T_INT:
2287         shift = 2;
2288         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2289         __ br(Assembler::LO, L_fill_elements);
2290         break;
2291       default: ShouldNotReachHere();
2292     }
2293 
2294     // Align source address at 8 bytes address boundary.
2295     Label L_skip_align1, L_skip_align2, L_skip_align4;
2296     if (!aligned) {
2297       switch (t) {
2298         case T_BYTE:
2299           // One byte misalignment happens only for byte arrays.
2300           __ tbz(to, 0, L_skip_align1);
2301           __ strb(value, Address(__ post(to, 1)));
2302           __ subw(count, count, 1);
2303           __ bind(L_skip_align1);
2304           // Fallthrough
2305         case T_SHORT:
2306           // Two bytes misalignment happens only for byte and short (char) arrays.
2307           __ tbz(to, 1, L_skip_align2);
2308           __ strh(value, Address(__ post(to, 2)));
2309           __ subw(count, count, 2 >> shift);
2310           __ bind(L_skip_align2);
2311           // Fallthrough
2312         case T_INT:
2313           // Align to 8 bytes, we know we are 4 byte aligned to start.
2314           __ tbz(to, 2, L_skip_align4);
2315           __ strw(value, Address(__ post(to, 4)));
2316           __ subw(count, count, 4 >> shift);
2317           __ bind(L_skip_align4);
2318           break;
2319         default: ShouldNotReachHere();
2320       }
2321     }
2322 
2323     //
2324     //  Fill large chunks
2325     //
2326     __ lsrw(cnt_words, count, 3 - shift); // number of words
2327     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2328     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2329     if (UseBlockZeroing) {
2330       Label non_block_zeroing, rest;
2331       // If the fill value is zero we can use the fast zero_words().
2332       __ cbnz(value, non_block_zeroing);
2333       __ mov(bz_base, to);
2334       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2335       __ zero_words(bz_base, cnt_words);
2336       __ b(rest);
2337       __ bind(non_block_zeroing);
2338       __ fill_words(to, cnt_words, value);
2339       __ bind(rest);
2340     } else {
2341       __ fill_words(to, cnt_words, value);
2342     }
2343 
2344     // Remaining count is less than 8 bytes. Fill it by a single store.
2345     // Note that the total length is no less than 8 bytes.
2346     if (t == T_BYTE || t == T_SHORT) {
2347       Label L_exit1;
2348       __ cbzw(count, L_exit1);
2349       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2350       __ str(value, Address(to, -8));    // overwrite some elements
2351       __ bind(L_exit1);
2352       __ leave();
2353       __ ret(lr);
2354     }
2355 
2356     // Handle copies less than 8 bytes.
2357     Label L_fill_2, L_fill_4, L_exit2;
2358     __ bind(L_fill_elements);
2359     switch (t) {
2360       case T_BYTE:
2361         __ tbz(count, 0, L_fill_2);
2362         __ strb(value, Address(__ post(to, 1)));
2363         __ bind(L_fill_2);
2364         __ tbz(count, 1, L_fill_4);
2365         __ strh(value, Address(__ post(to, 2)));
2366         __ bind(L_fill_4);
2367         __ tbz(count, 2, L_exit2);
2368         __ strw(value, Address(to));
2369         break;
2370       case T_SHORT:
2371         __ tbz(count, 0, L_fill_4);
2372         __ strh(value, Address(__ post(to, 2)));
2373         __ bind(L_fill_4);
2374         __ tbz(count, 1, L_exit2);
2375         __ strw(value, Address(to));
2376         break;
2377       case T_INT:
2378         __ cbzw(count, L_exit2);
2379         __ strw(value, Address(to));
2380         break;
2381       default: ShouldNotReachHere();
2382     }
2383     __ bind(L_exit2);
2384     __ leave();
2385     __ ret(lr);
2386     return start;
2387   }
2388 
2389   void generate_arraycopy_stubs() {
2390     address entry;
2391     address entry_jbyte_arraycopy;
2392     address entry_jshort_arraycopy;
2393     address entry_jint_arraycopy;
2394     address entry_oop_arraycopy;
2395     address entry_jlong_arraycopy;
2396     address entry_checkcast_arraycopy;
2397 
2398     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2399     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2400 
2401     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2402 
2403     //*** jbyte
2404     // Always need aligned and unaligned versions
2405     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2406                                                                                   "jbyte_disjoint_arraycopy");
2407     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2408                                                                                   &entry_jbyte_arraycopy,
2409                                                                                   "jbyte_arraycopy");
2410     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2411                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2412     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2413                                                                                   "arrayof_jbyte_arraycopy");
2414 
2415     //*** jshort
2416     // Always need aligned and unaligned versions
2417     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2418                                                                                     "jshort_disjoint_arraycopy");
2419     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2420                                                                                     &entry_jshort_arraycopy,
2421                                                                                     "jshort_arraycopy");
2422     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2423                                                                                     "arrayof_jshort_disjoint_arraycopy");
2424     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2425                                                                                     "arrayof_jshort_arraycopy");
2426 
2427     //*** jint
2428     // Aligned versions
2429     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2430                                                                                 "arrayof_jint_disjoint_arraycopy");
2431     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2432                                                                                 "arrayof_jint_arraycopy");
2433     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2434     // entry_jint_arraycopy always points to the unaligned version
2435     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2436                                                                                 "jint_disjoint_arraycopy");
2437     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2438                                                                                 &entry_jint_arraycopy,
2439                                                                                 "jint_arraycopy");
2440 
2441     //*** jlong
2442     // It is always aligned
2443     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2444                                                                                   "arrayof_jlong_disjoint_arraycopy");
2445     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2446                                                                                   "arrayof_jlong_arraycopy");
2447     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2448     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2449 
2450     //*** oops
2451     {
2452       // With compressed oops we need unaligned versions; notice that
2453       // we overwrite entry_oop_arraycopy.
2454       bool aligned = !UseCompressedOops;
2455 
2456       StubRoutines::_arrayof_oop_disjoint_arraycopy
2457         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2458                                      /*dest_uninitialized*/false);
2459       StubRoutines::_arrayof_oop_arraycopy
2460         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2461                                      /*dest_uninitialized*/false);
2462       // Aligned versions without pre-barriers
2463       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2464         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2465                                      /*dest_uninitialized*/true);
2466       StubRoutines::_arrayof_oop_arraycopy_uninit
2467         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2468                                      /*dest_uninitialized*/true);
2469     }
2470 
2471     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2472     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2473     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2474     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2475 
2476     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2477     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2478                                                                         /*dest_uninitialized*/true);
2479 
2480     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2481                                                               entry_jbyte_arraycopy,
2482                                                               entry_jshort_arraycopy,
2483                                                               entry_jint_arraycopy,
2484                                                               entry_jlong_arraycopy);
2485 
2486     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2487                                                                entry_jbyte_arraycopy,
2488                                                                entry_jshort_arraycopy,
2489                                                                entry_jint_arraycopy,
2490                                                                entry_oop_arraycopy,
2491                                                                entry_jlong_arraycopy,
2492                                                                entry_checkcast_arraycopy);
2493 
2494     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2495     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2496     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2497     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2498     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2499     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2500   }
2501 
2502   void generate_math_stubs() { Unimplemented(); }
2503 
2504   // Arguments:
2505   //
2506   // Inputs:
2507   //   c_rarg0   - source byte array address
2508   //   c_rarg1   - destination byte array address
2509   //   c_rarg2   - K (key) in little endian int array
2510   //
2511   address generate_aescrypt_encryptBlock() {
2512     __ align(CodeEntryAlignment);
2513     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2514 
2515     Label L_doLast;
2516 
2517     const Register from        = c_rarg0;  // source array address
2518     const Register to          = c_rarg1;  // destination array address
2519     const Register key         = c_rarg2;  // key array address
2520     const Register keylen      = rscratch1;
2521 
2522     address start = __ pc();
2523     __ enter();
2524 
2525     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2526 
2527     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2528 
2529     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2530     __ rev32(v1, __ T16B, v1);
2531     __ rev32(v2, __ T16B, v2);
2532     __ rev32(v3, __ T16B, v3);
2533     __ rev32(v4, __ T16B, v4);
2534     __ aese(v0, v1);
2535     __ aesmc(v0, v0);
2536     __ aese(v0, v2);
2537     __ aesmc(v0, v0);
2538     __ aese(v0, v3);
2539     __ aesmc(v0, v0);
2540     __ aese(v0, v4);
2541     __ aesmc(v0, v0);
2542 
2543     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2544     __ rev32(v1, __ T16B, v1);
2545     __ rev32(v2, __ T16B, v2);
2546     __ rev32(v3, __ T16B, v3);
2547     __ rev32(v4, __ T16B, v4);
2548     __ aese(v0, v1);
2549     __ aesmc(v0, v0);
2550     __ aese(v0, v2);
2551     __ aesmc(v0, v0);
2552     __ aese(v0, v3);
2553     __ aesmc(v0, v0);
2554     __ aese(v0, v4);
2555     __ aesmc(v0, v0);
2556 
2557     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2558     __ rev32(v1, __ T16B, v1);
2559     __ rev32(v2, __ T16B, v2);
2560 
2561     __ cmpw(keylen, 44);
2562     __ br(Assembler::EQ, L_doLast);
2563 
2564     __ aese(v0, v1);
2565     __ aesmc(v0, v0);
2566     __ aese(v0, v2);
2567     __ aesmc(v0, v0);
2568 
2569     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2570     __ rev32(v1, __ T16B, v1);
2571     __ rev32(v2, __ T16B, v2);
2572 
2573     __ cmpw(keylen, 52);
2574     __ br(Assembler::EQ, L_doLast);
2575 
2576     __ aese(v0, v1);
2577     __ aesmc(v0, v0);
2578     __ aese(v0, v2);
2579     __ aesmc(v0, v0);
2580 
2581     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2582     __ rev32(v1, __ T16B, v1);
2583     __ rev32(v2, __ T16B, v2);
2584 
2585     __ BIND(L_doLast);
2586 
2587     __ aese(v0, v1);
2588     __ aesmc(v0, v0);
2589     __ aese(v0, v2);
2590 
2591     __ ld1(v1, __ T16B, key);
2592     __ rev32(v1, __ T16B, v1);
2593     __ eor(v0, __ T16B, v0, v1);
2594 
2595     __ st1(v0, __ T16B, to);
2596 
2597     __ mov(r0, 0);
2598 
2599     __ leave();
2600     __ ret(lr);
2601 
2602     return start;
2603   }
2604 
2605   // Arguments:
2606   //
2607   // Inputs:
2608   //   c_rarg0   - source byte array address
2609   //   c_rarg1   - destination byte array address
2610   //   c_rarg2   - K (key) in little endian int array
2611   //
2612   address generate_aescrypt_decryptBlock() {
2613     assert(UseAES, "need AES instructions and misaligned SSE support");
2614     __ align(CodeEntryAlignment);
2615     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2616     Label L_doLast;
2617 
2618     const Register from        = c_rarg0;  // source array address
2619     const Register to          = c_rarg1;  // destination array address
2620     const Register key         = c_rarg2;  // key array address
2621     const Register keylen      = rscratch1;
2622 
2623     address start = __ pc();
2624     __ enter(); // required for proper stackwalking of RuntimeStub frame
2625 
2626     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2627 
2628     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2629 
2630     __ ld1(v5, __ T16B, __ post(key, 16));
2631     __ rev32(v5, __ T16B, v5);
2632 
2633     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2634     __ rev32(v1, __ T16B, v1);
2635     __ rev32(v2, __ T16B, v2);
2636     __ rev32(v3, __ T16B, v3);
2637     __ rev32(v4, __ T16B, v4);
2638     __ aesd(v0, v1);
2639     __ aesimc(v0, v0);
2640     __ aesd(v0, v2);
2641     __ aesimc(v0, v0);
2642     __ aesd(v0, v3);
2643     __ aesimc(v0, v0);
2644     __ aesd(v0, v4);
2645     __ aesimc(v0, v0);
2646 
2647     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2648     __ rev32(v1, __ T16B, v1);
2649     __ rev32(v2, __ T16B, v2);
2650     __ rev32(v3, __ T16B, v3);
2651     __ rev32(v4, __ T16B, v4);
2652     __ aesd(v0, v1);
2653     __ aesimc(v0, v0);
2654     __ aesd(v0, v2);
2655     __ aesimc(v0, v0);
2656     __ aesd(v0, v3);
2657     __ aesimc(v0, v0);
2658     __ aesd(v0, v4);
2659     __ aesimc(v0, v0);
2660 
2661     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2662     __ rev32(v1, __ T16B, v1);
2663     __ rev32(v2, __ T16B, v2);
2664 
2665     __ cmpw(keylen, 44);
2666     __ br(Assembler::EQ, L_doLast);
2667 
2668     __ aesd(v0, v1);
2669     __ aesimc(v0, v0);
2670     __ aesd(v0, v2);
2671     __ aesimc(v0, v0);
2672 
2673     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2674     __ rev32(v1, __ T16B, v1);
2675     __ rev32(v2, __ T16B, v2);
2676 
2677     __ cmpw(keylen, 52);
2678     __ br(Assembler::EQ, L_doLast);
2679 
2680     __ aesd(v0, v1);
2681     __ aesimc(v0, v0);
2682     __ aesd(v0, v2);
2683     __ aesimc(v0, v0);
2684 
2685     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2686     __ rev32(v1, __ T16B, v1);
2687     __ rev32(v2, __ T16B, v2);
2688 
2689     __ BIND(L_doLast);
2690 
2691     __ aesd(v0, v1);
2692     __ aesimc(v0, v0);
2693     __ aesd(v0, v2);
2694 
2695     __ eor(v0, __ T16B, v0, v5);
2696 
2697     __ st1(v0, __ T16B, to);
2698 
2699     __ mov(r0, 0);
2700 
2701     __ leave();
2702     __ ret(lr);
2703 
2704     return start;
2705   }
2706 
2707   // Arguments:
2708   //
2709   // Inputs:
2710   //   c_rarg0   - source byte array address
2711   //   c_rarg1   - destination byte array address
2712   //   c_rarg2   - K (key) in little endian int array
2713   //   c_rarg3   - r vector byte array address
2714   //   c_rarg4   - input length
2715   //
2716   // Output:
2717   //   x0        - input length
2718   //
2719   address generate_cipherBlockChaining_encryptAESCrypt() {
2720     assert(UseAES, "need AES instructions and misaligned SSE support");
2721     __ align(CodeEntryAlignment);
2722     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2723 
2724     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2725 
2726     const Register from        = c_rarg0;  // source array address
2727     const Register to          = c_rarg1;  // destination array address
2728     const Register key         = c_rarg2;  // key array address
2729     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2730                                            // and left with the results of the last encryption block
2731     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2732     const Register keylen      = rscratch1;
2733 
2734     address start = __ pc();
2735 
2736       __ enter();
2737 
2738       __ movw(rscratch2, len_reg);
2739 
2740       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2741 
2742       __ ld1(v0, __ T16B, rvec);
2743 
2744       __ cmpw(keylen, 52);
2745       __ br(Assembler::CC, L_loadkeys_44);
2746       __ br(Assembler::EQ, L_loadkeys_52);
2747 
2748       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2749       __ rev32(v17, __ T16B, v17);
2750       __ rev32(v18, __ T16B, v18);
2751     __ BIND(L_loadkeys_52);
2752       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2753       __ rev32(v19, __ T16B, v19);
2754       __ rev32(v20, __ T16B, v20);
2755     __ BIND(L_loadkeys_44);
2756       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2757       __ rev32(v21, __ T16B, v21);
2758       __ rev32(v22, __ T16B, v22);
2759       __ rev32(v23, __ T16B, v23);
2760       __ rev32(v24, __ T16B, v24);
2761       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2762       __ rev32(v25, __ T16B, v25);
2763       __ rev32(v26, __ T16B, v26);
2764       __ rev32(v27, __ T16B, v27);
2765       __ rev32(v28, __ T16B, v28);
2766       __ ld1(v29, v30, v31, __ T16B, key);
2767       __ rev32(v29, __ T16B, v29);
2768       __ rev32(v30, __ T16B, v30);
2769       __ rev32(v31, __ T16B, v31);
2770 
2771     __ BIND(L_aes_loop);
2772       __ ld1(v1, __ T16B, __ post(from, 16));
2773       __ eor(v0, __ T16B, v0, v1);
2774 
2775       __ br(Assembler::CC, L_rounds_44);
2776       __ br(Assembler::EQ, L_rounds_52);
2777 
2778       __ aese(v0, v17); __ aesmc(v0, v0);
2779       __ aese(v0, v18); __ aesmc(v0, v0);
2780     __ BIND(L_rounds_52);
2781       __ aese(v0, v19); __ aesmc(v0, v0);
2782       __ aese(v0, v20); __ aesmc(v0, v0);
2783     __ BIND(L_rounds_44);
2784       __ aese(v0, v21); __ aesmc(v0, v0);
2785       __ aese(v0, v22); __ aesmc(v0, v0);
2786       __ aese(v0, v23); __ aesmc(v0, v0);
2787       __ aese(v0, v24); __ aesmc(v0, v0);
2788       __ aese(v0, v25); __ aesmc(v0, v0);
2789       __ aese(v0, v26); __ aesmc(v0, v0);
2790       __ aese(v0, v27); __ aesmc(v0, v0);
2791       __ aese(v0, v28); __ aesmc(v0, v0);
2792       __ aese(v0, v29); __ aesmc(v0, v0);
2793       __ aese(v0, v30);
2794       __ eor(v0, __ T16B, v0, v31);
2795 
2796       __ st1(v0, __ T16B, __ post(to, 16));
2797 
2798       __ subw(len_reg, len_reg, 16);
2799       __ cbnzw(len_reg, L_aes_loop);
2800 
2801       __ st1(v0, __ T16B, rvec);
2802 
2803       __ mov(r0, rscratch2);
2804 
2805       __ leave();
2806       __ ret(lr);
2807 
2808       return start;
2809   }
2810 
2811   // Arguments:
2812   //
2813   // Inputs:
2814   //   c_rarg0   - source byte array address
2815   //   c_rarg1   - destination byte array address
2816   //   c_rarg2   - K (key) in little endian int array
2817   //   c_rarg3   - r vector byte array address
2818   //   c_rarg4   - input length
2819   //
2820   // Output:
2821   //   r0        - input length
2822   //
2823   address generate_cipherBlockChaining_decryptAESCrypt() {
2824     assert(UseAES, "need AES instructions and misaligned SSE support");
2825     __ align(CodeEntryAlignment);
2826     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2827 
2828     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2829 
2830     const Register from        = c_rarg0;  // source array address
2831     const Register to          = c_rarg1;  // destination array address
2832     const Register key         = c_rarg2;  // key array address
2833     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2834                                            // and left with the results of the last encryption block
2835     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2836     const Register keylen      = rscratch1;
2837 
2838     address start = __ pc();
2839 
2840       __ enter();
2841 
2842       __ movw(rscratch2, len_reg);
2843 
2844       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2845 
2846       __ ld1(v2, __ T16B, rvec);
2847 
2848       __ ld1(v31, __ T16B, __ post(key, 16));
2849       __ rev32(v31, __ T16B, v31);
2850 
2851       __ cmpw(keylen, 52);
2852       __ br(Assembler::CC, L_loadkeys_44);
2853       __ br(Assembler::EQ, L_loadkeys_52);
2854 
2855       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2856       __ rev32(v17, __ T16B, v17);
2857       __ rev32(v18, __ T16B, v18);
2858     __ BIND(L_loadkeys_52);
2859       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2860       __ rev32(v19, __ T16B, v19);
2861       __ rev32(v20, __ T16B, v20);
2862     __ BIND(L_loadkeys_44);
2863       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2864       __ rev32(v21, __ T16B, v21);
2865       __ rev32(v22, __ T16B, v22);
2866       __ rev32(v23, __ T16B, v23);
2867       __ rev32(v24, __ T16B, v24);
2868       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2869       __ rev32(v25, __ T16B, v25);
2870       __ rev32(v26, __ T16B, v26);
2871       __ rev32(v27, __ T16B, v27);
2872       __ rev32(v28, __ T16B, v28);
2873       __ ld1(v29, v30, __ T16B, key);
2874       __ rev32(v29, __ T16B, v29);
2875       __ rev32(v30, __ T16B, v30);
2876 
2877     __ BIND(L_aes_loop);
2878       __ ld1(v0, __ T16B, __ post(from, 16));
2879       __ orr(v1, __ T16B, v0, v0);
2880 
2881       __ br(Assembler::CC, L_rounds_44);
2882       __ br(Assembler::EQ, L_rounds_52);
2883 
2884       __ aesd(v0, v17); __ aesimc(v0, v0);
2885       __ aesd(v0, v18); __ aesimc(v0, v0);
2886     __ BIND(L_rounds_52);
2887       __ aesd(v0, v19); __ aesimc(v0, v0);
2888       __ aesd(v0, v20); __ aesimc(v0, v0);
2889     __ BIND(L_rounds_44);
2890       __ aesd(v0, v21); __ aesimc(v0, v0);
2891       __ aesd(v0, v22); __ aesimc(v0, v0);
2892       __ aesd(v0, v23); __ aesimc(v0, v0);
2893       __ aesd(v0, v24); __ aesimc(v0, v0);
2894       __ aesd(v0, v25); __ aesimc(v0, v0);
2895       __ aesd(v0, v26); __ aesimc(v0, v0);
2896       __ aesd(v0, v27); __ aesimc(v0, v0);
2897       __ aesd(v0, v28); __ aesimc(v0, v0);
2898       __ aesd(v0, v29); __ aesimc(v0, v0);
2899       __ aesd(v0, v30);
2900       __ eor(v0, __ T16B, v0, v31);
2901       __ eor(v0, __ T16B, v0, v2);
2902 
2903       __ st1(v0, __ T16B, __ post(to, 16));
2904       __ orr(v2, __ T16B, v1, v1);
2905 
2906       __ subw(len_reg, len_reg, 16);
2907       __ cbnzw(len_reg, L_aes_loop);
2908 
2909       __ st1(v2, __ T16B, rvec);
2910 
2911       __ mov(r0, rscratch2);
2912 
2913       __ leave();
2914       __ ret(lr);
2915 
2916     return start;
2917   }
2918 
2919   // Arguments:
2920   //
2921   // Inputs:
2922   //   c_rarg0   - byte[]  source+offset
2923   //   c_rarg1   - int[]   SHA.state
2924   //   c_rarg2   - int     offset
2925   //   c_rarg3   - int     limit
2926   //
2927   address generate_sha1_implCompress(bool multi_block, const char *name) {
2928     __ align(CodeEntryAlignment);
2929     StubCodeMark mark(this, "StubRoutines", name);
2930     address start = __ pc();
2931 
2932     Register buf   = c_rarg0;
2933     Register state = c_rarg1;
2934     Register ofs   = c_rarg2;
2935     Register limit = c_rarg3;
2936 
2937     Label keys;
2938     Label sha1_loop;
2939 
2940     // load the keys into v0..v3
2941     __ adr(rscratch1, keys);
2942     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2943     // load 5 words state into v6, v7
2944     __ ldrq(v6, Address(state, 0));
2945     __ ldrs(v7, Address(state, 16));
2946 
2947 
2948     __ BIND(sha1_loop);
2949     // load 64 bytes of data into v16..v19
2950     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2951     __ rev32(v16, __ T16B, v16);
2952     __ rev32(v17, __ T16B, v17);
2953     __ rev32(v18, __ T16B, v18);
2954     __ rev32(v19, __ T16B, v19);
2955 
2956     // do the sha1
2957     __ addv(v4, __ T4S, v16, v0);
2958     __ orr(v20, __ T16B, v6, v6);
2959 
2960     FloatRegister d0 = v16;
2961     FloatRegister d1 = v17;
2962     FloatRegister d2 = v18;
2963     FloatRegister d3 = v19;
2964 
2965     for (int round = 0; round < 20; round++) {
2966       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2967       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2968       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2969       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2970       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2971 
2972       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2973       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2974       __ sha1h(tmp2, __ T4S, v20);
2975       if (round < 5)
2976         __ sha1c(v20, __ T4S, tmp3, tmp4);
2977       else if (round < 10 || round >= 15)
2978         __ sha1p(v20, __ T4S, tmp3, tmp4);
2979       else
2980         __ sha1m(v20, __ T4S, tmp3, tmp4);
2981       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2982 
2983       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2984     }
2985 
2986     __ addv(v7, __ T2S, v7, v21);
2987     __ addv(v6, __ T4S, v6, v20);
2988 
2989     if (multi_block) {
2990       __ add(ofs, ofs, 64);
2991       __ cmp(ofs, limit);
2992       __ br(Assembler::LE, sha1_loop);
2993       __ mov(c_rarg0, ofs); // return ofs
2994     }
2995 
2996     __ strq(v6, Address(state, 0));
2997     __ strs(v7, Address(state, 16));
2998 
2999     __ ret(lr);
3000 
3001     __ bind(keys);
3002     __ emit_int32(0x5a827999);
3003     __ emit_int32(0x6ed9eba1);
3004     __ emit_int32(0x8f1bbcdc);
3005     __ emit_int32(0xca62c1d6);
3006 
3007     return start;
3008   }
3009 
3010 
3011   // Arguments:
3012   //
3013   // Inputs:
3014   //   c_rarg0   - byte[]  source+offset
3015   //   c_rarg1   - int[]   SHA.state
3016   //   c_rarg2   - int     offset
3017   //   c_rarg3   - int     limit
3018   //
3019   address generate_sha256_implCompress(bool multi_block, const char *name) {
3020     static const uint32_t round_consts[64] = {
3021       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3022       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3023       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3024       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3025       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3026       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3027       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3028       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3029       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3030       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3031       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3032       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3033       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3034       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3035       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3036       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3037     };
3038     __ align(CodeEntryAlignment);
3039     StubCodeMark mark(this, "StubRoutines", name);
3040     address start = __ pc();
3041 
3042     Register buf   = c_rarg0;
3043     Register state = c_rarg1;
3044     Register ofs   = c_rarg2;
3045     Register limit = c_rarg3;
3046 
3047     Label sha1_loop;
3048 
3049     __ stpd(v8, v9, __ pre(sp, -32));
3050     __ stpd(v10, v11, Address(sp, 16));
3051 
3052 // dga == v0
3053 // dgb == v1
3054 // dg0 == v2
3055 // dg1 == v3
3056 // dg2 == v4
3057 // t0 == v6
3058 // t1 == v7
3059 
3060     // load 16 keys to v16..v31
3061     __ lea(rscratch1, ExternalAddress((address)round_consts));
3062     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3063     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3064     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3065     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3066 
3067     // load 8 words (256 bits) state
3068     __ ldpq(v0, v1, state);
3069 
3070     __ BIND(sha1_loop);
3071     // load 64 bytes of data into v8..v11
3072     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3073     __ rev32(v8, __ T16B, v8);
3074     __ rev32(v9, __ T16B, v9);
3075     __ rev32(v10, __ T16B, v10);
3076     __ rev32(v11, __ T16B, v11);
3077 
3078     __ addv(v6, __ T4S, v8, v16);
3079     __ orr(v2, __ T16B, v0, v0);
3080     __ orr(v3, __ T16B, v1, v1);
3081 
3082     FloatRegister d0 = v8;
3083     FloatRegister d1 = v9;
3084     FloatRegister d2 = v10;
3085     FloatRegister d3 = v11;
3086 
3087 
3088     for (int round = 0; round < 16; round++) {
3089       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3090       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3091       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3092       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3093 
3094       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3095        __ orr(v4, __ T16B, v2, v2);
3096       if (round < 15)
3097         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3098       __ sha256h(v2, __ T4S, v3, tmp2);
3099       __ sha256h2(v3, __ T4S, v4, tmp2);
3100       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3101 
3102       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3103     }
3104 
3105     __ addv(v0, __ T4S, v0, v2);
3106     __ addv(v1, __ T4S, v1, v3);
3107 
3108     if (multi_block) {
3109       __ add(ofs, ofs, 64);
3110       __ cmp(ofs, limit);
3111       __ br(Assembler::LE, sha1_loop);
3112       __ mov(c_rarg0, ofs); // return ofs
3113     }
3114 
3115     __ ldpd(v10, v11, Address(sp, 16));
3116     __ ldpd(v8, v9, __ post(sp, 32));
3117 
3118     __ stpq(v0, v1, state);
3119 
3120     __ ret(lr);
3121 
3122     return start;
3123   }
3124 
3125 #ifndef BUILTIN_SIM
3126   // Safefetch stubs.
3127   void generate_safefetch(const char* name, int size, address* entry,
3128                           address* fault_pc, address* continuation_pc) {
3129     // safefetch signatures:
3130     //   int      SafeFetch32(int*      adr, int      errValue);
3131     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3132     //
3133     // arguments:
3134     //   c_rarg0 = adr
3135     //   c_rarg1 = errValue
3136     //
3137     // result:
3138     //   PPC_RET  = *adr or errValue
3139 
3140     StubCodeMark mark(this, "StubRoutines", name);
3141 
3142     // Entry point, pc or function descriptor.
3143     *entry = __ pc();
3144 
3145     // Load *adr into c_rarg1, may fault.
3146     *fault_pc = __ pc();
3147     switch (size) {
3148       case 4:
3149         // int32_t
3150         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3151         break;
3152       case 8:
3153         // int64_t
3154         __ ldr(c_rarg1, Address(c_rarg0, 0));
3155         break;
3156       default:
3157         ShouldNotReachHere();
3158     }
3159 
3160     // return errValue or *adr
3161     *continuation_pc = __ pc();
3162     __ mov(r0, c_rarg1);
3163     __ ret(lr);
3164   }
3165 #endif
3166 
3167   /**
3168    *  Arguments:
3169    *
3170    * Inputs:
3171    *   c_rarg0   - int crc
3172    *   c_rarg1   - byte* buf
3173    *   c_rarg2   - int length
3174    *
3175    * Ouput:
3176    *       rax   - int crc result
3177    */
3178   address generate_updateBytesCRC32() {
3179     assert(UseCRC32Intrinsics, "what are we doing here?");
3180 
3181     __ align(CodeEntryAlignment);
3182     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3183 
3184     address start = __ pc();
3185 
3186     const Register crc   = c_rarg0;  // crc
3187     const Register buf   = c_rarg1;  // source java byte array address
3188     const Register len   = c_rarg2;  // length
3189     const Register table0 = c_rarg3; // crc_table address
3190     const Register table1 = c_rarg4;
3191     const Register table2 = c_rarg5;
3192     const Register table3 = c_rarg6;
3193     const Register tmp3 = c_rarg7;
3194 
3195     BLOCK_COMMENT("Entry:");
3196     __ enter(); // required for proper stackwalking of RuntimeStub frame
3197 
3198     __ kernel_crc32(crc, buf, len,
3199               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3200 
3201     __ leave(); // required for proper stackwalking of RuntimeStub frame
3202     __ ret(lr);
3203 
3204     return start;
3205   }
3206 
3207   /**
3208    *  Arguments:
3209    *
3210    * Inputs:
3211    *   c_rarg0   - int crc
3212    *   c_rarg1   - byte* buf
3213    *   c_rarg2   - int length
3214    *   c_rarg3   - int* table
3215    *
3216    * Ouput:
3217    *       r0   - int crc result
3218    */
3219   address generate_updateBytesCRC32C() {
3220     assert(UseCRC32CIntrinsics, "what are we doing here?");
3221 
3222     __ align(CodeEntryAlignment);
3223     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3224 
3225     address start = __ pc();
3226 
3227     const Register crc   = c_rarg0;  // crc
3228     const Register buf   = c_rarg1;  // source java byte array address
3229     const Register len   = c_rarg2;  // length
3230     const Register table0 = c_rarg3; // crc_table address
3231     const Register table1 = c_rarg4;
3232     const Register table2 = c_rarg5;
3233     const Register table3 = c_rarg6;
3234     const Register tmp3 = c_rarg7;
3235 
3236     BLOCK_COMMENT("Entry:");
3237     __ enter(); // required for proper stackwalking of RuntimeStub frame
3238 
3239     __ kernel_crc32c(crc, buf, len,
3240               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3241 
3242     __ leave(); // required for proper stackwalking of RuntimeStub frame
3243     __ ret(lr);
3244 
3245     return start;
3246   }
3247 
3248   /***
3249    *  Arguments:
3250    *
3251    *  Inputs:
3252    *   c_rarg0   - int   adler
3253    *   c_rarg1   - byte* buff
3254    *   c_rarg2   - int   len
3255    *
3256    * Output:
3257    *   c_rarg0   - int adler result
3258    */
3259   address generate_updateBytesAdler32() {
3260     __ align(CodeEntryAlignment);
3261     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3262     address start = __ pc();
3263 
3264     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3265 
3266     // Aliases
3267     Register adler  = c_rarg0;
3268     Register s1     = c_rarg0;
3269     Register s2     = c_rarg3;
3270     Register buff   = c_rarg1;
3271     Register len    = c_rarg2;
3272     Register nmax  = r4;
3273     Register base  = r5;
3274     Register count = r6;
3275     Register temp0 = rscratch1;
3276     Register temp1 = rscratch2;
3277     FloatRegister vbytes = v0;
3278     FloatRegister vs1acc = v1;
3279     FloatRegister vs2acc = v2;
3280     FloatRegister vtable = v3;
3281 
3282     // Max number of bytes we can process before having to take the mod
3283     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3284     unsigned long BASE = 0xfff1;
3285     unsigned long NMAX = 0x15B0;
3286 
3287     __ mov(base, BASE);
3288     __ mov(nmax, NMAX);
3289 
3290     // Load accumulation coefficients for the upper 16 bits
3291     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3292     __ ld1(vtable, __ T16B, Address(temp0));
3293 
3294     // s1 is initialized to the lower 16 bits of adler
3295     // s2 is initialized to the upper 16 bits of adler
3296     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3297     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3298 
3299     // The pipelined loop needs at least 16 elements for 1 iteration
3300     // It does check this, but it is more effective to skip to the cleanup loop
3301     __ cmp(len, (u1)16);
3302     __ br(Assembler::HS, L_nmax);
3303     __ cbz(len, L_combine);
3304 
3305     __ bind(L_simple_by1_loop);
3306     __ ldrb(temp0, Address(__ post(buff, 1)));
3307     __ add(s1, s1, temp0);
3308     __ add(s2, s2, s1);
3309     __ subs(len, len, 1);
3310     __ br(Assembler::HI, L_simple_by1_loop);
3311 
3312     // s1 = s1 % BASE
3313     __ subs(temp0, s1, base);
3314     __ csel(s1, temp0, s1, Assembler::HS);
3315 
3316     // s2 = s2 % BASE
3317     __ lsr(temp0, s2, 16);
3318     __ lsl(temp1, temp0, 4);
3319     __ sub(temp1, temp1, temp0);
3320     __ add(s2, temp1, s2, ext::uxth);
3321 
3322     __ subs(temp0, s2, base);
3323     __ csel(s2, temp0, s2, Assembler::HS);
3324 
3325     __ b(L_combine);
3326 
3327     __ bind(L_nmax);
3328     __ subs(len, len, nmax);
3329     __ sub(count, nmax, 16);
3330     __ br(Assembler::LO, L_by16);
3331 
3332     __ bind(L_nmax_loop);
3333 
3334     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3335                                       vbytes, vs1acc, vs2acc, vtable);
3336 
3337     __ subs(count, count, 16);
3338     __ br(Assembler::HS, L_nmax_loop);
3339 
3340     // s1 = s1 % BASE
3341     __ lsr(temp0, s1, 16);
3342     __ lsl(temp1, temp0, 4);
3343     __ sub(temp1, temp1, temp0);
3344     __ add(temp1, temp1, s1, ext::uxth);
3345 
3346     __ lsr(temp0, temp1, 16);
3347     __ lsl(s1, temp0, 4);
3348     __ sub(s1, s1, temp0);
3349     __ add(s1, s1, temp1, ext:: uxth);
3350 
3351     __ subs(temp0, s1, base);
3352     __ csel(s1, temp0, s1, Assembler::HS);
3353 
3354     // s2 = s2 % BASE
3355     __ lsr(temp0, s2, 16);
3356     __ lsl(temp1, temp0, 4);
3357     __ sub(temp1, temp1, temp0);
3358     __ add(temp1, temp1, s2, ext::uxth);
3359 
3360     __ lsr(temp0, temp1, 16);
3361     __ lsl(s2, temp0, 4);
3362     __ sub(s2, s2, temp0);
3363     __ add(s2, s2, temp1, ext:: uxth);
3364 
3365     __ subs(temp0, s2, base);
3366     __ csel(s2, temp0, s2, Assembler::HS);
3367 
3368     __ subs(len, len, nmax);
3369     __ sub(count, nmax, 16);
3370     __ br(Assembler::HS, L_nmax_loop);
3371 
3372     __ bind(L_by16);
3373     __ adds(len, len, count);
3374     __ br(Assembler::LO, L_by1);
3375 
3376     __ bind(L_by16_loop);
3377 
3378     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3379                                       vbytes, vs1acc, vs2acc, vtable);
3380 
3381     __ subs(len, len, 16);
3382     __ br(Assembler::HS, L_by16_loop);
3383 
3384     __ bind(L_by1);
3385     __ adds(len, len, 15);
3386     __ br(Assembler::LO, L_do_mod);
3387 
3388     __ bind(L_by1_loop);
3389     __ ldrb(temp0, Address(__ post(buff, 1)));
3390     __ add(s1, temp0, s1);
3391     __ add(s2, s2, s1);
3392     __ subs(len, len, 1);
3393     __ br(Assembler::HS, L_by1_loop);
3394 
3395     __ bind(L_do_mod);
3396     // s1 = s1 % BASE
3397     __ lsr(temp0, s1, 16);
3398     __ lsl(temp1, temp0, 4);
3399     __ sub(temp1, temp1, temp0);
3400     __ add(temp1, temp1, s1, ext::uxth);
3401 
3402     __ lsr(temp0, temp1, 16);
3403     __ lsl(s1, temp0, 4);
3404     __ sub(s1, s1, temp0);
3405     __ add(s1, s1, temp1, ext:: uxth);
3406 
3407     __ subs(temp0, s1, base);
3408     __ csel(s1, temp0, s1, Assembler::HS);
3409 
3410     // s2 = s2 % BASE
3411     __ lsr(temp0, s2, 16);
3412     __ lsl(temp1, temp0, 4);
3413     __ sub(temp1, temp1, temp0);
3414     __ add(temp1, temp1, s2, ext::uxth);
3415 
3416     __ lsr(temp0, temp1, 16);
3417     __ lsl(s2, temp0, 4);
3418     __ sub(s2, s2, temp0);
3419     __ add(s2, s2, temp1, ext:: uxth);
3420 
3421     __ subs(temp0, s2, base);
3422     __ csel(s2, temp0, s2, Assembler::HS);
3423 
3424     // Combine lower bits and higher bits
3425     __ bind(L_combine);
3426     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3427 
3428     __ ret(lr);
3429 
3430     return start;
3431   }
3432 
3433   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3434           Register temp0, Register temp1, FloatRegister vbytes,
3435           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3436     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3437     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3438     // In non-vectorized code, we update s1 and s2 as:
3439     //   s1 <- s1 + b1
3440     //   s2 <- s2 + s1
3441     //   s1 <- s1 + b2
3442     //   s2 <- s2 + b1
3443     //   ...
3444     //   s1 <- s1 + b16
3445     //   s2 <- s2 + s1
3446     // Putting above assignments together, we have:
3447     //   s1_new = s1 + b1 + b2 + ... + b16
3448     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3449     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3450     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3451     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3452 
3453     // s2 = s2 + s1 * 16
3454     __ add(s2, s2, s1, Assembler::LSL, 4);
3455 
3456     // vs1acc = b1 + b2 + b3 + ... + b16
3457     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3458     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3459     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3460     __ uaddlv(vs1acc, __ T16B, vbytes);
3461     __ uaddlv(vs2acc, __ T8H, vs2acc);
3462 
3463     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3464     __ fmovd(temp0, vs1acc);
3465     __ fmovd(temp1, vs2acc);
3466     __ add(s1, s1, temp0);
3467     __ add(s2, s2, temp1);
3468   }
3469 
3470   /**
3471    *  Arguments:
3472    *
3473    *  Input:
3474    *    c_rarg0   - x address
3475    *    c_rarg1   - x length
3476    *    c_rarg2   - y address
3477    *    c_rarg3   - y lenth
3478    *    c_rarg4   - z address
3479    *    c_rarg5   - z length
3480    */
3481   address generate_multiplyToLen() {
3482     __ align(CodeEntryAlignment);
3483     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3484 
3485     address start = __ pc();
3486     const Register x     = r0;
3487     const Register xlen  = r1;
3488     const Register y     = r2;
3489     const Register ylen  = r3;
3490     const Register z     = r4;
3491     const Register zlen  = r5;
3492 
3493     const Register tmp1  = r10;
3494     const Register tmp2  = r11;
3495     const Register tmp3  = r12;
3496     const Register tmp4  = r13;
3497     const Register tmp5  = r14;
3498     const Register tmp6  = r15;
3499     const Register tmp7  = r16;
3500 
3501     BLOCK_COMMENT("Entry:");
3502     __ enter(); // required for proper stackwalking of RuntimeStub frame
3503     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3504     __ leave(); // required for proper stackwalking of RuntimeStub frame
3505     __ ret(lr);
3506 
3507     return start;
3508   }
3509 
3510   address generate_squareToLen() {
3511     // squareToLen algorithm for sizes 1..127 described in java code works
3512     // faster than multiply_to_len on some CPUs and slower on others, but
3513     // multiply_to_len shows a bit better overall results
3514     __ align(CodeEntryAlignment);
3515     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3516     address start = __ pc();
3517 
3518     const Register x     = r0;
3519     const Register xlen  = r1;
3520     const Register z     = r2;
3521     const Register zlen  = r3;
3522     const Register y     = r4; // == x
3523     const Register ylen  = r5; // == xlen
3524 
3525     const Register tmp1  = r10;
3526     const Register tmp2  = r11;
3527     const Register tmp3  = r12;
3528     const Register tmp4  = r13;
3529     const Register tmp5  = r14;
3530     const Register tmp6  = r15;
3531     const Register tmp7  = r16;
3532 
3533     RegSet spilled_regs = RegSet::of(y, ylen);
3534     BLOCK_COMMENT("Entry:");
3535     __ enter();
3536     __ push(spilled_regs, sp);
3537     __ mov(y, x);
3538     __ mov(ylen, xlen);
3539     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3540     __ pop(spilled_regs, sp);
3541     __ leave();
3542     __ ret(lr);
3543     return start;
3544   }
3545 
3546   address generate_mulAdd() {
3547     __ align(CodeEntryAlignment);
3548     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3549 
3550     address start = __ pc();
3551 
3552     const Register out     = r0;
3553     const Register in      = r1;
3554     const Register offset  = r2;
3555     const Register len     = r3;
3556     const Register k       = r4;
3557 
3558     BLOCK_COMMENT("Entry:");
3559     __ enter();
3560     __ mul_add(out, in, offset, len, k);
3561     __ leave();
3562     __ ret(lr);
3563 
3564     return start;
3565   }
3566 
3567   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3568                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3569                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3570     // Karatsuba multiplication performs a 128*128 -> 256-bit
3571     // multiplication in three 128-bit multiplications and a few
3572     // additions.
3573     //
3574     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3575     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3576     //
3577     // Inputs:
3578     //
3579     // A0 in a.d[0]     (subkey)
3580     // A1 in a.d[1]
3581     // (A1+A0) in a1_xor_a0.d[0]
3582     //
3583     // B0 in b.d[0]     (state)
3584     // B1 in b.d[1]
3585 
3586     __ ext(tmp1, __ T16B, b, b, 0x08);
3587     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3588     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3589     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3590     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3591 
3592     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3593     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3594     __ eor(tmp2, __ T16B, tmp2, tmp4);
3595     __ eor(tmp2, __ T16B, tmp2, tmp3);
3596 
3597     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3598     __ ins(result_hi, __ D, tmp2, 0, 1);
3599     __ ins(result_lo, __ D, tmp2, 1, 0);
3600   }
3601 
3602   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3603                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3604     const FloatRegister t0 = result;
3605 
3606     // The GCM field polynomial f is z^128 + p(z), where p =
3607     // z^7+z^2+z+1.
3608     //
3609     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3610     //
3611     // so, given that the product we're reducing is
3612     //    a == lo + hi * z^128
3613     // substituting,
3614     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3615     //
3616     // we reduce by multiplying hi by p(z) and subtracting the result
3617     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3618     // bits we can do this with two 64-bit multiplications, lo*p and
3619     // hi*p.
3620 
3621     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3622     __ ext(t1, __ T16B, t0, z, 8);
3623     __ eor(hi, __ T16B, hi, t1);
3624     __ ext(t1, __ T16B, z, t0, 8);
3625     __ eor(lo, __ T16B, lo, t1);
3626     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3627     __ eor(result, __ T16B, lo, t0);
3628   }
3629 
3630   address generate_has_negatives(address &has_negatives_long) {
3631     const u1 large_loop_size = 64;
3632     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3633     int dcache_line = VM_Version::dcache_line_size();
3634 
3635     Register ary1 = r1, len = r2, result = r0;
3636 
3637     __ align(CodeEntryAlignment);
3638 
3639     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3640 
3641     address entry = __ pc();
3642 
3643     __ enter();
3644 
3645   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3646         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3647 
3648   __ cmp(len, (u1)15);
3649   __ br(Assembler::GT, LEN_OVER_15);
3650   // The only case when execution falls into this code is when pointer is near
3651   // the end of memory page and we have to avoid reading next page
3652   __ add(ary1, ary1, len);
3653   __ subs(len, len, 8);
3654   __ br(Assembler::GT, LEN_OVER_8);
3655   __ ldr(rscratch2, Address(ary1, -8));
3656   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3657   __ lsrv(rscratch2, rscratch2, rscratch1);
3658   __ tst(rscratch2, UPPER_BIT_MASK);
3659   __ cset(result, Assembler::NE);
3660   __ leave();
3661   __ ret(lr);
3662   __ bind(LEN_OVER_8);
3663   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3664   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3665   __ tst(rscratch2, UPPER_BIT_MASK);
3666   __ br(Assembler::NE, RET_TRUE_NO_POP);
3667   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3668   __ lsrv(rscratch1, rscratch1, rscratch2);
3669   __ tst(rscratch1, UPPER_BIT_MASK);
3670   __ cset(result, Assembler::NE);
3671   __ leave();
3672   __ ret(lr);
3673 
3674   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3675   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3676 
3677   has_negatives_long = __ pc(); // 2nd entry point
3678 
3679   __ enter();
3680 
3681   __ bind(LEN_OVER_15);
3682     __ push(spilled_regs, sp);
3683     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3684     __ cbz(rscratch2, ALIGNED);
3685     __ ldp(tmp6, tmp1, Address(ary1));
3686     __ mov(tmp5, 16);
3687     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3688     __ add(ary1, ary1, rscratch1);
3689     __ sub(len, len, rscratch1);
3690     __ orr(tmp6, tmp6, tmp1);
3691     __ tst(tmp6, UPPER_BIT_MASK);
3692     __ br(Assembler::NE, RET_TRUE);
3693 
3694   __ bind(ALIGNED);
3695     __ cmp(len, large_loop_size);
3696     __ br(Assembler::LT, CHECK_16);
3697     // Perform 16-byte load as early return in pre-loop to handle situation
3698     // when initially aligned large array has negative values at starting bytes,
3699     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3700     // slower. Cases with negative bytes further ahead won't be affected that
3701     // much. In fact, it'll be faster due to early loads, less instructions and
3702     // less branches in LARGE_LOOP.
3703     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3704     __ sub(len, len, 16);
3705     __ orr(tmp6, tmp6, tmp1);
3706     __ tst(tmp6, UPPER_BIT_MASK);
3707     __ br(Assembler::NE, RET_TRUE);
3708     __ cmp(len, large_loop_size);
3709     __ br(Assembler::LT, CHECK_16);
3710 
3711     if (SoftwarePrefetchHintDistance >= 0
3712         && SoftwarePrefetchHintDistance >= dcache_line) {
3713       // initial prefetch
3714       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3715     }
3716   __ bind(LARGE_LOOP);
3717     if (SoftwarePrefetchHintDistance >= 0) {
3718       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3719     }
3720     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3721     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3722     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3723     // instructions per cycle and have less branches, but this approach disables
3724     // early return, thus, all 64 bytes are loaded and checked every time.
3725     __ ldp(tmp2, tmp3, Address(ary1));
3726     __ ldp(tmp4, tmp5, Address(ary1, 16));
3727     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3728     __ ldp(tmp6, tmp1, Address(ary1, 48));
3729     __ add(ary1, ary1, large_loop_size);
3730     __ sub(len, len, large_loop_size);
3731     __ orr(tmp2, tmp2, tmp3);
3732     __ orr(tmp4, tmp4, tmp5);
3733     __ orr(rscratch1, rscratch1, rscratch2);
3734     __ orr(tmp6, tmp6, tmp1);
3735     __ orr(tmp2, tmp2, tmp4);
3736     __ orr(rscratch1, rscratch1, tmp6);
3737     __ orr(tmp2, tmp2, rscratch1);
3738     __ tst(tmp2, UPPER_BIT_MASK);
3739     __ br(Assembler::NE, RET_TRUE);
3740     __ cmp(len, large_loop_size);
3741     __ br(Assembler::GE, LARGE_LOOP);
3742 
3743   __ bind(CHECK_16); // small 16-byte load pre-loop
3744     __ cmp(len, (u1)16);
3745     __ br(Assembler::LT, POST_LOOP16);
3746 
3747   __ bind(LOOP16); // small 16-byte load loop
3748     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3749     __ sub(len, len, 16);
3750     __ orr(tmp2, tmp2, tmp3);
3751     __ tst(tmp2, UPPER_BIT_MASK);
3752     __ br(Assembler::NE, RET_TRUE);
3753     __ cmp(len, (u1)16);
3754     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3755 
3756   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3757     __ cmp(len, (u1)8);
3758     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3759     __ ldr(tmp3, Address(__ post(ary1, 8)));
3760     __ sub(len, len, 8);
3761     __ tst(tmp3, UPPER_BIT_MASK);
3762     __ br(Assembler::NE, RET_TRUE);
3763 
3764   __ bind(POST_LOOP16_LOAD_TAIL);
3765     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3766     __ ldr(tmp1, Address(ary1));
3767     __ mov(tmp2, 64);
3768     __ sub(tmp4, tmp2, len, __ LSL, 3);
3769     __ lslv(tmp1, tmp1, tmp4);
3770     __ tst(tmp1, UPPER_BIT_MASK);
3771     __ br(Assembler::NE, RET_TRUE);
3772     // Fallthrough
3773 
3774   __ bind(RET_FALSE);
3775     __ pop(spilled_regs, sp);
3776     __ leave();
3777     __ mov(result, zr);
3778     __ ret(lr);
3779 
3780   __ bind(RET_TRUE);
3781     __ pop(spilled_regs, sp);
3782   __ bind(RET_TRUE_NO_POP);
3783     __ leave();
3784     __ mov(result, 1);
3785     __ ret(lr);
3786 
3787   __ bind(DONE);
3788     __ pop(spilled_regs, sp);
3789     __ leave();
3790     __ ret(lr);
3791     return entry;
3792   }
3793 
3794   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3795         bool usePrefetch, Label &NOT_EQUAL) {
3796     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3797         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3798         tmp7 = r12, tmp8 = r13;
3799     Label LOOP;
3800 
3801     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3802     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3803     __ bind(LOOP);
3804     if (usePrefetch) {
3805       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3806       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3807     }
3808     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3809     __ eor(tmp1, tmp1, tmp2);
3810     __ eor(tmp3, tmp3, tmp4);
3811     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3812     __ orr(tmp1, tmp1, tmp3);
3813     __ cbnz(tmp1, NOT_EQUAL);
3814     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3815     __ eor(tmp5, tmp5, tmp6);
3816     __ eor(tmp7, tmp7, tmp8);
3817     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3818     __ orr(tmp5, tmp5, tmp7);
3819     __ cbnz(tmp5, NOT_EQUAL);
3820     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3821     __ eor(tmp1, tmp1, tmp2);
3822     __ eor(tmp3, tmp3, tmp4);
3823     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3824     __ orr(tmp1, tmp1, tmp3);
3825     __ cbnz(tmp1, NOT_EQUAL);
3826     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3827     __ eor(tmp5, tmp5, tmp6);
3828     __ sub(cnt1, cnt1, 8 * wordSize);
3829     __ eor(tmp7, tmp7, tmp8);
3830     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3831     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3832     // cmp) because subs allows an unlimited range of immediate operand.
3833     __ subs(tmp6, cnt1, loopThreshold);
3834     __ orr(tmp5, tmp5, tmp7);
3835     __ cbnz(tmp5, NOT_EQUAL);
3836     __ br(__ GE, LOOP);
3837     // post-loop
3838     __ eor(tmp1, tmp1, tmp2);
3839     __ eor(tmp3, tmp3, tmp4);
3840     __ orr(tmp1, tmp1, tmp3);
3841     __ sub(cnt1, cnt1, 2 * wordSize);
3842     __ cbnz(tmp1, NOT_EQUAL);
3843   }
3844 
3845   void generate_large_array_equals_loop_simd(int loopThreshold,
3846         bool usePrefetch, Label &NOT_EQUAL) {
3847     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3848         tmp2 = rscratch2;
3849     Label LOOP;
3850 
3851     __ bind(LOOP);
3852     if (usePrefetch) {
3853       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3854       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3855     }
3856     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3857     __ sub(cnt1, cnt1, 8 * wordSize);
3858     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3859     __ subs(tmp1, cnt1, loopThreshold);
3860     __ eor(v0, __ T16B, v0, v4);
3861     __ eor(v1, __ T16B, v1, v5);
3862     __ eor(v2, __ T16B, v2, v6);
3863     __ eor(v3, __ T16B, v3, v7);
3864     __ orr(v0, __ T16B, v0, v1);
3865     __ orr(v1, __ T16B, v2, v3);
3866     __ orr(v0, __ T16B, v0, v1);
3867     __ umov(tmp1, v0, __ D, 0);
3868     __ umov(tmp2, v0, __ D, 1);
3869     __ orr(tmp1, tmp1, tmp2);
3870     __ cbnz(tmp1, NOT_EQUAL);
3871     __ br(__ GE, LOOP);
3872   }
3873 
3874   // a1 = r1 - array1 address
3875   // a2 = r2 - array2 address
3876   // result = r0 - return value. Already contains "false"
3877   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3878   // r3-r5 are reserved temporary registers
3879   address generate_large_array_equals() {
3880     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3881         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3882         tmp7 = r12, tmp8 = r13;
3883     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3884         SMALL_LOOP, POST_LOOP;
3885     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3886     // calculate if at least 32 prefetched bytes are used
3887     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3888     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3889     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3890     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3891         tmp5, tmp6, tmp7, tmp8);
3892 
3893     __ align(CodeEntryAlignment);
3894 
3895     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3896 
3897     address entry = __ pc();
3898     __ enter();
3899     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3900     // also advance pointers to use post-increment instead of pre-increment
3901     __ add(a1, a1, wordSize);
3902     __ add(a2, a2, wordSize);
3903     if (AvoidUnalignedAccesses) {
3904       // both implementations (SIMD/nonSIMD) are using relatively large load
3905       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3906       // on some CPUs in case of address is not at least 16-byte aligned.
3907       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3908       // load if needed at least for 1st address and make if 16-byte aligned.
3909       Label ALIGNED16;
3910       __ tbz(a1, 3, ALIGNED16);
3911       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3912       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3913       __ sub(cnt1, cnt1, wordSize);
3914       __ eor(tmp1, tmp1, tmp2);
3915       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3916       __ bind(ALIGNED16);
3917     }
3918     if (UseSIMDForArrayEquals) {
3919       if (SoftwarePrefetchHintDistance >= 0) {
3920         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3921         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3922         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3923             /* prfm = */ true, NOT_EQUAL);
3924         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3925         __ br(__ LT, TAIL);
3926       }
3927       __ bind(NO_PREFETCH_LARGE_LOOP);
3928       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3929           /* prfm = */ false, NOT_EQUAL);
3930     } else {
3931       __ push(spilled_regs, sp);
3932       if (SoftwarePrefetchHintDistance >= 0) {
3933         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3934         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3935         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3936             /* prfm = */ true, NOT_EQUAL);
3937         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3938         __ br(__ LT, TAIL);
3939       }
3940       __ bind(NO_PREFETCH_LARGE_LOOP);
3941       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3942           /* prfm = */ false, NOT_EQUAL);
3943     }
3944     __ bind(TAIL);
3945       __ cbz(cnt1, EQUAL);
3946       __ subs(cnt1, cnt1, wordSize);
3947       __ br(__ LE, POST_LOOP);
3948     __ bind(SMALL_LOOP);
3949       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3950       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3951       __ subs(cnt1, cnt1, wordSize);
3952       __ eor(tmp1, tmp1, tmp2);
3953       __ cbnz(tmp1, NOT_EQUAL);
3954       __ br(__ GT, SMALL_LOOP);
3955     __ bind(POST_LOOP);
3956       __ ldr(tmp1, Address(a1, cnt1));
3957       __ ldr(tmp2, Address(a2, cnt1));
3958       __ eor(tmp1, tmp1, tmp2);
3959       __ cbnz(tmp1, NOT_EQUAL);
3960     __ bind(EQUAL);
3961       __ mov(result, true);
3962     __ bind(NOT_EQUAL);
3963       if (!UseSIMDForArrayEquals) {
3964         __ pop(spilled_regs, sp);
3965       }
3966     __ bind(NOT_EQUAL_NO_POP);
3967     __ leave();
3968     __ ret(lr);
3969     return entry;
3970   }
3971 
3972   address generate_dsin_dcos(bool isCos) {
3973     __ align(CodeEntryAlignment);
3974     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3975     address start = __ pc();
3976     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3977         (address)StubRoutines::aarch64::_two_over_pi,
3978         (address)StubRoutines::aarch64::_pio2,
3979         (address)StubRoutines::aarch64::_dsin_coef,
3980         (address)StubRoutines::aarch64::_dcos_coef);
3981     return start;
3982   }
3983 
3984   address generate_dlog() {
3985     __ align(CodeEntryAlignment);
3986     StubCodeMark mark(this, "StubRoutines", "dlog");
3987     address entry = __ pc();
3988     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3989         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3990     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3991     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3992         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3993     return entry;
3994   }
3995 
3996   // code for comparing 16 bytes of strings with same encoding
3997   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3998     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3999     __ ldr(rscratch1, Address(__ post(str1, 8)));
4000     __ eor(rscratch2, tmp1, tmp2);
4001     __ ldr(cnt1, Address(__ post(str2, 8)));
4002     __ cbnz(rscratch2, DIFF1);
4003     __ ldr(tmp1, Address(__ post(str1, 8)));
4004     __ eor(rscratch2, rscratch1, cnt1);
4005     __ ldr(tmp2, Address(__ post(str2, 8)));
4006     __ cbnz(rscratch2, DIFF2);
4007   }
4008 
4009   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4010   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4011       Label &DIFF2) {
4012     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4013     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4014 
4015     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4016     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4017     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4018     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4019 
4020     __ fmovd(tmpL, vtmp3);
4021     __ eor(rscratch2, tmp3, tmpL);
4022     __ cbnz(rscratch2, DIFF2);
4023 
4024     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4025     __ umov(tmpL, vtmp3, __ D, 1);
4026     __ eor(rscratch2, tmpU, tmpL);
4027     __ cbnz(rscratch2, DIFF1);
4028 
4029     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4030     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4031     __ fmovd(tmpL, vtmp);
4032     __ eor(rscratch2, tmp3, tmpL);
4033     __ cbnz(rscratch2, DIFF2);
4034 
4035     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4036     __ umov(tmpL, vtmp, __ D, 1);
4037     __ eor(rscratch2, tmpU, tmpL);
4038     __ cbnz(rscratch2, DIFF1);
4039   }
4040 
4041   // r0  = result
4042   // r1  = str1
4043   // r2  = cnt1
4044   // r3  = str2
4045   // r4  = cnt2
4046   // r10 = tmp1
4047   // r11 = tmp2
4048   address generate_compare_long_string_different_encoding(bool isLU) {
4049     __ align(CodeEntryAlignment);
4050     StubCodeMark mark(this, "StubRoutines", isLU
4051         ? "compare_long_string_different_encoding LU"
4052         : "compare_long_string_different_encoding UL");
4053     address entry = __ pc();
4054     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4055         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
4056         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4057     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4058         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4059     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4060     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4061 
4062     int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
4063 
4064     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4065     // cnt2 == amount of characters left to compare
4066     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4067     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4068     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4069     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4070     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4071     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4072     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4073     __ eor(rscratch2, tmp1, tmp2);
4074     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4075     __ mov(rscratch1, tmp2);
4076     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4077     Register strU = isLU ? str2 : str1,
4078              strL = isLU ? str1 : str2,
4079              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4080              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4081     __ push(spilled_regs, sp);
4082     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4083     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4084 
4085     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4086 
4087     if (SoftwarePrefetchHintDistance >= 0) {
4088       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4089       __ br(__ LT, SMALL_LOOP);
4090       __ bind(LARGE_LOOP_PREFETCH);
4091         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4092         __ mov(tmp4, 2);
4093         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4094         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4095           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4096           __ subs(tmp4, tmp4, 1);
4097           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4098           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4099           __ mov(tmp4, 2);
4100         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4101           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4102           __ subs(tmp4, tmp4, 1);
4103           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4104           __ sub(cnt2, cnt2, 64);
4105           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4106           __ br(__ GE, LARGE_LOOP_PREFETCH);
4107     }
4108     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4109     __ subs(cnt2, cnt2, 16);
4110     __ br(__ LT, TAIL);
4111     __ b(SMALL_LOOP_ENTER);
4112     __ bind(SMALL_LOOP); // smaller loop
4113       __ subs(cnt2, cnt2, 16);
4114     __ bind(SMALL_LOOP_ENTER);
4115       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4116       __ br(__ GE, SMALL_LOOP);
4117       __ cbz(cnt2, LOAD_LAST);
4118     __ bind(TAIL); // 1..15 characters left
4119       __ subs(zr, cnt2, -8);
4120       __ br(__ GT, TAIL_LOAD_16);
4121       __ ldrd(vtmp, Address(tmp2));
4122       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4123 
4124       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4125       __ fmovd(tmpL, vtmp3);
4126       __ eor(rscratch2, tmp3, tmpL);
4127       __ cbnz(rscratch2, DIFF2);
4128       __ umov(tmpL, vtmp3, __ D, 1);
4129       __ eor(rscratch2, tmpU, tmpL);
4130       __ cbnz(rscratch2, DIFF1);
4131       __ b(LOAD_LAST);
4132     __ bind(TAIL_LOAD_16);
4133       __ ldrq(vtmp, Address(tmp2));
4134       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4135       __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4136       __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4137       __ fmovd(tmpL, vtmp3);
4138       __ eor(rscratch2, tmp3, tmpL);
4139       __ cbnz(rscratch2, DIFF2);
4140 
4141       __ ldr(tmp3, Address(__ post(cnt1, 8)));
4142       __ umov(tmpL, vtmp3, __ D, 1);
4143       __ eor(rscratch2, tmpU, tmpL);
4144       __ cbnz(rscratch2, DIFF1);
4145 
4146       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4147       __ fmovd(tmpL, vtmp);
4148       __ eor(rscratch2, tmp3, tmpL);
4149       __ cbnz(rscratch2, DIFF2);
4150 
4151       __ umov(tmpL, vtmp, __ D, 1);
4152       __ eor(rscratch2, tmpU, tmpL);
4153       __ cbnz(rscratch2, DIFF1);
4154       __ b(LOAD_LAST);
4155     __ bind(DIFF2);
4156       __ mov(tmpU, tmp3);
4157     __ bind(DIFF1);
4158       __ pop(spilled_regs, sp);
4159       __ b(CALCULATE_DIFFERENCE);
4160     __ bind(LOAD_LAST);
4161       __ pop(spilled_regs, sp);
4162 
4163       __ ldrs(vtmp, Address(strL));
4164       __ ldr(tmpU, Address(strU));
4165       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4166       __ fmovd(tmpL, vtmp);
4167 
4168       __ eor(rscratch2, tmpU, tmpL);
4169       __ cbz(rscratch2, DONE);
4170 
4171     // Find the first different characters in the longwords and
4172     // compute their difference.
4173     __ bind(CALCULATE_DIFFERENCE);
4174       __ rev(rscratch2, rscratch2);
4175       __ clz(rscratch2, rscratch2);
4176       __ andr(rscratch2, rscratch2, -16);
4177       __ lsrv(tmp1, tmp1, rscratch2);
4178       __ uxthw(tmp1, tmp1);
4179       __ lsrv(rscratch1, rscratch1, rscratch2);
4180       __ uxthw(rscratch1, rscratch1);
4181       __ subw(result, tmp1, rscratch1);
4182     __ bind(DONE);
4183       __ ret(lr);
4184     return entry;
4185   }
4186 
4187   // r0  = result
4188   // r1  = str1
4189   // r2  = cnt1
4190   // r3  = str2
4191   // r4  = cnt2
4192   // r10 = tmp1
4193   // r11 = tmp2
4194   address generate_compare_long_string_same_encoding(bool isLL) {
4195     __ align(CodeEntryAlignment);
4196     StubCodeMark mark(this, "StubRoutines", isLL
4197         ? "compare_long_string_same_encoding LL"
4198         : "compare_long_string_same_encoding UU");
4199     address entry = __ pc();
4200     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4201         tmp1 = r10, tmp2 = r11;
4202     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4203         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4204         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4205     // exit from large loop when less than 64 bytes left to read or we're about
4206     // to prefetch memory behind array border
4207     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4208     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4209     // update cnt2 counter with already loaded 8 bytes
4210     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4211     // update pointers, because of previous read
4212     __ add(str1, str1, wordSize);
4213     __ add(str2, str2, wordSize);
4214     if (SoftwarePrefetchHintDistance >= 0) {
4215       __ bind(LARGE_LOOP_PREFETCH);
4216         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4217         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4218         compare_string_16_bytes_same(DIFF, DIFF2);
4219         compare_string_16_bytes_same(DIFF, DIFF2);
4220         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4221         compare_string_16_bytes_same(DIFF, DIFF2);
4222         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4223         compare_string_16_bytes_same(DIFF, DIFF2);
4224         __ br(__ GT, LARGE_LOOP_PREFETCH);
4225         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4226         // less than 16 bytes left?
4227         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4228         __ br(__ LT, TAIL);
4229     }
4230     __ bind(SMALL_LOOP);
4231       compare_string_16_bytes_same(DIFF, DIFF2);
4232       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4233       __ br(__ GE, SMALL_LOOP);
4234     __ bind(TAIL);
4235       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4236       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4237       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4238       __ br(__ LE, CHECK_LAST);
4239       __ eor(rscratch2, tmp1, tmp2);
4240       __ cbnz(rscratch2, DIFF);
4241       __ ldr(tmp1, Address(__ post(str1, 8)));
4242       __ ldr(tmp2, Address(__ post(str2, 8)));
4243       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4244     __ bind(CHECK_LAST);
4245       if (!isLL) {
4246         __ add(cnt2, cnt2, cnt2); // now in bytes
4247       }
4248       __ eor(rscratch2, tmp1, tmp2);
4249       __ cbnz(rscratch2, DIFF);
4250       __ ldr(rscratch1, Address(str1, cnt2));
4251       __ ldr(cnt1, Address(str2, cnt2));
4252       __ eor(rscratch2, rscratch1, cnt1);
4253       __ cbz(rscratch2, LENGTH_DIFF);
4254       // Find the first different characters in the longwords and
4255       // compute their difference.
4256     __ bind(DIFF2);
4257       __ rev(rscratch2, rscratch2);
4258       __ clz(rscratch2, rscratch2);
4259       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4260       __ lsrv(rscratch1, rscratch1, rscratch2);
4261       if (isLL) {
4262         __ lsrv(cnt1, cnt1, rscratch2);
4263         __ uxtbw(rscratch1, rscratch1);
4264         __ uxtbw(cnt1, cnt1);
4265       } else {
4266         __ lsrv(cnt1, cnt1, rscratch2);
4267         __ uxthw(rscratch1, rscratch1);
4268         __ uxthw(cnt1, cnt1);
4269       }
4270       __ subw(result, rscratch1, cnt1);
4271       __ b(LENGTH_DIFF);
4272     __ bind(DIFF);
4273       __ rev(rscratch2, rscratch2);
4274       __ clz(rscratch2, rscratch2);
4275       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4276       __ lsrv(tmp1, tmp1, rscratch2);
4277       if (isLL) {
4278         __ lsrv(tmp2, tmp2, rscratch2);
4279         __ uxtbw(tmp1, tmp1);
4280         __ uxtbw(tmp2, tmp2);
4281       } else {
4282         __ lsrv(tmp2, tmp2, rscratch2);
4283         __ uxthw(tmp1, tmp1);
4284         __ uxthw(tmp2, tmp2);
4285       }
4286       __ subw(result, tmp1, tmp2);
4287       __ b(LENGTH_DIFF);
4288     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4289       __ eor(rscratch2, tmp1, tmp2);
4290       __ cbnz(rscratch2, DIFF);
4291     __ bind(LENGTH_DIFF);
4292       __ ret(lr);
4293     return entry;
4294   }
4295 
4296   void generate_compare_long_strings() {
4297       StubRoutines::aarch64::_compare_long_string_LL
4298           = generate_compare_long_string_same_encoding(true);
4299       StubRoutines::aarch64::_compare_long_string_UU
4300           = generate_compare_long_string_same_encoding(false);
4301       StubRoutines::aarch64::_compare_long_string_LU
4302           = generate_compare_long_string_different_encoding(true);
4303       StubRoutines::aarch64::_compare_long_string_UL
4304           = generate_compare_long_string_different_encoding(false);
4305   }
4306 
4307   // R0 = result
4308   // R1 = str2
4309   // R2 = cnt1
4310   // R3 = str1
4311   // R4 = cnt2
4312   // This generic linear code use few additional ideas, which makes it faster:
4313   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4314   // in order to skip initial loading(help in systems with 1 ld pipeline)
4315   // 2) we can use "fast" algorithm of finding single character to search for
4316   // first symbol with less branches(1 branch per each loaded register instead
4317   // of branch for each symbol), so, this is where constants like
4318   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4319   // 3) after loading and analyzing 1st register of source string, it can be
4320   // used to search for every 1st character entry, saving few loads in
4321   // comparison with "simplier-but-slower" implementation
4322   // 4) in order to avoid lots of push/pop operations, code below is heavily
4323   // re-using/re-initializing/compressing register values, which makes code
4324   // larger and a bit less readable, however, most of extra operations are
4325   // issued during loads or branches, so, penalty is minimal
4326   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4327     const char* stubName = str1_isL
4328         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4329         : "indexof_linear_uu";
4330     __ align(CodeEntryAlignment);
4331     StubCodeMark mark(this, "StubRoutines", stubName);
4332     address entry = __ pc();
4333 
4334     int str1_chr_size = str1_isL ? 1 : 2;
4335     int str2_chr_size = str2_isL ? 1 : 2;
4336     int str1_chr_shift = str1_isL ? 0 : 1;
4337     int str2_chr_shift = str2_isL ? 0 : 1;
4338     bool isL = str1_isL && str2_isL;
4339    // parameters
4340     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4341     // temporary registers
4342     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4343     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4344     // redefinitions
4345     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4346 
4347     __ push(spilled_regs, sp);
4348     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4349         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4350         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4351         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4352         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4353         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4354     // Read whole register from str1. It is safe, because length >=8 here
4355     __ ldr(ch1, Address(str1));
4356     // Read whole register from str2. It is safe, because length >=8 here
4357     __ ldr(ch2, Address(str2));
4358     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4359     if (str1_isL != str2_isL) {
4360       __ eor(v0, __ T16B, v0, v0);
4361     }
4362     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4363     __ mul(first, first, tmp1);
4364     // check if we have less than 1 register to check
4365     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4366     if (str1_isL != str2_isL) {
4367       __ fmovd(v1, ch1);
4368     }
4369     __ br(__ LE, L_SMALL);
4370     __ eor(ch2, first, ch2);
4371     if (str1_isL != str2_isL) {
4372       __ zip1(v1, __ T16B, v1, v0);
4373     }
4374     __ sub(tmp2, ch2, tmp1);
4375     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4376     __ bics(tmp2, tmp2, ch2);
4377     if (str1_isL != str2_isL) {
4378       __ fmovd(ch1, v1);
4379     }
4380     __ br(__ NE, L_HAS_ZERO);
4381     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4382     __ add(result, result, wordSize/str2_chr_size);
4383     __ add(str2, str2, wordSize);
4384     __ br(__ LT, L_POST_LOOP);
4385     __ BIND(L_LOOP);
4386       __ ldr(ch2, Address(str2));
4387       __ eor(ch2, first, ch2);
4388       __ sub(tmp2, ch2, tmp1);
4389       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4390       __ bics(tmp2, tmp2, ch2);
4391       __ br(__ NE, L_HAS_ZERO);
4392     __ BIND(L_LOOP_PROCEED);
4393       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4394       __ add(str2, str2, wordSize);
4395       __ add(result, result, wordSize/str2_chr_size);
4396       __ br(__ GE, L_LOOP);
4397     __ BIND(L_POST_LOOP);
4398       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4399       __ br(__ LE, NOMATCH);
4400       __ ldr(ch2, Address(str2));
4401       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4402       __ eor(ch2, first, ch2);
4403       __ sub(tmp2, ch2, tmp1);
4404       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4405       __ mov(tmp4, -1); // all bits set
4406       __ b(L_SMALL_PROCEED);
4407     __ align(OptoLoopAlignment);
4408     __ BIND(L_SMALL);
4409       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4410       __ eor(ch2, first, ch2);
4411       if (str1_isL != str2_isL) {
4412         __ zip1(v1, __ T16B, v1, v0);
4413       }
4414       __ sub(tmp2, ch2, tmp1);
4415       __ mov(tmp4, -1); // all bits set
4416       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4417       if (str1_isL != str2_isL) {
4418         __ fmovd(ch1, v1); // move converted 4 symbols
4419       }
4420     __ BIND(L_SMALL_PROCEED);
4421       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4422       __ bic(tmp2, tmp2, ch2);
4423       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4424       __ rbit(tmp2, tmp2);
4425       __ br(__ EQ, NOMATCH);
4426     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4427       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4428       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4429       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4430       if (str2_isL) { // LL
4431         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4432         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4433         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4434         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4435         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4436       } else {
4437         __ mov(ch2, 0xE); // all bits in byte set except last one
4438         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4439         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4440         __ lslv(tmp2, tmp2, tmp4);
4441         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4442         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4443         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4444         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4445       }
4446       __ cmp(ch1, ch2);
4447       __ mov(tmp4, wordSize/str2_chr_size);
4448       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4449     __ BIND(L_SMALL_CMP_LOOP);
4450       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4451                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4452       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4453                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4454       __ add(tmp4, tmp4, 1);
4455       __ cmp(tmp4, cnt1);
4456       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4457       __ cmp(first, ch2);
4458       __ br(__ EQ, L_SMALL_CMP_LOOP);
4459     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4460       __ cbz(tmp2, NOMATCH); // no more matches. exit
4461       __ clz(tmp4, tmp2);
4462       __ add(result, result, 1); // advance index
4463       __ add(str2, str2, str2_chr_size); // advance pointer
4464       __ b(L_SMALL_HAS_ZERO_LOOP);
4465     __ align(OptoLoopAlignment);
4466     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4467       __ cmp(first, ch2);
4468       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4469       __ b(DONE);
4470     __ align(OptoLoopAlignment);
4471     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4472       if (str2_isL) { // LL
4473         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4474         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4475         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4476         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4477         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4478       } else {
4479         __ mov(ch2, 0xE); // all bits in byte set except last one
4480         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4481         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4482         __ lslv(tmp2, tmp2, tmp4);
4483         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4484         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4485         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4486         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4487       }
4488       __ cmp(ch1, ch2);
4489       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4490       __ b(DONE);
4491     __ align(OptoLoopAlignment);
4492     __ BIND(L_HAS_ZERO);
4493       __ rbit(tmp2, tmp2);
4494       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4495       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4496       // It's fine because both counters are 32bit and are not changed in this
4497       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4498       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4499       __ sub(result, result, 1);
4500     __ BIND(L_HAS_ZERO_LOOP);
4501       __ mov(cnt1, wordSize/str2_chr_size);
4502       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4503       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4504       if (str2_isL) {
4505         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4506         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4507         __ lslv(tmp2, tmp2, tmp4);
4508         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4509         __ add(tmp4, tmp4, 1);
4510         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4511         __ lsl(tmp2, tmp2, 1);
4512         __ mov(tmp4, wordSize/str2_chr_size);
4513       } else {
4514         __ mov(ch2, 0xE);
4515         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4516         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4517         __ lslv(tmp2, tmp2, tmp4);
4518         __ add(tmp4, tmp4, 1);
4519         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4520         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4521         __ lsl(tmp2, tmp2, 1);
4522         __ mov(tmp4, wordSize/str2_chr_size);
4523         __ sub(str2, str2, str2_chr_size);
4524       }
4525       __ cmp(ch1, ch2);
4526       __ mov(tmp4, wordSize/str2_chr_size);
4527       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4528     __ BIND(L_CMP_LOOP);
4529       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4530                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4531       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4532                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4533       __ add(tmp4, tmp4, 1);
4534       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4535       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4536       __ cmp(cnt1, ch2);
4537       __ br(__ EQ, L_CMP_LOOP);
4538     __ BIND(L_CMP_LOOP_NOMATCH);
4539       // here we're not matched
4540       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4541       __ clz(tmp4, tmp2);
4542       __ add(str2, str2, str2_chr_size); // advance pointer
4543       __ b(L_HAS_ZERO_LOOP);
4544     __ align(OptoLoopAlignment);
4545     __ BIND(L_CMP_LOOP_LAST_CMP);
4546       __ cmp(cnt1, ch2);
4547       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4548       __ b(DONE);
4549     __ align(OptoLoopAlignment);
4550     __ BIND(L_CMP_LOOP_LAST_CMP2);
4551       if (str2_isL) {
4552         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4553         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4554         __ lslv(tmp2, tmp2, tmp4);
4555         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4556         __ add(tmp4, tmp4, 1);
4557         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4558         __ lsl(tmp2, tmp2, 1);
4559       } else {
4560         __ mov(ch2, 0xE);
4561         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4562         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4563         __ lslv(tmp2, tmp2, tmp4);
4564         __ add(tmp4, tmp4, 1);
4565         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4566         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4567         __ lsl(tmp2, tmp2, 1);
4568         __ sub(str2, str2, str2_chr_size);
4569       }
4570       __ cmp(ch1, ch2);
4571       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4572       __ b(DONE);
4573     __ align(OptoLoopAlignment);
4574     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4575       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4576       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4577       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4578       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4579       // result by analyzed characters value, so, we can just reset lower bits
4580       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4581       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4582       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4583       // index of last analyzed substring inside current octet. So, str2 in at
4584       // respective start address. We need to advance it to next octet
4585       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4586       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4587       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4588       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4589       __ movw(cnt2, cnt2);
4590       __ b(L_LOOP_PROCEED);
4591     __ align(OptoLoopAlignment);
4592     __ BIND(NOMATCH);
4593       __ mov(result, -1);
4594     __ BIND(DONE);
4595       __ pop(spilled_regs, sp);
4596       __ ret(lr);
4597     return entry;
4598   }
4599 
4600   void generate_string_indexof_stubs() {
4601     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4602     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4603     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4604   }
4605 
4606   void inflate_and_store_2_fp_registers(bool generatePrfm,
4607       FloatRegister src1, FloatRegister src2) {
4608     Register dst = r1;
4609     __ zip1(v1, __ T16B, src1, v0);
4610     __ zip2(v2, __ T16B, src1, v0);
4611     if (generatePrfm) {
4612       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4613     }
4614     __ zip1(v3, __ T16B, src2, v0);
4615     __ zip2(v4, __ T16B, src2, v0);
4616     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4617   }
4618 
4619   // R0 = src
4620   // R1 = dst
4621   // R2 = len
4622   // R3 = len >> 3
4623   // V0 = 0
4624   // v1 = loaded 8 bytes
4625   address generate_large_byte_array_inflate() {
4626     __ align(CodeEntryAlignment);
4627     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4628     address entry = __ pc();
4629     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4630     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4631     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4632 
4633     // do one more 8-byte read to have address 16-byte aligned in most cases
4634     // also use single store instruction
4635     __ ldrd(v2, __ post(src, 8));
4636     __ sub(octetCounter, octetCounter, 2);
4637     __ zip1(v1, __ T16B, v1, v0);
4638     __ zip1(v2, __ T16B, v2, v0);
4639     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4640     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4641     __ subs(rscratch1, octetCounter, large_loop_threshold);
4642     __ br(__ LE, LOOP_START);
4643     __ b(LOOP_PRFM_START);
4644     __ bind(LOOP_PRFM);
4645       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4646     __ bind(LOOP_PRFM_START);
4647       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4648       __ sub(octetCounter, octetCounter, 8);
4649       __ subs(rscratch1, octetCounter, large_loop_threshold);
4650       inflate_and_store_2_fp_registers(true, v3, v4);
4651       inflate_and_store_2_fp_registers(true, v5, v6);
4652       __ br(__ GT, LOOP_PRFM);
4653       __ cmp(octetCounter, (u1)8);
4654       __ br(__ LT, DONE);
4655     __ bind(LOOP);
4656       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4657       __ bind(LOOP_START);
4658       __ sub(octetCounter, octetCounter, 8);
4659       __ cmp(octetCounter, (u1)8);
4660       inflate_and_store_2_fp_registers(false, v3, v4);
4661       inflate_and_store_2_fp_registers(false, v5, v6);
4662       __ br(__ GE, LOOP);
4663     __ bind(DONE);
4664       __ ret(lr);
4665     return entry;
4666   }
4667 
4668   /**
4669    *  Arguments:
4670    *
4671    *  Input:
4672    *  c_rarg0   - current state address
4673    *  c_rarg1   - H key address
4674    *  c_rarg2   - data address
4675    *  c_rarg3   - number of blocks
4676    *
4677    *  Output:
4678    *  Updated state at c_rarg0
4679    */
4680   address generate_ghash_processBlocks() {
4681     // Bafflingly, GCM uses little-endian for the byte order, but
4682     // big-endian for the bit order.  For example, the polynomial 1 is
4683     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4684     //
4685     // So, we must either reverse the bytes in each word and do
4686     // everything big-endian or reverse the bits in each byte and do
4687     // it little-endian.  On AArch64 it's more idiomatic to reverse
4688     // the bits in each byte (we have an instruction, RBIT, to do
4689     // that) and keep the data in little-endian bit order throught the
4690     // calculation, bit-reversing the inputs and outputs.
4691 
4692     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4693     __ align(wordSize * 2);
4694     address p = __ pc();
4695     __ emit_int64(0x87);  // The low-order bits of the field
4696                           // polynomial (i.e. p = z^7+z^2+z+1)
4697                           // repeated in the low and high parts of a
4698                           // 128-bit vector
4699     __ emit_int64(0x87);
4700 
4701     __ align(CodeEntryAlignment);
4702     address start = __ pc();
4703 
4704     Register state   = c_rarg0;
4705     Register subkeyH = c_rarg1;
4706     Register data    = c_rarg2;
4707     Register blocks  = c_rarg3;
4708 
4709     FloatRegister vzr = v30;
4710     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4711 
4712     __ ldrq(v0, Address(state));
4713     __ ldrq(v1, Address(subkeyH));
4714 
4715     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4716     __ rbit(v0, __ T16B, v0);
4717     __ rev64(v1, __ T16B, v1);
4718     __ rbit(v1, __ T16B, v1);
4719 
4720     __ ldrq(v26, p);
4721 
4722     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4723     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4724 
4725     {
4726       Label L_ghash_loop;
4727       __ bind(L_ghash_loop);
4728 
4729       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4730                                                  // reversing each byte
4731       __ rbit(v2, __ T16B, v2);
4732       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4733 
4734       // Multiply state in v2 by subkey in v1
4735       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4736                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4737                      /*temps*/v6, v20, v18, v21);
4738       // Reduce v7:v5 by the field polynomial
4739       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4740 
4741       __ sub(blocks, blocks, 1);
4742       __ cbnz(blocks, L_ghash_loop);
4743     }
4744 
4745     // The bit-reversed result is at this point in v0
4746     __ rev64(v1, __ T16B, v0);
4747     __ rbit(v1, __ T16B, v1);
4748 
4749     __ st1(v1, __ T16B, state);
4750     __ ret(lr);
4751 
4752     return start;
4753   }
4754 
4755   // Continuation point for throwing of implicit exceptions that are
4756   // not handled in the current activation. Fabricates an exception
4757   // oop and initiates normal exception dispatching in this
4758   // frame. Since we need to preserve callee-saved values (currently
4759   // only for C2, but done for C1 as well) we need a callee-saved oop
4760   // map and therefore have to make these stubs into RuntimeStubs
4761   // rather than BufferBlobs.  If the compiler needs all registers to
4762   // be preserved between the fault point and the exception handler
4763   // then it must assume responsibility for that in
4764   // AbstractCompiler::continuation_for_implicit_null_exception or
4765   // continuation_for_implicit_division_by_zero_exception. All other
4766   // implicit exceptions (e.g., NullPointerException or
4767   // AbstractMethodError on entry) are either at call sites or
4768   // otherwise assume that stack unwinding will be initiated, so
4769   // caller saved registers were assumed volatile in the compiler.
4770 
4771 #undef __
4772 #define __ masm->
4773 
4774   address generate_throw_exception(const char* name,
4775                                    address runtime_entry,
4776                                    Register arg1 = noreg,
4777                                    Register arg2 = noreg) {
4778     // Information about frame layout at time of blocking runtime call.
4779     // Note that we only have to preserve callee-saved registers since
4780     // the compilers are responsible for supplying a continuation point
4781     // if they expect all registers to be preserved.
4782     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4783     enum layout {
4784       rfp_off = 0,
4785       rfp_off2,
4786       return_off,
4787       return_off2,
4788       framesize // inclusive of return address
4789     };
4790 
4791     int insts_size = 512;
4792     int locs_size  = 64;
4793 
4794     CodeBuffer code(name, insts_size, locs_size);
4795     OopMapSet* oop_maps  = new OopMapSet();
4796     MacroAssembler* masm = new MacroAssembler(&code);
4797 
4798     address start = __ pc();
4799 
4800     // This is an inlined and slightly modified version of call_VM
4801     // which has the ability to fetch the return PC out of
4802     // thread-local storage and also sets up last_Java_sp slightly
4803     // differently than the real call_VM
4804 
4805     __ enter(); // Save FP and LR before call
4806 
4807     assert(is_even(framesize/2), "sp not 16-byte aligned");
4808 
4809     // lr and fp are already in place
4810     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4811 
4812     int frame_complete = __ pc() - start;
4813 
4814     // Set up last_Java_sp and last_Java_fp
4815     address the_pc = __ pc();
4816     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4817 
4818     // Call runtime
4819     if (arg1 != noreg) {
4820       assert(arg2 != c_rarg1, "clobbered");
4821       __ mov(c_rarg1, arg1);
4822     }
4823     if (arg2 != noreg) {
4824       __ mov(c_rarg2, arg2);
4825     }
4826     __ mov(c_rarg0, rthread);
4827     BLOCK_COMMENT("call runtime_entry");
4828     __ mov(rscratch1, runtime_entry);
4829     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4830 
4831     // Generate oop map
4832     OopMap* map = new OopMap(framesize, 0);
4833 
4834     oop_maps->add_gc_map(the_pc - start, map);
4835 
4836     __ reset_last_Java_frame(true);
4837     __ maybe_isb();
4838 
4839     __ leave();
4840 
4841     // check for pending exceptions
4842 #ifdef ASSERT
4843     Label L;
4844     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4845     __ cbnz(rscratch1, L);
4846     __ should_not_reach_here();
4847     __ bind(L);
4848 #endif // ASSERT
4849     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4850 
4851 
4852     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4853     RuntimeStub* stub =
4854       RuntimeStub::new_runtime_stub(name,
4855                                     &code,
4856                                     frame_complete,
4857                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4858                                     oop_maps, false);
4859     return stub->entry_point();
4860   }
4861 
4862   class MontgomeryMultiplyGenerator : public MacroAssembler {
4863 
4864     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4865       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4866 
4867     RegSet _toSave;
4868     bool _squaring;
4869 
4870   public:
4871     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4872       : MacroAssembler(as->code()), _squaring(squaring) {
4873 
4874       // Register allocation
4875 
4876       Register reg = c_rarg0;
4877       Pa_base = reg;       // Argument registers
4878       if (squaring)
4879         Pb_base = Pa_base;
4880       else
4881         Pb_base = ++reg;
4882       Pn_base = ++reg;
4883       Rlen= ++reg;
4884       inv = ++reg;
4885       Pm_base = ++reg;
4886 
4887                           // Working registers:
4888       Ra =  ++reg;        // The current digit of a, b, n, and m.
4889       Rb =  ++reg;
4890       Rm =  ++reg;
4891       Rn =  ++reg;
4892 
4893       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4894       Pb =  ++reg;
4895       Pm =  ++reg;
4896       Pn =  ++reg;
4897 
4898       t0 =  ++reg;        // Three registers which form a
4899       t1 =  ++reg;        // triple-precision accumuator.
4900       t2 =  ++reg;
4901 
4902       Ri =  ++reg;        // Inner and outer loop indexes.
4903       Rj =  ++reg;
4904 
4905       Rhi_ab = ++reg;     // Product registers: low and high parts
4906       Rlo_ab = ++reg;     // of a*b and m*n.
4907       Rhi_mn = ++reg;
4908       Rlo_mn = ++reg;
4909 
4910       // r19 and up are callee-saved.
4911       _toSave = RegSet::range(r19, reg) + Pm_base;
4912     }
4913 
4914   private:
4915     void save_regs() {
4916       push(_toSave, sp);
4917     }
4918 
4919     void restore_regs() {
4920       pop(_toSave, sp);
4921     }
4922 
4923     template <typename T>
4924     void unroll_2(Register count, T block) {
4925       Label loop, end, odd;
4926       tbnz(count, 0, odd);
4927       cbz(count, end);
4928       align(16);
4929       bind(loop);
4930       (this->*block)();
4931       bind(odd);
4932       (this->*block)();
4933       subs(count, count, 2);
4934       br(Assembler::GT, loop);
4935       bind(end);
4936     }
4937 
4938     template <typename T>
4939     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4940       Label loop, end, odd;
4941       tbnz(count, 0, odd);
4942       cbz(count, end);
4943       align(16);
4944       bind(loop);
4945       (this->*block)(d, s, tmp);
4946       bind(odd);
4947       (this->*block)(d, s, tmp);
4948       subs(count, count, 2);
4949       br(Assembler::GT, loop);
4950       bind(end);
4951     }
4952 
4953     void pre1(RegisterOrConstant i) {
4954       block_comment("pre1");
4955       // Pa = Pa_base;
4956       // Pb = Pb_base + i;
4957       // Pm = Pm_base;
4958       // Pn = Pn_base + i;
4959       // Ra = *Pa;
4960       // Rb = *Pb;
4961       // Rm = *Pm;
4962       // Rn = *Pn;
4963       ldr(Ra, Address(Pa_base));
4964       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4965       ldr(Rm, Address(Pm_base));
4966       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4967       lea(Pa, Address(Pa_base));
4968       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4969       lea(Pm, Address(Pm_base));
4970       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4971 
4972       // Zero the m*n result.
4973       mov(Rhi_mn, zr);
4974       mov(Rlo_mn, zr);
4975     }
4976 
4977     // The core multiply-accumulate step of a Montgomery
4978     // multiplication.  The idea is to schedule operations as a
4979     // pipeline so that instructions with long latencies (loads and
4980     // multiplies) have time to complete before their results are
4981     // used.  This most benefits in-order implementations of the
4982     // architecture but out-of-order ones also benefit.
4983     void step() {
4984       block_comment("step");
4985       // MACC(Ra, Rb, t0, t1, t2);
4986       // Ra = *++Pa;
4987       // Rb = *--Pb;
4988       umulh(Rhi_ab, Ra, Rb);
4989       mul(Rlo_ab, Ra, Rb);
4990       ldr(Ra, pre(Pa, wordSize));
4991       ldr(Rb, pre(Pb, -wordSize));
4992       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4993                                        // previous iteration.
4994       // MACC(Rm, Rn, t0, t1, t2);
4995       // Rm = *++Pm;
4996       // Rn = *--Pn;
4997       umulh(Rhi_mn, Rm, Rn);
4998       mul(Rlo_mn, Rm, Rn);
4999       ldr(Rm, pre(Pm, wordSize));
5000       ldr(Rn, pre(Pn, -wordSize));
5001       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5002     }
5003 
5004     void post1() {
5005       block_comment("post1");
5006 
5007       // MACC(Ra, Rb, t0, t1, t2);
5008       // Ra = *++Pa;
5009       // Rb = *--Pb;
5010       umulh(Rhi_ab, Ra, Rb);
5011       mul(Rlo_ab, Ra, Rb);
5012       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5013       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5014 
5015       // *Pm = Rm = t0 * inv;
5016       mul(Rm, t0, inv);
5017       str(Rm, Address(Pm));
5018 
5019       // MACC(Rm, Rn, t0, t1, t2);
5020       // t0 = t1; t1 = t2; t2 = 0;
5021       umulh(Rhi_mn, Rm, Rn);
5022 
5023 #ifndef PRODUCT
5024       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5025       {
5026         mul(Rlo_mn, Rm, Rn);
5027         add(Rlo_mn, t0, Rlo_mn);
5028         Label ok;
5029         cbz(Rlo_mn, ok); {
5030           stop("broken Montgomery multiply");
5031         } bind(ok);
5032       }
5033 #endif
5034       // We have very carefully set things up so that
5035       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5036       // the lower half of Rm * Rn because we know the result already:
5037       // it must be -t0.  t0 + (-t0) must generate a carry iff
5038       // t0 != 0.  So, rather than do a mul and an adds we just set
5039       // the carry flag iff t0 is nonzero.
5040       //
5041       // mul(Rlo_mn, Rm, Rn);
5042       // adds(zr, t0, Rlo_mn);
5043       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5044       adcs(t0, t1, Rhi_mn);
5045       adc(t1, t2, zr);
5046       mov(t2, zr);
5047     }
5048 
5049     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5050       block_comment("pre2");
5051       // Pa = Pa_base + i-len;
5052       // Pb = Pb_base + len;
5053       // Pm = Pm_base + i-len;
5054       // Pn = Pn_base + len;
5055 
5056       if (i.is_register()) {
5057         sub(Rj, i.as_register(), len);
5058       } else {
5059         mov(Rj, i.as_constant());
5060         sub(Rj, Rj, len);
5061       }
5062       // Rj == i-len
5063 
5064       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5065       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5066       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5067       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5068 
5069       // Ra = *++Pa;
5070       // Rb = *--Pb;
5071       // Rm = *++Pm;
5072       // Rn = *--Pn;
5073       ldr(Ra, pre(Pa, wordSize));
5074       ldr(Rb, pre(Pb, -wordSize));
5075       ldr(Rm, pre(Pm, wordSize));
5076       ldr(Rn, pre(Pn, -wordSize));
5077 
5078       mov(Rhi_mn, zr);
5079       mov(Rlo_mn, zr);
5080     }
5081 
5082     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5083       block_comment("post2");
5084       if (i.is_constant()) {
5085         mov(Rj, i.as_constant()-len.as_constant());
5086       } else {
5087         sub(Rj, i.as_register(), len);
5088       }
5089 
5090       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5091 
5092       // As soon as we know the least significant digit of our result,
5093       // store it.
5094       // Pm_base[i-len] = t0;
5095       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5096 
5097       // t0 = t1; t1 = t2; t2 = 0;
5098       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5099       adc(t1, t2, zr);
5100       mov(t2, zr);
5101     }
5102 
5103     // A carry in t0 after Montgomery multiplication means that we
5104     // should subtract multiples of n from our result in m.  We'll
5105     // keep doing that until there is no carry.
5106     void normalize(RegisterOrConstant len) {
5107       block_comment("normalize");
5108       // while (t0)
5109       //   t0 = sub(Pm_base, Pn_base, t0, len);
5110       Label loop, post, again;
5111       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5112       cbz(t0, post); {
5113         bind(again); {
5114           mov(i, zr);
5115           mov(cnt, len);
5116           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5117           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5118           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5119           align(16);
5120           bind(loop); {
5121             sbcs(Rm, Rm, Rn);
5122             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5123             add(i, i, 1);
5124             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5125             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5126             sub(cnt, cnt, 1);
5127           } cbnz(cnt, loop);
5128           sbc(t0, t0, zr);
5129         } cbnz(t0, again);
5130       } bind(post);
5131     }
5132 
5133     // Move memory at s to d, reversing words.
5134     //    Increments d to end of copied memory
5135     //    Destroys tmp1, tmp2
5136     //    Preserves len
5137     //    Leaves s pointing to the address which was in d at start
5138     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5139       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5140 
5141       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5142       mov(tmp1, len);
5143       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5144       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5145     }
5146     // where
5147     void reverse1(Register d, Register s, Register tmp) {
5148       ldr(tmp, pre(s, -wordSize));
5149       ror(tmp, tmp, 32);
5150       str(tmp, post(d, wordSize));
5151     }
5152 
5153     void step_squaring() {
5154       // An extra ACC
5155       step();
5156       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5157     }
5158 
5159     void last_squaring(RegisterOrConstant i) {
5160       Label dont;
5161       // if ((i & 1) == 0) {
5162       tbnz(i.as_register(), 0, dont); {
5163         // MACC(Ra, Rb, t0, t1, t2);
5164         // Ra = *++Pa;
5165         // Rb = *--Pb;
5166         umulh(Rhi_ab, Ra, Rb);
5167         mul(Rlo_ab, Ra, Rb);
5168         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5169       } bind(dont);
5170     }
5171 
5172     void extra_step_squaring() {
5173       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5174 
5175       // MACC(Rm, Rn, t0, t1, t2);
5176       // Rm = *++Pm;
5177       // Rn = *--Pn;
5178       umulh(Rhi_mn, Rm, Rn);
5179       mul(Rlo_mn, Rm, Rn);
5180       ldr(Rm, pre(Pm, wordSize));
5181       ldr(Rn, pre(Pn, -wordSize));
5182     }
5183 
5184     void post1_squaring() {
5185       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5186 
5187       // *Pm = Rm = t0 * inv;
5188       mul(Rm, t0, inv);
5189       str(Rm, Address(Pm));
5190 
5191       // MACC(Rm, Rn, t0, t1, t2);
5192       // t0 = t1; t1 = t2; t2 = 0;
5193       umulh(Rhi_mn, Rm, Rn);
5194 
5195 #ifndef PRODUCT
5196       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5197       {
5198         mul(Rlo_mn, Rm, Rn);
5199         add(Rlo_mn, t0, Rlo_mn);
5200         Label ok;
5201         cbz(Rlo_mn, ok); {
5202           stop("broken Montgomery multiply");
5203         } bind(ok);
5204       }
5205 #endif
5206       // We have very carefully set things up so that
5207       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5208       // the lower half of Rm * Rn because we know the result already:
5209       // it must be -t0.  t0 + (-t0) must generate a carry iff
5210       // t0 != 0.  So, rather than do a mul and an adds we just set
5211       // the carry flag iff t0 is nonzero.
5212       //
5213       // mul(Rlo_mn, Rm, Rn);
5214       // adds(zr, t0, Rlo_mn);
5215       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5216       adcs(t0, t1, Rhi_mn);
5217       adc(t1, t2, zr);
5218       mov(t2, zr);
5219     }
5220 
5221     void acc(Register Rhi, Register Rlo,
5222              Register t0, Register t1, Register t2) {
5223       adds(t0, t0, Rlo);
5224       adcs(t1, t1, Rhi);
5225       adc(t2, t2, zr);
5226     }
5227 
5228   public:
5229     /**
5230      * Fast Montgomery multiplication.  The derivation of the
5231      * algorithm is in A Cryptographic Library for the Motorola
5232      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5233      *
5234      * Arguments:
5235      *
5236      * Inputs for multiplication:
5237      *   c_rarg0   - int array elements a
5238      *   c_rarg1   - int array elements b
5239      *   c_rarg2   - int array elements n (the modulus)
5240      *   c_rarg3   - int length
5241      *   c_rarg4   - int inv
5242      *   c_rarg5   - int array elements m (the result)
5243      *
5244      * Inputs for squaring:
5245      *   c_rarg0   - int array elements a
5246      *   c_rarg1   - int array elements n (the modulus)
5247      *   c_rarg2   - int length
5248      *   c_rarg3   - int inv
5249      *   c_rarg4   - int array elements m (the result)
5250      *
5251      */
5252     address generate_multiply() {
5253       Label argh, nothing;
5254       bind(argh);
5255       stop("MontgomeryMultiply total_allocation must be <= 8192");
5256 
5257       align(CodeEntryAlignment);
5258       address entry = pc();
5259 
5260       cbzw(Rlen, nothing);
5261 
5262       enter();
5263 
5264       // Make room.
5265       cmpw(Rlen, 512);
5266       br(Assembler::HI, argh);
5267       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5268       andr(sp, Ra, -2 * wordSize);
5269 
5270       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5271 
5272       {
5273         // Copy input args, reversing as we go.  We use Ra as a
5274         // temporary variable.
5275         reverse(Ra, Pa_base, Rlen, t0, t1);
5276         if (!_squaring)
5277           reverse(Ra, Pb_base, Rlen, t0, t1);
5278         reverse(Ra, Pn_base, Rlen, t0, t1);
5279       }
5280 
5281       // Push all call-saved registers and also Pm_base which we'll need
5282       // at the end.
5283       save_regs();
5284 
5285 #ifndef PRODUCT
5286       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5287       {
5288         ldr(Rn, Address(Pn_base, 0));
5289         mul(Rlo_mn, Rn, inv);
5290         subs(zr, Rlo_mn, -1);
5291         Label ok;
5292         br(EQ, ok); {
5293           stop("broken inverse in Montgomery multiply");
5294         } bind(ok);
5295       }
5296 #endif
5297 
5298       mov(Pm_base, Ra);
5299 
5300       mov(t0, zr);
5301       mov(t1, zr);
5302       mov(t2, zr);
5303 
5304       block_comment("for (int i = 0; i < len; i++) {");
5305       mov(Ri, zr); {
5306         Label loop, end;
5307         cmpw(Ri, Rlen);
5308         br(Assembler::GE, end);
5309 
5310         bind(loop);
5311         pre1(Ri);
5312 
5313         block_comment("  for (j = i; j; j--) {"); {
5314           movw(Rj, Ri);
5315           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5316         } block_comment("  } // j");
5317 
5318         post1();
5319         addw(Ri, Ri, 1);
5320         cmpw(Ri, Rlen);
5321         br(Assembler::LT, loop);
5322         bind(end);
5323         block_comment("} // i");
5324       }
5325 
5326       block_comment("for (int i = len; i < 2*len; i++) {");
5327       mov(Ri, Rlen); {
5328         Label loop, end;
5329         cmpw(Ri, Rlen, Assembler::LSL, 1);
5330         br(Assembler::GE, end);
5331 
5332         bind(loop);
5333         pre2(Ri, Rlen);
5334 
5335         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5336           lslw(Rj, Rlen, 1);
5337           subw(Rj, Rj, Ri);
5338           subw(Rj, Rj, 1);
5339           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5340         } block_comment("  } // j");
5341 
5342         post2(Ri, Rlen);
5343         addw(Ri, Ri, 1);
5344         cmpw(Ri, Rlen, Assembler::LSL, 1);
5345         br(Assembler::LT, loop);
5346         bind(end);
5347       }
5348       block_comment("} // i");
5349 
5350       normalize(Rlen);
5351 
5352       mov(Ra, Pm_base);  // Save Pm_base in Ra
5353       restore_regs();  // Restore caller's Pm_base
5354 
5355       // Copy our result into caller's Pm_base
5356       reverse(Pm_base, Ra, Rlen, t0, t1);
5357 
5358       leave();
5359       bind(nothing);
5360       ret(lr);
5361 
5362       return entry;
5363     }
5364     // In C, approximately:
5365 
5366     // void
5367     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5368     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5369     //                     unsigned long inv, int len) {
5370     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5371     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5372     //   unsigned long Ra, Rb, Rn, Rm;
5373 
5374     //   int i;
5375 
5376     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5377 
5378     //   for (i = 0; i < len; i++) {
5379     //     int j;
5380 
5381     //     Pa = Pa_base;
5382     //     Pb = Pb_base + i;
5383     //     Pm = Pm_base;
5384     //     Pn = Pn_base + i;
5385 
5386     //     Ra = *Pa;
5387     //     Rb = *Pb;
5388     //     Rm = *Pm;
5389     //     Rn = *Pn;
5390 
5391     //     int iters = i;
5392     //     for (j = 0; iters--; j++) {
5393     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5394     //       MACC(Ra, Rb, t0, t1, t2);
5395     //       Ra = *++Pa;
5396     //       Rb = *--Pb;
5397     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5398     //       MACC(Rm, Rn, t0, t1, t2);
5399     //       Rm = *++Pm;
5400     //       Rn = *--Pn;
5401     //     }
5402 
5403     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5404     //     MACC(Ra, Rb, t0, t1, t2);
5405     //     *Pm = Rm = t0 * inv;
5406     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5407     //     MACC(Rm, Rn, t0, t1, t2);
5408 
5409     //     assert(t0 == 0, "broken Montgomery multiply");
5410 
5411     //     t0 = t1; t1 = t2; t2 = 0;
5412     //   }
5413 
5414     //   for (i = len; i < 2*len; i++) {
5415     //     int j;
5416 
5417     //     Pa = Pa_base + i-len;
5418     //     Pb = Pb_base + len;
5419     //     Pm = Pm_base + i-len;
5420     //     Pn = Pn_base + len;
5421 
5422     //     Ra = *++Pa;
5423     //     Rb = *--Pb;
5424     //     Rm = *++Pm;
5425     //     Rn = *--Pn;
5426 
5427     //     int iters = len*2-i-1;
5428     //     for (j = i-len+1; iters--; j++) {
5429     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5430     //       MACC(Ra, Rb, t0, t1, t2);
5431     //       Ra = *++Pa;
5432     //       Rb = *--Pb;
5433     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5434     //       MACC(Rm, Rn, t0, t1, t2);
5435     //       Rm = *++Pm;
5436     //       Rn = *--Pn;
5437     //     }
5438 
5439     //     Pm_base[i-len] = t0;
5440     //     t0 = t1; t1 = t2; t2 = 0;
5441     //   }
5442 
5443     //   while (t0)
5444     //     t0 = sub(Pm_base, Pn_base, t0, len);
5445     // }
5446 
5447     /**
5448      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5449      * multiplies than Montgomery multiplication so it should be up to
5450      * 25% faster.  However, its loop control is more complex and it
5451      * may actually run slower on some machines.
5452      *
5453      * Arguments:
5454      *
5455      * Inputs:
5456      *   c_rarg0   - int array elements a
5457      *   c_rarg1   - int array elements n (the modulus)
5458      *   c_rarg2   - int length
5459      *   c_rarg3   - int inv
5460      *   c_rarg4   - int array elements m (the result)
5461      *
5462      */
5463     address generate_square() {
5464       Label argh;
5465       bind(argh);
5466       stop("MontgomeryMultiply total_allocation must be <= 8192");
5467 
5468       align(CodeEntryAlignment);
5469       address entry = pc();
5470 
5471       enter();
5472 
5473       // Make room.
5474       cmpw(Rlen, 512);
5475       br(Assembler::HI, argh);
5476       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5477       andr(sp, Ra, -2 * wordSize);
5478 
5479       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5480 
5481       {
5482         // Copy input args, reversing as we go.  We use Ra as a
5483         // temporary variable.
5484         reverse(Ra, Pa_base, Rlen, t0, t1);
5485         reverse(Ra, Pn_base, Rlen, t0, t1);
5486       }
5487 
5488       // Push all call-saved registers and also Pm_base which we'll need
5489       // at the end.
5490       save_regs();
5491 
5492       mov(Pm_base, Ra);
5493 
5494       mov(t0, zr);
5495       mov(t1, zr);
5496       mov(t2, zr);
5497 
5498       block_comment("for (int i = 0; i < len; i++) {");
5499       mov(Ri, zr); {
5500         Label loop, end;
5501         bind(loop);
5502         cmp(Ri, Rlen);
5503         br(Assembler::GE, end);
5504 
5505         pre1(Ri);
5506 
5507         block_comment("for (j = (i+1)/2; j; j--) {"); {
5508           add(Rj, Ri, 1);
5509           lsr(Rj, Rj, 1);
5510           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5511         } block_comment("  } // j");
5512 
5513         last_squaring(Ri);
5514 
5515         block_comment("  for (j = i/2; j; j--) {"); {
5516           lsr(Rj, Ri, 1);
5517           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5518         } block_comment("  } // j");
5519 
5520         post1_squaring();
5521         add(Ri, Ri, 1);
5522         cmp(Ri, Rlen);
5523         br(Assembler::LT, loop);
5524 
5525         bind(end);
5526         block_comment("} // i");
5527       }
5528 
5529       block_comment("for (int i = len; i < 2*len; i++) {");
5530       mov(Ri, Rlen); {
5531         Label loop, end;
5532         bind(loop);
5533         cmp(Ri, Rlen, Assembler::LSL, 1);
5534         br(Assembler::GE, end);
5535 
5536         pre2(Ri, Rlen);
5537 
5538         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5539           lsl(Rj, Rlen, 1);
5540           sub(Rj, Rj, Ri);
5541           sub(Rj, Rj, 1);
5542           lsr(Rj, Rj, 1);
5543           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5544         } block_comment("  } // j");
5545 
5546         last_squaring(Ri);
5547 
5548         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5549           lsl(Rj, Rlen, 1);
5550           sub(Rj, Rj, Ri);
5551           lsr(Rj, Rj, 1);
5552           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5553         } block_comment("  } // j");
5554 
5555         post2(Ri, Rlen);
5556         add(Ri, Ri, 1);
5557         cmp(Ri, Rlen, Assembler::LSL, 1);
5558 
5559         br(Assembler::LT, loop);
5560         bind(end);
5561         block_comment("} // i");
5562       }
5563 
5564       normalize(Rlen);
5565 
5566       mov(Ra, Pm_base);  // Save Pm_base in Ra
5567       restore_regs();  // Restore caller's Pm_base
5568 
5569       // Copy our result into caller's Pm_base
5570       reverse(Pm_base, Ra, Rlen, t0, t1);
5571 
5572       leave();
5573       ret(lr);
5574 
5575       return entry;
5576     }
5577     // In C, approximately:
5578 
5579     // void
5580     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5581     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5582     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5583     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5584     //   unsigned long Ra, Rb, Rn, Rm;
5585 
5586     //   int i;
5587 
5588     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5589 
5590     //   for (i = 0; i < len; i++) {
5591     //     int j;
5592 
5593     //     Pa = Pa_base;
5594     //     Pb = Pa_base + i;
5595     //     Pm = Pm_base;
5596     //     Pn = Pn_base + i;
5597 
5598     //     Ra = *Pa;
5599     //     Rb = *Pb;
5600     //     Rm = *Pm;
5601     //     Rn = *Pn;
5602 
5603     //     int iters = (i+1)/2;
5604     //     for (j = 0; iters--; j++) {
5605     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5606     //       MACC2(Ra, Rb, t0, t1, t2);
5607     //       Ra = *++Pa;
5608     //       Rb = *--Pb;
5609     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5610     //       MACC(Rm, Rn, t0, t1, t2);
5611     //       Rm = *++Pm;
5612     //       Rn = *--Pn;
5613     //     }
5614     //     if ((i & 1) == 0) {
5615     //       assert(Ra == Pa_base[j], "must be");
5616     //       MACC(Ra, Ra, t0, t1, t2);
5617     //     }
5618     //     iters = i/2;
5619     //     assert(iters == i-j, "must be");
5620     //     for (; iters--; j++) {
5621     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5622     //       MACC(Rm, Rn, t0, t1, t2);
5623     //       Rm = *++Pm;
5624     //       Rn = *--Pn;
5625     //     }
5626 
5627     //     *Pm = Rm = t0 * inv;
5628     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5629     //     MACC(Rm, Rn, t0, t1, t2);
5630 
5631     //     assert(t0 == 0, "broken Montgomery multiply");
5632 
5633     //     t0 = t1; t1 = t2; t2 = 0;
5634     //   }
5635 
5636     //   for (i = len; i < 2*len; i++) {
5637     //     int start = i-len+1;
5638     //     int end = start + (len - start)/2;
5639     //     int j;
5640 
5641     //     Pa = Pa_base + i-len;
5642     //     Pb = Pa_base + len;
5643     //     Pm = Pm_base + i-len;
5644     //     Pn = Pn_base + len;
5645 
5646     //     Ra = *++Pa;
5647     //     Rb = *--Pb;
5648     //     Rm = *++Pm;
5649     //     Rn = *--Pn;
5650 
5651     //     int iters = (2*len-i-1)/2;
5652     //     assert(iters == end-start, "must be");
5653     //     for (j = start; iters--; j++) {
5654     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5655     //       MACC2(Ra, Rb, t0, t1, t2);
5656     //       Ra = *++Pa;
5657     //       Rb = *--Pb;
5658     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5659     //       MACC(Rm, Rn, t0, t1, t2);
5660     //       Rm = *++Pm;
5661     //       Rn = *--Pn;
5662     //     }
5663     //     if ((i & 1) == 0) {
5664     //       assert(Ra == Pa_base[j], "must be");
5665     //       MACC(Ra, Ra, t0, t1, t2);
5666     //     }
5667     //     iters =  (2*len-i)/2;
5668     //     assert(iters == len-j, "must be");
5669     //     for (; iters--; j++) {
5670     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5671     //       MACC(Rm, Rn, t0, t1, t2);
5672     //       Rm = *++Pm;
5673     //       Rn = *--Pn;
5674     //     }
5675     //     Pm_base[i-len] = t0;
5676     //     t0 = t1; t1 = t2; t2 = 0;
5677     //   }
5678 
5679     //   while (t0)
5680     //     t0 = sub(Pm_base, Pn_base, t0, len);
5681     // }
5682   };
5683 
5684 
5685   // Initialization
5686   void generate_initial() {
5687     // Generate initial stubs and initializes the entry points
5688 
5689     // entry points that exist in all platforms Note: This is code
5690     // that could be shared among different platforms - however the
5691     // benefit seems to be smaller than the disadvantage of having a
5692     // much more complicated generator structure. See also comment in
5693     // stubRoutines.hpp.
5694 
5695     StubRoutines::_forward_exception_entry = generate_forward_exception();
5696 
5697     StubRoutines::_call_stub_entry =
5698       generate_call_stub(StubRoutines::_call_stub_return_address);
5699 
5700     // is referenced by megamorphic call
5701     StubRoutines::_catch_exception_entry = generate_catch_exception();
5702 
5703     // Build this early so it's available for the interpreter.
5704     StubRoutines::_throw_StackOverflowError_entry =
5705       generate_throw_exception("StackOverflowError throw_exception",
5706                                CAST_FROM_FN_PTR(address,
5707                                                 SharedRuntime::throw_StackOverflowError));
5708     StubRoutines::_throw_delayed_StackOverflowError_entry =
5709       generate_throw_exception("delayed StackOverflowError throw_exception",
5710                                CAST_FROM_FN_PTR(address,
5711                                                 SharedRuntime::throw_delayed_StackOverflowError));
5712     if (UseCRC32Intrinsics) {
5713       // set table address before stub generation which use it
5714       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5715       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5716     }
5717 
5718     if (UseCRC32CIntrinsics) {
5719       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5720     }
5721 
5722     // Disabled until JDK-8210858 is fixed
5723     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5724     //   StubRoutines::_dlog = generate_dlog();
5725     // }
5726 
5727     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5728       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5729     }
5730 
5731     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5732       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5733     }
5734   }
5735 
5736   void generate_all() {
5737     // support for verify_oop (must happen after universe_init)
5738     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5739     StubRoutines::_throw_AbstractMethodError_entry =
5740       generate_throw_exception("AbstractMethodError throw_exception",
5741                                CAST_FROM_FN_PTR(address,
5742                                                 SharedRuntime::
5743                                                 throw_AbstractMethodError));
5744 
5745     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5746       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5747                                CAST_FROM_FN_PTR(address,
5748                                                 SharedRuntime::
5749                                                 throw_IncompatibleClassChangeError));
5750 
5751     StubRoutines::_throw_NullPointerException_at_call_entry =
5752       generate_throw_exception("NullPointerException at call throw_exception",
5753                                CAST_FROM_FN_PTR(address,
5754                                                 SharedRuntime::
5755                                                 throw_NullPointerException_at_call));
5756 
5757     // arraycopy stubs used by compilers
5758     generate_arraycopy_stubs();
5759 
5760     // has negatives stub for large arrays.
5761     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5762 
5763     // array equals stub for large arrays.
5764     if (!UseSimpleArrayEquals) {
5765       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5766     }
5767 
5768     generate_compare_long_strings();
5769 
5770     generate_string_indexof_stubs();
5771 
5772     // byte_array_inflate stub for large arrays.
5773     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5774 
5775 #ifdef COMPILER2
5776     if (UseMultiplyToLenIntrinsic) {
5777       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5778     }
5779 
5780     if (UseSquareToLenIntrinsic) {
5781       StubRoutines::_squareToLen = generate_squareToLen();
5782     }
5783 
5784     if (UseMulAddIntrinsic) {
5785       StubRoutines::_mulAdd = generate_mulAdd();
5786     }
5787 
5788     if (UseMontgomeryMultiplyIntrinsic) {
5789       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5790       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5791       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5792     }
5793 
5794     if (UseMontgomerySquareIntrinsic) {
5795       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5796       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5797       // We use generate_multiply() rather than generate_square()
5798       // because it's faster for the sizes of modulus we care about.
5799       StubRoutines::_montgomerySquare = g.generate_multiply();
5800     }
5801 #endif // COMPILER2
5802 
5803 #ifndef BUILTIN_SIM
5804     // generate GHASH intrinsics code
5805     if (UseGHASHIntrinsics) {
5806       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5807     }
5808 
5809     if (UseAESIntrinsics) {
5810       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5811       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5812       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5813       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5814     }
5815 
5816     if (UseSHA1Intrinsics) {
5817       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5818       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5819     }
5820     if (UseSHA256Intrinsics) {
5821       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5822       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5823     }
5824 
5825     // generate Adler32 intrinsics code
5826     if (UseAdler32Intrinsics) {
5827       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5828     }
5829 
5830     // Safefetch stubs.
5831     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5832                                                        &StubRoutines::_safefetch32_fault_pc,
5833                                                        &StubRoutines::_safefetch32_continuation_pc);
5834     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5835                                                        &StubRoutines::_safefetchN_fault_pc,
5836                                                        &StubRoutines::_safefetchN_continuation_pc);
5837 #endif
5838     StubRoutines::aarch64::set_completed();
5839   }
5840 
5841  public:
5842   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5843     if (all) {
5844       generate_all();
5845     } else {
5846       generate_initial();
5847     }
5848   }
5849 }; // end class declaration
5850 
5851 void StubGenerator_generate(CodeBuffer* code, bool all) {
5852   StubGenerator g(code, all);
5853 }