1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #include "utilities/powerOfTwo.hpp"
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zThreadLocalData.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     address aarch64_entry = __ pc();
 222 
 223     // set up frame and move sp to end of save area
 224     __ enter();
 225     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 226 
 227     // save register parameters and Java scratch/global registers
 228     // n.b. we save thread even though it gets installed in
 229     // rthread because we want to sanity check rthread later
 230     __ str(c_rarg7,  thread);
 231     __ strw(c_rarg6, parameter_size);
 232     __ stp(c_rarg4, c_rarg5,  entry_point);
 233     __ stp(c_rarg2, c_rarg3,  result_type);
 234     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 235 
 236     __ stp(r20, r19,   r20_save);
 237     __ stp(r22, r21,   r22_save);
 238     __ stp(r24, r23,   r24_save);
 239     __ stp(r26, r25,   r26_save);
 240     __ stp(r28, r27,   r28_save);
 241 
 242     __ stpd(v9,  v8,   d9_save);
 243     __ stpd(v11, v10,  d11_save);
 244     __ stpd(v13, v12,  d13_save);
 245     __ stpd(v15, v14,  d15_save);
 246 
 247     // install Java thread in global register now we have saved
 248     // whatever value it held
 249     __ mov(rthread, c_rarg7);
 250     // And method
 251     __ mov(rmethod, c_rarg3);
 252 
 253     // set up the heapbase register
 254     __ reinit_heapbase();
 255 
 256 #ifdef ASSERT
 257     // make sure we have no pending exceptions
 258     {
 259       Label L;
 260       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 261       __ cmp(rscratch1, (u1)NULL_WORD);
 262       __ br(Assembler::EQ, L);
 263       __ stop("StubRoutines::call_stub: entered with pending exception");
 264       __ BIND(L);
 265     }
 266 #endif
 267     // pass parameters if any
 268     __ mov(esp, sp);
 269     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 270     __ andr(sp, rscratch1, -2 * wordSize);
 271 
 272     BLOCK_COMMENT("pass parameters if any");
 273     Label parameters_done;
 274     // parameter count is still in c_rarg6
 275     // and parameter pointer identifying param 1 is in c_rarg5
 276     __ cbzw(c_rarg6, parameters_done);
 277 
 278     address loop = __ pc();
 279     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 280     __ subsw(c_rarg6, c_rarg6, 1);
 281     __ push(rscratch1);
 282     __ br(Assembler::GT, loop);
 283 
 284     __ BIND(parameters_done);
 285 
 286     // call Java entry -- passing methdoOop, and current sp
 287     //      rmethod: Method*
 288     //      r13: sender sp
 289     BLOCK_COMMENT("call Java function");
 290     __ mov(r13, sp);
 291     __ blr(c_rarg4);
 292 
 293     // we do this here because the notify will already have been done
 294     // if we get to the next instruction via an exception
 295     //
 296     // n.b. adding this instruction here affects the calculation of
 297     // whether or not a routine returns to the call stub (used when
 298     // doing stack walks) since the normal test is to check the return
 299     // pc against the address saved below. so we may need to allow for
 300     // this extra instruction in the check.
 301 
 302     // save current address for use by exception handling code
 303 
 304     return_address = __ pc();
 305 
 306     // store result depending on type (everything that is not
 307     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 308     // n.b. this assumes Java returns an integral result in r0
 309     // and a floating result in j_farg0
 310     __ ldr(j_rarg2, result);
 311     Label is_long, is_float, is_double, exit;
 312     __ ldr(j_rarg1, result_type);
 313     __ cmp(j_rarg1, (u1)T_OBJECT);
 314     __ br(Assembler::EQ, is_long);
 315     __ cmp(j_rarg1, (u1)T_LONG);
 316     __ br(Assembler::EQ, is_long);
 317     __ cmp(j_rarg1, (u1)T_FLOAT);
 318     __ br(Assembler::EQ, is_float);
 319     __ cmp(j_rarg1, (u1)T_DOUBLE);
 320     __ br(Assembler::EQ, is_double);
 321 
 322     // handle T_INT case
 323     __ strw(r0, Address(j_rarg2));
 324 
 325     __ BIND(exit);
 326 
 327     // pop parameters
 328     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 329 
 330 #ifdef ASSERT
 331     // verify that threads correspond
 332     {
 333       Label L, S;
 334       __ ldr(rscratch1, thread);
 335       __ cmp(rthread, rscratch1);
 336       __ br(Assembler::NE, S);
 337       __ get_thread(rscratch1);
 338       __ cmp(rthread, rscratch1);
 339       __ br(Assembler::EQ, L);
 340       __ BIND(S);
 341       __ stop("StubRoutines::call_stub: threads must correspond");
 342       __ BIND(L);
 343     }
 344 #endif
 345 
 346     // restore callee-save registers
 347     __ ldpd(v15, v14,  d15_save);
 348     __ ldpd(v13, v12,  d13_save);
 349     __ ldpd(v11, v10,  d11_save);
 350     __ ldpd(v9,  v8,   d9_save);
 351 
 352     __ ldp(r28, r27,   r28_save);
 353     __ ldp(r26, r25,   r26_save);
 354     __ ldp(r24, r23,   r24_save);
 355     __ ldp(r22, r21,   r22_save);
 356     __ ldp(r20, r19,   r20_save);
 357 
 358     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 359     __ ldrw(c_rarg2, result_type);
 360     __ ldr(c_rarg3,  method);
 361     __ ldp(c_rarg4, c_rarg5,  entry_point);
 362     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 363 
 364     // leave frame and return to caller
 365     __ leave();
 366     __ ret(lr);
 367 
 368     // handle return types different from T_INT
 369 
 370     __ BIND(is_long);
 371     __ str(r0, Address(j_rarg2, 0));
 372     __ br(Assembler::AL, exit);
 373 
 374     __ BIND(is_float);
 375     __ strs(j_farg0, Address(j_rarg2, 0));
 376     __ br(Assembler::AL, exit);
 377 
 378     __ BIND(is_double);
 379     __ strd(j_farg0, Address(j_rarg2, 0));
 380     __ br(Assembler::AL, exit);
 381 
 382     return start;
 383   }
 384 
 385   // Return point for a Java call if there's an exception thrown in
 386   // Java code.  The exception is caught and transformed into a
 387   // pending exception stored in JavaThread that can be tested from
 388   // within the VM.
 389   //
 390   // Note: Usually the parameters are removed by the callee. In case
 391   // of an exception crossing an activation frame boundary, that is
 392   // not the case if the callee is compiled code => need to setup the
 393   // rsp.
 394   //
 395   // r0: exception oop
 396 
 397   address generate_catch_exception() {
 398     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 399     address start = __ pc();
 400 
 401     // same as in generate_call_stub():
 402     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 403     const Address thread        (rfp, thread_off         * wordSize);
 404 
 405 #ifdef ASSERT
 406     // verify that threads correspond
 407     {
 408       Label L, S;
 409       __ ldr(rscratch1, thread);
 410       __ cmp(rthread, rscratch1);
 411       __ br(Assembler::NE, S);
 412       __ get_thread(rscratch1);
 413       __ cmp(rthread, rscratch1);
 414       __ br(Assembler::EQ, L);
 415       __ bind(S);
 416       __ stop("StubRoutines::catch_exception: threads must correspond");
 417       __ bind(L);
 418     }
 419 #endif
 420 
 421     // set pending exception
 422     __ verify_oop(r0);
 423 
 424     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 425     __ mov(rscratch1, (address)__FILE__);
 426     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 427     __ movw(rscratch1, (int)__LINE__);
 428     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 429 
 430     // complete return to VM
 431     assert(StubRoutines::_call_stub_return_address != NULL,
 432            "_call_stub_return_address must have been generated before");
 433     __ b(StubRoutines::_call_stub_return_address);
 434 
 435     return start;
 436   }
 437 
 438   // Continuation point for runtime calls returning with a pending
 439   // exception.  The pending exception check happened in the runtime
 440   // or native call stub.  The pending exception in Thread is
 441   // converted into a Java-level exception.
 442   //
 443   // Contract with Java-level exception handlers:
 444   // r0: exception
 445   // r3: throwing pc
 446   //
 447   // NOTE: At entry of this stub, exception-pc must be in LR !!
 448 
 449   // NOTE: this is always used as a jump target within generated code
 450   // so it just needs to be generated code wiht no x86 prolog
 451 
 452   address generate_forward_exception() {
 453     StubCodeMark mark(this, "StubRoutines", "forward exception");
 454     address start = __ pc();
 455 
 456     // Upon entry, LR points to the return address returning into
 457     // Java (interpreted or compiled) code; i.e., the return address
 458     // becomes the throwing pc.
 459     //
 460     // Arguments pushed before the runtime call are still on the stack
 461     // but the exception handler will reset the stack pointer ->
 462     // ignore them.  A potential result in registers can be ignored as
 463     // well.
 464 
 465 #ifdef ASSERT
 466     // make sure this code is only executed if there is a pending exception
 467     {
 468       Label L;
 469       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 470       __ cbnz(rscratch1, L);
 471       __ stop("StubRoutines::forward exception: no pending exception (1)");
 472       __ bind(L);
 473     }
 474 #endif
 475 
 476     // compute exception handler into r19
 477 
 478     // call the VM to find the handler address associated with the
 479     // caller address. pass thread in r0 and caller pc (ret address)
 480     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 481     // the stack.
 482     __ mov(c_rarg1, lr);
 483     // lr will be trashed by the VM call so we move it to R19
 484     // (callee-saved) because we also need to pass it to the handler
 485     // returned by this call.
 486     __ mov(r19, lr);
 487     BLOCK_COMMENT("call exception_handler_for_return_address");
 488     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 489                          SharedRuntime::exception_handler_for_return_address),
 490                     rthread, c_rarg1);
 491     // we should not really care that lr is no longer the callee
 492     // address. we saved the value the handler needs in r19 so we can
 493     // just copy it to r3. however, the C2 handler will push its own
 494     // frame and then calls into the VM and the VM code asserts that
 495     // the PC for the frame above the handler belongs to a compiled
 496     // Java method. So, we restore lr here to satisfy that assert.
 497     __ mov(lr, r19);
 498     // setup r0 & r3 & clear pending exception
 499     __ mov(r3, r19);
 500     __ mov(r19, r0);
 501     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 502     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 503 
 504 #ifdef ASSERT
 505     // make sure exception is set
 506     {
 507       Label L;
 508       __ cbnz(r0, L);
 509       __ stop("StubRoutines::forward exception: no pending exception (2)");
 510       __ bind(L);
 511     }
 512 #endif
 513 
 514     // continue at exception handler
 515     // r0: exception
 516     // r3: throwing pc
 517     // r19: exception handler
 518     __ verify_oop(r0);
 519     __ br(r19);
 520 
 521     return start;
 522   }
 523 
 524   // Non-destructive plausibility checks for oops
 525   //
 526   // Arguments:
 527   //    r0: oop to verify
 528   //    rscratch1: error message
 529   //
 530   // Stack after saving c_rarg3:
 531   //    [tos + 0]: saved c_rarg3
 532   //    [tos + 1]: saved c_rarg2
 533   //    [tos + 2]: saved lr
 534   //    [tos + 3]: saved rscratch2
 535   //    [tos + 4]: saved r0
 536   //    [tos + 5]: saved rscratch1
 537   address generate_verify_oop() {
 538 
 539     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 540     address start = __ pc();
 541 
 542     Label exit, error;
 543 
 544     // save c_rarg2 and c_rarg3
 545     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 546 
 547     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 548     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ ldr(c_rarg3, Address(c_rarg2));
 550     __ add(c_rarg3, c_rarg3, 1);
 551     __ str(c_rarg3, Address(c_rarg2));
 552 
 553     // object is in r0
 554     // make sure object is 'reasonable'
 555     __ cbz(r0, exit); // if obj is NULL it is OK
 556 
 557 #if INCLUDE_ZGC
 558     if (UseZGC) {
 559       // Check if mask is good.
 560       // verifies that ZAddressBadMask & r0 == 0
 561       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 562       __ andr(c_rarg2, r0, c_rarg3);
 563       __ cbnz(c_rarg2, error);
 564     }
 565 #endif
 566 
 567     // Check if the oop is in the right area of memory
 568     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 569     __ andr(c_rarg2, r0, c_rarg3);
 570     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 571 
 572     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 573     // instruction here because the flags register is live.
 574     __ eor(c_rarg2, c_rarg2, c_rarg3);
 575     __ cbnz(c_rarg2, error);
 576 
 577     // make sure klass is 'reasonable', which is not zero.
 578     __ load_klass(r0, r0);  // get klass
 579     __ cbz(r0, error);      // if klass is NULL it is broken
 580 
 581     // return if everything seems ok
 582     __ bind(exit);
 583 
 584     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 585     __ ret(lr);
 586 
 587     // handle errors
 588     __ bind(error);
 589     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 590 
 591     __ push(RegSet::range(r0, r29), sp);
 592     // debug(char* msg, int64_t pc, int64_t regs[])
 593     __ mov(c_rarg0, rscratch1);      // pass address of error message
 594     __ mov(c_rarg1, lr);             // pass return address
 595     __ mov(c_rarg2, sp);             // pass address of regs on stack
 596 #ifndef PRODUCT
 597     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 598 #endif
 599     BLOCK_COMMENT("call MacroAssembler::debug");
 600     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 601     __ blr(rscratch1);
 602     __ hlt(0);
 603 
 604     return start;
 605   }
 606 
 607   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 608 
 609   // The inner part of zero_words().  This is the bulk operation,
 610   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 611   // caller is responsible for zeroing the last few words.
 612   //
 613   // Inputs:
 614   // r10: the HeapWord-aligned base address of an array to zero.
 615   // r11: the count in HeapWords, r11 > 0.
 616   //
 617   // Returns r10 and r11, adjusted for the caller to clear.
 618   // r10: the base address of the tail of words left to clear.
 619   // r11: the number of words in the tail.
 620   //      r11 < MacroAssembler::zero_words_block_size.
 621 
 622   address generate_zero_blocks() {
 623     Label done;
 624     Label base_aligned;
 625 
 626     Register base = r10, cnt = r11;
 627 
 628     __ align(CodeEntryAlignment);
 629     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 630     address start = __ pc();
 631 
 632     if (UseBlockZeroing) {
 633       int zva_length = VM_Version::zva_length();
 634 
 635       // Ensure ZVA length can be divided by 16. This is required by
 636       // the subsequent operations.
 637       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 638 
 639       __ tbz(base, 3, base_aligned);
 640       __ str(zr, Address(__ post(base, 8)));
 641       __ sub(cnt, cnt, 1);
 642       __ bind(base_aligned);
 643 
 644       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 645       // alignment.
 646       Label small;
 647       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 648       __ subs(rscratch1, cnt, low_limit >> 3);
 649       __ br(Assembler::LT, small);
 650       __ zero_dcache_blocks(base, cnt);
 651       __ bind(small);
 652     }
 653 
 654     {
 655       // Number of stp instructions we'll unroll
 656       const int unroll =
 657         MacroAssembler::zero_words_block_size / 2;
 658       // Clear the remaining blocks.
 659       Label loop;
 660       __ subs(cnt, cnt, unroll * 2);
 661       __ br(Assembler::LT, done);
 662       __ bind(loop);
 663       for (int i = 0; i < unroll; i++)
 664         __ stp(zr, zr, __ post(base, 16));
 665       __ subs(cnt, cnt, unroll * 2);
 666       __ br(Assembler::GE, loop);
 667       __ bind(done);
 668       __ add(cnt, cnt, unroll * 2);
 669     }
 670 
 671     __ ret(lr);
 672 
 673     return start;
 674   }
 675 
 676 
 677   typedef enum {
 678     copy_forwards = 1,
 679     copy_backwards = -1
 680   } copy_direction;
 681 
 682   // Bulk copy of blocks of 8 words.
 683   //
 684   // count is a count of words.
 685   //
 686   // Precondition: count >= 8
 687   //
 688   // Postconditions:
 689   //
 690   // The least significant bit of count contains the remaining count
 691   // of words to copy.  The rest of count is trash.
 692   //
 693   // s and d are adjusted to point to the remaining words to copy
 694   //
 695   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 696                            copy_direction direction) {
 697     int unit = wordSize * direction;
 698     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 699 
 700     int offset;
 701     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 702       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 703     const Register stride = r13;
 704 
 705     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 706     assert_different_registers(s, d, count, rscratch1);
 707 
 708     Label again, drain;
 709     const char *stub_name;
 710     if (direction == copy_forwards)
 711       stub_name = "forward_copy_longs";
 712     else
 713       stub_name = "backward_copy_longs";
 714 
 715     __ align(CodeEntryAlignment);
 716 
 717     StubCodeMark mark(this, "StubRoutines", stub_name);
 718 
 719     __ bind(start);
 720 
 721     Label unaligned_copy_long;
 722     if (AvoidUnalignedAccesses) {
 723       __ tbnz(d, 3, unaligned_copy_long);
 724     }
 725 
 726     if (direction == copy_forwards) {
 727       __ sub(s, s, bias);
 728       __ sub(d, d, bias);
 729     }
 730 
 731 #ifdef ASSERT
 732     // Make sure we are never given < 8 words
 733     {
 734       Label L;
 735       __ cmp(count, (u1)8);
 736       __ br(Assembler::GE, L);
 737       __ stop("genrate_copy_longs called with < 8 words");
 738       __ bind(L);
 739     }
 740 #endif
 741 
 742     // Fill 8 registers
 743     if (UseSIMDForMemoryOps) {
 744       __ ldpq(v0, v1, Address(s, 4 * unit));
 745       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 746     } else {
 747       __ ldp(t0, t1, Address(s, 2 * unit));
 748       __ ldp(t2, t3, Address(s, 4 * unit));
 749       __ ldp(t4, t5, Address(s, 6 * unit));
 750       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 751     }
 752 
 753     __ subs(count, count, 16);
 754     __ br(Assembler::LO, drain);
 755 
 756     int prefetch = PrefetchCopyIntervalInBytes;
 757     bool use_stride = false;
 758     if (direction == copy_backwards) {
 759        use_stride = prefetch > 256;
 760        prefetch = -prefetch;
 761        if (use_stride) __ mov(stride, prefetch);
 762     }
 763 
 764     __ bind(again);
 765 
 766     if (PrefetchCopyIntervalInBytes > 0)
 767       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 768 
 769     if (UseSIMDForMemoryOps) {
 770       __ stpq(v0, v1, Address(d, 4 * unit));
 771       __ ldpq(v0, v1, Address(s, 4 * unit));
 772       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 773       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 774     } else {
 775       __ stp(t0, t1, Address(d, 2 * unit));
 776       __ ldp(t0, t1, Address(s, 2 * unit));
 777       __ stp(t2, t3, Address(d, 4 * unit));
 778       __ ldp(t2, t3, Address(s, 4 * unit));
 779       __ stp(t4, t5, Address(d, 6 * unit));
 780       __ ldp(t4, t5, Address(s, 6 * unit));
 781       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 782       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 783     }
 784 
 785     __ subs(count, count, 8);
 786     __ br(Assembler::HS, again);
 787 
 788     // Drain
 789     __ bind(drain);
 790     if (UseSIMDForMemoryOps) {
 791       __ stpq(v0, v1, Address(d, 4 * unit));
 792       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 793     } else {
 794       __ stp(t0, t1, Address(d, 2 * unit));
 795       __ stp(t2, t3, Address(d, 4 * unit));
 796       __ stp(t4, t5, Address(d, 6 * unit));
 797       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 798     }
 799 
 800     {
 801       Label L1, L2;
 802       __ tbz(count, exact_log2(4), L1);
 803       if (UseSIMDForMemoryOps) {
 804         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 805         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 806       } else {
 807         __ ldp(t0, t1, Address(s, 2 * unit));
 808         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 809         __ stp(t0, t1, Address(d, 2 * unit));
 810         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 811       }
 812       __ bind(L1);
 813 
 814       if (direction == copy_forwards) {
 815         __ add(s, s, bias);
 816         __ add(d, d, bias);
 817       }
 818 
 819       __ tbz(count, 1, L2);
 820       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 821       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 822       __ bind(L2);
 823     }
 824 
 825     __ ret(lr);
 826 
 827     if (AvoidUnalignedAccesses) {
 828       Label drain, again;
 829       // Register order for storing. Order is different for backward copy.
 830 
 831       __ bind(unaligned_copy_long);
 832 
 833       // source address is even aligned, target odd aligned
 834       //
 835       // when forward copying word pairs we read long pairs at offsets
 836       // {0, 2, 4, 6} (in long words). when backwards copying we read
 837       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 838       // address by -2 in the forwards case so we can compute the
 839       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 840       // or -1.
 841       //
 842       // when forward copying we need to store 1 word, 3 pairs and
 843       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 844       // zero offset We adjust the destination by -1 which means we
 845       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 846       //
 847       // When backwards copyng we need to store 1 word, 3 pairs and
 848       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 849       // offsets {1, 3, 5, 7, 8} * unit.
 850 
 851       if (direction == copy_forwards) {
 852         __ sub(s, s, 16);
 853         __ sub(d, d, 8);
 854       }
 855 
 856       // Fill 8 registers
 857       //
 858       // for forwards copy s was offset by -16 from the original input
 859       // value of s so the register contents are at these offsets
 860       // relative to the 64 bit block addressed by that original input
 861       // and so on for each successive 64 byte block when s is updated
 862       //
 863       // t0 at offset 0,  t1 at offset 8
 864       // t2 at offset 16, t3 at offset 24
 865       // t4 at offset 32, t5 at offset 40
 866       // t6 at offset 48, t7 at offset 56
 867 
 868       // for backwards copy s was not offset so the register contents
 869       // are at these offsets into the preceding 64 byte block
 870       // relative to that original input and so on for each successive
 871       // preceding 64 byte block when s is updated. this explains the
 872       // slightly counter-intuitive looking pattern of register usage
 873       // in the stp instructions for backwards copy.
 874       //
 875       // t0 at offset -16, t1 at offset -8
 876       // t2 at offset -32, t3 at offset -24
 877       // t4 at offset -48, t5 at offset -40
 878       // t6 at offset -64, t7 at offset -56
 879 
 880       __ ldp(t0, t1, Address(s, 2 * unit));
 881       __ ldp(t2, t3, Address(s, 4 * unit));
 882       __ ldp(t4, t5, Address(s, 6 * unit));
 883       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 884 
 885       __ subs(count, count, 16);
 886       __ br(Assembler::LO, drain);
 887 
 888       int prefetch = PrefetchCopyIntervalInBytes;
 889       bool use_stride = false;
 890       if (direction == copy_backwards) {
 891          use_stride = prefetch > 256;
 892          prefetch = -prefetch;
 893          if (use_stride) __ mov(stride, prefetch);
 894       }
 895 
 896       __ bind(again);
 897 
 898       if (PrefetchCopyIntervalInBytes > 0)
 899         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 900 
 901       if (direction == copy_forwards) {
 902        // allowing for the offset of -8 the store instructions place
 903        // registers into the target 64 bit block at the following
 904        // offsets
 905        //
 906        // t0 at offset 0
 907        // t1 at offset 8,  t2 at offset 16
 908        // t3 at offset 24, t4 at offset 32
 909        // t5 at offset 40, t6 at offset 48
 910        // t7 at offset 56
 911 
 912         __ str(t0, Address(d, 1 * unit));
 913         __ stp(t1, t2, Address(d, 2 * unit));
 914         __ ldp(t0, t1, Address(s, 2 * unit));
 915         __ stp(t3, t4, Address(d, 4 * unit));
 916         __ ldp(t2, t3, Address(s, 4 * unit));
 917         __ stp(t5, t6, Address(d, 6 * unit));
 918         __ ldp(t4, t5, Address(s, 6 * unit));
 919         __ str(t7, Address(__ pre(d, 8 * unit)));
 920         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 921       } else {
 922        // d was not offset when we started so the registers are
 923        // written into the 64 bit block preceding d with the following
 924        // offsets
 925        //
 926        // t1 at offset -8
 927        // t3 at offset -24, t0 at offset -16
 928        // t5 at offset -48, t2 at offset -32
 929        // t7 at offset -56, t4 at offset -48
 930        //                   t6 at offset -64
 931        //
 932        // note that this matches the offsets previously noted for the
 933        // loads
 934 
 935         __ str(t1, Address(d, 1 * unit));
 936         __ stp(t3, t0, Address(d, 3 * unit));
 937         __ ldp(t0, t1, Address(s, 2 * unit));
 938         __ stp(t5, t2, Address(d, 5 * unit));
 939         __ ldp(t2, t3, Address(s, 4 * unit));
 940         __ stp(t7, t4, Address(d, 7 * unit));
 941         __ ldp(t4, t5, Address(s, 6 * unit));
 942         __ str(t6, Address(__ pre(d, 8 * unit)));
 943         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 944       }
 945 
 946       __ subs(count, count, 8);
 947       __ br(Assembler::HS, again);
 948 
 949       // Drain
 950       //
 951       // this uses the same pattern of offsets and register arguments
 952       // as above
 953       __ bind(drain);
 954       if (direction == copy_forwards) {
 955         __ str(t0, Address(d, 1 * unit));
 956         __ stp(t1, t2, Address(d, 2 * unit));
 957         __ stp(t3, t4, Address(d, 4 * unit));
 958         __ stp(t5, t6, Address(d, 6 * unit));
 959         __ str(t7, Address(__ pre(d, 8 * unit)));
 960       } else {
 961         __ str(t1, Address(d, 1 * unit));
 962         __ stp(t3, t0, Address(d, 3 * unit));
 963         __ stp(t5, t2, Address(d, 5 * unit));
 964         __ stp(t7, t4, Address(d, 7 * unit));
 965         __ str(t6, Address(__ pre(d, 8 * unit)));
 966       }
 967       // now we need to copy any remaining part block which may
 968       // include a 4 word block subblock and/or a 2 word subblock.
 969       // bits 2 and 1 in the count are the tell-tale for whetehr we
 970       // have each such subblock
 971       {
 972         Label L1, L2;
 973         __ tbz(count, exact_log2(4), L1);
 974        // this is the same as above but copying only 4 longs hence
 975        // with ony one intervening stp between the str instructions
 976        // but note that the offsets and registers still follow the
 977        // same pattern
 978         __ ldp(t0, t1, Address(s, 2 * unit));
 979         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 980         if (direction == copy_forwards) {
 981           __ str(t0, Address(d, 1 * unit));
 982           __ stp(t1, t2, Address(d, 2 * unit));
 983           __ str(t3, Address(__ pre(d, 4 * unit)));
 984         } else {
 985           __ str(t1, Address(d, 1 * unit));
 986           __ stp(t3, t0, Address(d, 3 * unit));
 987           __ str(t2, Address(__ pre(d, 4 * unit)));
 988         }
 989         __ bind(L1);
 990 
 991         __ tbz(count, 1, L2);
 992        // this is the same as above but copying only 2 longs hence
 993        // there is no intervening stp between the str instructions
 994        // but note that the offset and register patterns are still
 995        // the same
 996         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
 997         if (direction == copy_forwards) {
 998           __ str(t0, Address(d, 1 * unit));
 999           __ str(t1, Address(__ pre(d, 2 * unit)));
1000         } else {
1001           __ str(t1, Address(d, 1 * unit));
1002           __ str(t0, Address(__ pre(d, 2 * unit)));
1003         }
1004         __ bind(L2);
1005 
1006        // for forwards copy we need to re-adjust the offsets we
1007        // applied so that s and d are follow the last words written
1008 
1009        if (direction == copy_forwards) {
1010          __ add(s, s, 16);
1011          __ add(d, d, 8);
1012        }
1013 
1014       }
1015 
1016       __ ret(lr);
1017       }
1018   }
1019 
1020   // Small copy: less than 16 bytes.
1021   //
1022   // NB: Ignores all of the bits of count which represent more than 15
1023   // bytes, so a caller doesn't have to mask them.
1024 
1025   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1026     bool is_backwards = step < 0;
1027     size_t granularity = uabs(step);
1028     int direction = is_backwards ? -1 : 1;
1029     int unit = wordSize * direction;
1030 
1031     Label Lword, Lint, Lshort, Lbyte;
1032 
1033     assert(granularity
1034            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1035 
1036     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1037 
1038     // ??? I don't know if this bit-test-and-branch is the right thing
1039     // to do.  It does a lot of jumping, resulting in several
1040     // mispredicted branches.  It might make more sense to do this
1041     // with something like Duff's device with a single computed branch.
1042 
1043     __ tbz(count, 3 - exact_log2(granularity), Lword);
1044     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1045     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1046     __ bind(Lword);
1047 
1048     if (granularity <= sizeof (jint)) {
1049       __ tbz(count, 2 - exact_log2(granularity), Lint);
1050       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1051       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1052       __ bind(Lint);
1053     }
1054 
1055     if (granularity <= sizeof (jshort)) {
1056       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1057       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1058       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1059       __ bind(Lshort);
1060     }
1061 
1062     if (granularity <= sizeof (jbyte)) {
1063       __ tbz(count, 0, Lbyte);
1064       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1065       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1066       __ bind(Lbyte);
1067     }
1068   }
1069 
1070   Label copy_f, copy_b;
1071 
1072   // All-singing all-dancing memory copy.
1073   //
1074   // Copy count units of memory from s to d.  The size of a unit is
1075   // step, which can be positive or negative depending on the direction
1076   // of copy.  If is_aligned is false, we align the source address.
1077   //
1078 
1079   void copy_memory(bool is_aligned, Register s, Register d,
1080                    Register count, Register tmp, int step) {
1081     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1082     bool is_backwards = step < 0;
1083     int granularity = uabs(step);
1084     const Register t0 = r3, t1 = r4;
1085 
1086     // <= 96 bytes do inline. Direction doesn't matter because we always
1087     // load all the data before writing anything
1088     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1089     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1090     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1091     const Register send = r17, dend = r18;
1092 
1093     if (PrefetchCopyIntervalInBytes > 0)
1094       __ prfm(Address(s, 0), PLDL1KEEP);
1095     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1096     __ br(Assembler::HI, copy_big);
1097 
1098     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1099     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1100 
1101     __ cmp(count, u1(16/granularity));
1102     __ br(Assembler::LS, copy16);
1103 
1104     __ cmp(count, u1(64/granularity));
1105     __ br(Assembler::HI, copy80);
1106 
1107     __ cmp(count, u1(32/granularity));
1108     __ br(Assembler::LS, copy32);
1109 
1110     // 33..64 bytes
1111     if (UseSIMDForMemoryOps) {
1112       __ ldpq(v0, v1, Address(s, 0));
1113       __ ldpq(v2, v3, Address(send, -32));
1114       __ stpq(v0, v1, Address(d, 0));
1115       __ stpq(v2, v3, Address(dend, -32));
1116     } else {
1117       __ ldp(t0, t1, Address(s, 0));
1118       __ ldp(t2, t3, Address(s, 16));
1119       __ ldp(t4, t5, Address(send, -32));
1120       __ ldp(t6, t7, Address(send, -16));
1121 
1122       __ stp(t0, t1, Address(d, 0));
1123       __ stp(t2, t3, Address(d, 16));
1124       __ stp(t4, t5, Address(dend, -32));
1125       __ stp(t6, t7, Address(dend, -16));
1126     }
1127     __ b(finish);
1128 
1129     // 17..32 bytes
1130     __ bind(copy32);
1131     __ ldp(t0, t1, Address(s, 0));
1132     __ ldp(t2, t3, Address(send, -16));
1133     __ stp(t0, t1, Address(d, 0));
1134     __ stp(t2, t3, Address(dend, -16));
1135     __ b(finish);
1136 
1137     // 65..80/96 bytes
1138     // (96 bytes if SIMD because we do 32 byes per instruction)
1139     __ bind(copy80);
1140     if (UseSIMDForMemoryOps) {
1141       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1142       __ ldpq(v4, v5, Address(send, -32));
1143       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1144       __ stpq(v4, v5, Address(dend, -32));
1145     } else {
1146       __ ldp(t0, t1, Address(s, 0));
1147       __ ldp(t2, t3, Address(s, 16));
1148       __ ldp(t4, t5, Address(s, 32));
1149       __ ldp(t6, t7, Address(s, 48));
1150       __ ldp(t8, t9, Address(send, -16));
1151 
1152       __ stp(t0, t1, Address(d, 0));
1153       __ stp(t2, t3, Address(d, 16));
1154       __ stp(t4, t5, Address(d, 32));
1155       __ stp(t6, t7, Address(d, 48));
1156       __ stp(t8, t9, Address(dend, -16));
1157     }
1158     __ b(finish);
1159 
1160     // 0..16 bytes
1161     __ bind(copy16);
1162     __ cmp(count, u1(8/granularity));
1163     __ br(Assembler::LO, copy8);
1164 
1165     // 8..16 bytes
1166     __ ldr(t0, Address(s, 0));
1167     __ ldr(t1, Address(send, -8));
1168     __ str(t0, Address(d, 0));
1169     __ str(t1, Address(dend, -8));
1170     __ b(finish);
1171 
1172     if (granularity < 8) {
1173       // 4..7 bytes
1174       __ bind(copy8);
1175       __ tbz(count, 2 - exact_log2(granularity), copy4);
1176       __ ldrw(t0, Address(s, 0));
1177       __ ldrw(t1, Address(send, -4));
1178       __ strw(t0, Address(d, 0));
1179       __ strw(t1, Address(dend, -4));
1180       __ b(finish);
1181       if (granularity < 4) {
1182         // 0..3 bytes
1183         __ bind(copy4);
1184         __ cbz(count, finish); // get rid of 0 case
1185         if (granularity == 2) {
1186           __ ldrh(t0, Address(s, 0));
1187           __ strh(t0, Address(d, 0));
1188         } else { // granularity == 1
1189           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1190           // the first and last byte.
1191           // Handle the 3 byte case by loading and storing base + count/2
1192           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1193           // This does means in the 1 byte case we load/store the same
1194           // byte 3 times.
1195           __ lsr(count, count, 1);
1196           __ ldrb(t0, Address(s, 0));
1197           __ ldrb(t1, Address(send, -1));
1198           __ ldrb(t2, Address(s, count));
1199           __ strb(t0, Address(d, 0));
1200           __ strb(t1, Address(dend, -1));
1201           __ strb(t2, Address(d, count));
1202         }
1203         __ b(finish);
1204       }
1205     }
1206 
1207     __ bind(copy_big);
1208     if (is_backwards) {
1209       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1210       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1211     }
1212 
1213     // Now we've got the small case out of the way we can align the
1214     // source address on a 2-word boundary.
1215 
1216     Label aligned;
1217 
1218     if (is_aligned) {
1219       // We may have to adjust by 1 word to get s 2-word-aligned.
1220       __ tbz(s, exact_log2(wordSize), aligned);
1221       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1222       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1223       __ sub(count, count, wordSize/granularity);
1224     } else {
1225       if (is_backwards) {
1226         __ andr(rscratch2, s, 2 * wordSize - 1);
1227       } else {
1228         __ neg(rscratch2, s);
1229         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1230       }
1231       // rscratch2 is the byte adjustment needed to align s.
1232       __ cbz(rscratch2, aligned);
1233       int shift = exact_log2(granularity);
1234       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1235       __ sub(count, count, rscratch2);
1236 
1237 #if 0
1238       // ?? This code is only correct for a disjoint copy.  It may or
1239       // may not make sense to use it in that case.
1240 
1241       // Copy the first pair; s and d may not be aligned.
1242       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1243       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1244 
1245       // Align s and d, adjust count
1246       if (is_backwards) {
1247         __ sub(s, s, rscratch2);
1248         __ sub(d, d, rscratch2);
1249       } else {
1250         __ add(s, s, rscratch2);
1251         __ add(d, d, rscratch2);
1252       }
1253 #else
1254       copy_memory_small(s, d, rscratch2, rscratch1, step);
1255 #endif
1256     }
1257 
1258     __ bind(aligned);
1259 
1260     // s is now 2-word-aligned.
1261 
1262     // We have a count of units and some trailing bytes.  Adjust the
1263     // count and do a bulk copy of words.
1264     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1265     if (direction == copy_forwards)
1266       __ bl(copy_f);
1267     else
1268       __ bl(copy_b);
1269 
1270     // And the tail.
1271     copy_memory_small(s, d, count, tmp, step);
1272 
1273     if (granularity >= 8) __ bind(copy8);
1274     if (granularity >= 4) __ bind(copy4);
1275     __ bind(finish);
1276   }
1277 
1278 
1279   void clobber_registers() {
1280 #ifdef ASSERT
1281     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1282     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1283     for (Register r = r3; r <= r18; r++)
1284       if (r != rscratch1) __ mov(r, rscratch1);
1285 #endif
1286   }
1287 
1288   // Scan over array at a for count oops, verifying each one.
1289   // Preserves a and count, clobbers rscratch1 and rscratch2.
1290   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1291     Label loop, end;
1292     __ mov(rscratch1, a);
1293     __ mov(rscratch2, zr);
1294     __ bind(loop);
1295     __ cmp(rscratch2, count);
1296     __ br(Assembler::HS, end);
1297     if (size == (size_t)wordSize) {
1298       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1299       __ verify_oop(temp);
1300     } else {
1301       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1302       __ decode_heap_oop(temp); // calls verify_oop
1303     }
1304     __ add(rscratch2, rscratch2, size);
1305     __ b(loop);
1306     __ bind(end);
1307   }
1308 
1309   // Arguments:
1310   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1311   //             ignored
1312   //   is_oop  - true => oop array, so generate store check code
1313   //   name    - stub name string
1314   //
1315   // Inputs:
1316   //   c_rarg0   - source array address
1317   //   c_rarg1   - destination array address
1318   //   c_rarg2   - element count, treated as ssize_t, can be zero
1319   //
1320   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1321   // the hardware handle it.  The two dwords within qwords that span
1322   // cache line boundaries will still be loaded and stored atomicly.
1323   //
1324   // Side Effects:
1325   //   disjoint_int_copy_entry is set to the no-overlap entry point
1326   //   used by generate_conjoint_int_oop_copy().
1327   //
1328   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1329                                   const char *name, bool dest_uninitialized = false) {
1330     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1331     RegSet saved_reg = RegSet::of(s, d, count);
1332     __ align(CodeEntryAlignment);
1333     StubCodeMark mark(this, "StubRoutines", name);
1334     address start = __ pc();
1335     __ enter();
1336 
1337     if (entry != NULL) {
1338       *entry = __ pc();
1339       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1340       BLOCK_COMMENT("Entry:");
1341     }
1342 
1343     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1344     if (dest_uninitialized) {
1345       decorators |= IS_DEST_UNINITIALIZED;
1346     }
1347     if (aligned) {
1348       decorators |= ARRAYCOPY_ALIGNED;
1349     }
1350 
1351     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1352     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1353 
1354     if (is_oop) {
1355       // save regs before copy_memory
1356       __ push(RegSet::of(d, count), sp);
1357     }
1358     {
1359       // UnsafeCopyMemory page error: continue after ucm
1360       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1361       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1362       copy_memory(aligned, s, d, count, rscratch1, size);
1363     }
1364 
1365     if (is_oop) {
1366       __ pop(RegSet::of(d, count), sp);
1367       if (VerifyOops)
1368         verify_oop_array(size, d, count, r16);
1369     }
1370 
1371     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1372 
1373     __ leave();
1374     __ mov(r0, zr); // return 0
1375     __ ret(lr);
1376     return start;
1377   }
1378 
1379   // Arguments:
1380   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1381   //             ignored
1382   //   is_oop  - true => oop array, so generate store check code
1383   //   name    - stub name string
1384   //
1385   // Inputs:
1386   //   c_rarg0   - source array address
1387   //   c_rarg1   - destination array address
1388   //   c_rarg2   - element count, treated as ssize_t, can be zero
1389   //
1390   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1391   // the hardware handle it.  The two dwords within qwords that span
1392   // cache line boundaries will still be loaded and stored atomicly.
1393   //
1394   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1395                                  address *entry, const char *name,
1396                                  bool dest_uninitialized = false) {
1397     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1398     RegSet saved_regs = RegSet::of(s, d, count);
1399     StubCodeMark mark(this, "StubRoutines", name);
1400     address start = __ pc();
1401     __ enter();
1402 
1403     if (entry != NULL) {
1404       *entry = __ pc();
1405       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1406       BLOCK_COMMENT("Entry:");
1407     }
1408 
1409     // use fwd copy when (d-s) above_equal (count*size)
1410     __ sub(rscratch1, d, s);
1411     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1412     __ br(Assembler::HS, nooverlap_target);
1413 
1414     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1415     if (dest_uninitialized) {
1416       decorators |= IS_DEST_UNINITIALIZED;
1417     }
1418     if (aligned) {
1419       decorators |= ARRAYCOPY_ALIGNED;
1420     }
1421 
1422     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1423     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1424 
1425     if (is_oop) {
1426       // save regs before copy_memory
1427       __ push(RegSet::of(d, count), sp);
1428     }
1429     {
1430       // UnsafeCopyMemory page error: continue after ucm
1431       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1432       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1433       copy_memory(aligned, s, d, count, rscratch1, -size);
1434     }
1435     if (is_oop) {
1436       __ pop(RegSet::of(d, count), sp);
1437       if (VerifyOops)
1438         verify_oop_array(size, d, count, r16);
1439     }
1440     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1441     __ leave();
1442     __ mov(r0, zr); // return 0
1443     __ ret(lr);
1444     return start;
1445 }
1446 
1447   // Arguments:
1448   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1449   //             ignored
1450   //   name    - stub name string
1451   //
1452   // Inputs:
1453   //   c_rarg0   - source array address
1454   //   c_rarg1   - destination array address
1455   //   c_rarg2   - element count, treated as ssize_t, can be zero
1456   //
1457   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1458   // we let the hardware handle it.  The one to eight bytes within words,
1459   // dwords or qwords that span cache line boundaries will still be loaded
1460   // and stored atomically.
1461   //
1462   // Side Effects:
1463   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1464   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1465   // we let the hardware handle it.  The one to eight bytes within words,
1466   // dwords or qwords that span cache line boundaries will still be loaded
1467   // and stored atomically.
1468   //
1469   // Side Effects:
1470   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1471   //   used by generate_conjoint_byte_copy().
1472   //
1473   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1474     const bool not_oop = false;
1475     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1476   }
1477 
1478   // Arguments:
1479   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1480   //             ignored
1481   //   name    - stub name string
1482   //
1483   // Inputs:
1484   //   c_rarg0   - source array address
1485   //   c_rarg1   - destination array address
1486   //   c_rarg2   - element count, treated as ssize_t, can be zero
1487   //
1488   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1489   // we let the hardware handle it.  The one to eight bytes within words,
1490   // dwords or qwords that span cache line boundaries will still be loaded
1491   // and stored atomically.
1492   //
1493   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1494                                       address* entry, const char *name) {
1495     const bool not_oop = false;
1496     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1497   }
1498 
1499   // Arguments:
1500   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1501   //             ignored
1502   //   name    - stub name string
1503   //
1504   // Inputs:
1505   //   c_rarg0   - source array address
1506   //   c_rarg1   - destination array address
1507   //   c_rarg2   - element count, treated as ssize_t, can be zero
1508   //
1509   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1510   // let the hardware handle it.  The two or four words within dwords
1511   // or qwords that span cache line boundaries will still be loaded
1512   // and stored atomically.
1513   //
1514   // Side Effects:
1515   //   disjoint_short_copy_entry is set to the no-overlap entry point
1516   //   used by generate_conjoint_short_copy().
1517   //
1518   address generate_disjoint_short_copy(bool aligned,
1519                                        address* entry, const char *name) {
1520     const bool not_oop = false;
1521     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1522   }
1523 
1524   // Arguments:
1525   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1526   //             ignored
1527   //   name    - stub name string
1528   //
1529   // Inputs:
1530   //   c_rarg0   - source array address
1531   //   c_rarg1   - destination array address
1532   //   c_rarg2   - element count, treated as ssize_t, can be zero
1533   //
1534   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1535   // let the hardware handle it.  The two or four words within dwords
1536   // or qwords that span cache line boundaries will still be loaded
1537   // and stored atomically.
1538   //
1539   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1540                                        address *entry, const char *name) {
1541     const bool not_oop = false;
1542     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1543 
1544   }
1545   // Arguments:
1546   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1547   //             ignored
1548   //   name    - stub name string
1549   //
1550   // Inputs:
1551   //   c_rarg0   - source array address
1552   //   c_rarg1   - destination array address
1553   //   c_rarg2   - element count, treated as ssize_t, can be zero
1554   //
1555   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1556   // the hardware handle it.  The two dwords within qwords that span
1557   // cache line boundaries will still be loaded and stored atomicly.
1558   //
1559   // Side Effects:
1560   //   disjoint_int_copy_entry is set to the no-overlap entry point
1561   //   used by generate_conjoint_int_oop_copy().
1562   //
1563   address generate_disjoint_int_copy(bool aligned, address *entry,
1564                                          const char *name, bool dest_uninitialized = false) {
1565     const bool not_oop = false;
1566     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1567   }
1568 
1569   // Arguments:
1570   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1571   //             ignored
1572   //   name    - stub name string
1573   //
1574   // Inputs:
1575   //   c_rarg0   - source array address
1576   //   c_rarg1   - destination array address
1577   //   c_rarg2   - element count, treated as ssize_t, can be zero
1578   //
1579   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1580   // the hardware handle it.  The two dwords within qwords that span
1581   // cache line boundaries will still be loaded and stored atomicly.
1582   //
1583   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1584                                      address *entry, const char *name,
1585                                      bool dest_uninitialized = false) {
1586     const bool not_oop = false;
1587     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1588   }
1589 
1590 
1591   // Arguments:
1592   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1593   //             ignored
1594   //   name    - stub name string
1595   //
1596   // Inputs:
1597   //   c_rarg0   - source array address
1598   //   c_rarg1   - destination array address
1599   //   c_rarg2   - element count, treated as size_t, can be zero
1600   //
1601   // Side Effects:
1602   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1603   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1604   //
1605   address generate_disjoint_long_copy(bool aligned, address *entry,
1606                                           const char *name, bool dest_uninitialized = false) {
1607     const bool not_oop = false;
1608     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1609   }
1610 
1611   // Arguments:
1612   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1613   //             ignored
1614   //   name    - stub name string
1615   //
1616   // Inputs:
1617   //   c_rarg0   - source array address
1618   //   c_rarg1   - destination array address
1619   //   c_rarg2   - element count, treated as size_t, can be zero
1620   //
1621   address generate_conjoint_long_copy(bool aligned,
1622                                       address nooverlap_target, address *entry,
1623                                       const char *name, bool dest_uninitialized = false) {
1624     const bool not_oop = false;
1625     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1626   }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as size_t, can be zero
1637   //
1638   // Side Effects:
1639   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1640   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1641   //
1642   address generate_disjoint_oop_copy(bool aligned, address *entry,
1643                                      const char *name, bool dest_uninitialized) {
1644     const bool is_oop = true;
1645     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1646     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1647   }
1648 
1649   // Arguments:
1650   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1651   //             ignored
1652   //   name    - stub name string
1653   //
1654   // Inputs:
1655   //   c_rarg0   - source array address
1656   //   c_rarg1   - destination array address
1657   //   c_rarg2   - element count, treated as size_t, can be zero
1658   //
1659   address generate_conjoint_oop_copy(bool aligned,
1660                                      address nooverlap_target, address *entry,
1661                                      const char *name, bool dest_uninitialized) {
1662     const bool is_oop = true;
1663     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1664     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1665                                   name, dest_uninitialized);
1666   }
1667 
1668 
1669   // Helper for generating a dynamic type check.
1670   // Smashes rscratch1, rscratch2.
1671   void generate_type_check(Register sub_klass,
1672                            Register super_check_offset,
1673                            Register super_klass,
1674                            Label& L_success) {
1675     assert_different_registers(sub_klass, super_check_offset, super_klass);
1676 
1677     BLOCK_COMMENT("type_check:");
1678 
1679     Label L_miss;
1680 
1681     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1682                                      super_check_offset);
1683     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1684 
1685     // Fall through on failure!
1686     __ BIND(L_miss);
1687   }
1688 
1689   //
1690   //  Generate checkcasting array copy stub
1691   //
1692   //  Input:
1693   //    c_rarg0   - source array address
1694   //    c_rarg1   - destination array address
1695   //    c_rarg2   - element count, treated as ssize_t, can be zero
1696   //    c_rarg3   - size_t ckoff (super_check_offset)
1697   //    c_rarg4   - oop ckval (super_klass)
1698   //
1699   //  Output:
1700   //    r0 ==  0  -  success
1701   //    r0 == -1^K - failure, where K is partial transfer count
1702   //
1703   address generate_checkcast_copy(const char *name, address *entry,
1704                                   bool dest_uninitialized = false) {
1705 
1706     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1707 
1708     // Input registers (after setup_arg_regs)
1709     const Register from        = c_rarg0;   // source array address
1710     const Register to          = c_rarg1;   // destination array address
1711     const Register count       = c_rarg2;   // elementscount
1712     const Register ckoff       = c_rarg3;   // super_check_offset
1713     const Register ckval       = c_rarg4;   // super_klass
1714 
1715     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1716     RegSet wb_post_saved_regs = RegSet::of(count);
1717 
1718     // Registers used as temps (r18, r19, r20 are save-on-entry)
1719     const Register count_save  = r21;       // orig elementscount
1720     const Register start_to    = r20;       // destination array start address
1721     const Register copied_oop  = r18;       // actual oop copied
1722     const Register r19_klass   = r19;       // oop._klass
1723 
1724     //---------------------------------------------------------------
1725     // Assembler stub will be used for this call to arraycopy
1726     // if the two arrays are subtypes of Object[] but the
1727     // destination array type is not equal to or a supertype
1728     // of the source type.  Each element must be separately
1729     // checked.
1730 
1731     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1732                                copied_oop, r19_klass, count_save);
1733 
1734     __ align(CodeEntryAlignment);
1735     StubCodeMark mark(this, "StubRoutines", name);
1736     address start = __ pc();
1737 
1738     __ enter(); // required for proper stackwalking of RuntimeStub frame
1739 
1740 #ifdef ASSERT
1741     // caller guarantees that the arrays really are different
1742     // otherwise, we would have to make conjoint checks
1743     { Label L;
1744       array_overlap_test(L, TIMES_OOP);
1745       __ stop("checkcast_copy within a single array");
1746       __ bind(L);
1747     }
1748 #endif //ASSERT
1749 
1750     // Caller of this entry point must set up the argument registers.
1751     if (entry != NULL) {
1752       *entry = __ pc();
1753       BLOCK_COMMENT("Entry:");
1754     }
1755 
1756      // Empty array:  Nothing to do.
1757     __ cbz(count, L_done);
1758 
1759     __ push(RegSet::of(r18, r19, r20, r21), sp);
1760 
1761 #ifdef ASSERT
1762     BLOCK_COMMENT("assert consistent ckoff/ckval");
1763     // The ckoff and ckval must be mutually consistent,
1764     // even though caller generates both.
1765     { Label L;
1766       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1767       __ ldrw(start_to, Address(ckval, sco_offset));
1768       __ cmpw(ckoff, start_to);
1769       __ br(Assembler::EQ, L);
1770       __ stop("super_check_offset inconsistent");
1771       __ bind(L);
1772     }
1773 #endif //ASSERT
1774 
1775     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1776     bool is_oop = true;
1777     if (dest_uninitialized) {
1778       decorators |= IS_DEST_UNINITIALIZED;
1779     }
1780 
1781     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1782     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1783 
1784     // save the original count
1785     __ mov(count_save, count);
1786 
1787     // Copy from low to high addresses
1788     __ mov(start_to, to);              // Save destination array start address
1789     __ b(L_load_element);
1790 
1791     // ======== begin loop ========
1792     // (Loop is rotated; its entry is L_load_element.)
1793     // Loop control:
1794     //   for (; count != 0; count--) {
1795     //     copied_oop = load_heap_oop(from++);
1796     //     ... generate_type_check ...;
1797     //     store_heap_oop(to++, copied_oop);
1798     //   }
1799     __ align(OptoLoopAlignment);
1800 
1801     __ BIND(L_store_element);
1802     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1803     __ sub(count, count, 1);
1804     __ cbz(count, L_do_card_marks);
1805 
1806     // ======== loop entry is here ========
1807     __ BIND(L_load_element);
1808     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1809     __ cbz(copied_oop, L_store_element);
1810 
1811     __ load_klass(r19_klass, copied_oop);// query the object klass
1812     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1813     // ======== end loop ========
1814 
1815     // It was a real error; we must depend on the caller to finish the job.
1816     // Register count = remaining oops, count_orig = total oops.
1817     // Emit GC store barriers for the oops we have copied and report
1818     // their number to the caller.
1819 
1820     __ subs(count, count_save, count);     // K = partially copied oop count
1821     __ eon(count, count, zr);                   // report (-1^K) to caller
1822     __ br(Assembler::EQ, L_done_pop);
1823 
1824     __ BIND(L_do_card_marks);
1825     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1826 
1827     __ bind(L_done_pop);
1828     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1829     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1830 
1831     __ bind(L_done);
1832     __ mov(r0, count);
1833     __ leave();
1834     __ ret(lr);
1835 
1836     return start;
1837   }
1838 
1839   // Perform range checks on the proposed arraycopy.
1840   // Kills temp, but nothing else.
1841   // Also, clean the sign bits of src_pos and dst_pos.
1842   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1843                               Register src_pos, // source position (c_rarg1)
1844                               Register dst,     // destination array oo (c_rarg2)
1845                               Register dst_pos, // destination position (c_rarg3)
1846                               Register length,
1847                               Register temp,
1848                               Label& L_failed) {
1849     BLOCK_COMMENT("arraycopy_range_checks:");
1850 
1851     assert_different_registers(rscratch1, temp);
1852 
1853     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1854     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1855     __ addw(temp, length, src_pos);
1856     __ cmpw(temp, rscratch1);
1857     __ br(Assembler::HI, L_failed);
1858 
1859     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1860     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1861     __ addw(temp, length, dst_pos);
1862     __ cmpw(temp, rscratch1);
1863     __ br(Assembler::HI, L_failed);
1864 
1865     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1866     __ movw(src_pos, src_pos);
1867     __ movw(dst_pos, dst_pos);
1868 
1869     BLOCK_COMMENT("arraycopy_range_checks done");
1870   }
1871 
1872   // These stubs get called from some dumb test routine.
1873   // I'll write them properly when they're called from
1874   // something that's actually doing something.
1875   static void fake_arraycopy_stub(address src, address dst, int count) {
1876     assert(count == 0, "huh?");
1877   }
1878 
1879 
1880   //
1881   //  Generate 'unsafe' array copy stub
1882   //  Though just as safe as the other stubs, it takes an unscaled
1883   //  size_t argument instead of an element count.
1884   //
1885   //  Input:
1886   //    c_rarg0   - source array address
1887   //    c_rarg1   - destination array address
1888   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1889   //
1890   // Examines the alignment of the operands and dispatches
1891   // to a long, int, short, or byte copy loop.
1892   //
1893   address generate_unsafe_copy(const char *name,
1894                                address byte_copy_entry,
1895                                address short_copy_entry,
1896                                address int_copy_entry,
1897                                address long_copy_entry) {
1898     Label L_long_aligned, L_int_aligned, L_short_aligned;
1899     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1900 
1901     __ align(CodeEntryAlignment);
1902     StubCodeMark mark(this, "StubRoutines", name);
1903     address start = __ pc();
1904     __ enter(); // required for proper stackwalking of RuntimeStub frame
1905 
1906     // bump this on entry, not on exit:
1907     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1908 
1909     __ orr(rscratch1, s, d);
1910     __ orr(rscratch1, rscratch1, count);
1911 
1912     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1913     __ cbz(rscratch1, L_long_aligned);
1914     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1915     __ cbz(rscratch1, L_int_aligned);
1916     __ tbz(rscratch1, 0, L_short_aligned);
1917     __ b(RuntimeAddress(byte_copy_entry));
1918 
1919     __ BIND(L_short_aligned);
1920     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1921     __ b(RuntimeAddress(short_copy_entry));
1922     __ BIND(L_int_aligned);
1923     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1924     __ b(RuntimeAddress(int_copy_entry));
1925     __ BIND(L_long_aligned);
1926     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1927     __ b(RuntimeAddress(long_copy_entry));
1928 
1929     return start;
1930   }
1931 
1932   //
1933   //  Generate generic array copy stubs
1934   //
1935   //  Input:
1936   //    c_rarg0    -  src oop
1937   //    c_rarg1    -  src_pos (32-bits)
1938   //    c_rarg2    -  dst oop
1939   //    c_rarg3    -  dst_pos (32-bits)
1940   //    c_rarg4    -  element count (32-bits)
1941   //
1942   //  Output:
1943   //    r0 ==  0  -  success
1944   //    r0 == -1^K - failure, where K is partial transfer count
1945   //
1946   address generate_generic_copy(const char *name,
1947                                 address byte_copy_entry, address short_copy_entry,
1948                                 address int_copy_entry, address oop_copy_entry,
1949                                 address long_copy_entry, address checkcast_copy_entry) {
1950 
1951     Label L_failed, L_objArray;
1952     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1953 
1954     // Input registers
1955     const Register src        = c_rarg0;  // source array oop
1956     const Register src_pos    = c_rarg1;  // source position
1957     const Register dst        = c_rarg2;  // destination array oop
1958     const Register dst_pos    = c_rarg3;  // destination position
1959     const Register length     = c_rarg4;
1960 
1961 
1962     // Registers used as temps
1963     const Register dst_klass  = c_rarg5;
1964 
1965     __ align(CodeEntryAlignment);
1966 
1967     StubCodeMark mark(this, "StubRoutines", name);
1968 
1969     address start = __ pc();
1970 
1971     __ enter(); // required for proper stackwalking of RuntimeStub frame
1972 
1973     // bump this on entry, not on exit:
1974     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1975 
1976     //-----------------------------------------------------------------------
1977     // Assembler stub will be used for this call to arraycopy
1978     // if the following conditions are met:
1979     //
1980     // (1) src and dst must not be null.
1981     // (2) src_pos must not be negative.
1982     // (3) dst_pos must not be negative.
1983     // (4) length  must not be negative.
1984     // (5) src klass and dst klass should be the same and not NULL.
1985     // (6) src and dst should be arrays.
1986     // (7) src_pos + length must not exceed length of src.
1987     // (8) dst_pos + length must not exceed length of dst.
1988     //
1989 
1990     //  if (src == NULL) return -1;
1991     __ cbz(src, L_failed);
1992 
1993     //  if (src_pos < 0) return -1;
1994     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1995 
1996     //  if (dst == NULL) return -1;
1997     __ cbz(dst, L_failed);
1998 
1999     //  if (dst_pos < 0) return -1;
2000     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2001 
2002     // registers used as temp
2003     const Register scratch_length    = r16; // elements count to copy
2004     const Register scratch_src_klass = r17; // array klass
2005     const Register lh                = r18; // layout helper
2006 
2007     //  if (length < 0) return -1;
2008     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2009     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2010 
2011     __ load_klass(scratch_src_klass, src);
2012 #ifdef ASSERT
2013     //  assert(src->klass() != NULL);
2014     {
2015       BLOCK_COMMENT("assert klasses not null {");
2016       Label L1, L2;
2017       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2018       __ bind(L1);
2019       __ stop("broken null klass");
2020       __ bind(L2);
2021       __ load_klass(rscratch1, dst);
2022       __ cbz(rscratch1, L1);     // this would be broken also
2023       BLOCK_COMMENT("} assert klasses not null done");
2024     }
2025 #endif
2026 
2027     // Load layout helper (32-bits)
2028     //
2029     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2030     // 32        30    24            16              8     2                 0
2031     //
2032     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2033     //
2034 
2035     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2036 
2037     // Handle objArrays completely differently...
2038     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2039     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2040     __ movw(rscratch1, objArray_lh);
2041     __ eorw(rscratch2, lh, rscratch1);
2042     __ cbzw(rscratch2, L_objArray);
2043 
2044     //  if (src->klass() != dst->klass()) return -1;
2045     __ load_klass(rscratch2, dst);
2046     __ eor(rscratch2, rscratch2, scratch_src_klass);
2047     __ cbnz(rscratch2, L_failed);
2048 
2049     //  if (!src->is_Array()) return -1;
2050     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2051 
2052     // At this point, it is known to be a typeArray (array_tag 0x3).
2053 #ifdef ASSERT
2054     {
2055       BLOCK_COMMENT("assert primitive array {");
2056       Label L;
2057       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2058       __ cmpw(lh, rscratch2);
2059       __ br(Assembler::GE, L);
2060       __ stop("must be a primitive array");
2061       __ bind(L);
2062       BLOCK_COMMENT("} assert primitive array done");
2063     }
2064 #endif
2065 
2066     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2067                            rscratch2, L_failed);
2068 
2069     // TypeArrayKlass
2070     //
2071     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2072     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2073     //
2074 
2075     const Register rscratch1_offset = rscratch1;    // array offset
2076     const Register r18_elsize = lh; // element size
2077 
2078     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2079            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2080     __ add(src, src, rscratch1_offset);           // src array offset
2081     __ add(dst, dst, rscratch1_offset);           // dst array offset
2082     BLOCK_COMMENT("choose copy loop based on element size");
2083 
2084     // next registers should be set before the jump to corresponding stub
2085     const Register from     = c_rarg0;  // source array address
2086     const Register to       = c_rarg1;  // destination array address
2087     const Register count    = c_rarg2;  // elements count
2088 
2089     // 'from', 'to', 'count' registers should be set in such order
2090     // since they are the same as 'src', 'src_pos', 'dst'.
2091 
2092     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2093 
2094     // The possible values of elsize are 0-3, i.e. exact_log2(element
2095     // size in bytes).  We do a simple bitwise binary search.
2096   __ BIND(L_copy_bytes);
2097     __ tbnz(r18_elsize, 1, L_copy_ints);
2098     __ tbnz(r18_elsize, 0, L_copy_shorts);
2099     __ lea(from, Address(src, src_pos));// src_addr
2100     __ lea(to,   Address(dst, dst_pos));// dst_addr
2101     __ movw(count, scratch_length); // length
2102     __ b(RuntimeAddress(byte_copy_entry));
2103 
2104   __ BIND(L_copy_shorts);
2105     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2106     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2107     __ movw(count, scratch_length); // length
2108     __ b(RuntimeAddress(short_copy_entry));
2109 
2110   __ BIND(L_copy_ints);
2111     __ tbnz(r18_elsize, 0, L_copy_longs);
2112     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2113     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2114     __ movw(count, scratch_length); // length
2115     __ b(RuntimeAddress(int_copy_entry));
2116 
2117   __ BIND(L_copy_longs);
2118 #ifdef ASSERT
2119     {
2120       BLOCK_COMMENT("assert long copy {");
2121       Label L;
2122       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2123       __ cmpw(r18_elsize, LogBytesPerLong);
2124       __ br(Assembler::EQ, L);
2125       __ stop("must be long copy, but elsize is wrong");
2126       __ bind(L);
2127       BLOCK_COMMENT("} assert long copy done");
2128     }
2129 #endif
2130     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2131     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2132     __ movw(count, scratch_length); // length
2133     __ b(RuntimeAddress(long_copy_entry));
2134 
2135     // ObjArrayKlass
2136   __ BIND(L_objArray);
2137     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2138 
2139     Label L_plain_copy, L_checkcast_copy;
2140     //  test array classes for subtyping
2141     __ load_klass(r18, dst);
2142     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2143     __ br(Assembler::NE, L_checkcast_copy);
2144 
2145     // Identically typed arrays can be copied without element-wise checks.
2146     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2147                            rscratch2, L_failed);
2148 
2149     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2150     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2151     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2152     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2153     __ movw(count, scratch_length); // length
2154   __ BIND(L_plain_copy);
2155     __ b(RuntimeAddress(oop_copy_entry));
2156 
2157   __ BIND(L_checkcast_copy);
2158     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2159     {
2160       // Before looking at dst.length, make sure dst is also an objArray.
2161       __ ldrw(rscratch1, Address(r18, lh_offset));
2162       __ movw(rscratch2, objArray_lh);
2163       __ eorw(rscratch1, rscratch1, rscratch2);
2164       __ cbnzw(rscratch1, L_failed);
2165 
2166       // It is safe to examine both src.length and dst.length.
2167       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2168                              r18, L_failed);
2169 
2170       __ load_klass(dst_klass, dst); // reload
2171 
2172       // Marshal the base address arguments now, freeing registers.
2173       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2174       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2175       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2176       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2177       __ movw(count, length);           // length (reloaded)
2178       Register sco_temp = c_rarg3;      // this register is free now
2179       assert_different_registers(from, to, count, sco_temp,
2180                                  dst_klass, scratch_src_klass);
2181       // assert_clean_int(count, sco_temp);
2182 
2183       // Generate the type check.
2184       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2185       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2186 
2187       // Smashes rscratch1, rscratch2
2188       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2189 
2190       // Fetch destination element klass from the ObjArrayKlass header.
2191       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2192       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2193       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2194 
2195       // the checkcast_copy loop needs two extra arguments:
2196       assert(c_rarg3 == sco_temp, "#3 already in place");
2197       // Set up arguments for checkcast_copy_entry.
2198       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2199       __ b(RuntimeAddress(checkcast_copy_entry));
2200     }
2201 
2202   __ BIND(L_failed);
2203     __ mov(r0, -1);
2204     __ leave();   // required for proper stackwalking of RuntimeStub frame
2205     __ ret(lr);
2206 
2207     return start;
2208   }
2209 
2210   //
2211   // Generate stub for array fill. If "aligned" is true, the
2212   // "to" address is assumed to be heapword aligned.
2213   //
2214   // Arguments for generated stub:
2215   //   to:    c_rarg0
2216   //   value: c_rarg1
2217   //   count: c_rarg2 treated as signed
2218   //
2219   address generate_fill(BasicType t, bool aligned, const char *name) {
2220     __ align(CodeEntryAlignment);
2221     StubCodeMark mark(this, "StubRoutines", name);
2222     address start = __ pc();
2223 
2224     BLOCK_COMMENT("Entry:");
2225 
2226     const Register to        = c_rarg0;  // source array address
2227     const Register value     = c_rarg1;  // value
2228     const Register count     = c_rarg2;  // elements count
2229 
2230     const Register bz_base = r10;        // base for block_zero routine
2231     const Register cnt_words = r11;      // temp register
2232 
2233     __ enter();
2234 
2235     Label L_fill_elements, L_exit1;
2236 
2237     int shift = -1;
2238     switch (t) {
2239       case T_BYTE:
2240         shift = 0;
2241         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2242         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2243         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2244         __ br(Assembler::LO, L_fill_elements);
2245         break;
2246       case T_SHORT:
2247         shift = 1;
2248         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2249         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2250         __ br(Assembler::LO, L_fill_elements);
2251         break;
2252       case T_INT:
2253         shift = 2;
2254         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2255         __ br(Assembler::LO, L_fill_elements);
2256         break;
2257       default: ShouldNotReachHere();
2258     }
2259 
2260     // Align source address at 8 bytes address boundary.
2261     Label L_skip_align1, L_skip_align2, L_skip_align4;
2262     if (!aligned) {
2263       switch (t) {
2264         case T_BYTE:
2265           // One byte misalignment happens only for byte arrays.
2266           __ tbz(to, 0, L_skip_align1);
2267           __ strb(value, Address(__ post(to, 1)));
2268           __ subw(count, count, 1);
2269           __ bind(L_skip_align1);
2270           // Fallthrough
2271         case T_SHORT:
2272           // Two bytes misalignment happens only for byte and short (char) arrays.
2273           __ tbz(to, 1, L_skip_align2);
2274           __ strh(value, Address(__ post(to, 2)));
2275           __ subw(count, count, 2 >> shift);
2276           __ bind(L_skip_align2);
2277           // Fallthrough
2278         case T_INT:
2279           // Align to 8 bytes, we know we are 4 byte aligned to start.
2280           __ tbz(to, 2, L_skip_align4);
2281           __ strw(value, Address(__ post(to, 4)));
2282           __ subw(count, count, 4 >> shift);
2283           __ bind(L_skip_align4);
2284           break;
2285         default: ShouldNotReachHere();
2286       }
2287     }
2288 
2289     //
2290     //  Fill large chunks
2291     //
2292     __ lsrw(cnt_words, count, 3 - shift); // number of words
2293     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2294     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2295     if (UseBlockZeroing) {
2296       Label non_block_zeroing, rest;
2297       // If the fill value is zero we can use the fast zero_words().
2298       __ cbnz(value, non_block_zeroing);
2299       __ mov(bz_base, to);
2300       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2301       __ zero_words(bz_base, cnt_words);
2302       __ b(rest);
2303       __ bind(non_block_zeroing);
2304       __ fill_words(to, cnt_words, value);
2305       __ bind(rest);
2306     } else {
2307       __ fill_words(to, cnt_words, value);
2308     }
2309 
2310     // Remaining count is less than 8 bytes. Fill it by a single store.
2311     // Note that the total length is no less than 8 bytes.
2312     if (t == T_BYTE || t == T_SHORT) {
2313       Label L_exit1;
2314       __ cbzw(count, L_exit1);
2315       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2316       __ str(value, Address(to, -8));    // overwrite some elements
2317       __ bind(L_exit1);
2318       __ leave();
2319       __ ret(lr);
2320     }
2321 
2322     // Handle copies less than 8 bytes.
2323     Label L_fill_2, L_fill_4, L_exit2;
2324     __ bind(L_fill_elements);
2325     switch (t) {
2326       case T_BYTE:
2327         __ tbz(count, 0, L_fill_2);
2328         __ strb(value, Address(__ post(to, 1)));
2329         __ bind(L_fill_2);
2330         __ tbz(count, 1, L_fill_4);
2331         __ strh(value, Address(__ post(to, 2)));
2332         __ bind(L_fill_4);
2333         __ tbz(count, 2, L_exit2);
2334         __ strw(value, Address(to));
2335         break;
2336       case T_SHORT:
2337         __ tbz(count, 0, L_fill_4);
2338         __ strh(value, Address(__ post(to, 2)));
2339         __ bind(L_fill_4);
2340         __ tbz(count, 1, L_exit2);
2341         __ strw(value, Address(to));
2342         break;
2343       case T_INT:
2344         __ cbzw(count, L_exit2);
2345         __ strw(value, Address(to));
2346         break;
2347       default: ShouldNotReachHere();
2348     }
2349     __ bind(L_exit2);
2350     __ leave();
2351     __ ret(lr);
2352     return start;
2353   }
2354 
2355   address generate_data_cache_writeback() {
2356     const Register line        = c_rarg0;  // address of line to write back
2357 
2358     __ align(CodeEntryAlignment);
2359 
2360     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2361 
2362     address start = __ pc();
2363     __ enter();
2364     __ cache_wb(Address(line, 0));
2365     __ leave();
2366     __ ret(lr);
2367 
2368     return start;
2369   }
2370 
2371   address generate_data_cache_writeback_sync() {
2372     const Register is_pre     = c_rarg0;  // pre or post sync
2373 
2374     __ align(CodeEntryAlignment);
2375 
2376     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2377 
2378     // pre wbsync is a no-op
2379     // post wbsync translates to an sfence
2380 
2381     Label skip;
2382     address start = __ pc();
2383     __ enter();
2384     __ cbnz(is_pre, skip);
2385     __ cache_wbsync(false);
2386     __ bind(skip);
2387     __ leave();
2388     __ ret(lr);
2389 
2390     return start;
2391   }
2392 
2393   void generate_arraycopy_stubs() {
2394     address entry;
2395     address entry_jbyte_arraycopy;
2396     address entry_jshort_arraycopy;
2397     address entry_jint_arraycopy;
2398     address entry_oop_arraycopy;
2399     address entry_jlong_arraycopy;
2400     address entry_checkcast_arraycopy;
2401 
2402     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2403     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2404 
2405     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2406 
2407     //*** jbyte
2408     // Always need aligned and unaligned versions
2409     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2410                                                                                   "jbyte_disjoint_arraycopy");
2411     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2412                                                                                   &entry_jbyte_arraycopy,
2413                                                                                   "jbyte_arraycopy");
2414     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2415                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2416     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2417                                                                                   "arrayof_jbyte_arraycopy");
2418 
2419     //*** jshort
2420     // Always need aligned and unaligned versions
2421     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2422                                                                                     "jshort_disjoint_arraycopy");
2423     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2424                                                                                     &entry_jshort_arraycopy,
2425                                                                                     "jshort_arraycopy");
2426     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2427                                                                                     "arrayof_jshort_disjoint_arraycopy");
2428     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2429                                                                                     "arrayof_jshort_arraycopy");
2430 
2431     //*** jint
2432     // Aligned versions
2433     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2434                                                                                 "arrayof_jint_disjoint_arraycopy");
2435     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2436                                                                                 "arrayof_jint_arraycopy");
2437     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2438     // entry_jint_arraycopy always points to the unaligned version
2439     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2440                                                                                 "jint_disjoint_arraycopy");
2441     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2442                                                                                 &entry_jint_arraycopy,
2443                                                                                 "jint_arraycopy");
2444 
2445     //*** jlong
2446     // It is always aligned
2447     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2448                                                                                   "arrayof_jlong_disjoint_arraycopy");
2449     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2450                                                                                   "arrayof_jlong_arraycopy");
2451     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2452     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2453 
2454     //*** oops
2455     {
2456       // With compressed oops we need unaligned versions; notice that
2457       // we overwrite entry_oop_arraycopy.
2458       bool aligned = !UseCompressedOops;
2459 
2460       StubRoutines::_arrayof_oop_disjoint_arraycopy
2461         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2462                                      /*dest_uninitialized*/false);
2463       StubRoutines::_arrayof_oop_arraycopy
2464         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2465                                      /*dest_uninitialized*/false);
2466       // Aligned versions without pre-barriers
2467       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2468         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2469                                      /*dest_uninitialized*/true);
2470       StubRoutines::_arrayof_oop_arraycopy_uninit
2471         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2472                                      /*dest_uninitialized*/true);
2473     }
2474 
2475     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2476     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2477     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2478     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2479 
2480     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2481     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2482                                                                         /*dest_uninitialized*/true);
2483 
2484     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2485                                                               entry_jbyte_arraycopy,
2486                                                               entry_jshort_arraycopy,
2487                                                               entry_jint_arraycopy,
2488                                                               entry_jlong_arraycopy);
2489 
2490     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2491                                                                entry_jbyte_arraycopy,
2492                                                                entry_jshort_arraycopy,
2493                                                                entry_jint_arraycopy,
2494                                                                entry_oop_arraycopy,
2495                                                                entry_jlong_arraycopy,
2496                                                                entry_checkcast_arraycopy);
2497 
2498     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2499     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2500     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2501     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2502     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2503     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2504   }
2505 
2506   void generate_math_stubs() { Unimplemented(); }
2507 
2508   // Arguments:
2509   //
2510   // Inputs:
2511   //   c_rarg0   - source byte array address
2512   //   c_rarg1   - destination byte array address
2513   //   c_rarg2   - K (key) in little endian int array
2514   //
2515   address generate_aescrypt_encryptBlock() {
2516     __ align(CodeEntryAlignment);
2517     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2518 
2519     Label L_doLast;
2520 
2521     const Register from        = c_rarg0;  // source array address
2522     const Register to          = c_rarg1;  // destination array address
2523     const Register key         = c_rarg2;  // key array address
2524     const Register keylen      = rscratch1;
2525 
2526     address start = __ pc();
2527     __ enter();
2528 
2529     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2530 
2531     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2532 
2533     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2534     __ rev32(v1, __ T16B, v1);
2535     __ rev32(v2, __ T16B, v2);
2536     __ rev32(v3, __ T16B, v3);
2537     __ rev32(v4, __ T16B, v4);
2538     __ aese(v0, v1);
2539     __ aesmc(v0, v0);
2540     __ aese(v0, v2);
2541     __ aesmc(v0, v0);
2542     __ aese(v0, v3);
2543     __ aesmc(v0, v0);
2544     __ aese(v0, v4);
2545     __ aesmc(v0, v0);
2546 
2547     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2548     __ rev32(v1, __ T16B, v1);
2549     __ rev32(v2, __ T16B, v2);
2550     __ rev32(v3, __ T16B, v3);
2551     __ rev32(v4, __ T16B, v4);
2552     __ aese(v0, v1);
2553     __ aesmc(v0, v0);
2554     __ aese(v0, v2);
2555     __ aesmc(v0, v0);
2556     __ aese(v0, v3);
2557     __ aesmc(v0, v0);
2558     __ aese(v0, v4);
2559     __ aesmc(v0, v0);
2560 
2561     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2562     __ rev32(v1, __ T16B, v1);
2563     __ rev32(v2, __ T16B, v2);
2564 
2565     __ cmpw(keylen, 44);
2566     __ br(Assembler::EQ, L_doLast);
2567 
2568     __ aese(v0, v1);
2569     __ aesmc(v0, v0);
2570     __ aese(v0, v2);
2571     __ aesmc(v0, v0);
2572 
2573     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2574     __ rev32(v1, __ T16B, v1);
2575     __ rev32(v2, __ T16B, v2);
2576 
2577     __ cmpw(keylen, 52);
2578     __ br(Assembler::EQ, L_doLast);
2579 
2580     __ aese(v0, v1);
2581     __ aesmc(v0, v0);
2582     __ aese(v0, v2);
2583     __ aesmc(v0, v0);
2584 
2585     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2586     __ rev32(v1, __ T16B, v1);
2587     __ rev32(v2, __ T16B, v2);
2588 
2589     __ BIND(L_doLast);
2590 
2591     __ aese(v0, v1);
2592     __ aesmc(v0, v0);
2593     __ aese(v0, v2);
2594 
2595     __ ld1(v1, __ T16B, key);
2596     __ rev32(v1, __ T16B, v1);
2597     __ eor(v0, __ T16B, v0, v1);
2598 
2599     __ st1(v0, __ T16B, to);
2600 
2601     __ mov(r0, 0);
2602 
2603     __ leave();
2604     __ ret(lr);
2605 
2606     return start;
2607   }
2608 
2609   // Arguments:
2610   //
2611   // Inputs:
2612   //   c_rarg0   - source byte array address
2613   //   c_rarg1   - destination byte array address
2614   //   c_rarg2   - K (key) in little endian int array
2615   //
2616   address generate_aescrypt_decryptBlock() {
2617     assert(UseAES, "need AES instructions and misaligned SSE support");
2618     __ align(CodeEntryAlignment);
2619     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2620     Label L_doLast;
2621 
2622     const Register from        = c_rarg0;  // source array address
2623     const Register to          = c_rarg1;  // destination array address
2624     const Register key         = c_rarg2;  // key array address
2625     const Register keylen      = rscratch1;
2626 
2627     address start = __ pc();
2628     __ enter(); // required for proper stackwalking of RuntimeStub frame
2629 
2630     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2631 
2632     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2633 
2634     __ ld1(v5, __ T16B, __ post(key, 16));
2635     __ rev32(v5, __ T16B, v5);
2636 
2637     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2638     __ rev32(v1, __ T16B, v1);
2639     __ rev32(v2, __ T16B, v2);
2640     __ rev32(v3, __ T16B, v3);
2641     __ rev32(v4, __ T16B, v4);
2642     __ aesd(v0, v1);
2643     __ aesimc(v0, v0);
2644     __ aesd(v0, v2);
2645     __ aesimc(v0, v0);
2646     __ aesd(v0, v3);
2647     __ aesimc(v0, v0);
2648     __ aesd(v0, v4);
2649     __ aesimc(v0, v0);
2650 
2651     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2652     __ rev32(v1, __ T16B, v1);
2653     __ rev32(v2, __ T16B, v2);
2654     __ rev32(v3, __ T16B, v3);
2655     __ rev32(v4, __ T16B, v4);
2656     __ aesd(v0, v1);
2657     __ aesimc(v0, v0);
2658     __ aesd(v0, v2);
2659     __ aesimc(v0, v0);
2660     __ aesd(v0, v3);
2661     __ aesimc(v0, v0);
2662     __ aesd(v0, v4);
2663     __ aesimc(v0, v0);
2664 
2665     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2666     __ rev32(v1, __ T16B, v1);
2667     __ rev32(v2, __ T16B, v2);
2668 
2669     __ cmpw(keylen, 44);
2670     __ br(Assembler::EQ, L_doLast);
2671 
2672     __ aesd(v0, v1);
2673     __ aesimc(v0, v0);
2674     __ aesd(v0, v2);
2675     __ aesimc(v0, v0);
2676 
2677     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2678     __ rev32(v1, __ T16B, v1);
2679     __ rev32(v2, __ T16B, v2);
2680 
2681     __ cmpw(keylen, 52);
2682     __ br(Assembler::EQ, L_doLast);
2683 
2684     __ aesd(v0, v1);
2685     __ aesimc(v0, v0);
2686     __ aesd(v0, v2);
2687     __ aesimc(v0, v0);
2688 
2689     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2690     __ rev32(v1, __ T16B, v1);
2691     __ rev32(v2, __ T16B, v2);
2692 
2693     __ BIND(L_doLast);
2694 
2695     __ aesd(v0, v1);
2696     __ aesimc(v0, v0);
2697     __ aesd(v0, v2);
2698 
2699     __ eor(v0, __ T16B, v0, v5);
2700 
2701     __ st1(v0, __ T16B, to);
2702 
2703     __ mov(r0, 0);
2704 
2705     __ leave();
2706     __ ret(lr);
2707 
2708     return start;
2709   }
2710 
2711   // Arguments:
2712   //
2713   // Inputs:
2714   //   c_rarg0   - source byte array address
2715   //   c_rarg1   - destination byte array address
2716   //   c_rarg2   - K (key) in little endian int array
2717   //   c_rarg3   - r vector byte array address
2718   //   c_rarg4   - input length
2719   //
2720   // Output:
2721   //   x0        - input length
2722   //
2723   address generate_cipherBlockChaining_encryptAESCrypt() {
2724     assert(UseAES, "need AES instructions and misaligned SSE support");
2725     __ align(CodeEntryAlignment);
2726     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2727 
2728     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2729 
2730     const Register from        = c_rarg0;  // source array address
2731     const Register to          = c_rarg1;  // destination array address
2732     const Register key         = c_rarg2;  // key array address
2733     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2734                                            // and left with the results of the last encryption block
2735     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2736     const Register keylen      = rscratch1;
2737 
2738     address start = __ pc();
2739 
2740       __ enter();
2741 
2742       __ movw(rscratch2, len_reg);
2743 
2744       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2745 
2746       __ ld1(v0, __ T16B, rvec);
2747 
2748       __ cmpw(keylen, 52);
2749       __ br(Assembler::CC, L_loadkeys_44);
2750       __ br(Assembler::EQ, L_loadkeys_52);
2751 
2752       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2753       __ rev32(v17, __ T16B, v17);
2754       __ rev32(v18, __ T16B, v18);
2755     __ BIND(L_loadkeys_52);
2756       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2757       __ rev32(v19, __ T16B, v19);
2758       __ rev32(v20, __ T16B, v20);
2759     __ BIND(L_loadkeys_44);
2760       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2761       __ rev32(v21, __ T16B, v21);
2762       __ rev32(v22, __ T16B, v22);
2763       __ rev32(v23, __ T16B, v23);
2764       __ rev32(v24, __ T16B, v24);
2765       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2766       __ rev32(v25, __ T16B, v25);
2767       __ rev32(v26, __ T16B, v26);
2768       __ rev32(v27, __ T16B, v27);
2769       __ rev32(v28, __ T16B, v28);
2770       __ ld1(v29, v30, v31, __ T16B, key);
2771       __ rev32(v29, __ T16B, v29);
2772       __ rev32(v30, __ T16B, v30);
2773       __ rev32(v31, __ T16B, v31);
2774 
2775     __ BIND(L_aes_loop);
2776       __ ld1(v1, __ T16B, __ post(from, 16));
2777       __ eor(v0, __ T16B, v0, v1);
2778 
2779       __ br(Assembler::CC, L_rounds_44);
2780       __ br(Assembler::EQ, L_rounds_52);
2781 
2782       __ aese(v0, v17); __ aesmc(v0, v0);
2783       __ aese(v0, v18); __ aesmc(v0, v0);
2784     __ BIND(L_rounds_52);
2785       __ aese(v0, v19); __ aesmc(v0, v0);
2786       __ aese(v0, v20); __ aesmc(v0, v0);
2787     __ BIND(L_rounds_44);
2788       __ aese(v0, v21); __ aesmc(v0, v0);
2789       __ aese(v0, v22); __ aesmc(v0, v0);
2790       __ aese(v0, v23); __ aesmc(v0, v0);
2791       __ aese(v0, v24); __ aesmc(v0, v0);
2792       __ aese(v0, v25); __ aesmc(v0, v0);
2793       __ aese(v0, v26); __ aesmc(v0, v0);
2794       __ aese(v0, v27); __ aesmc(v0, v0);
2795       __ aese(v0, v28); __ aesmc(v0, v0);
2796       __ aese(v0, v29); __ aesmc(v0, v0);
2797       __ aese(v0, v30);
2798       __ eor(v0, __ T16B, v0, v31);
2799 
2800       __ st1(v0, __ T16B, __ post(to, 16));
2801 
2802       __ subw(len_reg, len_reg, 16);
2803       __ cbnzw(len_reg, L_aes_loop);
2804 
2805       __ st1(v0, __ T16B, rvec);
2806 
2807       __ mov(r0, rscratch2);
2808 
2809       __ leave();
2810       __ ret(lr);
2811 
2812       return start;
2813   }
2814 
2815   // Arguments:
2816   //
2817   // Inputs:
2818   //   c_rarg0   - source byte array address
2819   //   c_rarg1   - destination byte array address
2820   //   c_rarg2   - K (key) in little endian int array
2821   //   c_rarg3   - r vector byte array address
2822   //   c_rarg4   - input length
2823   //
2824   // Output:
2825   //   r0        - input length
2826   //
2827   address generate_cipherBlockChaining_decryptAESCrypt() {
2828     assert(UseAES, "need AES instructions and misaligned SSE support");
2829     __ align(CodeEntryAlignment);
2830     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2831 
2832     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2833 
2834     const Register from        = c_rarg0;  // source array address
2835     const Register to          = c_rarg1;  // destination array address
2836     const Register key         = c_rarg2;  // key array address
2837     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2838                                            // and left with the results of the last encryption block
2839     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2840     const Register keylen      = rscratch1;
2841 
2842     address start = __ pc();
2843 
2844       __ enter();
2845 
2846       __ movw(rscratch2, len_reg);
2847 
2848       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2849 
2850       __ ld1(v2, __ T16B, rvec);
2851 
2852       __ ld1(v31, __ T16B, __ post(key, 16));
2853       __ rev32(v31, __ T16B, v31);
2854 
2855       __ cmpw(keylen, 52);
2856       __ br(Assembler::CC, L_loadkeys_44);
2857       __ br(Assembler::EQ, L_loadkeys_52);
2858 
2859       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2860       __ rev32(v17, __ T16B, v17);
2861       __ rev32(v18, __ T16B, v18);
2862     __ BIND(L_loadkeys_52);
2863       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2864       __ rev32(v19, __ T16B, v19);
2865       __ rev32(v20, __ T16B, v20);
2866     __ BIND(L_loadkeys_44);
2867       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2868       __ rev32(v21, __ T16B, v21);
2869       __ rev32(v22, __ T16B, v22);
2870       __ rev32(v23, __ T16B, v23);
2871       __ rev32(v24, __ T16B, v24);
2872       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2873       __ rev32(v25, __ T16B, v25);
2874       __ rev32(v26, __ T16B, v26);
2875       __ rev32(v27, __ T16B, v27);
2876       __ rev32(v28, __ T16B, v28);
2877       __ ld1(v29, v30, __ T16B, key);
2878       __ rev32(v29, __ T16B, v29);
2879       __ rev32(v30, __ T16B, v30);
2880 
2881     __ BIND(L_aes_loop);
2882       __ ld1(v0, __ T16B, __ post(from, 16));
2883       __ orr(v1, __ T16B, v0, v0);
2884 
2885       __ br(Assembler::CC, L_rounds_44);
2886       __ br(Assembler::EQ, L_rounds_52);
2887 
2888       __ aesd(v0, v17); __ aesimc(v0, v0);
2889       __ aesd(v0, v18); __ aesimc(v0, v0);
2890     __ BIND(L_rounds_52);
2891       __ aesd(v0, v19); __ aesimc(v0, v0);
2892       __ aesd(v0, v20); __ aesimc(v0, v0);
2893     __ BIND(L_rounds_44);
2894       __ aesd(v0, v21); __ aesimc(v0, v0);
2895       __ aesd(v0, v22); __ aesimc(v0, v0);
2896       __ aesd(v0, v23); __ aesimc(v0, v0);
2897       __ aesd(v0, v24); __ aesimc(v0, v0);
2898       __ aesd(v0, v25); __ aesimc(v0, v0);
2899       __ aesd(v0, v26); __ aesimc(v0, v0);
2900       __ aesd(v0, v27); __ aesimc(v0, v0);
2901       __ aesd(v0, v28); __ aesimc(v0, v0);
2902       __ aesd(v0, v29); __ aesimc(v0, v0);
2903       __ aesd(v0, v30);
2904       __ eor(v0, __ T16B, v0, v31);
2905       __ eor(v0, __ T16B, v0, v2);
2906 
2907       __ st1(v0, __ T16B, __ post(to, 16));
2908       __ orr(v2, __ T16B, v1, v1);
2909 
2910       __ subw(len_reg, len_reg, 16);
2911       __ cbnzw(len_reg, L_aes_loop);
2912 
2913       __ st1(v2, __ T16B, rvec);
2914 
2915       __ mov(r0, rscratch2);
2916 
2917       __ leave();
2918       __ ret(lr);
2919 
2920     return start;
2921   }
2922 
2923   // Arguments:
2924   //
2925   // Inputs:
2926   //   c_rarg0   - byte[]  source+offset
2927   //   c_rarg1   - int[]   SHA.state
2928   //   c_rarg2   - int     offset
2929   //   c_rarg3   - int     limit
2930   //
2931   address generate_sha1_implCompress(bool multi_block, const char *name) {
2932     __ align(CodeEntryAlignment);
2933     StubCodeMark mark(this, "StubRoutines", name);
2934     address start = __ pc();
2935 
2936     Register buf   = c_rarg0;
2937     Register state = c_rarg1;
2938     Register ofs   = c_rarg2;
2939     Register limit = c_rarg3;
2940 
2941     Label keys;
2942     Label sha1_loop;
2943 
2944     // load the keys into v0..v3
2945     __ adr(rscratch1, keys);
2946     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2947     // load 5 words state into v6, v7
2948     __ ldrq(v6, Address(state, 0));
2949     __ ldrs(v7, Address(state, 16));
2950 
2951 
2952     __ BIND(sha1_loop);
2953     // load 64 bytes of data into v16..v19
2954     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2955     __ rev32(v16, __ T16B, v16);
2956     __ rev32(v17, __ T16B, v17);
2957     __ rev32(v18, __ T16B, v18);
2958     __ rev32(v19, __ T16B, v19);
2959 
2960     // do the sha1
2961     __ addv(v4, __ T4S, v16, v0);
2962     __ orr(v20, __ T16B, v6, v6);
2963 
2964     FloatRegister d0 = v16;
2965     FloatRegister d1 = v17;
2966     FloatRegister d2 = v18;
2967     FloatRegister d3 = v19;
2968 
2969     for (int round = 0; round < 20; round++) {
2970       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2971       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2972       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2973       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2974       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2975 
2976       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2977       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2978       __ sha1h(tmp2, __ T4S, v20);
2979       if (round < 5)
2980         __ sha1c(v20, __ T4S, tmp3, tmp4);
2981       else if (round < 10 || round >= 15)
2982         __ sha1p(v20, __ T4S, tmp3, tmp4);
2983       else
2984         __ sha1m(v20, __ T4S, tmp3, tmp4);
2985       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2986 
2987       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2988     }
2989 
2990     __ addv(v7, __ T2S, v7, v21);
2991     __ addv(v6, __ T4S, v6, v20);
2992 
2993     if (multi_block) {
2994       __ add(ofs, ofs, 64);
2995       __ cmp(ofs, limit);
2996       __ br(Assembler::LE, sha1_loop);
2997       __ mov(c_rarg0, ofs); // return ofs
2998     }
2999 
3000     __ strq(v6, Address(state, 0));
3001     __ strs(v7, Address(state, 16));
3002 
3003     __ ret(lr);
3004 
3005     __ bind(keys);
3006     __ emit_int32(0x5a827999);
3007     __ emit_int32(0x6ed9eba1);
3008     __ emit_int32(0x8f1bbcdc);
3009     __ emit_int32(0xca62c1d6);
3010 
3011     return start;
3012   }
3013 
3014 
3015   // Arguments:
3016   //
3017   // Inputs:
3018   //   c_rarg0   - byte[]  source+offset
3019   //   c_rarg1   - int[]   SHA.state
3020   //   c_rarg2   - int     offset
3021   //   c_rarg3   - int     limit
3022   //
3023   address generate_sha256_implCompress(bool multi_block, const char *name) {
3024     static const uint32_t round_consts[64] = {
3025       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3026       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3027       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3028       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3029       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3030       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3031       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3032       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3033       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3034       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3035       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3036       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3037       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3038       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3039       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3040       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3041     };
3042     __ align(CodeEntryAlignment);
3043     StubCodeMark mark(this, "StubRoutines", name);
3044     address start = __ pc();
3045 
3046     Register buf   = c_rarg0;
3047     Register state = c_rarg1;
3048     Register ofs   = c_rarg2;
3049     Register limit = c_rarg3;
3050 
3051     Label sha1_loop;
3052 
3053     __ stpd(v8, v9, __ pre(sp, -32));
3054     __ stpd(v10, v11, Address(sp, 16));
3055 
3056 // dga == v0
3057 // dgb == v1
3058 // dg0 == v2
3059 // dg1 == v3
3060 // dg2 == v4
3061 // t0 == v6
3062 // t1 == v7
3063 
3064     // load 16 keys to v16..v31
3065     __ lea(rscratch1, ExternalAddress((address)round_consts));
3066     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3067     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3068     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3069     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3070 
3071     // load 8 words (256 bits) state
3072     __ ldpq(v0, v1, state);
3073 
3074     __ BIND(sha1_loop);
3075     // load 64 bytes of data into v8..v11
3076     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3077     __ rev32(v8, __ T16B, v8);
3078     __ rev32(v9, __ T16B, v9);
3079     __ rev32(v10, __ T16B, v10);
3080     __ rev32(v11, __ T16B, v11);
3081 
3082     __ addv(v6, __ T4S, v8, v16);
3083     __ orr(v2, __ T16B, v0, v0);
3084     __ orr(v3, __ T16B, v1, v1);
3085 
3086     FloatRegister d0 = v8;
3087     FloatRegister d1 = v9;
3088     FloatRegister d2 = v10;
3089     FloatRegister d3 = v11;
3090 
3091 
3092     for (int round = 0; round < 16; round++) {
3093       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3094       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3095       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3096       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3097 
3098       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3099        __ orr(v4, __ T16B, v2, v2);
3100       if (round < 15)
3101         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3102       __ sha256h(v2, __ T4S, v3, tmp2);
3103       __ sha256h2(v3, __ T4S, v4, tmp2);
3104       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3105 
3106       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3107     }
3108 
3109     __ addv(v0, __ T4S, v0, v2);
3110     __ addv(v1, __ T4S, v1, v3);
3111 
3112     if (multi_block) {
3113       __ add(ofs, ofs, 64);
3114       __ cmp(ofs, limit);
3115       __ br(Assembler::LE, sha1_loop);
3116       __ mov(c_rarg0, ofs); // return ofs
3117     }
3118 
3119     __ ldpd(v10, v11, Address(sp, 16));
3120     __ ldpd(v8, v9, __ post(sp, 32));
3121 
3122     __ stpq(v0, v1, state);
3123 
3124     __ ret(lr);
3125 
3126     return start;
3127   }
3128 
3129   // Safefetch stubs.
3130   void generate_safefetch(const char* name, int size, address* entry,
3131                           address* fault_pc, address* continuation_pc) {
3132     // safefetch signatures:
3133     //   int      SafeFetch32(int*      adr, int      errValue);
3134     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3135     //
3136     // arguments:
3137     //   c_rarg0 = adr
3138     //   c_rarg1 = errValue
3139     //
3140     // result:
3141     //   PPC_RET  = *adr or errValue
3142 
3143     StubCodeMark mark(this, "StubRoutines", name);
3144 
3145     // Entry point, pc or function descriptor.
3146     *entry = __ pc();
3147 
3148     // Load *adr into c_rarg1, may fault.
3149     *fault_pc = __ pc();
3150     switch (size) {
3151       case 4:
3152         // int32_t
3153         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3154         break;
3155       case 8:
3156         // int64_t
3157         __ ldr(c_rarg1, Address(c_rarg0, 0));
3158         break;
3159       default:
3160         ShouldNotReachHere();
3161     }
3162 
3163     // return errValue or *adr
3164     *continuation_pc = __ pc();
3165     __ mov(r0, c_rarg1);
3166     __ ret(lr);
3167   }
3168 
3169   /**
3170    *  Arguments:
3171    *
3172    * Inputs:
3173    *   c_rarg0   - int crc
3174    *   c_rarg1   - byte* buf
3175    *   c_rarg2   - int length
3176    *
3177    * Ouput:
3178    *       rax   - int crc result
3179    */
3180   address generate_updateBytesCRC32() {
3181     assert(UseCRC32Intrinsics, "what are we doing here?");
3182 
3183     __ align(CodeEntryAlignment);
3184     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3185 
3186     address start = __ pc();
3187 
3188     const Register crc   = c_rarg0;  // crc
3189     const Register buf   = c_rarg1;  // source java byte array address
3190     const Register len   = c_rarg2;  // length
3191     const Register table0 = c_rarg3; // crc_table address
3192     const Register table1 = c_rarg4;
3193     const Register table2 = c_rarg5;
3194     const Register table3 = c_rarg6;
3195     const Register tmp3 = c_rarg7;
3196 
3197     BLOCK_COMMENT("Entry:");
3198     __ enter(); // required for proper stackwalking of RuntimeStub frame
3199 
3200     __ kernel_crc32(crc, buf, len,
3201               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3202 
3203     __ leave(); // required for proper stackwalking of RuntimeStub frame
3204     __ ret(lr);
3205 
3206     return start;
3207   }
3208 
3209   /**
3210    *  Arguments:
3211    *
3212    * Inputs:
3213    *   c_rarg0   - int crc
3214    *   c_rarg1   - byte* buf
3215    *   c_rarg2   - int length
3216    *   c_rarg3   - int* table
3217    *
3218    * Ouput:
3219    *       r0   - int crc result
3220    */
3221   address generate_updateBytesCRC32C() {
3222     assert(UseCRC32CIntrinsics, "what are we doing here?");
3223 
3224     __ align(CodeEntryAlignment);
3225     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3226 
3227     address start = __ pc();
3228 
3229     const Register crc   = c_rarg0;  // crc
3230     const Register buf   = c_rarg1;  // source java byte array address
3231     const Register len   = c_rarg2;  // length
3232     const Register table0 = c_rarg3; // crc_table address
3233     const Register table1 = c_rarg4;
3234     const Register table2 = c_rarg5;
3235     const Register table3 = c_rarg6;
3236     const Register tmp3 = c_rarg7;
3237 
3238     BLOCK_COMMENT("Entry:");
3239     __ enter(); // required for proper stackwalking of RuntimeStub frame
3240 
3241     __ kernel_crc32c(crc, buf, len,
3242               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3243 
3244     __ leave(); // required for proper stackwalking of RuntimeStub frame
3245     __ ret(lr);
3246 
3247     return start;
3248   }
3249 
3250   /***
3251    *  Arguments:
3252    *
3253    *  Inputs:
3254    *   c_rarg0   - int   adler
3255    *   c_rarg1   - byte* buff
3256    *   c_rarg2   - int   len
3257    *
3258    * Output:
3259    *   c_rarg0   - int adler result
3260    */
3261   address generate_updateBytesAdler32() {
3262     __ align(CodeEntryAlignment);
3263     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3264     address start = __ pc();
3265 
3266     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3267 
3268     // Aliases
3269     Register adler  = c_rarg0;
3270     Register s1     = c_rarg0;
3271     Register s2     = c_rarg3;
3272     Register buff   = c_rarg1;
3273     Register len    = c_rarg2;
3274     Register nmax  = r4;
3275     Register base  = r5;
3276     Register count = r6;
3277     Register temp0 = rscratch1;
3278     Register temp1 = rscratch2;
3279     FloatRegister vbytes = v0;
3280     FloatRegister vs1acc = v1;
3281     FloatRegister vs2acc = v2;
3282     FloatRegister vtable = v3;
3283 
3284     // Max number of bytes we can process before having to take the mod
3285     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3286     unsigned long BASE = 0xfff1;
3287     unsigned long NMAX = 0x15B0;
3288 
3289     __ mov(base, BASE);
3290     __ mov(nmax, NMAX);
3291 
3292     // Load accumulation coefficients for the upper 16 bits
3293     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3294     __ ld1(vtable, __ T16B, Address(temp0));
3295 
3296     // s1 is initialized to the lower 16 bits of adler
3297     // s2 is initialized to the upper 16 bits of adler
3298     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3299     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3300 
3301     // The pipelined loop needs at least 16 elements for 1 iteration
3302     // It does check this, but it is more effective to skip to the cleanup loop
3303     __ cmp(len, (u1)16);
3304     __ br(Assembler::HS, L_nmax);
3305     __ cbz(len, L_combine);
3306 
3307     __ bind(L_simple_by1_loop);
3308     __ ldrb(temp0, Address(__ post(buff, 1)));
3309     __ add(s1, s1, temp0);
3310     __ add(s2, s2, s1);
3311     __ subs(len, len, 1);
3312     __ br(Assembler::HI, L_simple_by1_loop);
3313 
3314     // s1 = s1 % BASE
3315     __ subs(temp0, s1, base);
3316     __ csel(s1, temp0, s1, Assembler::HS);
3317 
3318     // s2 = s2 % BASE
3319     __ lsr(temp0, s2, 16);
3320     __ lsl(temp1, temp0, 4);
3321     __ sub(temp1, temp1, temp0);
3322     __ add(s2, temp1, s2, ext::uxth);
3323 
3324     __ subs(temp0, s2, base);
3325     __ csel(s2, temp0, s2, Assembler::HS);
3326 
3327     __ b(L_combine);
3328 
3329     __ bind(L_nmax);
3330     __ subs(len, len, nmax);
3331     __ sub(count, nmax, 16);
3332     __ br(Assembler::LO, L_by16);
3333 
3334     __ bind(L_nmax_loop);
3335 
3336     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3337                                       vbytes, vs1acc, vs2acc, vtable);
3338 
3339     __ subs(count, count, 16);
3340     __ br(Assembler::HS, L_nmax_loop);
3341 
3342     // s1 = s1 % BASE
3343     __ lsr(temp0, s1, 16);
3344     __ lsl(temp1, temp0, 4);
3345     __ sub(temp1, temp1, temp0);
3346     __ add(temp1, temp1, s1, ext::uxth);
3347 
3348     __ lsr(temp0, temp1, 16);
3349     __ lsl(s1, temp0, 4);
3350     __ sub(s1, s1, temp0);
3351     __ add(s1, s1, temp1, ext:: uxth);
3352 
3353     __ subs(temp0, s1, base);
3354     __ csel(s1, temp0, s1, Assembler::HS);
3355 
3356     // s2 = s2 % BASE
3357     __ lsr(temp0, s2, 16);
3358     __ lsl(temp1, temp0, 4);
3359     __ sub(temp1, temp1, temp0);
3360     __ add(temp1, temp1, s2, ext::uxth);
3361 
3362     __ lsr(temp0, temp1, 16);
3363     __ lsl(s2, temp0, 4);
3364     __ sub(s2, s2, temp0);
3365     __ add(s2, s2, temp1, ext:: uxth);
3366 
3367     __ subs(temp0, s2, base);
3368     __ csel(s2, temp0, s2, Assembler::HS);
3369 
3370     __ subs(len, len, nmax);
3371     __ sub(count, nmax, 16);
3372     __ br(Assembler::HS, L_nmax_loop);
3373 
3374     __ bind(L_by16);
3375     __ adds(len, len, count);
3376     __ br(Assembler::LO, L_by1);
3377 
3378     __ bind(L_by16_loop);
3379 
3380     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3381                                       vbytes, vs1acc, vs2acc, vtable);
3382 
3383     __ subs(len, len, 16);
3384     __ br(Assembler::HS, L_by16_loop);
3385 
3386     __ bind(L_by1);
3387     __ adds(len, len, 15);
3388     __ br(Assembler::LO, L_do_mod);
3389 
3390     __ bind(L_by1_loop);
3391     __ ldrb(temp0, Address(__ post(buff, 1)));
3392     __ add(s1, temp0, s1);
3393     __ add(s2, s2, s1);
3394     __ subs(len, len, 1);
3395     __ br(Assembler::HS, L_by1_loop);
3396 
3397     __ bind(L_do_mod);
3398     // s1 = s1 % BASE
3399     __ lsr(temp0, s1, 16);
3400     __ lsl(temp1, temp0, 4);
3401     __ sub(temp1, temp1, temp0);
3402     __ add(temp1, temp1, s1, ext::uxth);
3403 
3404     __ lsr(temp0, temp1, 16);
3405     __ lsl(s1, temp0, 4);
3406     __ sub(s1, s1, temp0);
3407     __ add(s1, s1, temp1, ext:: uxth);
3408 
3409     __ subs(temp0, s1, base);
3410     __ csel(s1, temp0, s1, Assembler::HS);
3411 
3412     // s2 = s2 % BASE
3413     __ lsr(temp0, s2, 16);
3414     __ lsl(temp1, temp0, 4);
3415     __ sub(temp1, temp1, temp0);
3416     __ add(temp1, temp1, s2, ext::uxth);
3417 
3418     __ lsr(temp0, temp1, 16);
3419     __ lsl(s2, temp0, 4);
3420     __ sub(s2, s2, temp0);
3421     __ add(s2, s2, temp1, ext:: uxth);
3422 
3423     __ subs(temp0, s2, base);
3424     __ csel(s2, temp0, s2, Assembler::HS);
3425 
3426     // Combine lower bits and higher bits
3427     __ bind(L_combine);
3428     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3429 
3430     __ ret(lr);
3431 
3432     return start;
3433   }
3434 
3435   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3436           Register temp0, Register temp1, FloatRegister vbytes,
3437           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3438     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3439     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3440     // In non-vectorized code, we update s1 and s2 as:
3441     //   s1 <- s1 + b1
3442     //   s2 <- s2 + s1
3443     //   s1 <- s1 + b2
3444     //   s2 <- s2 + b1
3445     //   ...
3446     //   s1 <- s1 + b16
3447     //   s2 <- s2 + s1
3448     // Putting above assignments together, we have:
3449     //   s1_new = s1 + b1 + b2 + ... + b16
3450     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3451     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3452     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3453     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3454 
3455     // s2 = s2 + s1 * 16
3456     __ add(s2, s2, s1, Assembler::LSL, 4);
3457 
3458     // vs1acc = b1 + b2 + b3 + ... + b16
3459     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3460     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3461     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3462     __ uaddlv(vs1acc, __ T16B, vbytes);
3463     __ uaddlv(vs2acc, __ T8H, vs2acc);
3464 
3465     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3466     __ fmovd(temp0, vs1acc);
3467     __ fmovd(temp1, vs2acc);
3468     __ add(s1, s1, temp0);
3469     __ add(s2, s2, temp1);
3470   }
3471 
3472   /**
3473    *  Arguments:
3474    *
3475    *  Input:
3476    *    c_rarg0   - x address
3477    *    c_rarg1   - x length
3478    *    c_rarg2   - y address
3479    *    c_rarg3   - y lenth
3480    *    c_rarg4   - z address
3481    *    c_rarg5   - z length
3482    */
3483   address generate_multiplyToLen() {
3484     __ align(CodeEntryAlignment);
3485     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3486 
3487     address start = __ pc();
3488     const Register x     = r0;
3489     const Register xlen  = r1;
3490     const Register y     = r2;
3491     const Register ylen  = r3;
3492     const Register z     = r4;
3493     const Register zlen  = r5;
3494 
3495     const Register tmp1  = r10;
3496     const Register tmp2  = r11;
3497     const Register tmp3  = r12;
3498     const Register tmp4  = r13;
3499     const Register tmp5  = r14;
3500     const Register tmp6  = r15;
3501     const Register tmp7  = r16;
3502 
3503     BLOCK_COMMENT("Entry:");
3504     __ enter(); // required for proper stackwalking of RuntimeStub frame
3505     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3506     __ leave(); // required for proper stackwalking of RuntimeStub frame
3507     __ ret(lr);
3508 
3509     return start;
3510   }
3511 
3512   address generate_squareToLen() {
3513     // squareToLen algorithm for sizes 1..127 described in java code works
3514     // faster than multiply_to_len on some CPUs and slower on others, but
3515     // multiply_to_len shows a bit better overall results
3516     __ align(CodeEntryAlignment);
3517     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3518     address start = __ pc();
3519 
3520     const Register x     = r0;
3521     const Register xlen  = r1;
3522     const Register z     = r2;
3523     const Register zlen  = r3;
3524     const Register y     = r4; // == x
3525     const Register ylen  = r5; // == xlen
3526 
3527     const Register tmp1  = r10;
3528     const Register tmp2  = r11;
3529     const Register tmp3  = r12;
3530     const Register tmp4  = r13;
3531     const Register tmp5  = r14;
3532     const Register tmp6  = r15;
3533     const Register tmp7  = r16;
3534 
3535     RegSet spilled_regs = RegSet::of(y, ylen);
3536     BLOCK_COMMENT("Entry:");
3537     __ enter();
3538     __ push(spilled_regs, sp);
3539     __ mov(y, x);
3540     __ mov(ylen, xlen);
3541     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3542     __ pop(spilled_regs, sp);
3543     __ leave();
3544     __ ret(lr);
3545     return start;
3546   }
3547 
3548   address generate_mulAdd() {
3549     __ align(CodeEntryAlignment);
3550     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3551 
3552     address start = __ pc();
3553 
3554     const Register out     = r0;
3555     const Register in      = r1;
3556     const Register offset  = r2;
3557     const Register len     = r3;
3558     const Register k       = r4;
3559 
3560     BLOCK_COMMENT("Entry:");
3561     __ enter();
3562     __ mul_add(out, in, offset, len, k);
3563     __ leave();
3564     __ ret(lr);
3565 
3566     return start;
3567   }
3568 
3569   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3570                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3571                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3572     // Karatsuba multiplication performs a 128*128 -> 256-bit
3573     // multiplication in three 128-bit multiplications and a few
3574     // additions.
3575     //
3576     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3577     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3578     //
3579     // Inputs:
3580     //
3581     // A0 in a.d[0]     (subkey)
3582     // A1 in a.d[1]
3583     // (A1+A0) in a1_xor_a0.d[0]
3584     //
3585     // B0 in b.d[0]     (state)
3586     // B1 in b.d[1]
3587 
3588     __ ext(tmp1, __ T16B, b, b, 0x08);
3589     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3590     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3591     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3592     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3593 
3594     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3595     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3596     __ eor(tmp2, __ T16B, tmp2, tmp4);
3597     __ eor(tmp2, __ T16B, tmp2, tmp3);
3598 
3599     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3600     __ ins(result_hi, __ D, tmp2, 0, 1);
3601     __ ins(result_lo, __ D, tmp2, 1, 0);
3602   }
3603 
3604   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3605                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3606     const FloatRegister t0 = result;
3607 
3608     // The GCM field polynomial f is z^128 + p(z), where p =
3609     // z^7+z^2+z+1.
3610     //
3611     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3612     //
3613     // so, given that the product we're reducing is
3614     //    a == lo + hi * z^128
3615     // substituting,
3616     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3617     //
3618     // we reduce by multiplying hi by p(z) and subtracting the result
3619     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3620     // bits we can do this with two 64-bit multiplications, lo*p and
3621     // hi*p.
3622 
3623     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3624     __ ext(t1, __ T16B, t0, z, 8);
3625     __ eor(hi, __ T16B, hi, t1);
3626     __ ext(t1, __ T16B, z, t0, 8);
3627     __ eor(lo, __ T16B, lo, t1);
3628     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3629     __ eor(result, __ T16B, lo, t0);
3630   }
3631 
3632   address generate_has_negatives(address &has_negatives_long) {
3633     const u1 large_loop_size = 64;
3634     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3635     int dcache_line = VM_Version::dcache_line_size();
3636 
3637     Register ary1 = r1, len = r2, result = r0;
3638 
3639     __ align(CodeEntryAlignment);
3640 
3641     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3642 
3643     address entry = __ pc();
3644 
3645     __ enter();
3646 
3647   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3648         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3649 
3650   __ cmp(len, (u1)15);
3651   __ br(Assembler::GT, LEN_OVER_15);
3652   // The only case when execution falls into this code is when pointer is near
3653   // the end of memory page and we have to avoid reading next page
3654   __ add(ary1, ary1, len);
3655   __ subs(len, len, 8);
3656   __ br(Assembler::GT, LEN_OVER_8);
3657   __ ldr(rscratch2, Address(ary1, -8));
3658   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3659   __ lsrv(rscratch2, rscratch2, rscratch1);
3660   __ tst(rscratch2, UPPER_BIT_MASK);
3661   __ cset(result, Assembler::NE);
3662   __ leave();
3663   __ ret(lr);
3664   __ bind(LEN_OVER_8);
3665   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3666   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3667   __ tst(rscratch2, UPPER_BIT_MASK);
3668   __ br(Assembler::NE, RET_TRUE_NO_POP);
3669   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3670   __ lsrv(rscratch1, rscratch1, rscratch2);
3671   __ tst(rscratch1, UPPER_BIT_MASK);
3672   __ cset(result, Assembler::NE);
3673   __ leave();
3674   __ ret(lr);
3675 
3676   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3677   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3678 
3679   has_negatives_long = __ pc(); // 2nd entry point
3680 
3681   __ enter();
3682 
3683   __ bind(LEN_OVER_15);
3684     __ push(spilled_regs, sp);
3685     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3686     __ cbz(rscratch2, ALIGNED);
3687     __ ldp(tmp6, tmp1, Address(ary1));
3688     __ mov(tmp5, 16);
3689     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3690     __ add(ary1, ary1, rscratch1);
3691     __ sub(len, len, rscratch1);
3692     __ orr(tmp6, tmp6, tmp1);
3693     __ tst(tmp6, UPPER_BIT_MASK);
3694     __ br(Assembler::NE, RET_TRUE);
3695 
3696   __ bind(ALIGNED);
3697     __ cmp(len, large_loop_size);
3698     __ br(Assembler::LT, CHECK_16);
3699     // Perform 16-byte load as early return in pre-loop to handle situation
3700     // when initially aligned large array has negative values at starting bytes,
3701     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3702     // slower. Cases with negative bytes further ahead won't be affected that
3703     // much. In fact, it'll be faster due to early loads, less instructions and
3704     // less branches in LARGE_LOOP.
3705     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3706     __ sub(len, len, 16);
3707     __ orr(tmp6, tmp6, tmp1);
3708     __ tst(tmp6, UPPER_BIT_MASK);
3709     __ br(Assembler::NE, RET_TRUE);
3710     __ cmp(len, large_loop_size);
3711     __ br(Assembler::LT, CHECK_16);
3712 
3713     if (SoftwarePrefetchHintDistance >= 0
3714         && SoftwarePrefetchHintDistance >= dcache_line) {
3715       // initial prefetch
3716       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3717     }
3718   __ bind(LARGE_LOOP);
3719     if (SoftwarePrefetchHintDistance >= 0) {
3720       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3721     }
3722     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3723     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3724     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3725     // instructions per cycle and have less branches, but this approach disables
3726     // early return, thus, all 64 bytes are loaded and checked every time.
3727     __ ldp(tmp2, tmp3, Address(ary1));
3728     __ ldp(tmp4, tmp5, Address(ary1, 16));
3729     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3730     __ ldp(tmp6, tmp1, Address(ary1, 48));
3731     __ add(ary1, ary1, large_loop_size);
3732     __ sub(len, len, large_loop_size);
3733     __ orr(tmp2, tmp2, tmp3);
3734     __ orr(tmp4, tmp4, tmp5);
3735     __ orr(rscratch1, rscratch1, rscratch2);
3736     __ orr(tmp6, tmp6, tmp1);
3737     __ orr(tmp2, tmp2, tmp4);
3738     __ orr(rscratch1, rscratch1, tmp6);
3739     __ orr(tmp2, tmp2, rscratch1);
3740     __ tst(tmp2, UPPER_BIT_MASK);
3741     __ br(Assembler::NE, RET_TRUE);
3742     __ cmp(len, large_loop_size);
3743     __ br(Assembler::GE, LARGE_LOOP);
3744 
3745   __ bind(CHECK_16); // small 16-byte load pre-loop
3746     __ cmp(len, (u1)16);
3747     __ br(Assembler::LT, POST_LOOP16);
3748 
3749   __ bind(LOOP16); // small 16-byte load loop
3750     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3751     __ sub(len, len, 16);
3752     __ orr(tmp2, tmp2, tmp3);
3753     __ tst(tmp2, UPPER_BIT_MASK);
3754     __ br(Assembler::NE, RET_TRUE);
3755     __ cmp(len, (u1)16);
3756     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3757 
3758   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3759     __ cmp(len, (u1)8);
3760     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3761     __ ldr(tmp3, Address(__ post(ary1, 8)));
3762     __ sub(len, len, 8);
3763     __ tst(tmp3, UPPER_BIT_MASK);
3764     __ br(Assembler::NE, RET_TRUE);
3765 
3766   __ bind(POST_LOOP16_LOAD_TAIL);
3767     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3768     __ ldr(tmp1, Address(ary1));
3769     __ mov(tmp2, 64);
3770     __ sub(tmp4, tmp2, len, __ LSL, 3);
3771     __ lslv(tmp1, tmp1, tmp4);
3772     __ tst(tmp1, UPPER_BIT_MASK);
3773     __ br(Assembler::NE, RET_TRUE);
3774     // Fallthrough
3775 
3776   __ bind(RET_FALSE);
3777     __ pop(spilled_regs, sp);
3778     __ leave();
3779     __ mov(result, zr);
3780     __ ret(lr);
3781 
3782   __ bind(RET_TRUE);
3783     __ pop(spilled_regs, sp);
3784   __ bind(RET_TRUE_NO_POP);
3785     __ leave();
3786     __ mov(result, 1);
3787     __ ret(lr);
3788 
3789   __ bind(DONE);
3790     __ pop(spilled_regs, sp);
3791     __ leave();
3792     __ ret(lr);
3793     return entry;
3794   }
3795 
3796   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3797         bool usePrefetch, Label &NOT_EQUAL) {
3798     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3799         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3800         tmp7 = r12, tmp8 = r13;
3801     Label LOOP;
3802 
3803     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3804     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3805     __ bind(LOOP);
3806     if (usePrefetch) {
3807       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3808       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3809     }
3810     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3811     __ eor(tmp1, tmp1, tmp2);
3812     __ eor(tmp3, tmp3, tmp4);
3813     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3814     __ orr(tmp1, tmp1, tmp3);
3815     __ cbnz(tmp1, NOT_EQUAL);
3816     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3817     __ eor(tmp5, tmp5, tmp6);
3818     __ eor(tmp7, tmp7, tmp8);
3819     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3820     __ orr(tmp5, tmp5, tmp7);
3821     __ cbnz(tmp5, NOT_EQUAL);
3822     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3823     __ eor(tmp1, tmp1, tmp2);
3824     __ eor(tmp3, tmp3, tmp4);
3825     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3826     __ orr(tmp1, tmp1, tmp3);
3827     __ cbnz(tmp1, NOT_EQUAL);
3828     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3829     __ eor(tmp5, tmp5, tmp6);
3830     __ sub(cnt1, cnt1, 8 * wordSize);
3831     __ eor(tmp7, tmp7, tmp8);
3832     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3833     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3834     // cmp) because subs allows an unlimited range of immediate operand.
3835     __ subs(tmp6, cnt1, loopThreshold);
3836     __ orr(tmp5, tmp5, tmp7);
3837     __ cbnz(tmp5, NOT_EQUAL);
3838     __ br(__ GE, LOOP);
3839     // post-loop
3840     __ eor(tmp1, tmp1, tmp2);
3841     __ eor(tmp3, tmp3, tmp4);
3842     __ orr(tmp1, tmp1, tmp3);
3843     __ sub(cnt1, cnt1, 2 * wordSize);
3844     __ cbnz(tmp1, NOT_EQUAL);
3845   }
3846 
3847   void generate_large_array_equals_loop_simd(int loopThreshold,
3848         bool usePrefetch, Label &NOT_EQUAL) {
3849     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3850         tmp2 = rscratch2;
3851     Label LOOP;
3852 
3853     __ bind(LOOP);
3854     if (usePrefetch) {
3855       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3856       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3857     }
3858     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3859     __ sub(cnt1, cnt1, 8 * wordSize);
3860     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3861     __ subs(tmp1, cnt1, loopThreshold);
3862     __ eor(v0, __ T16B, v0, v4);
3863     __ eor(v1, __ T16B, v1, v5);
3864     __ eor(v2, __ T16B, v2, v6);
3865     __ eor(v3, __ T16B, v3, v7);
3866     __ orr(v0, __ T16B, v0, v1);
3867     __ orr(v1, __ T16B, v2, v3);
3868     __ orr(v0, __ T16B, v0, v1);
3869     __ umov(tmp1, v0, __ D, 0);
3870     __ umov(tmp2, v0, __ D, 1);
3871     __ orr(tmp1, tmp1, tmp2);
3872     __ cbnz(tmp1, NOT_EQUAL);
3873     __ br(__ GE, LOOP);
3874   }
3875 
3876   // a1 = r1 - array1 address
3877   // a2 = r2 - array2 address
3878   // result = r0 - return value. Already contains "false"
3879   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3880   // r3-r5 are reserved temporary registers
3881   address generate_large_array_equals() {
3882     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3883         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3884         tmp7 = r12, tmp8 = r13;
3885     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3886         SMALL_LOOP, POST_LOOP;
3887     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3888     // calculate if at least 32 prefetched bytes are used
3889     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3890     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3891     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3892     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3893         tmp5, tmp6, tmp7, tmp8);
3894 
3895     __ align(CodeEntryAlignment);
3896 
3897     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3898 
3899     address entry = __ pc();
3900     __ enter();
3901     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3902     // also advance pointers to use post-increment instead of pre-increment
3903     __ add(a1, a1, wordSize);
3904     __ add(a2, a2, wordSize);
3905     if (AvoidUnalignedAccesses) {
3906       // both implementations (SIMD/nonSIMD) are using relatively large load
3907       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3908       // on some CPUs in case of address is not at least 16-byte aligned.
3909       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3910       // load if needed at least for 1st address and make if 16-byte aligned.
3911       Label ALIGNED16;
3912       __ tbz(a1, 3, ALIGNED16);
3913       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3914       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3915       __ sub(cnt1, cnt1, wordSize);
3916       __ eor(tmp1, tmp1, tmp2);
3917       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3918       __ bind(ALIGNED16);
3919     }
3920     if (UseSIMDForArrayEquals) {
3921       if (SoftwarePrefetchHintDistance >= 0) {
3922         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3923         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3924         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3925             /* prfm = */ true, NOT_EQUAL);
3926         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3927         __ br(__ LT, TAIL);
3928       }
3929       __ bind(NO_PREFETCH_LARGE_LOOP);
3930       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3931           /* prfm = */ false, NOT_EQUAL);
3932     } else {
3933       __ push(spilled_regs, sp);
3934       if (SoftwarePrefetchHintDistance >= 0) {
3935         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3936         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3937         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3938             /* prfm = */ true, NOT_EQUAL);
3939         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3940         __ br(__ LT, TAIL);
3941       }
3942       __ bind(NO_PREFETCH_LARGE_LOOP);
3943       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3944           /* prfm = */ false, NOT_EQUAL);
3945     }
3946     __ bind(TAIL);
3947       __ cbz(cnt1, EQUAL);
3948       __ subs(cnt1, cnt1, wordSize);
3949       __ br(__ LE, POST_LOOP);
3950     __ bind(SMALL_LOOP);
3951       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3952       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3953       __ subs(cnt1, cnt1, wordSize);
3954       __ eor(tmp1, tmp1, tmp2);
3955       __ cbnz(tmp1, NOT_EQUAL);
3956       __ br(__ GT, SMALL_LOOP);
3957     __ bind(POST_LOOP);
3958       __ ldr(tmp1, Address(a1, cnt1));
3959       __ ldr(tmp2, Address(a2, cnt1));
3960       __ eor(tmp1, tmp1, tmp2);
3961       __ cbnz(tmp1, NOT_EQUAL);
3962     __ bind(EQUAL);
3963       __ mov(result, true);
3964     __ bind(NOT_EQUAL);
3965       if (!UseSIMDForArrayEquals) {
3966         __ pop(spilled_regs, sp);
3967       }
3968     __ bind(NOT_EQUAL_NO_POP);
3969     __ leave();
3970     __ ret(lr);
3971     return entry;
3972   }
3973 
3974   address generate_dsin_dcos(bool isCos) {
3975     __ align(CodeEntryAlignment);
3976     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3977     address start = __ pc();
3978     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3979         (address)StubRoutines::aarch64::_two_over_pi,
3980         (address)StubRoutines::aarch64::_pio2,
3981         (address)StubRoutines::aarch64::_dsin_coef,
3982         (address)StubRoutines::aarch64::_dcos_coef);
3983     return start;
3984   }
3985 
3986   address generate_dlog() {
3987     __ align(CodeEntryAlignment);
3988     StubCodeMark mark(this, "StubRoutines", "dlog");
3989     address entry = __ pc();
3990     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3991         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3992     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3993     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3994         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3995     return entry;
3996   }
3997 
3998   // code for comparing 16 bytes of strings with same encoding
3999   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4000     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4001     __ ldr(rscratch1, Address(__ post(str1, 8)));
4002     __ eor(rscratch2, tmp1, tmp2);
4003     __ ldr(cnt1, Address(__ post(str2, 8)));
4004     __ cbnz(rscratch2, DIFF1);
4005     __ ldr(tmp1, Address(__ post(str1, 8)));
4006     __ eor(rscratch2, rscratch1, cnt1);
4007     __ ldr(tmp2, Address(__ post(str2, 8)));
4008     __ cbnz(rscratch2, DIFF2);
4009   }
4010 
4011   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4012   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4013       Label &DIFF2) {
4014     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4015     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4016 
4017     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4018     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4019     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4020     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4021 
4022     __ fmovd(tmpL, vtmp3);
4023     __ eor(rscratch2, tmp3, tmpL);
4024     __ cbnz(rscratch2, DIFF2);
4025 
4026     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4027     __ umov(tmpL, vtmp3, __ D, 1);
4028     __ eor(rscratch2, tmpU, tmpL);
4029     __ cbnz(rscratch2, DIFF1);
4030 
4031     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4032     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4033     __ fmovd(tmpL, vtmp);
4034     __ eor(rscratch2, tmp3, tmpL);
4035     __ cbnz(rscratch2, DIFF2);
4036 
4037     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4038     __ umov(tmpL, vtmp, __ D, 1);
4039     __ eor(rscratch2, tmpU, tmpL);
4040     __ cbnz(rscratch2, DIFF1);
4041   }
4042 
4043   // r0  = result
4044   // r1  = str1
4045   // r2  = cnt1
4046   // r3  = str2
4047   // r4  = cnt2
4048   // r10 = tmp1
4049   // r11 = tmp2
4050   address generate_compare_long_string_different_encoding(bool isLU) {
4051     __ align(CodeEntryAlignment);
4052     StubCodeMark mark(this, "StubRoutines", isLU
4053         ? "compare_long_string_different_encoding LU"
4054         : "compare_long_string_different_encoding UL");
4055     address entry = __ pc();
4056     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4057         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4058         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4059     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4060         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4061     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4062     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4063 
4064     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4065 
4066     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4067     // cnt2 == amount of characters left to compare
4068     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4069     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4070     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4071     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4072     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4073     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4074     __ eor(rscratch2, tmp1, tmp2);
4075     __ mov(rscratch1, tmp2);
4076     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4077     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4078              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4079     __ push(spilled_regs, sp);
4080     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4081     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4082 
4083     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4084 
4085     if (SoftwarePrefetchHintDistance >= 0) {
4086       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4087       __ br(__ LT, NO_PREFETCH);
4088       __ bind(LARGE_LOOP_PREFETCH);
4089         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4090         __ mov(tmp4, 2);
4091         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4092         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4093           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4094           __ subs(tmp4, tmp4, 1);
4095           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4096           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4097           __ mov(tmp4, 2);
4098         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4099           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4100           __ subs(tmp4, tmp4, 1);
4101           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4102           __ sub(cnt2, cnt2, 64);
4103           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4104           __ br(__ GE, LARGE_LOOP_PREFETCH);
4105     }
4106     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4107     __ bind(NO_PREFETCH);
4108     __ subs(cnt2, cnt2, 16);
4109     __ br(__ LT, TAIL);
4110     __ align(OptoLoopAlignment);
4111     __ bind(SMALL_LOOP); // smaller loop
4112       __ subs(cnt2, cnt2, 16);
4113       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4114       __ br(__ GE, SMALL_LOOP);
4115       __ cmn(cnt2, (u1)16);
4116       __ br(__ EQ, LOAD_LAST);
4117     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4118       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4119       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4120       __ ldr(tmp3, Address(cnt1, -8));
4121       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4122       __ b(LOAD_LAST);
4123     __ bind(DIFF2);
4124       __ mov(tmpU, tmp3);
4125     __ bind(DIFF1);
4126       __ pop(spilled_regs, sp);
4127       __ b(CALCULATE_DIFFERENCE);
4128     __ bind(LOAD_LAST);
4129       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4130       // No need to load it again
4131       __ mov(tmpU, tmp3);
4132       __ pop(spilled_regs, sp);
4133 
4134       // tmp2 points to the address of the last 4 Latin1 characters right now
4135       __ ldrs(vtmp, Address(tmp2));
4136       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4137       __ fmovd(tmpL, vtmp);
4138 
4139       __ eor(rscratch2, tmpU, tmpL);
4140       __ cbz(rscratch2, DONE);
4141 
4142     // Find the first different characters in the longwords and
4143     // compute their difference.
4144     __ bind(CALCULATE_DIFFERENCE);
4145       __ rev(rscratch2, rscratch2);
4146       __ clz(rscratch2, rscratch2);
4147       __ andr(rscratch2, rscratch2, -16);
4148       __ lsrv(tmp1, tmp1, rscratch2);
4149       __ uxthw(tmp1, tmp1);
4150       __ lsrv(rscratch1, rscratch1, rscratch2);
4151       __ uxthw(rscratch1, rscratch1);
4152       __ subw(result, tmp1, rscratch1);
4153     __ bind(DONE);
4154       __ ret(lr);
4155     return entry;
4156   }
4157 
4158     address generate_method_entry_barrier() {
4159     __ align(CodeEntryAlignment);
4160     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
4161 
4162     Label deoptimize_label;
4163 
4164     address start = __ pc();
4165 
4166     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
4167 
4168     __ enter();
4169     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
4170 
4171     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
4172 
4173     __ push_call_clobbered_registers();
4174 
4175     __ mov(c_rarg0, rscratch2);
4176     __ call_VM_leaf
4177          (CAST_FROM_FN_PTR
4178           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
4179 
4180     __ reset_last_Java_frame(true);
4181 
4182     __ mov(rscratch1, r0);
4183 
4184     __ pop_call_clobbered_registers();
4185 
4186     __ cbnz(rscratch1, deoptimize_label);
4187 
4188     __ leave();
4189     __ ret(lr);
4190 
4191     __ BIND(deoptimize_label);
4192 
4193     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
4194     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
4195 
4196     __ mov(sp, rscratch1);
4197     __ br(rscratch2);
4198 
4199     return start;
4200   }
4201 
4202   // r0  = result
4203   // r1  = str1
4204   // r2  = cnt1
4205   // r3  = str2
4206   // r4  = cnt2
4207   // r10 = tmp1
4208   // r11 = tmp2
4209   address generate_compare_long_string_same_encoding(bool isLL) {
4210     __ align(CodeEntryAlignment);
4211     StubCodeMark mark(this, "StubRoutines", isLL
4212         ? "compare_long_string_same_encoding LL"
4213         : "compare_long_string_same_encoding UU");
4214     address entry = __ pc();
4215     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4216         tmp1 = r10, tmp2 = r11;
4217     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4218         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4219         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4220     // exit from large loop when less than 64 bytes left to read or we're about
4221     // to prefetch memory behind array border
4222     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4223     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4224     // update cnt2 counter with already loaded 8 bytes
4225     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4226     // update pointers, because of previous read
4227     __ add(str1, str1, wordSize);
4228     __ add(str2, str2, wordSize);
4229     if (SoftwarePrefetchHintDistance >= 0) {
4230       __ bind(LARGE_LOOP_PREFETCH);
4231         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4232         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4233         compare_string_16_bytes_same(DIFF, DIFF2);
4234         compare_string_16_bytes_same(DIFF, DIFF2);
4235         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4236         compare_string_16_bytes_same(DIFF, DIFF2);
4237         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4238         compare_string_16_bytes_same(DIFF, DIFF2);
4239         __ br(__ GT, LARGE_LOOP_PREFETCH);
4240         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4241     }
4242     // less than 16 bytes left?
4243     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4244     __ br(__ LT, TAIL);
4245     __ align(OptoLoopAlignment);
4246     __ bind(SMALL_LOOP);
4247       compare_string_16_bytes_same(DIFF, DIFF2);
4248       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4249       __ br(__ GE, SMALL_LOOP);
4250     __ bind(TAIL);
4251       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4252       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4253       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4254       __ br(__ LE, CHECK_LAST);
4255       __ eor(rscratch2, tmp1, tmp2);
4256       __ cbnz(rscratch2, DIFF);
4257       __ ldr(tmp1, Address(__ post(str1, 8)));
4258       __ ldr(tmp2, Address(__ post(str2, 8)));
4259       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4260     __ bind(CHECK_LAST);
4261       if (!isLL) {
4262         __ add(cnt2, cnt2, cnt2); // now in bytes
4263       }
4264       __ eor(rscratch2, tmp1, tmp2);
4265       __ cbnz(rscratch2, DIFF);
4266       __ ldr(rscratch1, Address(str1, cnt2));
4267       __ ldr(cnt1, Address(str2, cnt2));
4268       __ eor(rscratch2, rscratch1, cnt1);
4269       __ cbz(rscratch2, LENGTH_DIFF);
4270       // Find the first different characters in the longwords and
4271       // compute their difference.
4272     __ bind(DIFF2);
4273       __ rev(rscratch2, rscratch2);
4274       __ clz(rscratch2, rscratch2);
4275       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4276       __ lsrv(rscratch1, rscratch1, rscratch2);
4277       if (isLL) {
4278         __ lsrv(cnt1, cnt1, rscratch2);
4279         __ uxtbw(rscratch1, rscratch1);
4280         __ uxtbw(cnt1, cnt1);
4281       } else {
4282         __ lsrv(cnt1, cnt1, rscratch2);
4283         __ uxthw(rscratch1, rscratch1);
4284         __ uxthw(cnt1, cnt1);
4285       }
4286       __ subw(result, rscratch1, cnt1);
4287       __ b(LENGTH_DIFF);
4288     __ bind(DIFF);
4289       __ rev(rscratch2, rscratch2);
4290       __ clz(rscratch2, rscratch2);
4291       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4292       __ lsrv(tmp1, tmp1, rscratch2);
4293       if (isLL) {
4294         __ lsrv(tmp2, tmp2, rscratch2);
4295         __ uxtbw(tmp1, tmp1);
4296         __ uxtbw(tmp2, tmp2);
4297       } else {
4298         __ lsrv(tmp2, tmp2, rscratch2);
4299         __ uxthw(tmp1, tmp1);
4300         __ uxthw(tmp2, tmp2);
4301       }
4302       __ subw(result, tmp1, tmp2);
4303       __ b(LENGTH_DIFF);
4304     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4305       __ eor(rscratch2, tmp1, tmp2);
4306       __ cbnz(rscratch2, DIFF);
4307     __ bind(LENGTH_DIFF);
4308       __ ret(lr);
4309     return entry;
4310   }
4311 
4312   void generate_compare_long_strings() {
4313       StubRoutines::aarch64::_compare_long_string_LL
4314           = generate_compare_long_string_same_encoding(true);
4315       StubRoutines::aarch64::_compare_long_string_UU
4316           = generate_compare_long_string_same_encoding(false);
4317       StubRoutines::aarch64::_compare_long_string_LU
4318           = generate_compare_long_string_different_encoding(true);
4319       StubRoutines::aarch64::_compare_long_string_UL
4320           = generate_compare_long_string_different_encoding(false);
4321   }
4322 
4323   // R0 = result
4324   // R1 = str2
4325   // R2 = cnt1
4326   // R3 = str1
4327   // R4 = cnt2
4328   // This generic linear code use few additional ideas, which makes it faster:
4329   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4330   // in order to skip initial loading(help in systems with 1 ld pipeline)
4331   // 2) we can use "fast" algorithm of finding single character to search for
4332   // first symbol with less branches(1 branch per each loaded register instead
4333   // of branch for each symbol), so, this is where constants like
4334   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4335   // 3) after loading and analyzing 1st register of source string, it can be
4336   // used to search for every 1st character entry, saving few loads in
4337   // comparison with "simplier-but-slower" implementation
4338   // 4) in order to avoid lots of push/pop operations, code below is heavily
4339   // re-using/re-initializing/compressing register values, which makes code
4340   // larger and a bit less readable, however, most of extra operations are
4341   // issued during loads or branches, so, penalty is minimal
4342   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4343     const char* stubName = str1_isL
4344         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4345         : "indexof_linear_uu";
4346     __ align(CodeEntryAlignment);
4347     StubCodeMark mark(this, "StubRoutines", stubName);
4348     address entry = __ pc();
4349 
4350     int str1_chr_size = str1_isL ? 1 : 2;
4351     int str2_chr_size = str2_isL ? 1 : 2;
4352     int str1_chr_shift = str1_isL ? 0 : 1;
4353     int str2_chr_shift = str2_isL ? 0 : 1;
4354     bool isL = str1_isL && str2_isL;
4355    // parameters
4356     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4357     // temporary registers
4358     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4359     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4360     // redefinitions
4361     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4362 
4363     __ push(spilled_regs, sp);
4364     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4365         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4366         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4367         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4368         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4369         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4370     // Read whole register from str1. It is safe, because length >=8 here
4371     __ ldr(ch1, Address(str1));
4372     // Read whole register from str2. It is safe, because length >=8 here
4373     __ ldr(ch2, Address(str2));
4374     __ sub(cnt2, cnt2, cnt1);
4375     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4376     if (str1_isL != str2_isL) {
4377       __ eor(v0, __ T16B, v0, v0);
4378     }
4379     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4380     __ mul(first, first, tmp1);
4381     // check if we have less than 1 register to check
4382     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4383     if (str1_isL != str2_isL) {
4384       __ fmovd(v1, ch1);
4385     }
4386     __ br(__ LE, L_SMALL);
4387     __ eor(ch2, first, ch2);
4388     if (str1_isL != str2_isL) {
4389       __ zip1(v1, __ T16B, v1, v0);
4390     }
4391     __ sub(tmp2, ch2, tmp1);
4392     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4393     __ bics(tmp2, tmp2, ch2);
4394     if (str1_isL != str2_isL) {
4395       __ fmovd(ch1, v1);
4396     }
4397     __ br(__ NE, L_HAS_ZERO);
4398     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4399     __ add(result, result, wordSize/str2_chr_size);
4400     __ add(str2, str2, wordSize);
4401     __ br(__ LT, L_POST_LOOP);
4402     __ BIND(L_LOOP);
4403       __ ldr(ch2, Address(str2));
4404       __ eor(ch2, first, ch2);
4405       __ sub(tmp2, ch2, tmp1);
4406       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4407       __ bics(tmp2, tmp2, ch2);
4408       __ br(__ NE, L_HAS_ZERO);
4409     __ BIND(L_LOOP_PROCEED);
4410       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4411       __ add(str2, str2, wordSize);
4412       __ add(result, result, wordSize/str2_chr_size);
4413       __ br(__ GE, L_LOOP);
4414     __ BIND(L_POST_LOOP);
4415       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4416       __ br(__ LE, NOMATCH);
4417       __ ldr(ch2, Address(str2));
4418       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4419       __ eor(ch2, first, ch2);
4420       __ sub(tmp2, ch2, tmp1);
4421       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4422       __ mov(tmp4, -1); // all bits set
4423       __ b(L_SMALL_PROCEED);
4424     __ align(OptoLoopAlignment);
4425     __ BIND(L_SMALL);
4426       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4427       __ eor(ch2, first, ch2);
4428       if (str1_isL != str2_isL) {
4429         __ zip1(v1, __ T16B, v1, v0);
4430       }
4431       __ sub(tmp2, ch2, tmp1);
4432       __ mov(tmp4, -1); // all bits set
4433       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4434       if (str1_isL != str2_isL) {
4435         __ fmovd(ch1, v1); // move converted 4 symbols
4436       }
4437     __ BIND(L_SMALL_PROCEED);
4438       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4439       __ bic(tmp2, tmp2, ch2);
4440       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4441       __ rbit(tmp2, tmp2);
4442       __ br(__ EQ, NOMATCH);
4443     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4444       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4445       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4446       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4447       if (str2_isL) { // LL
4448         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4449         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4450         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4451         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4452         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4453       } else {
4454         __ mov(ch2, 0xE); // all bits in byte set except last one
4455         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4456         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4457         __ lslv(tmp2, tmp2, tmp4);
4458         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4459         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4460         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4461         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4462       }
4463       __ cmp(ch1, ch2);
4464       __ mov(tmp4, wordSize/str2_chr_size);
4465       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4466     __ BIND(L_SMALL_CMP_LOOP);
4467       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4468                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4469       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4470                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4471       __ add(tmp4, tmp4, 1);
4472       __ cmp(tmp4, cnt1);
4473       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4474       __ cmp(first, ch2);
4475       __ br(__ EQ, L_SMALL_CMP_LOOP);
4476     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4477       __ cbz(tmp2, NOMATCH); // no more matches. exit
4478       __ clz(tmp4, tmp2);
4479       __ add(result, result, 1); // advance index
4480       __ add(str2, str2, str2_chr_size); // advance pointer
4481       __ b(L_SMALL_HAS_ZERO_LOOP);
4482     __ align(OptoLoopAlignment);
4483     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4484       __ cmp(first, ch2);
4485       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4486       __ b(DONE);
4487     __ align(OptoLoopAlignment);
4488     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4489       if (str2_isL) { // LL
4490         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4491         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4492         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4493         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4494         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4495       } else {
4496         __ mov(ch2, 0xE); // all bits in byte set except last one
4497         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4498         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4499         __ lslv(tmp2, tmp2, tmp4);
4500         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4501         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4502         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4503         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4504       }
4505       __ cmp(ch1, ch2);
4506       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4507       __ b(DONE);
4508     __ align(OptoLoopAlignment);
4509     __ BIND(L_HAS_ZERO);
4510       __ rbit(tmp2, tmp2);
4511       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4512       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4513       // It's fine because both counters are 32bit and are not changed in this
4514       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4515       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4516       __ sub(result, result, 1);
4517     __ BIND(L_HAS_ZERO_LOOP);
4518       __ mov(cnt1, wordSize/str2_chr_size);
4519       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4520       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4521       if (str2_isL) {
4522         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4523         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4524         __ lslv(tmp2, tmp2, tmp4);
4525         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4526         __ add(tmp4, tmp4, 1);
4527         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4528         __ lsl(tmp2, tmp2, 1);
4529         __ mov(tmp4, wordSize/str2_chr_size);
4530       } else {
4531         __ mov(ch2, 0xE);
4532         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4533         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4534         __ lslv(tmp2, tmp2, tmp4);
4535         __ add(tmp4, tmp4, 1);
4536         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4537         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4538         __ lsl(tmp2, tmp2, 1);
4539         __ mov(tmp4, wordSize/str2_chr_size);
4540         __ sub(str2, str2, str2_chr_size);
4541       }
4542       __ cmp(ch1, ch2);
4543       __ mov(tmp4, wordSize/str2_chr_size);
4544       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4545     __ BIND(L_CMP_LOOP);
4546       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4547                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4548       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4549                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4550       __ add(tmp4, tmp4, 1);
4551       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4552       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4553       __ cmp(cnt1, ch2);
4554       __ br(__ EQ, L_CMP_LOOP);
4555     __ BIND(L_CMP_LOOP_NOMATCH);
4556       // here we're not matched
4557       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4558       __ clz(tmp4, tmp2);
4559       __ add(str2, str2, str2_chr_size); // advance pointer
4560       __ b(L_HAS_ZERO_LOOP);
4561     __ align(OptoLoopAlignment);
4562     __ BIND(L_CMP_LOOP_LAST_CMP);
4563       __ cmp(cnt1, ch2);
4564       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4565       __ b(DONE);
4566     __ align(OptoLoopAlignment);
4567     __ BIND(L_CMP_LOOP_LAST_CMP2);
4568       if (str2_isL) {
4569         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4570         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4571         __ lslv(tmp2, tmp2, tmp4);
4572         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4573         __ add(tmp4, tmp4, 1);
4574         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4575         __ lsl(tmp2, tmp2, 1);
4576       } else {
4577         __ mov(ch2, 0xE);
4578         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4579         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4580         __ lslv(tmp2, tmp2, tmp4);
4581         __ add(tmp4, tmp4, 1);
4582         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4583         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4584         __ lsl(tmp2, tmp2, 1);
4585         __ sub(str2, str2, str2_chr_size);
4586       }
4587       __ cmp(ch1, ch2);
4588       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4589       __ b(DONE);
4590     __ align(OptoLoopAlignment);
4591     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4592       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4593       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4594       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4595       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4596       // result by analyzed characters value, so, we can just reset lower bits
4597       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4598       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4599       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4600       // index of last analyzed substring inside current octet. So, str2 in at
4601       // respective start address. We need to advance it to next octet
4602       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4603       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4604       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4605       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4606       __ movw(cnt2, cnt2);
4607       __ b(L_LOOP_PROCEED);
4608     __ align(OptoLoopAlignment);
4609     __ BIND(NOMATCH);
4610       __ mov(result, -1);
4611     __ BIND(DONE);
4612       __ pop(spilled_regs, sp);
4613       __ ret(lr);
4614     return entry;
4615   }
4616 
4617   void generate_string_indexof_stubs() {
4618     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4619     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4620     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4621   }
4622 
4623   void inflate_and_store_2_fp_registers(bool generatePrfm,
4624       FloatRegister src1, FloatRegister src2) {
4625     Register dst = r1;
4626     __ zip1(v1, __ T16B, src1, v0);
4627     __ zip2(v2, __ T16B, src1, v0);
4628     if (generatePrfm) {
4629       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4630     }
4631     __ zip1(v3, __ T16B, src2, v0);
4632     __ zip2(v4, __ T16B, src2, v0);
4633     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4634   }
4635 
4636   // R0 = src
4637   // R1 = dst
4638   // R2 = len
4639   // R3 = len >> 3
4640   // V0 = 0
4641   // v1 = loaded 8 bytes
4642   address generate_large_byte_array_inflate() {
4643     __ align(CodeEntryAlignment);
4644     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4645     address entry = __ pc();
4646     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4647     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4648     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4649 
4650     // do one more 8-byte read to have address 16-byte aligned in most cases
4651     // also use single store instruction
4652     __ ldrd(v2, __ post(src, 8));
4653     __ sub(octetCounter, octetCounter, 2);
4654     __ zip1(v1, __ T16B, v1, v0);
4655     __ zip1(v2, __ T16B, v2, v0);
4656     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4657     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4658     __ subs(rscratch1, octetCounter, large_loop_threshold);
4659     __ br(__ LE, LOOP_START);
4660     __ b(LOOP_PRFM_START);
4661     __ bind(LOOP_PRFM);
4662       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4663     __ bind(LOOP_PRFM_START);
4664       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4665       __ sub(octetCounter, octetCounter, 8);
4666       __ subs(rscratch1, octetCounter, large_loop_threshold);
4667       inflate_and_store_2_fp_registers(true, v3, v4);
4668       inflate_and_store_2_fp_registers(true, v5, v6);
4669       __ br(__ GT, LOOP_PRFM);
4670       __ cmp(octetCounter, (u1)8);
4671       __ br(__ LT, DONE);
4672     __ bind(LOOP);
4673       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4674       __ bind(LOOP_START);
4675       __ sub(octetCounter, octetCounter, 8);
4676       __ cmp(octetCounter, (u1)8);
4677       inflate_and_store_2_fp_registers(false, v3, v4);
4678       inflate_and_store_2_fp_registers(false, v5, v6);
4679       __ br(__ GE, LOOP);
4680     __ bind(DONE);
4681       __ ret(lr);
4682     return entry;
4683   }
4684 
4685   /**
4686    *  Arguments:
4687    *
4688    *  Input:
4689    *  c_rarg0   - current state address
4690    *  c_rarg1   - H key address
4691    *  c_rarg2   - data address
4692    *  c_rarg3   - number of blocks
4693    *
4694    *  Output:
4695    *  Updated state at c_rarg0
4696    */
4697   address generate_ghash_processBlocks() {
4698     // Bafflingly, GCM uses little-endian for the byte order, but
4699     // big-endian for the bit order.  For example, the polynomial 1 is
4700     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4701     //
4702     // So, we must either reverse the bytes in each word and do
4703     // everything big-endian or reverse the bits in each byte and do
4704     // it little-endian.  On AArch64 it's more idiomatic to reverse
4705     // the bits in each byte (we have an instruction, RBIT, to do
4706     // that) and keep the data in little-endian bit order throught the
4707     // calculation, bit-reversing the inputs and outputs.
4708 
4709     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4710     __ align(wordSize * 2);
4711     address p = __ pc();
4712     __ emit_int64(0x87);  // The low-order bits of the field
4713                           // polynomial (i.e. p = z^7+z^2+z+1)
4714                           // repeated in the low and high parts of a
4715                           // 128-bit vector
4716     __ emit_int64(0x87);
4717 
4718     __ align(CodeEntryAlignment);
4719     address start = __ pc();
4720 
4721     Register state   = c_rarg0;
4722     Register subkeyH = c_rarg1;
4723     Register data    = c_rarg2;
4724     Register blocks  = c_rarg3;
4725 
4726     FloatRegister vzr = v30;
4727     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4728 
4729     __ ldrq(v0, Address(state));
4730     __ ldrq(v1, Address(subkeyH));
4731 
4732     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4733     __ rbit(v0, __ T16B, v0);
4734     __ rev64(v1, __ T16B, v1);
4735     __ rbit(v1, __ T16B, v1);
4736 
4737     __ ldrq(v26, p);
4738 
4739     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4740     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4741 
4742     {
4743       Label L_ghash_loop;
4744       __ bind(L_ghash_loop);
4745 
4746       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4747                                                  // reversing each byte
4748       __ rbit(v2, __ T16B, v2);
4749       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4750 
4751       // Multiply state in v2 by subkey in v1
4752       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4753                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4754                      /*temps*/v6, v20, v18, v21);
4755       // Reduce v7:v5 by the field polynomial
4756       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4757 
4758       __ sub(blocks, blocks, 1);
4759       __ cbnz(blocks, L_ghash_loop);
4760     }
4761 
4762     // The bit-reversed result is at this point in v0
4763     __ rev64(v1, __ T16B, v0);
4764     __ rbit(v1, __ T16B, v1);
4765 
4766     __ st1(v1, __ T16B, state);
4767     __ ret(lr);
4768 
4769     return start;
4770   }
4771 
4772   // Continuation point for throwing of implicit exceptions that are
4773   // not handled in the current activation. Fabricates an exception
4774   // oop and initiates normal exception dispatching in this
4775   // frame. Since we need to preserve callee-saved values (currently
4776   // only for C2, but done for C1 as well) we need a callee-saved oop
4777   // map and therefore have to make these stubs into RuntimeStubs
4778   // rather than BufferBlobs.  If the compiler needs all registers to
4779   // be preserved between the fault point and the exception handler
4780   // then it must assume responsibility for that in
4781   // AbstractCompiler::continuation_for_implicit_null_exception or
4782   // continuation_for_implicit_division_by_zero_exception. All other
4783   // implicit exceptions (e.g., NullPointerException or
4784   // AbstractMethodError on entry) are either at call sites or
4785   // otherwise assume that stack unwinding will be initiated, so
4786   // caller saved registers were assumed volatile in the compiler.
4787 
4788 #undef __
4789 #define __ masm->
4790 
4791   address generate_throw_exception(const char* name,
4792                                    address runtime_entry,
4793                                    Register arg1 = noreg,
4794                                    Register arg2 = noreg) {
4795     // Information about frame layout at time of blocking runtime call.
4796     // Note that we only have to preserve callee-saved registers since
4797     // the compilers are responsible for supplying a continuation point
4798     // if they expect all registers to be preserved.
4799     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4800     enum layout {
4801       rfp_off = 0,
4802       rfp_off2,
4803       return_off,
4804       return_off2,
4805       framesize // inclusive of return address
4806     };
4807 
4808     int insts_size = 512;
4809     int locs_size  = 64;
4810 
4811     CodeBuffer code(name, insts_size, locs_size);
4812     OopMapSet* oop_maps  = new OopMapSet();
4813     MacroAssembler* masm = new MacroAssembler(&code);
4814 
4815     address start = __ pc();
4816 
4817     // This is an inlined and slightly modified version of call_VM
4818     // which has the ability to fetch the return PC out of
4819     // thread-local storage and also sets up last_Java_sp slightly
4820     // differently than the real call_VM
4821 
4822     __ enter(); // Save FP and LR before call
4823 
4824     assert(is_even(framesize/2), "sp not 16-byte aligned");
4825 
4826     // lr and fp are already in place
4827     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4828 
4829     int frame_complete = __ pc() - start;
4830 
4831     // Set up last_Java_sp and last_Java_fp
4832     address the_pc = __ pc();
4833     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4834 
4835     // Call runtime
4836     if (arg1 != noreg) {
4837       assert(arg2 != c_rarg1, "clobbered");
4838       __ mov(c_rarg1, arg1);
4839     }
4840     if (arg2 != noreg) {
4841       __ mov(c_rarg2, arg2);
4842     }
4843     __ mov(c_rarg0, rthread);
4844     BLOCK_COMMENT("call runtime_entry");
4845     __ mov(rscratch1, runtime_entry);
4846     __ blr(rscratch1);
4847 
4848     // Generate oop map
4849     OopMap* map = new OopMap(framesize, 0);
4850 
4851     oop_maps->add_gc_map(the_pc - start, map);
4852 
4853     __ reset_last_Java_frame(true);
4854     __ maybe_isb();
4855 
4856     __ leave();
4857 
4858     // check for pending exceptions
4859 #ifdef ASSERT
4860     Label L;
4861     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4862     __ cbnz(rscratch1, L);
4863     __ should_not_reach_here();
4864     __ bind(L);
4865 #endif // ASSERT
4866     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4867 
4868 
4869     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4870     RuntimeStub* stub =
4871       RuntimeStub::new_runtime_stub(name,
4872                                     &code,
4873                                     frame_complete,
4874                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4875                                     oop_maps, false);
4876     return stub->entry_point();
4877   }
4878 
4879   class MontgomeryMultiplyGenerator : public MacroAssembler {
4880 
4881     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4882       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4883 
4884     RegSet _toSave;
4885     bool _squaring;
4886 
4887   public:
4888     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4889       : MacroAssembler(as->code()), _squaring(squaring) {
4890 
4891       // Register allocation
4892 
4893       Register reg = c_rarg0;
4894       Pa_base = reg;       // Argument registers
4895       if (squaring)
4896         Pb_base = Pa_base;
4897       else
4898         Pb_base = ++reg;
4899       Pn_base = ++reg;
4900       Rlen= ++reg;
4901       inv = ++reg;
4902       Pm_base = ++reg;
4903 
4904                           // Working registers:
4905       Ra =  ++reg;        // The current digit of a, b, n, and m.
4906       Rb =  ++reg;
4907       Rm =  ++reg;
4908       Rn =  ++reg;
4909 
4910       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4911       Pb =  ++reg;
4912       Pm =  ++reg;
4913       Pn =  ++reg;
4914 
4915       t0 =  ++reg;        // Three registers which form a
4916       t1 =  ++reg;        // triple-precision accumuator.
4917       t2 =  ++reg;
4918 
4919       Ri =  ++reg;        // Inner and outer loop indexes.
4920       Rj =  ++reg;
4921 
4922       Rhi_ab = ++reg;     // Product registers: low and high parts
4923       Rlo_ab = ++reg;     // of a*b and m*n.
4924       Rhi_mn = ++reg;
4925       Rlo_mn = ++reg;
4926 
4927       // r19 and up are callee-saved.
4928       _toSave = RegSet::range(r19, reg) + Pm_base;
4929     }
4930 
4931   private:
4932     void save_regs() {
4933       push(_toSave, sp);
4934     }
4935 
4936     void restore_regs() {
4937       pop(_toSave, sp);
4938     }
4939 
4940     template <typename T>
4941     void unroll_2(Register count, T block) {
4942       Label loop, end, odd;
4943       tbnz(count, 0, odd);
4944       cbz(count, end);
4945       align(16);
4946       bind(loop);
4947       (this->*block)();
4948       bind(odd);
4949       (this->*block)();
4950       subs(count, count, 2);
4951       br(Assembler::GT, loop);
4952       bind(end);
4953     }
4954 
4955     template <typename T>
4956     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4957       Label loop, end, odd;
4958       tbnz(count, 0, odd);
4959       cbz(count, end);
4960       align(16);
4961       bind(loop);
4962       (this->*block)(d, s, tmp);
4963       bind(odd);
4964       (this->*block)(d, s, tmp);
4965       subs(count, count, 2);
4966       br(Assembler::GT, loop);
4967       bind(end);
4968     }
4969 
4970     void pre1(RegisterOrConstant i) {
4971       block_comment("pre1");
4972       // Pa = Pa_base;
4973       // Pb = Pb_base + i;
4974       // Pm = Pm_base;
4975       // Pn = Pn_base + i;
4976       // Ra = *Pa;
4977       // Rb = *Pb;
4978       // Rm = *Pm;
4979       // Rn = *Pn;
4980       ldr(Ra, Address(Pa_base));
4981       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4982       ldr(Rm, Address(Pm_base));
4983       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4984       lea(Pa, Address(Pa_base));
4985       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4986       lea(Pm, Address(Pm_base));
4987       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4988 
4989       // Zero the m*n result.
4990       mov(Rhi_mn, zr);
4991       mov(Rlo_mn, zr);
4992     }
4993 
4994     // The core multiply-accumulate step of a Montgomery
4995     // multiplication.  The idea is to schedule operations as a
4996     // pipeline so that instructions with long latencies (loads and
4997     // multiplies) have time to complete before their results are
4998     // used.  This most benefits in-order implementations of the
4999     // architecture but out-of-order ones also benefit.
5000     void step() {
5001       block_comment("step");
5002       // MACC(Ra, Rb, t0, t1, t2);
5003       // Ra = *++Pa;
5004       // Rb = *--Pb;
5005       umulh(Rhi_ab, Ra, Rb);
5006       mul(Rlo_ab, Ra, Rb);
5007       ldr(Ra, pre(Pa, wordSize));
5008       ldr(Rb, pre(Pb, -wordSize));
5009       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5010                                        // previous iteration.
5011       // MACC(Rm, Rn, t0, t1, t2);
5012       // Rm = *++Pm;
5013       // Rn = *--Pn;
5014       umulh(Rhi_mn, Rm, Rn);
5015       mul(Rlo_mn, Rm, Rn);
5016       ldr(Rm, pre(Pm, wordSize));
5017       ldr(Rn, pre(Pn, -wordSize));
5018       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5019     }
5020 
5021     void post1() {
5022       block_comment("post1");
5023 
5024       // MACC(Ra, Rb, t0, t1, t2);
5025       // Ra = *++Pa;
5026       // Rb = *--Pb;
5027       umulh(Rhi_ab, Ra, Rb);
5028       mul(Rlo_ab, Ra, Rb);
5029       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5030       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5031 
5032       // *Pm = Rm = t0 * inv;
5033       mul(Rm, t0, inv);
5034       str(Rm, Address(Pm));
5035 
5036       // MACC(Rm, Rn, t0, t1, t2);
5037       // t0 = t1; t1 = t2; t2 = 0;
5038       umulh(Rhi_mn, Rm, Rn);
5039 
5040 #ifndef PRODUCT
5041       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5042       {
5043         mul(Rlo_mn, Rm, Rn);
5044         add(Rlo_mn, t0, Rlo_mn);
5045         Label ok;
5046         cbz(Rlo_mn, ok); {
5047           stop("broken Montgomery multiply");
5048         } bind(ok);
5049       }
5050 #endif
5051       // We have very carefully set things up so that
5052       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5053       // the lower half of Rm * Rn because we know the result already:
5054       // it must be -t0.  t0 + (-t0) must generate a carry iff
5055       // t0 != 0.  So, rather than do a mul and an adds we just set
5056       // the carry flag iff t0 is nonzero.
5057       //
5058       // mul(Rlo_mn, Rm, Rn);
5059       // adds(zr, t0, Rlo_mn);
5060       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5061       adcs(t0, t1, Rhi_mn);
5062       adc(t1, t2, zr);
5063       mov(t2, zr);
5064     }
5065 
5066     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5067       block_comment("pre2");
5068       // Pa = Pa_base + i-len;
5069       // Pb = Pb_base + len;
5070       // Pm = Pm_base + i-len;
5071       // Pn = Pn_base + len;
5072 
5073       if (i.is_register()) {
5074         sub(Rj, i.as_register(), len);
5075       } else {
5076         mov(Rj, i.as_constant());
5077         sub(Rj, Rj, len);
5078       }
5079       // Rj == i-len
5080 
5081       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5082       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5083       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5084       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5085 
5086       // Ra = *++Pa;
5087       // Rb = *--Pb;
5088       // Rm = *++Pm;
5089       // Rn = *--Pn;
5090       ldr(Ra, pre(Pa, wordSize));
5091       ldr(Rb, pre(Pb, -wordSize));
5092       ldr(Rm, pre(Pm, wordSize));
5093       ldr(Rn, pre(Pn, -wordSize));
5094 
5095       mov(Rhi_mn, zr);
5096       mov(Rlo_mn, zr);
5097     }
5098 
5099     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5100       block_comment("post2");
5101       if (i.is_constant()) {
5102         mov(Rj, i.as_constant()-len.as_constant());
5103       } else {
5104         sub(Rj, i.as_register(), len);
5105       }
5106 
5107       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5108 
5109       // As soon as we know the least significant digit of our result,
5110       // store it.
5111       // Pm_base[i-len] = t0;
5112       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5113 
5114       // t0 = t1; t1 = t2; t2 = 0;
5115       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5116       adc(t1, t2, zr);
5117       mov(t2, zr);
5118     }
5119 
5120     // A carry in t0 after Montgomery multiplication means that we
5121     // should subtract multiples of n from our result in m.  We'll
5122     // keep doing that until there is no carry.
5123     void normalize(RegisterOrConstant len) {
5124       block_comment("normalize");
5125       // while (t0)
5126       //   t0 = sub(Pm_base, Pn_base, t0, len);
5127       Label loop, post, again;
5128       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5129       cbz(t0, post); {
5130         bind(again); {
5131           mov(i, zr);
5132           mov(cnt, len);
5133           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5134           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5135           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5136           align(16);
5137           bind(loop); {
5138             sbcs(Rm, Rm, Rn);
5139             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5140             add(i, i, 1);
5141             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5142             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5143             sub(cnt, cnt, 1);
5144           } cbnz(cnt, loop);
5145           sbc(t0, t0, zr);
5146         } cbnz(t0, again);
5147       } bind(post);
5148     }
5149 
5150     // Move memory at s to d, reversing words.
5151     //    Increments d to end of copied memory
5152     //    Destroys tmp1, tmp2
5153     //    Preserves len
5154     //    Leaves s pointing to the address which was in d at start
5155     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5156       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5157 
5158       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5159       mov(tmp1, len);
5160       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5161       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5162     }
5163     // where
5164     void reverse1(Register d, Register s, Register tmp) {
5165       ldr(tmp, pre(s, -wordSize));
5166       ror(tmp, tmp, 32);
5167       str(tmp, post(d, wordSize));
5168     }
5169 
5170     void step_squaring() {
5171       // An extra ACC
5172       step();
5173       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5174     }
5175 
5176     void last_squaring(RegisterOrConstant i) {
5177       Label dont;
5178       // if ((i & 1) == 0) {
5179       tbnz(i.as_register(), 0, dont); {
5180         // MACC(Ra, Rb, t0, t1, t2);
5181         // Ra = *++Pa;
5182         // Rb = *--Pb;
5183         umulh(Rhi_ab, Ra, Rb);
5184         mul(Rlo_ab, Ra, Rb);
5185         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5186       } bind(dont);
5187     }
5188 
5189     void extra_step_squaring() {
5190       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5191 
5192       // MACC(Rm, Rn, t0, t1, t2);
5193       // Rm = *++Pm;
5194       // Rn = *--Pn;
5195       umulh(Rhi_mn, Rm, Rn);
5196       mul(Rlo_mn, Rm, Rn);
5197       ldr(Rm, pre(Pm, wordSize));
5198       ldr(Rn, pre(Pn, -wordSize));
5199     }
5200 
5201     void post1_squaring() {
5202       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5203 
5204       // *Pm = Rm = t0 * inv;
5205       mul(Rm, t0, inv);
5206       str(Rm, Address(Pm));
5207 
5208       // MACC(Rm, Rn, t0, t1, t2);
5209       // t0 = t1; t1 = t2; t2 = 0;
5210       umulh(Rhi_mn, Rm, Rn);
5211 
5212 #ifndef PRODUCT
5213       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5214       {
5215         mul(Rlo_mn, Rm, Rn);
5216         add(Rlo_mn, t0, Rlo_mn);
5217         Label ok;
5218         cbz(Rlo_mn, ok); {
5219           stop("broken Montgomery multiply");
5220         } bind(ok);
5221       }
5222 #endif
5223       // We have very carefully set things up so that
5224       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5225       // the lower half of Rm * Rn because we know the result already:
5226       // it must be -t0.  t0 + (-t0) must generate a carry iff
5227       // t0 != 0.  So, rather than do a mul and an adds we just set
5228       // the carry flag iff t0 is nonzero.
5229       //
5230       // mul(Rlo_mn, Rm, Rn);
5231       // adds(zr, t0, Rlo_mn);
5232       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5233       adcs(t0, t1, Rhi_mn);
5234       adc(t1, t2, zr);
5235       mov(t2, zr);
5236     }
5237 
5238     void acc(Register Rhi, Register Rlo,
5239              Register t0, Register t1, Register t2) {
5240       adds(t0, t0, Rlo);
5241       adcs(t1, t1, Rhi);
5242       adc(t2, t2, zr);
5243     }
5244 
5245   public:
5246     /**
5247      * Fast Montgomery multiplication.  The derivation of the
5248      * algorithm is in A Cryptographic Library for the Motorola
5249      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5250      *
5251      * Arguments:
5252      *
5253      * Inputs for multiplication:
5254      *   c_rarg0   - int array elements a
5255      *   c_rarg1   - int array elements b
5256      *   c_rarg2   - int array elements n (the modulus)
5257      *   c_rarg3   - int length
5258      *   c_rarg4   - int inv
5259      *   c_rarg5   - int array elements m (the result)
5260      *
5261      * Inputs for squaring:
5262      *   c_rarg0   - int array elements a
5263      *   c_rarg1   - int array elements n (the modulus)
5264      *   c_rarg2   - int length
5265      *   c_rarg3   - int inv
5266      *   c_rarg4   - int array elements m (the result)
5267      *
5268      */
5269     address generate_multiply() {
5270       Label argh, nothing;
5271       bind(argh);
5272       stop("MontgomeryMultiply total_allocation must be <= 8192");
5273 
5274       align(CodeEntryAlignment);
5275       address entry = pc();
5276 
5277       cbzw(Rlen, nothing);
5278 
5279       enter();
5280 
5281       // Make room.
5282       cmpw(Rlen, 512);
5283       br(Assembler::HI, argh);
5284       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5285       andr(sp, Ra, -2 * wordSize);
5286 
5287       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5288 
5289       {
5290         // Copy input args, reversing as we go.  We use Ra as a
5291         // temporary variable.
5292         reverse(Ra, Pa_base, Rlen, t0, t1);
5293         if (!_squaring)
5294           reverse(Ra, Pb_base, Rlen, t0, t1);
5295         reverse(Ra, Pn_base, Rlen, t0, t1);
5296       }
5297 
5298       // Push all call-saved registers and also Pm_base which we'll need
5299       // at the end.
5300       save_regs();
5301 
5302 #ifndef PRODUCT
5303       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5304       {
5305         ldr(Rn, Address(Pn_base, 0));
5306         mul(Rlo_mn, Rn, inv);
5307         subs(zr, Rlo_mn, -1);
5308         Label ok;
5309         br(EQ, ok); {
5310           stop("broken inverse in Montgomery multiply");
5311         } bind(ok);
5312       }
5313 #endif
5314 
5315       mov(Pm_base, Ra);
5316 
5317       mov(t0, zr);
5318       mov(t1, zr);
5319       mov(t2, zr);
5320 
5321       block_comment("for (int i = 0; i < len; i++) {");
5322       mov(Ri, zr); {
5323         Label loop, end;
5324         cmpw(Ri, Rlen);
5325         br(Assembler::GE, end);
5326 
5327         bind(loop);
5328         pre1(Ri);
5329 
5330         block_comment("  for (j = i; j; j--) {"); {
5331           movw(Rj, Ri);
5332           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5333         } block_comment("  } // j");
5334 
5335         post1();
5336         addw(Ri, Ri, 1);
5337         cmpw(Ri, Rlen);
5338         br(Assembler::LT, loop);
5339         bind(end);
5340         block_comment("} // i");
5341       }
5342 
5343       block_comment("for (int i = len; i < 2*len; i++) {");
5344       mov(Ri, Rlen); {
5345         Label loop, end;
5346         cmpw(Ri, Rlen, Assembler::LSL, 1);
5347         br(Assembler::GE, end);
5348 
5349         bind(loop);
5350         pre2(Ri, Rlen);
5351 
5352         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5353           lslw(Rj, Rlen, 1);
5354           subw(Rj, Rj, Ri);
5355           subw(Rj, Rj, 1);
5356           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5357         } block_comment("  } // j");
5358 
5359         post2(Ri, Rlen);
5360         addw(Ri, Ri, 1);
5361         cmpw(Ri, Rlen, Assembler::LSL, 1);
5362         br(Assembler::LT, loop);
5363         bind(end);
5364       }
5365       block_comment("} // i");
5366 
5367       normalize(Rlen);
5368 
5369       mov(Ra, Pm_base);  // Save Pm_base in Ra
5370       restore_regs();  // Restore caller's Pm_base
5371 
5372       // Copy our result into caller's Pm_base
5373       reverse(Pm_base, Ra, Rlen, t0, t1);
5374 
5375       leave();
5376       bind(nothing);
5377       ret(lr);
5378 
5379       return entry;
5380     }
5381     // In C, approximately:
5382 
5383     // void
5384     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5385     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5386     //                     unsigned long inv, int len) {
5387     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5388     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5389     //   unsigned long Ra, Rb, Rn, Rm;
5390 
5391     //   int i;
5392 
5393     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5394 
5395     //   for (i = 0; i < len; i++) {
5396     //     int j;
5397 
5398     //     Pa = Pa_base;
5399     //     Pb = Pb_base + i;
5400     //     Pm = Pm_base;
5401     //     Pn = Pn_base + i;
5402 
5403     //     Ra = *Pa;
5404     //     Rb = *Pb;
5405     //     Rm = *Pm;
5406     //     Rn = *Pn;
5407 
5408     //     int iters = i;
5409     //     for (j = 0; iters--; j++) {
5410     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5411     //       MACC(Ra, Rb, t0, t1, t2);
5412     //       Ra = *++Pa;
5413     //       Rb = *--Pb;
5414     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5415     //       MACC(Rm, Rn, t0, t1, t2);
5416     //       Rm = *++Pm;
5417     //       Rn = *--Pn;
5418     //     }
5419 
5420     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5421     //     MACC(Ra, Rb, t0, t1, t2);
5422     //     *Pm = Rm = t0 * inv;
5423     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5424     //     MACC(Rm, Rn, t0, t1, t2);
5425 
5426     //     assert(t0 == 0, "broken Montgomery multiply");
5427 
5428     //     t0 = t1; t1 = t2; t2 = 0;
5429     //   }
5430 
5431     //   for (i = len; i < 2*len; i++) {
5432     //     int j;
5433 
5434     //     Pa = Pa_base + i-len;
5435     //     Pb = Pb_base + len;
5436     //     Pm = Pm_base + i-len;
5437     //     Pn = Pn_base + len;
5438 
5439     //     Ra = *++Pa;
5440     //     Rb = *--Pb;
5441     //     Rm = *++Pm;
5442     //     Rn = *--Pn;
5443 
5444     //     int iters = len*2-i-1;
5445     //     for (j = i-len+1; iters--; j++) {
5446     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5447     //       MACC(Ra, Rb, t0, t1, t2);
5448     //       Ra = *++Pa;
5449     //       Rb = *--Pb;
5450     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5451     //       MACC(Rm, Rn, t0, t1, t2);
5452     //       Rm = *++Pm;
5453     //       Rn = *--Pn;
5454     //     }
5455 
5456     //     Pm_base[i-len] = t0;
5457     //     t0 = t1; t1 = t2; t2 = 0;
5458     //   }
5459 
5460     //   while (t0)
5461     //     t0 = sub(Pm_base, Pn_base, t0, len);
5462     // }
5463 
5464     /**
5465      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5466      * multiplies than Montgomery multiplication so it should be up to
5467      * 25% faster.  However, its loop control is more complex and it
5468      * may actually run slower on some machines.
5469      *
5470      * Arguments:
5471      *
5472      * Inputs:
5473      *   c_rarg0   - int array elements a
5474      *   c_rarg1   - int array elements n (the modulus)
5475      *   c_rarg2   - int length
5476      *   c_rarg3   - int inv
5477      *   c_rarg4   - int array elements m (the result)
5478      *
5479      */
5480     address generate_square() {
5481       Label argh;
5482       bind(argh);
5483       stop("MontgomeryMultiply total_allocation must be <= 8192");
5484 
5485       align(CodeEntryAlignment);
5486       address entry = pc();
5487 
5488       enter();
5489 
5490       // Make room.
5491       cmpw(Rlen, 512);
5492       br(Assembler::HI, argh);
5493       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5494       andr(sp, Ra, -2 * wordSize);
5495 
5496       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5497 
5498       {
5499         // Copy input args, reversing as we go.  We use Ra as a
5500         // temporary variable.
5501         reverse(Ra, Pa_base, Rlen, t0, t1);
5502         reverse(Ra, Pn_base, Rlen, t0, t1);
5503       }
5504 
5505       // Push all call-saved registers and also Pm_base which we'll need
5506       // at the end.
5507       save_regs();
5508 
5509       mov(Pm_base, Ra);
5510 
5511       mov(t0, zr);
5512       mov(t1, zr);
5513       mov(t2, zr);
5514 
5515       block_comment("for (int i = 0; i < len; i++) {");
5516       mov(Ri, zr); {
5517         Label loop, end;
5518         bind(loop);
5519         cmp(Ri, Rlen);
5520         br(Assembler::GE, end);
5521 
5522         pre1(Ri);
5523 
5524         block_comment("for (j = (i+1)/2; j; j--) {"); {
5525           add(Rj, Ri, 1);
5526           lsr(Rj, Rj, 1);
5527           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5528         } block_comment("  } // j");
5529 
5530         last_squaring(Ri);
5531 
5532         block_comment("  for (j = i/2; j; j--) {"); {
5533           lsr(Rj, Ri, 1);
5534           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5535         } block_comment("  } // j");
5536 
5537         post1_squaring();
5538         add(Ri, Ri, 1);
5539         cmp(Ri, Rlen);
5540         br(Assembler::LT, loop);
5541 
5542         bind(end);
5543         block_comment("} // i");
5544       }
5545 
5546       block_comment("for (int i = len; i < 2*len; i++) {");
5547       mov(Ri, Rlen); {
5548         Label loop, end;
5549         bind(loop);
5550         cmp(Ri, Rlen, Assembler::LSL, 1);
5551         br(Assembler::GE, end);
5552 
5553         pre2(Ri, Rlen);
5554 
5555         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5556           lsl(Rj, Rlen, 1);
5557           sub(Rj, Rj, Ri);
5558           sub(Rj, Rj, 1);
5559           lsr(Rj, Rj, 1);
5560           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5561         } block_comment("  } // j");
5562 
5563         last_squaring(Ri);
5564 
5565         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5566           lsl(Rj, Rlen, 1);
5567           sub(Rj, Rj, Ri);
5568           lsr(Rj, Rj, 1);
5569           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5570         } block_comment("  } // j");
5571 
5572         post2(Ri, Rlen);
5573         add(Ri, Ri, 1);
5574         cmp(Ri, Rlen, Assembler::LSL, 1);
5575 
5576         br(Assembler::LT, loop);
5577         bind(end);
5578         block_comment("} // i");
5579       }
5580 
5581       normalize(Rlen);
5582 
5583       mov(Ra, Pm_base);  // Save Pm_base in Ra
5584       restore_regs();  // Restore caller's Pm_base
5585 
5586       // Copy our result into caller's Pm_base
5587       reverse(Pm_base, Ra, Rlen, t0, t1);
5588 
5589       leave();
5590       ret(lr);
5591 
5592       return entry;
5593     }
5594     // In C, approximately:
5595 
5596     // void
5597     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5598     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5599     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5600     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5601     //   unsigned long Ra, Rb, Rn, Rm;
5602 
5603     //   int i;
5604 
5605     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5606 
5607     //   for (i = 0; i < len; i++) {
5608     //     int j;
5609 
5610     //     Pa = Pa_base;
5611     //     Pb = Pa_base + i;
5612     //     Pm = Pm_base;
5613     //     Pn = Pn_base + i;
5614 
5615     //     Ra = *Pa;
5616     //     Rb = *Pb;
5617     //     Rm = *Pm;
5618     //     Rn = *Pn;
5619 
5620     //     int iters = (i+1)/2;
5621     //     for (j = 0; iters--; j++) {
5622     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5623     //       MACC2(Ra, Rb, t0, t1, t2);
5624     //       Ra = *++Pa;
5625     //       Rb = *--Pb;
5626     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5627     //       MACC(Rm, Rn, t0, t1, t2);
5628     //       Rm = *++Pm;
5629     //       Rn = *--Pn;
5630     //     }
5631     //     if ((i & 1) == 0) {
5632     //       assert(Ra == Pa_base[j], "must be");
5633     //       MACC(Ra, Ra, t0, t1, t2);
5634     //     }
5635     //     iters = i/2;
5636     //     assert(iters == i-j, "must be");
5637     //     for (; iters--; j++) {
5638     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5639     //       MACC(Rm, Rn, t0, t1, t2);
5640     //       Rm = *++Pm;
5641     //       Rn = *--Pn;
5642     //     }
5643 
5644     //     *Pm = Rm = t0 * inv;
5645     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5646     //     MACC(Rm, Rn, t0, t1, t2);
5647 
5648     //     assert(t0 == 0, "broken Montgomery multiply");
5649 
5650     //     t0 = t1; t1 = t2; t2 = 0;
5651     //   }
5652 
5653     //   for (i = len; i < 2*len; i++) {
5654     //     int start = i-len+1;
5655     //     int end = start + (len - start)/2;
5656     //     int j;
5657 
5658     //     Pa = Pa_base + i-len;
5659     //     Pb = Pa_base + len;
5660     //     Pm = Pm_base + i-len;
5661     //     Pn = Pn_base + len;
5662 
5663     //     Ra = *++Pa;
5664     //     Rb = *--Pb;
5665     //     Rm = *++Pm;
5666     //     Rn = *--Pn;
5667 
5668     //     int iters = (2*len-i-1)/2;
5669     //     assert(iters == end-start, "must be");
5670     //     for (j = start; iters--; j++) {
5671     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5672     //       MACC2(Ra, Rb, t0, t1, t2);
5673     //       Ra = *++Pa;
5674     //       Rb = *--Pb;
5675     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5676     //       MACC(Rm, Rn, t0, t1, t2);
5677     //       Rm = *++Pm;
5678     //       Rn = *--Pn;
5679     //     }
5680     //     if ((i & 1) == 0) {
5681     //       assert(Ra == Pa_base[j], "must be");
5682     //       MACC(Ra, Ra, t0, t1, t2);
5683     //     }
5684     //     iters =  (2*len-i)/2;
5685     //     assert(iters == len-j, "must be");
5686     //     for (; iters--; j++) {
5687     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5688     //       MACC(Rm, Rn, t0, t1, t2);
5689     //       Rm = *++Pm;
5690     //       Rn = *--Pn;
5691     //     }
5692     //     Pm_base[i-len] = t0;
5693     //     t0 = t1; t1 = t2; t2 = 0;
5694     //   }
5695 
5696     //   while (t0)
5697     //     t0 = sub(Pm_base, Pn_base, t0, len);
5698     // }
5699   };
5700 
5701 
5702   // Initialization
5703   void generate_initial() {
5704     // Generate initial stubs and initializes the entry points
5705 
5706     // entry points that exist in all platforms Note: This is code
5707     // that could be shared among different platforms - however the
5708     // benefit seems to be smaller than the disadvantage of having a
5709     // much more complicated generator structure. See also comment in
5710     // stubRoutines.hpp.
5711 
5712     StubRoutines::_forward_exception_entry = generate_forward_exception();
5713 
5714     StubRoutines::_call_stub_entry =
5715       generate_call_stub(StubRoutines::_call_stub_return_address);
5716 
5717     // is referenced by megamorphic call
5718     StubRoutines::_catch_exception_entry = generate_catch_exception();
5719 
5720     // Build this early so it's available for the interpreter.
5721     StubRoutines::_throw_StackOverflowError_entry =
5722       generate_throw_exception("StackOverflowError throw_exception",
5723                                CAST_FROM_FN_PTR(address,
5724                                                 SharedRuntime::throw_StackOverflowError));
5725     StubRoutines::_throw_delayed_StackOverflowError_entry =
5726       generate_throw_exception("delayed StackOverflowError throw_exception",
5727                                CAST_FROM_FN_PTR(address,
5728                                                 SharedRuntime::throw_delayed_StackOverflowError));
5729     if (UseCRC32Intrinsics) {
5730       // set table address before stub generation which use it
5731       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5732       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5733     }
5734 
5735     if (UseCRC32CIntrinsics) {
5736       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5737     }
5738 
5739     // Disabled until JDK-8210858 is fixed
5740     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5741     //   StubRoutines::_dlog = generate_dlog();
5742     // }
5743 
5744     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5745       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5746     }
5747 
5748     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5749       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5750     }
5751 
5752     // Safefetch stubs.
5753     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5754                                                        &StubRoutines::_safefetch32_fault_pc,
5755                                                        &StubRoutines::_safefetch32_continuation_pc);
5756     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5757                                                        &StubRoutines::_safefetchN_fault_pc,
5758                                                        &StubRoutines::_safefetchN_continuation_pc);
5759   }
5760 
5761   void generate_all() {
5762     // support for verify_oop (must happen after universe_init)
5763     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5764     StubRoutines::_throw_AbstractMethodError_entry =
5765       generate_throw_exception("AbstractMethodError throw_exception",
5766                                CAST_FROM_FN_PTR(address,
5767                                                 SharedRuntime::
5768                                                 throw_AbstractMethodError));
5769 
5770     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5771       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5772                                CAST_FROM_FN_PTR(address,
5773                                                 SharedRuntime::
5774                                                 throw_IncompatibleClassChangeError));
5775 
5776     StubRoutines::_throw_NullPointerException_at_call_entry =
5777       generate_throw_exception("NullPointerException at call throw_exception",
5778                                CAST_FROM_FN_PTR(address,
5779                                                 SharedRuntime::
5780                                                 throw_NullPointerException_at_call));
5781 
5782     // arraycopy stubs used by compilers
5783     generate_arraycopy_stubs();
5784 
5785     // has negatives stub for large arrays.
5786     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5787 
5788     // array equals stub for large arrays.
5789     if (!UseSimpleArrayEquals) {
5790       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5791     }
5792 
5793     generate_compare_long_strings();
5794 
5795     generate_string_indexof_stubs();
5796 
5797     // byte_array_inflate stub for large arrays.
5798     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5799 
5800     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5801     if (bs_nm != NULL) {
5802       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
5803     }
5804 #ifdef COMPILER2
5805     if (UseMultiplyToLenIntrinsic) {
5806       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5807     }
5808 
5809     if (UseSquareToLenIntrinsic) {
5810       StubRoutines::_squareToLen = generate_squareToLen();
5811     }
5812 
5813     if (UseMulAddIntrinsic) {
5814       StubRoutines::_mulAdd = generate_mulAdd();
5815     }
5816 
5817     if (UseMontgomeryMultiplyIntrinsic) {
5818       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5819       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5820       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5821     }
5822 
5823     if (UseMontgomerySquareIntrinsic) {
5824       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5825       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5826       // We use generate_multiply() rather than generate_square()
5827       // because it's faster for the sizes of modulus we care about.
5828       StubRoutines::_montgomerySquare = g.generate_multiply();
5829     }
5830 #endif // COMPILER2
5831 
5832     // generate GHASH intrinsics code
5833     if (UseGHASHIntrinsics) {
5834       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5835     }
5836 
5837     // data cache line writeback
5838     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5839     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5840 
5841     if (UseAESIntrinsics) {
5842       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5843       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5844       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5845       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5846     }
5847 
5848     if (UseSHA1Intrinsics) {
5849       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5850       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5851     }
5852     if (UseSHA256Intrinsics) {
5853       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5854       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5855     }
5856 
5857     // generate Adler32 intrinsics code
5858     if (UseAdler32Intrinsics) {
5859       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5860     }
5861 
5862     StubRoutines::aarch64::set_completed();
5863   }
5864 
5865  public:
5866   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5867     if (all) {
5868       generate_all();
5869     } else {
5870       generate_initial();
5871     }
5872   }
5873 }; // end class declaration
5874 
5875 #define UCM_TABLE_MAX_ENTRIES 8
5876 void StubGenerator_generate(CodeBuffer* code, bool all) {
5877   if (UnsafeCopyMemory::_table == NULL) {
5878     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
5879   }
5880   StubGenerator g(code, all);
5881 }