New src/hotspot/cpu/aarch64/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #include "utilities/powerOfTwo.hpp"
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zThreadLocalData.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     address aarch64_entry = __ pc();
 222 
 223     // set up frame and move sp to end of save area
 224     __ enter();
 225     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 226 
 227     // save register parameters and Java scratch/global registers
 228     // n.b. we save thread even though it gets installed in
 229     // rthread because we want to sanity check rthread later
 230     __ str(c_rarg7,  thread);
 231     __ strw(c_rarg6, parameter_size);
 232     __ stp(c_rarg4, c_rarg5,  entry_point);
 233     __ stp(c_rarg2, c_rarg3,  result_type);
 234     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 235 
 236     __ stp(r20, r19,   r20_save);
 237     __ stp(r22, r21,   r22_save);
 238     __ stp(r24, r23,   r24_save);
 239     __ stp(r26, r25,   r26_save);
 240     __ stp(r28, r27,   r28_save);
 241 
 242     __ stpd(v9,  v8,   d9_save);
 243     __ stpd(v11, v10,  d11_save);
 244     __ stpd(v13, v12,  d13_save);
 245     __ stpd(v15, v14,  d15_save);
 246 
 247     // install Java thread in global register now we have saved
 248     // whatever value it held
 249     __ mov(rthread, c_rarg7);
 250     // And method
 251     __ mov(rmethod, c_rarg3);
 252 
 253     // set up the heapbase register
 254     __ reinit_heapbase();
 255 
 256 #ifdef ASSERT
 257     // make sure we have no pending exceptions
 258     {
 259       Label L;
 260       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 261       __ cmp(rscratch1, (u1)NULL_WORD);
 262       __ br(Assembler::EQ, L);
 263       __ stop("StubRoutines::call_stub: entered with pending exception");
 264       __ BIND(L);
 265     }
 266 #endif
 267     // pass parameters if any
 268     __ mov(esp, sp);
 269     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 270     __ andr(sp, rscratch1, -2 * wordSize);
 271 
 272     BLOCK_COMMENT("pass parameters if any");
 273     Label parameters_done;
 274     // parameter count is still in c_rarg6
 275     // and parameter pointer identifying param 1 is in c_rarg5
 276     __ cbzw(c_rarg6, parameters_done);
 277 
 278     address loop = __ pc();
 279     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 280     __ subsw(c_rarg6, c_rarg6, 1);
 281     __ push(rscratch1);
 282     __ br(Assembler::GT, loop);
 283 
 284     __ BIND(parameters_done);
 285 
 286     // call Java entry -- passing methdoOop, and current sp
 287     //      rmethod: Method*
 288     //      r13: sender sp
 289     BLOCK_COMMENT("call Java function");
 290     __ mov(r13, sp);
 291     __ blr(c_rarg4);
 292 
 293     // we do this here because the notify will already have been done
 294     // if we get to the next instruction via an exception
 295     //
 296     // n.b. adding this instruction here affects the calculation of
 297     // whether or not a routine returns to the call stub (used when
 298     // doing stack walks) since the normal test is to check the return
 299     // pc against the address saved below. so we may need to allow for
 300     // this extra instruction in the check.
 301 
 302     // save current address for use by exception handling code
 303 
 304     return_address = __ pc();
 305 
 306     // store result depending on type (everything that is not
 307     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 308     // n.b. this assumes Java returns an integral result in r0
 309     // and a floating result in j_farg0
 310     __ ldr(j_rarg2, result);
 311     Label is_long, is_float, is_double, exit;
 312     __ ldr(j_rarg1, result_type);
 313     __ cmp(j_rarg1, (u1)T_OBJECT);
 314     __ br(Assembler::EQ, is_long);
 315     __ cmp(j_rarg1, (u1)T_LONG);
 316     __ br(Assembler::EQ, is_long);
 317     __ cmp(j_rarg1, (u1)T_FLOAT);
 318     __ br(Assembler::EQ, is_float);
 319     __ cmp(j_rarg1, (u1)T_DOUBLE);
 320     __ br(Assembler::EQ, is_double);
 321 
 322     // handle T_INT case
 323     __ strw(r0, Address(j_rarg2));
 324 
 325     __ BIND(exit);
 326 
 327     // pop parameters
 328     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 329 
 330 #ifdef ASSERT
 331     // verify that threads correspond
 332     {
 333       Label L, S;
 334       __ ldr(rscratch1, thread);
 335       __ cmp(rthread, rscratch1);
 336       __ br(Assembler::NE, S);
 337       __ get_thread(rscratch1);
 338       __ cmp(rthread, rscratch1);
 339       __ br(Assembler::EQ, L);
 340       __ BIND(S);
 341       __ stop("StubRoutines::call_stub: threads must correspond");
 342       __ BIND(L);
 343     }
 344 #endif
 345 
 346     // restore callee-save registers
 347     __ ldpd(v15, v14,  d15_save);
 348     __ ldpd(v13, v12,  d13_save);
 349     __ ldpd(v11, v10,  d11_save);
 350     __ ldpd(v9,  v8,   d9_save);
 351 
 352     __ ldp(r28, r27,   r28_save);
 353     __ ldp(r26, r25,   r26_save);
 354     __ ldp(r24, r23,   r24_save);
 355     __ ldp(r22, r21,   r22_save);
 356     __ ldp(r20, r19,   r20_save);
 357 
 358     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 359     __ ldrw(c_rarg2, result_type);
 360     __ ldr(c_rarg3,  method);
 361     __ ldp(c_rarg4, c_rarg5,  entry_point);
 362     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 363 
 364     // leave frame and return to caller
 365     __ leave();
 366     __ ret(lr);
 367 
 368     // handle return types different from T_INT
 369 
 370     __ BIND(is_long);
 371     __ str(r0, Address(j_rarg2, 0));
 372     __ br(Assembler::AL, exit);
 373 
 374     __ BIND(is_float);
 375     __ strs(j_farg0, Address(j_rarg2, 0));
 376     __ br(Assembler::AL, exit);
 377 
 378     __ BIND(is_double);
 379     __ strd(j_farg0, Address(j_rarg2, 0));
 380     __ br(Assembler::AL, exit);
 381 
 382     return start;
 383   }
 384 
 385   // Return point for a Java call if there's an exception thrown in
 386   // Java code.  The exception is caught and transformed into a
 387   // pending exception stored in JavaThread that can be tested from
 388   // within the VM.
 389   //
 390   // Note: Usually the parameters are removed by the callee. In case
 391   // of an exception crossing an activation frame boundary, that is
 392   // not the case if the callee is compiled code => need to setup the
 393   // rsp.
 394   //
 395   // r0: exception oop
 396 
 397   address generate_catch_exception() {
 398     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 399     address start = __ pc();
 400 
 401     // same as in generate_call_stub():
 402     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 403     const Address thread        (rfp, thread_off         * wordSize);
 404 
 405 #ifdef ASSERT
 406     // verify that threads correspond
 407     {
 408       Label L, S;
 409       __ ldr(rscratch1, thread);
 410       __ cmp(rthread, rscratch1);
 411       __ br(Assembler::NE, S);
 412       __ get_thread(rscratch1);
 413       __ cmp(rthread, rscratch1);
 414       __ br(Assembler::EQ, L);
 415       __ bind(S);
 416       __ stop("StubRoutines::catch_exception: threads must correspond");
 417       __ bind(L);
 418     }
 419 #endif
 420 
 421     // set pending exception
 422     __ verify_oop(r0);
 423 
 424     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 425     __ mov(rscratch1, (address)__FILE__);
 426     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 427     __ movw(rscratch1, (int)__LINE__);
 428     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 429 
 430     // complete return to VM
 431     assert(StubRoutines::_call_stub_return_address != NULL,
 432            "_call_stub_return_address must have been generated before");
 433     __ b(StubRoutines::_call_stub_return_address);
 434 
 435     return start;
 436   }
 437 
 438   // Continuation point for runtime calls returning with a pending
 439   // exception.  The pending exception check happened in the runtime
 440   // or native call stub.  The pending exception in Thread is
 441   // converted into a Java-level exception.
 442   //
 443   // Contract with Java-level exception handlers:
 444   // r0: exception
 445   // r3: throwing pc
 446   //
 447   // NOTE: At entry of this stub, exception-pc must be in LR !!
 448 
 449   // NOTE: this is always used as a jump target within generated code
 450   // so it just needs to be generated code wiht no x86 prolog
 451 
 452   address generate_forward_exception() {
 453     StubCodeMark mark(this, "StubRoutines", "forward exception");
 454     address start = __ pc();
 455 
 456     // Upon entry, LR points to the return address returning into
 457     // Java (interpreted or compiled) code; i.e., the return address
 458     // becomes the throwing pc.
 459     //
 460     // Arguments pushed before the runtime call are still on the stack
 461     // but the exception handler will reset the stack pointer ->
 462     // ignore them.  A potential result in registers can be ignored as
 463     // well.
 464 
 465 #ifdef ASSERT
 466     // make sure this code is only executed if there is a pending exception
 467     {
 468       Label L;
 469       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 470       __ cbnz(rscratch1, L);
 471       __ stop("StubRoutines::forward exception: no pending exception (1)");
 472       __ bind(L);
 473     }
 474 #endif
 475 
 476     // compute exception handler into r19
 477 
 478     // call the VM to find the handler address associated with the
 479     // caller address. pass thread in r0 and caller pc (ret address)
 480     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 481     // the stack.
 482     __ mov(c_rarg1, lr);
 483     // lr will be trashed by the VM call so we move it to R19
 484     // (callee-saved) because we also need to pass it to the handler
 485     // returned by this call.
 486     __ mov(r19, lr);
 487     BLOCK_COMMENT("call exception_handler_for_return_address");
 488     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 489                          SharedRuntime::exception_handler_for_return_address),
 490                     rthread, c_rarg1);
 491     if (UseSVE > 0 ) {
 492       // Reinitialize the ptrue predicate register, in case the external runtime
 493       // call clobbers ptrue reg, as we may return to SVE compiled code.
 494       __ reinitialize_ptrue();
 495     }
 496     // we should not really care that lr is no longer the callee
 497     // address. we saved the value the handler needs in r19 so we can
 498     // just copy it to r3. however, the C2 handler will push its own
 499     // frame and then calls into the VM and the VM code asserts that
 500     // the PC for the frame above the handler belongs to a compiled
 501     // Java method. So, we restore lr here to satisfy that assert.
 502     __ mov(lr, r19);
 503     // setup r0 & r3 & clear pending exception
 504     __ mov(r3, r19);
 505     __ mov(r19, r0);
 506     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 507     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 508 
 509 #ifdef ASSERT
 510     // make sure exception is set
 511     {
 512       Label L;
 513       __ cbnz(r0, L);
 514       __ stop("StubRoutines::forward exception: no pending exception (2)");
 515       __ bind(L);
 516     }
 517 #endif
 518 
 519     // continue at exception handler
 520     // r0: exception
 521     // r3: throwing pc
 522     // r19: exception handler
 523     __ verify_oop(r0);
 524     __ br(r19);
 525 
 526     return start;
 527   }
 528 
 529   // Non-destructive plausibility checks for oops
 530   //
 531   // Arguments:
 532   //    r0: oop to verify
 533   //    rscratch1: error message
 534   //
 535   // Stack after saving c_rarg3:
 536   //    [tos + 0]: saved c_rarg3
 537   //    [tos + 1]: saved c_rarg2
 538   //    [tos + 2]: saved lr
 539   //    [tos + 3]: saved rscratch2
 540   //    [tos + 4]: saved r0
 541   //    [tos + 5]: saved rscratch1
 542   address generate_verify_oop() {
 543 
 544     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 545     address start = __ pc();
 546 
 547     Label exit, error;
 548 
 549     // save c_rarg2 and c_rarg3
 550     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 551 
 552     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 553     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 554     __ ldr(c_rarg3, Address(c_rarg2));
 555     __ add(c_rarg3, c_rarg3, 1);
 556     __ str(c_rarg3, Address(c_rarg2));
 557 
 558     // object is in r0
 559     // make sure object is 'reasonable'
 560     __ cbz(r0, exit); // if obj is NULL it is OK
 561 
 562 #if INCLUDE_ZGC
 563     if (UseZGC) {
 564       // Check if mask is good.
 565       // verifies that ZAddressBadMask & r0 == 0
 566       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 567       __ andr(c_rarg2, r0, c_rarg3);
 568       __ cbnz(c_rarg2, error);
 569     }
 570 #endif
 571 
 572     // Check if the oop is in the right area of memory
 573     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 574     __ andr(c_rarg2, r0, c_rarg3);
 575     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 576 
 577     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 578     // instruction here because the flags register is live.
 579     __ eor(c_rarg2, c_rarg2, c_rarg3);
 580     __ cbnz(c_rarg2, error);
 581 
 582     // make sure klass is 'reasonable', which is not zero.
 583     __ load_klass(r0, r0);  // get klass
 584     __ cbz(r0, error);      // if klass is NULL it is broken
 585 
 586     // return if everything seems ok
 587     __ bind(exit);
 588 
 589     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 590     __ ret(lr);
 591 
 592     // handle errors
 593     __ bind(error);
 594     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 595 
 596     __ push(RegSet::range(r0, r29), sp);
 597     // debug(char* msg, int64_t pc, int64_t regs[])
 598     __ mov(c_rarg0, rscratch1);      // pass address of error message
 599     __ mov(c_rarg1, lr);             // pass return address
 600     __ mov(c_rarg2, sp);             // pass address of regs on stack
 601 #ifndef PRODUCT
 602     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 603 #endif
 604     BLOCK_COMMENT("call MacroAssembler::debug");
 605     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 606     __ blr(rscratch1);
 607     __ hlt(0);
 608 
 609     return start;
 610   }
 611 
 612   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 613 
 614   // The inner part of zero_words().  This is the bulk operation,
 615   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 616   // caller is responsible for zeroing the last few words.
 617   //
 618   // Inputs:
 619   // r10: the HeapWord-aligned base address of an array to zero.
 620   // r11: the count in HeapWords, r11 > 0.
 621   //
 622   // Returns r10 and r11, adjusted for the caller to clear.
 623   // r10: the base address of the tail of words left to clear.
 624   // r11: the number of words in the tail.
 625   //      r11 < MacroAssembler::zero_words_block_size.
 626 
 627   address generate_zero_blocks() {
 628     Label done;
 629     Label base_aligned;
 630 
 631     Register base = r10, cnt = r11;
 632 
 633     __ align(CodeEntryAlignment);
 634     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 635     address start = __ pc();
 636 
 637     if (UseBlockZeroing) {
 638       int zva_length = VM_Version::zva_length();
 639 
 640       // Ensure ZVA length can be divided by 16. This is required by
 641       // the subsequent operations.
 642       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 643 
 644       __ tbz(base, 3, base_aligned);
 645       __ str(zr, Address(__ post(base, 8)));
 646       __ sub(cnt, cnt, 1);
 647       __ bind(base_aligned);
 648 
 649       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 650       // alignment.
 651       Label small;
 652       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 653       __ subs(rscratch1, cnt, low_limit >> 3);
 654       __ br(Assembler::LT, small);
 655       __ zero_dcache_blocks(base, cnt);
 656       __ bind(small);
 657     }
 658 
 659     {
 660       // Number of stp instructions we'll unroll
 661       const int unroll =
 662         MacroAssembler::zero_words_block_size / 2;
 663       // Clear the remaining blocks.
 664       Label loop;
 665       __ subs(cnt, cnt, unroll * 2);
 666       __ br(Assembler::LT, done);
 667       __ bind(loop);
 668       for (int i = 0; i < unroll; i++)
 669         __ stp(zr, zr, __ post(base, 16));
 670       __ subs(cnt, cnt, unroll * 2);
 671       __ br(Assembler::GE, loop);
 672       __ bind(done);
 673       __ add(cnt, cnt, unroll * 2);
 674     }
 675 
 676     __ ret(lr);
 677 
 678     return start;
 679   }
 680 
 681 
 682   typedef enum {
 683     copy_forwards = 1,
 684     copy_backwards = -1
 685   } copy_direction;
 686 
 687   // Bulk copy of blocks of 8 words.
 688   //
 689   // count is a count of words.
 690   //
 691   // Precondition: count >= 8
 692   //
 693   // Postconditions:
 694   //
 695   // The least significant bit of count contains the remaining count
 696   // of words to copy.  The rest of count is trash.
 697   //
 698   // s and d are adjusted to point to the remaining words to copy
 699   //
 700   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 701                            copy_direction direction) {
 702     int unit = wordSize * direction;
 703     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 704 
 705     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 706       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 707     const Register stride = r13;
 708 
 709     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 710     assert_different_registers(s, d, count, rscratch1);
 711 
 712     Label again, drain;
 713     const char *stub_name;
 714     if (direction == copy_forwards)
 715       stub_name = "forward_copy_longs";
 716     else
 717       stub_name = "backward_copy_longs";
 718 
 719     __ align(CodeEntryAlignment);
 720 
 721     StubCodeMark mark(this, "StubRoutines", stub_name);
 722 
 723     __ bind(start);
 724 
 725     Label unaligned_copy_long;
 726     if (AvoidUnalignedAccesses) {
 727       __ tbnz(d, 3, unaligned_copy_long);
 728     }
 729 
 730     if (direction == copy_forwards) {
 731       __ sub(s, s, bias);
 732       __ sub(d, d, bias);
 733     }
 734 
 735 #ifdef ASSERT
 736     // Make sure we are never given < 8 words
 737     {
 738       Label L;
 739       __ cmp(count, (u1)8);
 740       __ br(Assembler::GE, L);
 741       __ stop("genrate_copy_longs called with < 8 words");
 742       __ bind(L);
 743     }
 744 #endif
 745 
 746     // Fill 8 registers
 747     if (UseSIMDForMemoryOps) {
 748       __ ldpq(v0, v1, Address(s, 4 * unit));
 749       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 750     } else {
 751       __ ldp(t0, t1, Address(s, 2 * unit));
 752       __ ldp(t2, t3, Address(s, 4 * unit));
 753       __ ldp(t4, t5, Address(s, 6 * unit));
 754       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 755     }
 756 
 757     __ subs(count, count, 16);
 758     __ br(Assembler::LO, drain);
 759 
 760     int prefetch = PrefetchCopyIntervalInBytes;
 761     bool use_stride = false;
 762     if (direction == copy_backwards) {
 763        use_stride = prefetch > 256;
 764        prefetch = -prefetch;
 765        if (use_stride) __ mov(stride, prefetch);
 766     }
 767 
 768     __ bind(again);
 769 
 770     if (PrefetchCopyIntervalInBytes > 0)
 771       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 772 
 773     if (UseSIMDForMemoryOps) {
 774       __ stpq(v0, v1, Address(d, 4 * unit));
 775       __ ldpq(v0, v1, Address(s, 4 * unit));
 776       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 777       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 778     } else {
 779       __ stp(t0, t1, Address(d, 2 * unit));
 780       __ ldp(t0, t1, Address(s, 2 * unit));
 781       __ stp(t2, t3, Address(d, 4 * unit));
 782       __ ldp(t2, t3, Address(s, 4 * unit));
 783       __ stp(t4, t5, Address(d, 6 * unit));
 784       __ ldp(t4, t5, Address(s, 6 * unit));
 785       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 786       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 787     }
 788 
 789     __ subs(count, count, 8);
 790     __ br(Assembler::HS, again);
 791 
 792     // Drain
 793     __ bind(drain);
 794     if (UseSIMDForMemoryOps) {
 795       __ stpq(v0, v1, Address(d, 4 * unit));
 796       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 797     } else {
 798       __ stp(t0, t1, Address(d, 2 * unit));
 799       __ stp(t2, t3, Address(d, 4 * unit));
 800       __ stp(t4, t5, Address(d, 6 * unit));
 801       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 802     }
 803 
 804     {
 805       Label L1, L2;
 806       __ tbz(count, exact_log2(4), L1);
 807       if (UseSIMDForMemoryOps) {
 808         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 809         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 810       } else {
 811         __ ldp(t0, t1, Address(s, 2 * unit));
 812         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 813         __ stp(t0, t1, Address(d, 2 * unit));
 814         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 815       }
 816       __ bind(L1);
 817 
 818       if (direction == copy_forwards) {
 819         __ add(s, s, bias);
 820         __ add(d, d, bias);
 821       }
 822 
 823       __ tbz(count, 1, L2);
 824       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 825       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 826       __ bind(L2);
 827     }
 828 
 829     __ ret(lr);
 830 
 831     if (AvoidUnalignedAccesses) {
 832       Label drain, again;
 833       // Register order for storing. Order is different for backward copy.
 834 
 835       __ bind(unaligned_copy_long);
 836 
 837       // source address is even aligned, target odd aligned
 838       //
 839       // when forward copying word pairs we read long pairs at offsets
 840       // {0, 2, 4, 6} (in long words). when backwards copying we read
 841       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 842       // address by -2 in the forwards case so we can compute the
 843       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 844       // or -1.
 845       //
 846       // when forward copying we need to store 1 word, 3 pairs and
 847       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 848       // zero offset We adjust the destination by -1 which means we
 849       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 850       //
 851       // When backwards copyng we need to store 1 word, 3 pairs and
 852       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 853       // offsets {1, 3, 5, 7, 8} * unit.
 854 
 855       if (direction == copy_forwards) {
 856         __ sub(s, s, 16);
 857         __ sub(d, d, 8);
 858       }
 859 
 860       // Fill 8 registers
 861       //
 862       // for forwards copy s was offset by -16 from the original input
 863       // value of s so the register contents are at these offsets
 864       // relative to the 64 bit block addressed by that original input
 865       // and so on for each successive 64 byte block when s is updated
 866       //
 867       // t0 at offset 0,  t1 at offset 8
 868       // t2 at offset 16, t3 at offset 24
 869       // t4 at offset 32, t5 at offset 40
 870       // t6 at offset 48, t7 at offset 56
 871 
 872       // for backwards copy s was not offset so the register contents
 873       // are at these offsets into the preceding 64 byte block
 874       // relative to that original input and so on for each successive
 875       // preceding 64 byte block when s is updated. this explains the
 876       // slightly counter-intuitive looking pattern of register usage
 877       // in the stp instructions for backwards copy.
 878       //
 879       // t0 at offset -16, t1 at offset -8
 880       // t2 at offset -32, t3 at offset -24
 881       // t4 at offset -48, t5 at offset -40
 882       // t6 at offset -64, t7 at offset -56
 883 
 884       __ ldp(t0, t1, Address(s, 2 * unit));
 885       __ ldp(t2, t3, Address(s, 4 * unit));
 886       __ ldp(t4, t5, Address(s, 6 * unit));
 887       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 888 
 889       __ subs(count, count, 16);
 890       __ br(Assembler::LO, drain);
 891 
 892       int prefetch = PrefetchCopyIntervalInBytes;
 893       bool use_stride = false;
 894       if (direction == copy_backwards) {
 895          use_stride = prefetch > 256;
 896          prefetch = -prefetch;
 897          if (use_stride) __ mov(stride, prefetch);
 898       }
 899 
 900       __ bind(again);
 901 
 902       if (PrefetchCopyIntervalInBytes > 0)
 903         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 904 
 905       if (direction == copy_forwards) {
 906        // allowing for the offset of -8 the store instructions place
 907        // registers into the target 64 bit block at the following
 908        // offsets
 909        //
 910        // t0 at offset 0
 911        // t1 at offset 8,  t2 at offset 16
 912        // t3 at offset 24, t4 at offset 32
 913        // t5 at offset 40, t6 at offset 48
 914        // t7 at offset 56
 915 
 916         __ str(t0, Address(d, 1 * unit));
 917         __ stp(t1, t2, Address(d, 2 * unit));
 918         __ ldp(t0, t1, Address(s, 2 * unit));
 919         __ stp(t3, t4, Address(d, 4 * unit));
 920         __ ldp(t2, t3, Address(s, 4 * unit));
 921         __ stp(t5, t6, Address(d, 6 * unit));
 922         __ ldp(t4, t5, Address(s, 6 * unit));
 923         __ str(t7, Address(__ pre(d, 8 * unit)));
 924         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 925       } else {
 926        // d was not offset when we started so the registers are
 927        // written into the 64 bit block preceding d with the following
 928        // offsets
 929        //
 930        // t1 at offset -8
 931        // t3 at offset -24, t0 at offset -16
 932        // t5 at offset -48, t2 at offset -32
 933        // t7 at offset -56, t4 at offset -48
 934        //                   t6 at offset -64
 935        //
 936        // note that this matches the offsets previously noted for the
 937        // loads
 938 
 939         __ str(t1, Address(d, 1 * unit));
 940         __ stp(t3, t0, Address(d, 3 * unit));
 941         __ ldp(t0, t1, Address(s, 2 * unit));
 942         __ stp(t5, t2, Address(d, 5 * unit));
 943         __ ldp(t2, t3, Address(s, 4 * unit));
 944         __ stp(t7, t4, Address(d, 7 * unit));
 945         __ ldp(t4, t5, Address(s, 6 * unit));
 946         __ str(t6, Address(__ pre(d, 8 * unit)));
 947         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 948       }
 949 
 950       __ subs(count, count, 8);
 951       __ br(Assembler::HS, again);
 952 
 953       // Drain
 954       //
 955       // this uses the same pattern of offsets and register arguments
 956       // as above
 957       __ bind(drain);
 958       if (direction == copy_forwards) {
 959         __ str(t0, Address(d, 1 * unit));
 960         __ stp(t1, t2, Address(d, 2 * unit));
 961         __ stp(t3, t4, Address(d, 4 * unit));
 962         __ stp(t5, t6, Address(d, 6 * unit));
 963         __ str(t7, Address(__ pre(d, 8 * unit)));
 964       } else {
 965         __ str(t1, Address(d, 1 * unit));
 966         __ stp(t3, t0, Address(d, 3 * unit));
 967         __ stp(t5, t2, Address(d, 5 * unit));
 968         __ stp(t7, t4, Address(d, 7 * unit));
 969         __ str(t6, Address(__ pre(d, 8 * unit)));
 970       }
 971       // now we need to copy any remaining part block which may
 972       // include a 4 word block subblock and/or a 2 word subblock.
 973       // bits 2 and 1 in the count are the tell-tale for whetehr we
 974       // have each such subblock
 975       {
 976         Label L1, L2;
 977         __ tbz(count, exact_log2(4), L1);
 978        // this is the same as above but copying only 4 longs hence
 979        // with ony one intervening stp between the str instructions
 980        // but note that the offsets and registers still follow the
 981        // same pattern
 982         __ ldp(t0, t1, Address(s, 2 * unit));
 983         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 984         if (direction == copy_forwards) {
 985           __ str(t0, Address(d, 1 * unit));
 986           __ stp(t1, t2, Address(d, 2 * unit));
 987           __ str(t3, Address(__ pre(d, 4 * unit)));
 988         } else {
 989           __ str(t1, Address(d, 1 * unit));
 990           __ stp(t3, t0, Address(d, 3 * unit));
 991           __ str(t2, Address(__ pre(d, 4 * unit)));
 992         }
 993         __ bind(L1);
 994 
 995         __ tbz(count, 1, L2);
 996        // this is the same as above but copying only 2 longs hence
 997        // there is no intervening stp between the str instructions
 998        // but note that the offset and register patterns are still
 999        // the same
1000         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1001         if (direction == copy_forwards) {
1002           __ str(t0, Address(d, 1 * unit));
1003           __ str(t1, Address(__ pre(d, 2 * unit)));
1004         } else {
1005           __ str(t1, Address(d, 1 * unit));
1006           __ str(t0, Address(__ pre(d, 2 * unit)));
1007         }
1008         __ bind(L2);
1009 
1010        // for forwards copy we need to re-adjust the offsets we
1011        // applied so that s and d are follow the last words written
1012 
1013        if (direction == copy_forwards) {
1014          __ add(s, s, 16);
1015          __ add(d, d, 8);
1016        }
1017 
1018       }
1019 
1020       __ ret(lr);
1021       }
1022   }
1023 
1024   // Small copy: less than 16 bytes.
1025   //
1026   // NB: Ignores all of the bits of count which represent more than 15
1027   // bytes, so a caller doesn't have to mask them.
1028 
1029   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1030     bool is_backwards = step < 0;
1031     size_t granularity = uabs(step);
1032     int direction = is_backwards ? -1 : 1;
1033     int unit = wordSize * direction;
1034 
1035     Label Lword, Lint, Lshort, Lbyte;
1036 
1037     assert(granularity
1038            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1039 
1040     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1041 
1042     // ??? I don't know if this bit-test-and-branch is the right thing
1043     // to do.  It does a lot of jumping, resulting in several
1044     // mispredicted branches.  It might make more sense to do this
1045     // with something like Duff's device with a single computed branch.
1046 
1047     __ tbz(count, 3 - exact_log2(granularity), Lword);
1048     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1049     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1050     __ bind(Lword);
1051 
1052     if (granularity <= sizeof (jint)) {
1053       __ tbz(count, 2 - exact_log2(granularity), Lint);
1054       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1055       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1056       __ bind(Lint);
1057     }
1058 
1059     if (granularity <= sizeof (jshort)) {
1060       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1061       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1062       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1063       __ bind(Lshort);
1064     }
1065 
1066     if (granularity <= sizeof (jbyte)) {
1067       __ tbz(count, 0, Lbyte);
1068       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1069       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1070       __ bind(Lbyte);
1071     }
1072   }
1073 
1074   Label copy_f, copy_b;
1075 
1076   // All-singing all-dancing memory copy.
1077   //
1078   // Copy count units of memory from s to d.  The size of a unit is
1079   // step, which can be positive or negative depending on the direction
1080   // of copy.  If is_aligned is false, we align the source address.
1081   //
1082 
1083   void copy_memory(bool is_aligned, Register s, Register d,
1084                    Register count, Register tmp, int step) {
1085     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1086     bool is_backwards = step < 0;
1087     int granularity = uabs(step);
1088     const Register t0 = r3, t1 = r4;
1089 
1090     // <= 96 bytes do inline. Direction doesn't matter because we always
1091     // load all the data before writing anything
1092     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1093     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1094     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1095     const Register send = r17, dend = r18;
1096 
1097     if (PrefetchCopyIntervalInBytes > 0)
1098       __ prfm(Address(s, 0), PLDL1KEEP);
1099     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1100     __ br(Assembler::HI, copy_big);
1101 
1102     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1103     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1104 
1105     __ cmp(count, u1(16/granularity));
1106     __ br(Assembler::LS, copy16);
1107 
1108     __ cmp(count, u1(64/granularity));
1109     __ br(Assembler::HI, copy80);
1110 
1111     __ cmp(count, u1(32/granularity));
1112     __ br(Assembler::LS, copy32);
1113 
1114     // 33..64 bytes
1115     if (UseSIMDForMemoryOps) {
1116       __ ldpq(v0, v1, Address(s, 0));
1117       __ ldpq(v2, v3, Address(send, -32));
1118       __ stpq(v0, v1, Address(d, 0));
1119       __ stpq(v2, v3, Address(dend, -32));
1120     } else {
1121       __ ldp(t0, t1, Address(s, 0));
1122       __ ldp(t2, t3, Address(s, 16));
1123       __ ldp(t4, t5, Address(send, -32));
1124       __ ldp(t6, t7, Address(send, -16));
1125 
1126       __ stp(t0, t1, Address(d, 0));
1127       __ stp(t2, t3, Address(d, 16));
1128       __ stp(t4, t5, Address(dend, -32));
1129       __ stp(t6, t7, Address(dend, -16));
1130     }
1131     __ b(finish);
1132 
1133     // 17..32 bytes
1134     __ bind(copy32);
1135     __ ldp(t0, t1, Address(s, 0));
1136     __ ldp(t2, t3, Address(send, -16));
1137     __ stp(t0, t1, Address(d, 0));
1138     __ stp(t2, t3, Address(dend, -16));
1139     __ b(finish);
1140 
1141     // 65..80/96 bytes
1142     // (96 bytes if SIMD because we do 32 byes per instruction)
1143     __ bind(copy80);
1144     if (UseSIMDForMemoryOps) {
1145       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1146       __ ldpq(v4, v5, Address(send, -32));
1147       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1148       __ stpq(v4, v5, Address(dend, -32));
1149     } else {
1150       __ ldp(t0, t1, Address(s, 0));
1151       __ ldp(t2, t3, Address(s, 16));
1152       __ ldp(t4, t5, Address(s, 32));
1153       __ ldp(t6, t7, Address(s, 48));
1154       __ ldp(t8, t9, Address(send, -16));
1155 
1156       __ stp(t0, t1, Address(d, 0));
1157       __ stp(t2, t3, Address(d, 16));
1158       __ stp(t4, t5, Address(d, 32));
1159       __ stp(t6, t7, Address(d, 48));
1160       __ stp(t8, t9, Address(dend, -16));
1161     }
1162     __ b(finish);
1163 
1164     // 0..16 bytes
1165     __ bind(copy16);
1166     __ cmp(count, u1(8/granularity));
1167     __ br(Assembler::LO, copy8);
1168 
1169     // 8..16 bytes
1170     __ ldr(t0, Address(s, 0));
1171     __ ldr(t1, Address(send, -8));
1172     __ str(t0, Address(d, 0));
1173     __ str(t1, Address(dend, -8));
1174     __ b(finish);
1175 
1176     if (granularity < 8) {
1177       // 4..7 bytes
1178       __ bind(copy8);
1179       __ tbz(count, 2 - exact_log2(granularity), copy4);
1180       __ ldrw(t0, Address(s, 0));
1181       __ ldrw(t1, Address(send, -4));
1182       __ strw(t0, Address(d, 0));
1183       __ strw(t1, Address(dend, -4));
1184       __ b(finish);
1185       if (granularity < 4) {
1186         // 0..3 bytes
1187         __ bind(copy4);
1188         __ cbz(count, finish); // get rid of 0 case
1189         if (granularity == 2) {
1190           __ ldrh(t0, Address(s, 0));
1191           __ strh(t0, Address(d, 0));
1192         } else { // granularity == 1
1193           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1194           // the first and last byte.
1195           // Handle the 3 byte case by loading and storing base + count/2
1196           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1197           // This does means in the 1 byte case we load/store the same
1198           // byte 3 times.
1199           __ lsr(count, count, 1);
1200           __ ldrb(t0, Address(s, 0));
1201           __ ldrb(t1, Address(send, -1));
1202           __ ldrb(t2, Address(s, count));
1203           __ strb(t0, Address(d, 0));
1204           __ strb(t1, Address(dend, -1));
1205           __ strb(t2, Address(d, count));
1206         }
1207         __ b(finish);
1208       }
1209     }
1210 
1211     __ bind(copy_big);
1212     if (is_backwards) {
1213       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1214       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1215     }
1216 
1217     // Now we've got the small case out of the way we can align the
1218     // source address on a 2-word boundary.
1219 
1220     Label aligned;
1221 
1222     if (is_aligned) {
1223       // We may have to adjust by 1 word to get s 2-word-aligned.
1224       __ tbz(s, exact_log2(wordSize), aligned);
1225       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1226       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1227       __ sub(count, count, wordSize/granularity);
1228     } else {
1229       if (is_backwards) {
1230         __ andr(rscratch2, s, 2 * wordSize - 1);
1231       } else {
1232         __ neg(rscratch2, s);
1233         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1234       }
1235       // rscratch2 is the byte adjustment needed to align s.
1236       __ cbz(rscratch2, aligned);
1237       int shift = exact_log2(granularity);
1238       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1239       __ sub(count, count, rscratch2);
1240 
1241 #if 0
1242       // ?? This code is only correct for a disjoint copy.  It may or
1243       // may not make sense to use it in that case.
1244 
1245       // Copy the first pair; s and d may not be aligned.
1246       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1247       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1248 
1249       // Align s and d, adjust count
1250       if (is_backwards) {
1251         __ sub(s, s, rscratch2);
1252         __ sub(d, d, rscratch2);
1253       } else {
1254         __ add(s, s, rscratch2);
1255         __ add(d, d, rscratch2);
1256       }
1257 #else
1258       copy_memory_small(s, d, rscratch2, rscratch1, step);
1259 #endif
1260     }
1261 
1262     __ bind(aligned);
1263 
1264     // s is now 2-word-aligned.
1265 
1266     // We have a count of units and some trailing bytes.  Adjust the
1267     // count and do a bulk copy of words.
1268     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1269     if (direction == copy_forwards)
1270       __ bl(copy_f);
1271     else
1272       __ bl(copy_b);
1273 
1274     // And the tail.
1275     copy_memory_small(s, d, count, tmp, step);
1276 
1277     if (granularity >= 8) __ bind(copy8);
1278     if (granularity >= 4) __ bind(copy4);
1279     __ bind(finish);
1280   }
1281 
1282 
1283   void clobber_registers() {
1284 #ifdef ASSERT
1285     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1286     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1287     for (Register r = r3; r <= r18; r++)
1288       if (r != rscratch1) __ mov(r, rscratch1);
1289 #endif
1290   }
1291 
1292   // Scan over array at a for count oops, verifying each one.
1293   // Preserves a and count, clobbers rscratch1 and rscratch2.
1294   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1295     Label loop, end;
1296     __ mov(rscratch1, a);
1297     __ mov(rscratch2, zr);
1298     __ bind(loop);
1299     __ cmp(rscratch2, count);
1300     __ br(Assembler::HS, end);
1301     if (size == (size_t)wordSize) {
1302       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1303       __ verify_oop(temp);
1304     } else {
1305       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1306       __ decode_heap_oop(temp); // calls verify_oop
1307     }
1308     __ add(rscratch2, rscratch2, size);
1309     __ b(loop);
1310     __ bind(end);
1311   }
1312 
1313   // Arguments:
1314   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1315   //             ignored
1316   //   is_oop  - true => oop array, so generate store check code
1317   //   name    - stub name string
1318   //
1319   // Inputs:
1320   //   c_rarg0   - source array address
1321   //   c_rarg1   - destination array address
1322   //   c_rarg2   - element count, treated as ssize_t, can be zero
1323   //
1324   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1325   // the hardware handle it.  The two dwords within qwords that span
1326   // cache line boundaries will still be loaded and stored atomicly.
1327   //
1328   // Side Effects:
1329   //   disjoint_int_copy_entry is set to the no-overlap entry point
1330   //   used by generate_conjoint_int_oop_copy().
1331   //
1332   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1333                                   const char *name, bool dest_uninitialized = false) {
1334     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1335     RegSet saved_reg = RegSet::of(s, d, count);
1336     __ align(CodeEntryAlignment);
1337     StubCodeMark mark(this, "StubRoutines", name);
1338     address start = __ pc();
1339     __ enter();
1340 
1341     if (entry != NULL) {
1342       *entry = __ pc();
1343       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1344       BLOCK_COMMENT("Entry:");
1345     }
1346 
1347     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1348     if (dest_uninitialized) {
1349       decorators |= IS_DEST_UNINITIALIZED;
1350     }
1351     if (aligned) {
1352       decorators |= ARRAYCOPY_ALIGNED;
1353     }
1354 
1355     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1356     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1357 
1358     if (is_oop) {
1359       // save regs before copy_memory
1360       __ push(RegSet::of(d, count), sp);
1361     }
1362     {
1363       // UnsafeCopyMemory page error: continue after ucm
1364       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1365       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1366       copy_memory(aligned, s, d, count, rscratch1, size);
1367     }
1368 
1369     if (is_oop) {
1370       __ pop(RegSet::of(d, count), sp);
1371       if (VerifyOops)
1372         verify_oop_array(size, d, count, r16);
1373     }
1374 
1375     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1376 
1377     __ leave();
1378     __ mov(r0, zr); // return 0
1379     __ ret(lr);
1380     return start;
1381   }
1382 
1383   // Arguments:
1384   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1385   //             ignored
1386   //   is_oop  - true => oop array, so generate store check code
1387   //   name    - stub name string
1388   //
1389   // Inputs:
1390   //   c_rarg0   - source array address
1391   //   c_rarg1   - destination array address
1392   //   c_rarg2   - element count, treated as ssize_t, can be zero
1393   //
1394   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1395   // the hardware handle it.  The two dwords within qwords that span
1396   // cache line boundaries will still be loaded and stored atomicly.
1397   //
1398   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1399                                  address *entry, const char *name,
1400                                  bool dest_uninitialized = false) {
1401     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1402     RegSet saved_regs = RegSet::of(s, d, count);
1403     StubCodeMark mark(this, "StubRoutines", name);
1404     address start = __ pc();
1405     __ enter();
1406 
1407     if (entry != NULL) {
1408       *entry = __ pc();
1409       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1410       BLOCK_COMMENT("Entry:");
1411     }
1412 
1413     // use fwd copy when (d-s) above_equal (count*size)
1414     __ sub(rscratch1, d, s);
1415     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1416     __ br(Assembler::HS, nooverlap_target);
1417 
1418     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1419     if (dest_uninitialized) {
1420       decorators |= IS_DEST_UNINITIALIZED;
1421     }
1422     if (aligned) {
1423       decorators |= ARRAYCOPY_ALIGNED;
1424     }
1425 
1426     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1427     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1428 
1429     if (is_oop) {
1430       // save regs before copy_memory
1431       __ push(RegSet::of(d, count), sp);
1432     }
1433     {
1434       // UnsafeCopyMemory page error: continue after ucm
1435       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1436       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1437       copy_memory(aligned, s, d, count, rscratch1, -size);
1438     }
1439     if (is_oop) {
1440       __ pop(RegSet::of(d, count), sp);
1441       if (VerifyOops)
1442         verify_oop_array(size, d, count, r16);
1443     }
1444     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1445     __ leave();
1446     __ mov(r0, zr); // return 0
1447     __ ret(lr);
1448     return start;
1449 }
1450 
1451   // Arguments:
1452   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1453   //             ignored
1454   //   name    - stub name string
1455   //
1456   // Inputs:
1457   //   c_rarg0   - source array address
1458   //   c_rarg1   - destination array address
1459   //   c_rarg2   - element count, treated as ssize_t, can be zero
1460   //
1461   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1462   // we let the hardware handle it.  The one to eight bytes within words,
1463   // dwords or qwords that span cache line boundaries will still be loaded
1464   // and stored atomically.
1465   //
1466   // Side Effects:
1467   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1468   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1469   // we let the hardware handle it.  The one to eight bytes within words,
1470   // dwords or qwords that span cache line boundaries will still be loaded
1471   // and stored atomically.
1472   //
1473   // Side Effects:
1474   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1475   //   used by generate_conjoint_byte_copy().
1476   //
1477   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1478     const bool not_oop = false;
1479     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1480   }
1481 
1482   // Arguments:
1483   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1484   //             ignored
1485   //   name    - stub name string
1486   //
1487   // Inputs:
1488   //   c_rarg0   - source array address
1489   //   c_rarg1   - destination array address
1490   //   c_rarg2   - element count, treated as ssize_t, can be zero
1491   //
1492   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1493   // we let the hardware handle it.  The one to eight bytes within words,
1494   // dwords or qwords that span cache line boundaries will still be loaded
1495   // and stored atomically.
1496   //
1497   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1498                                       address* entry, const char *name) {
1499     const bool not_oop = false;
1500     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1501   }
1502 
1503   // Arguments:
1504   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1505   //             ignored
1506   //   name    - stub name string
1507   //
1508   // Inputs:
1509   //   c_rarg0   - source array address
1510   //   c_rarg1   - destination array address
1511   //   c_rarg2   - element count, treated as ssize_t, can be zero
1512   //
1513   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1514   // let the hardware handle it.  The two or four words within dwords
1515   // or qwords that span cache line boundaries will still be loaded
1516   // and stored atomically.
1517   //
1518   // Side Effects:
1519   //   disjoint_short_copy_entry is set to the no-overlap entry point
1520   //   used by generate_conjoint_short_copy().
1521   //
1522   address generate_disjoint_short_copy(bool aligned,
1523                                        address* entry, const char *name) {
1524     const bool not_oop = false;
1525     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1526   }
1527 
1528   // Arguments:
1529   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1530   //             ignored
1531   //   name    - stub name string
1532   //
1533   // Inputs:
1534   //   c_rarg0   - source array address
1535   //   c_rarg1   - destination array address
1536   //   c_rarg2   - element count, treated as ssize_t, can be zero
1537   //
1538   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1539   // let the hardware handle it.  The two or four words within dwords
1540   // or qwords that span cache line boundaries will still be loaded
1541   // and stored atomically.
1542   //
1543   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1544                                        address *entry, const char *name) {
1545     const bool not_oop = false;
1546     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1547 
1548   }
1549   // Arguments:
1550   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1551   //             ignored
1552   //   name    - stub name string
1553   //
1554   // Inputs:
1555   //   c_rarg0   - source array address
1556   //   c_rarg1   - destination array address
1557   //   c_rarg2   - element count, treated as ssize_t, can be zero
1558   //
1559   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1560   // the hardware handle it.  The two dwords within qwords that span
1561   // cache line boundaries will still be loaded and stored atomicly.
1562   //
1563   // Side Effects:
1564   //   disjoint_int_copy_entry is set to the no-overlap entry point
1565   //   used by generate_conjoint_int_oop_copy().
1566   //
1567   address generate_disjoint_int_copy(bool aligned, address *entry,
1568                                          const char *name, bool dest_uninitialized = false) {
1569     const bool not_oop = false;
1570     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1571   }
1572 
1573   // Arguments:
1574   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1575   //             ignored
1576   //   name    - stub name string
1577   //
1578   // Inputs:
1579   //   c_rarg0   - source array address
1580   //   c_rarg1   - destination array address
1581   //   c_rarg2   - element count, treated as ssize_t, can be zero
1582   //
1583   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1584   // the hardware handle it.  The two dwords within qwords that span
1585   // cache line boundaries will still be loaded and stored atomicly.
1586   //
1587   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1588                                      address *entry, const char *name,
1589                                      bool dest_uninitialized = false) {
1590     const bool not_oop = false;
1591     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1592   }
1593 
1594 
1595   // Arguments:
1596   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1597   //             ignored
1598   //   name    - stub name string
1599   //
1600   // Inputs:
1601   //   c_rarg0   - source array address
1602   //   c_rarg1   - destination array address
1603   //   c_rarg2   - element count, treated as size_t, can be zero
1604   //
1605   // Side Effects:
1606   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1607   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1608   //
1609   address generate_disjoint_long_copy(bool aligned, address *entry,
1610                                           const char *name, bool dest_uninitialized = false) {
1611     const bool not_oop = false;
1612     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1613   }
1614 
1615   // Arguments:
1616   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1617   //             ignored
1618   //   name    - stub name string
1619   //
1620   // Inputs:
1621   //   c_rarg0   - source array address
1622   //   c_rarg1   - destination array address
1623   //   c_rarg2   - element count, treated as size_t, can be zero
1624   //
1625   address generate_conjoint_long_copy(bool aligned,
1626                                       address nooverlap_target, address *entry,
1627                                       const char *name, bool dest_uninitialized = false) {
1628     const bool not_oop = false;
1629     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1630   }
1631 
1632   // Arguments:
1633   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1634   //             ignored
1635   //   name    - stub name string
1636   //
1637   // Inputs:
1638   //   c_rarg0   - source array address
1639   //   c_rarg1   - destination array address
1640   //   c_rarg2   - element count, treated as size_t, can be zero
1641   //
1642   // Side Effects:
1643   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1644   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1645   //
1646   address generate_disjoint_oop_copy(bool aligned, address *entry,
1647                                      const char *name, bool dest_uninitialized) {
1648     const bool is_oop = true;
1649     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1650     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1651   }
1652 
1653   // Arguments:
1654   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1655   //             ignored
1656   //   name    - stub name string
1657   //
1658   // Inputs:
1659   //   c_rarg0   - source array address
1660   //   c_rarg1   - destination array address
1661   //   c_rarg2   - element count, treated as size_t, can be zero
1662   //
1663   address generate_conjoint_oop_copy(bool aligned,
1664                                      address nooverlap_target, address *entry,
1665                                      const char *name, bool dest_uninitialized) {
1666     const bool is_oop = true;
1667     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1668     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1669                                   name, dest_uninitialized);
1670   }
1671 
1672 
1673   // Helper for generating a dynamic type check.
1674   // Smashes rscratch1, rscratch2.
1675   void generate_type_check(Register sub_klass,
1676                            Register super_check_offset,
1677                            Register super_klass,
1678                            Label& L_success) {
1679     assert_different_registers(sub_klass, super_check_offset, super_klass);
1680 
1681     BLOCK_COMMENT("type_check:");
1682 
1683     Label L_miss;
1684 
1685     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1686                                      super_check_offset);
1687     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1688 
1689     // Fall through on failure!
1690     __ BIND(L_miss);
1691   }
1692 
1693   //
1694   //  Generate checkcasting array copy stub
1695   //
1696   //  Input:
1697   //    c_rarg0   - source array address
1698   //    c_rarg1   - destination array address
1699   //    c_rarg2   - element count, treated as ssize_t, can be zero
1700   //    c_rarg3   - size_t ckoff (super_check_offset)
1701   //    c_rarg4   - oop ckval (super_klass)
1702   //
1703   //  Output:
1704   //    r0 ==  0  -  success
1705   //    r0 == -1^K - failure, where K is partial transfer count
1706   //
1707   address generate_checkcast_copy(const char *name, address *entry,
1708                                   bool dest_uninitialized = false) {
1709 
1710     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1711 
1712     // Input registers (after setup_arg_regs)
1713     const Register from        = c_rarg0;   // source array address
1714     const Register to          = c_rarg1;   // destination array address
1715     const Register count       = c_rarg2;   // elementscount
1716     const Register ckoff       = c_rarg3;   // super_check_offset
1717     const Register ckval       = c_rarg4;   // super_klass
1718 
1719     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1720     RegSet wb_post_saved_regs = RegSet::of(count);
1721 
1722     // Registers used as temps (r18, r19, r20 are save-on-entry)
1723     const Register count_save  = r21;       // orig elementscount
1724     const Register start_to    = r20;       // destination array start address
1725     const Register copied_oop  = r18;       // actual oop copied
1726     const Register r19_klass   = r19;       // oop._klass
1727 
1728     //---------------------------------------------------------------
1729     // Assembler stub will be used for this call to arraycopy
1730     // if the two arrays are subtypes of Object[] but the
1731     // destination array type is not equal to or a supertype
1732     // of the source type.  Each element must be separately
1733     // checked.
1734 
1735     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1736                                copied_oop, r19_klass, count_save);
1737 
1738     __ align(CodeEntryAlignment);
1739     StubCodeMark mark(this, "StubRoutines", name);
1740     address start = __ pc();
1741 
1742     __ enter(); // required for proper stackwalking of RuntimeStub frame
1743 
1744 #ifdef ASSERT
1745     // caller guarantees that the arrays really are different
1746     // otherwise, we would have to make conjoint checks
1747     { Label L;
1748       array_overlap_test(L, TIMES_OOP);
1749       __ stop("checkcast_copy within a single array");
1750       __ bind(L);
1751     }
1752 #endif //ASSERT
1753 
1754     // Caller of this entry point must set up the argument registers.
1755     if (entry != NULL) {
1756       *entry = __ pc();
1757       BLOCK_COMMENT("Entry:");
1758     }
1759 
1760      // Empty array:  Nothing to do.
1761     __ cbz(count, L_done);
1762 
1763     __ push(RegSet::of(r18, r19, r20, r21), sp);
1764 
1765 #ifdef ASSERT
1766     BLOCK_COMMENT("assert consistent ckoff/ckval");
1767     // The ckoff and ckval must be mutually consistent,
1768     // even though caller generates both.
1769     { Label L;
1770       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1771       __ ldrw(start_to, Address(ckval, sco_offset));
1772       __ cmpw(ckoff, start_to);
1773       __ br(Assembler::EQ, L);
1774       __ stop("super_check_offset inconsistent");
1775       __ bind(L);
1776     }
1777 #endif //ASSERT
1778 
1779     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1780     bool is_oop = true;
1781     if (dest_uninitialized) {
1782       decorators |= IS_DEST_UNINITIALIZED;
1783     }
1784 
1785     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1786     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1787 
1788     // save the original count
1789     __ mov(count_save, count);
1790 
1791     // Copy from low to high addresses
1792     __ mov(start_to, to);              // Save destination array start address
1793     __ b(L_load_element);
1794 
1795     // ======== begin loop ========
1796     // (Loop is rotated; its entry is L_load_element.)
1797     // Loop control:
1798     //   for (; count != 0; count--) {
1799     //     copied_oop = load_heap_oop(from++);
1800     //     ... generate_type_check ...;
1801     //     store_heap_oop(to++, copied_oop);
1802     //   }
1803     __ align(OptoLoopAlignment);
1804 
1805     __ BIND(L_store_element);
1806     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1807     __ sub(count, count, 1);
1808     __ cbz(count, L_do_card_marks);
1809 
1810     // ======== loop entry is here ========
1811     __ BIND(L_load_element);
1812     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1813     __ cbz(copied_oop, L_store_element);
1814 
1815     __ load_klass(r19_klass, copied_oop);// query the object klass
1816     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1817     // ======== end loop ========
1818 
1819     // It was a real error; we must depend on the caller to finish the job.
1820     // Register count = remaining oops, count_orig = total oops.
1821     // Emit GC store barriers for the oops we have copied and report
1822     // their number to the caller.
1823 
1824     __ subs(count, count_save, count);     // K = partially copied oop count
1825     __ eon(count, count, zr);                   // report (-1^K) to caller
1826     __ br(Assembler::EQ, L_done_pop);
1827 
1828     __ BIND(L_do_card_marks);
1829     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1830 
1831     __ bind(L_done_pop);
1832     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1833     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1834 
1835     __ bind(L_done);
1836     __ mov(r0, count);
1837     __ leave();
1838     __ ret(lr);
1839 
1840     return start;
1841   }
1842 
1843   // Perform range checks on the proposed arraycopy.
1844   // Kills temp, but nothing else.
1845   // Also, clean the sign bits of src_pos and dst_pos.
1846   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1847                               Register src_pos, // source position (c_rarg1)
1848                               Register dst,     // destination array oo (c_rarg2)
1849                               Register dst_pos, // destination position (c_rarg3)
1850                               Register length,
1851                               Register temp,
1852                               Label& L_failed) {
1853     BLOCK_COMMENT("arraycopy_range_checks:");
1854 
1855     assert_different_registers(rscratch1, temp);
1856 
1857     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1858     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1859     __ addw(temp, length, src_pos);
1860     __ cmpw(temp, rscratch1);
1861     __ br(Assembler::HI, L_failed);
1862 
1863     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1864     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1865     __ addw(temp, length, dst_pos);
1866     __ cmpw(temp, rscratch1);
1867     __ br(Assembler::HI, L_failed);
1868 
1869     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1870     __ movw(src_pos, src_pos);
1871     __ movw(dst_pos, dst_pos);
1872 
1873     BLOCK_COMMENT("arraycopy_range_checks done");
1874   }
1875 
1876   // These stubs get called from some dumb test routine.
1877   // I'll write them properly when they're called from
1878   // something that's actually doing something.
1879   static void fake_arraycopy_stub(address src, address dst, int count) {
1880     assert(count == 0, "huh?");
1881   }
1882 
1883 
1884   //
1885   //  Generate 'unsafe' array copy stub
1886   //  Though just as safe as the other stubs, it takes an unscaled
1887   //  size_t argument instead of an element count.
1888   //
1889   //  Input:
1890   //    c_rarg0   - source array address
1891   //    c_rarg1   - destination array address
1892   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1893   //
1894   // Examines the alignment of the operands and dispatches
1895   // to a long, int, short, or byte copy loop.
1896   //
1897   address generate_unsafe_copy(const char *name,
1898                                address byte_copy_entry,
1899                                address short_copy_entry,
1900                                address int_copy_entry,
1901                                address long_copy_entry) {
1902     Label L_long_aligned, L_int_aligned, L_short_aligned;
1903     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1904 
1905     __ align(CodeEntryAlignment);
1906     StubCodeMark mark(this, "StubRoutines", name);
1907     address start = __ pc();
1908     __ enter(); // required for proper stackwalking of RuntimeStub frame
1909 
1910     // bump this on entry, not on exit:
1911     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1912 
1913     __ orr(rscratch1, s, d);
1914     __ orr(rscratch1, rscratch1, count);
1915 
1916     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1917     __ cbz(rscratch1, L_long_aligned);
1918     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1919     __ cbz(rscratch1, L_int_aligned);
1920     __ tbz(rscratch1, 0, L_short_aligned);
1921     __ b(RuntimeAddress(byte_copy_entry));
1922 
1923     __ BIND(L_short_aligned);
1924     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1925     __ b(RuntimeAddress(short_copy_entry));
1926     __ BIND(L_int_aligned);
1927     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1928     __ b(RuntimeAddress(int_copy_entry));
1929     __ BIND(L_long_aligned);
1930     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1931     __ b(RuntimeAddress(long_copy_entry));
1932 
1933     return start;
1934   }
1935 
1936   //
1937   //  Generate generic array copy stubs
1938   //
1939   //  Input:
1940   //    c_rarg0    -  src oop
1941   //    c_rarg1    -  src_pos (32-bits)
1942   //    c_rarg2    -  dst oop
1943   //    c_rarg3    -  dst_pos (32-bits)
1944   //    c_rarg4    -  element count (32-bits)
1945   //
1946   //  Output:
1947   //    r0 ==  0  -  success
1948   //    r0 == -1^K - failure, where K is partial transfer count
1949   //
1950   address generate_generic_copy(const char *name,
1951                                 address byte_copy_entry, address short_copy_entry,
1952                                 address int_copy_entry, address oop_copy_entry,
1953                                 address long_copy_entry, address checkcast_copy_entry) {
1954 
1955     Label L_failed, L_objArray;
1956     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1957 
1958     // Input registers
1959     const Register src        = c_rarg0;  // source array oop
1960     const Register src_pos    = c_rarg1;  // source position
1961     const Register dst        = c_rarg2;  // destination array oop
1962     const Register dst_pos    = c_rarg3;  // destination position
1963     const Register length     = c_rarg4;
1964 
1965 
1966     // Registers used as temps
1967     const Register dst_klass  = c_rarg5;
1968 
1969     __ align(CodeEntryAlignment);
1970 
1971     StubCodeMark mark(this, "StubRoutines", name);
1972 
1973     address start = __ pc();
1974 
1975     __ enter(); // required for proper stackwalking of RuntimeStub frame
1976 
1977     // bump this on entry, not on exit:
1978     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1979 
1980     //-----------------------------------------------------------------------
1981     // Assembler stub will be used for this call to arraycopy
1982     // if the following conditions are met:
1983     //
1984     // (1) src and dst must not be null.
1985     // (2) src_pos must not be negative.
1986     // (3) dst_pos must not be negative.
1987     // (4) length  must not be negative.
1988     // (5) src klass and dst klass should be the same and not NULL.
1989     // (6) src and dst should be arrays.
1990     // (7) src_pos + length must not exceed length of src.
1991     // (8) dst_pos + length must not exceed length of dst.
1992     //
1993 
1994     //  if (src == NULL) return -1;
1995     __ cbz(src, L_failed);
1996 
1997     //  if (src_pos < 0) return -1;
1998     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1999 
2000     //  if (dst == NULL) return -1;
2001     __ cbz(dst, L_failed);
2002 
2003     //  if (dst_pos < 0) return -1;
2004     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2005 
2006     // registers used as temp
2007     const Register scratch_length    = r16; // elements count to copy
2008     const Register scratch_src_klass = r17; // array klass
2009     const Register lh                = r18; // layout helper
2010 
2011     //  if (length < 0) return -1;
2012     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2013     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2014 
2015     __ load_klass(scratch_src_klass, src);
2016 #ifdef ASSERT
2017     //  assert(src->klass() != NULL);
2018     {
2019       BLOCK_COMMENT("assert klasses not null {");
2020       Label L1, L2;
2021       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2022       __ bind(L1);
2023       __ stop("broken null klass");
2024       __ bind(L2);
2025       __ load_klass(rscratch1, dst);
2026       __ cbz(rscratch1, L1);     // this would be broken also
2027       BLOCK_COMMENT("} assert klasses not null done");
2028     }
2029 #endif
2030 
2031     // Load layout helper (32-bits)
2032     //
2033     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2034     // 32        30    24            16              8     2                 0
2035     //
2036     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2037     //
2038 
2039     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2040 
2041     // Handle objArrays completely differently...
2042     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2043     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2044     __ movw(rscratch1, objArray_lh);
2045     __ eorw(rscratch2, lh, rscratch1);
2046     __ cbzw(rscratch2, L_objArray);
2047 
2048     //  if (src->klass() != dst->klass()) return -1;
2049     __ load_klass(rscratch2, dst);
2050     __ eor(rscratch2, rscratch2, scratch_src_klass);
2051     __ cbnz(rscratch2, L_failed);
2052 
2053     //  if (!src->is_Array()) return -1;
2054     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2055 
2056     // At this point, it is known to be a typeArray (array_tag 0x3).
2057 #ifdef ASSERT
2058     {
2059       BLOCK_COMMENT("assert primitive array {");
2060       Label L;
2061       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2062       __ cmpw(lh, rscratch2);
2063       __ br(Assembler::GE, L);
2064       __ stop("must be a primitive array");
2065       __ bind(L);
2066       BLOCK_COMMENT("} assert primitive array done");
2067     }
2068 #endif
2069 
2070     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2071                            rscratch2, L_failed);
2072 
2073     // TypeArrayKlass
2074     //
2075     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2076     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2077     //
2078 
2079     const Register rscratch1_offset = rscratch1;    // array offset
2080     const Register r18_elsize = lh; // element size
2081 
2082     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2083            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2084     __ add(src, src, rscratch1_offset);           // src array offset
2085     __ add(dst, dst, rscratch1_offset);           // dst array offset
2086     BLOCK_COMMENT("choose copy loop based on element size");
2087 
2088     // next registers should be set before the jump to corresponding stub
2089     const Register from     = c_rarg0;  // source array address
2090     const Register to       = c_rarg1;  // destination array address
2091     const Register count    = c_rarg2;  // elements count
2092 
2093     // 'from', 'to', 'count' registers should be set in such order
2094     // since they are the same as 'src', 'src_pos', 'dst'.
2095 
2096     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2097 
2098     // The possible values of elsize are 0-3, i.e. exact_log2(element
2099     // size in bytes).  We do a simple bitwise binary search.
2100   __ BIND(L_copy_bytes);
2101     __ tbnz(r18_elsize, 1, L_copy_ints);
2102     __ tbnz(r18_elsize, 0, L_copy_shorts);
2103     __ lea(from, Address(src, src_pos));// src_addr
2104     __ lea(to,   Address(dst, dst_pos));// dst_addr
2105     __ movw(count, scratch_length); // length
2106     __ b(RuntimeAddress(byte_copy_entry));
2107 
2108   __ BIND(L_copy_shorts);
2109     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2110     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2111     __ movw(count, scratch_length); // length
2112     __ b(RuntimeAddress(short_copy_entry));
2113 
2114   __ BIND(L_copy_ints);
2115     __ tbnz(r18_elsize, 0, L_copy_longs);
2116     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2117     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2118     __ movw(count, scratch_length); // length
2119     __ b(RuntimeAddress(int_copy_entry));
2120 
2121   __ BIND(L_copy_longs);
2122 #ifdef ASSERT
2123     {
2124       BLOCK_COMMENT("assert long copy {");
2125       Label L;
2126       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2127       __ cmpw(r18_elsize, LogBytesPerLong);
2128       __ br(Assembler::EQ, L);
2129       __ stop("must be long copy, but elsize is wrong");
2130       __ bind(L);
2131       BLOCK_COMMENT("} assert long copy done");
2132     }
2133 #endif
2134     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2135     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2136     __ movw(count, scratch_length); // length
2137     __ b(RuntimeAddress(long_copy_entry));
2138 
2139     // ObjArrayKlass
2140   __ BIND(L_objArray);
2141     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2142 
2143     Label L_plain_copy, L_checkcast_copy;
2144     //  test array classes for subtyping
2145     __ load_klass(r18, dst);
2146     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2147     __ br(Assembler::NE, L_checkcast_copy);
2148 
2149     // Identically typed arrays can be copied without element-wise checks.
2150     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2151                            rscratch2, L_failed);
2152 
2153     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2154     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2155     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2156     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2157     __ movw(count, scratch_length); // length
2158   __ BIND(L_plain_copy);
2159     __ b(RuntimeAddress(oop_copy_entry));
2160 
2161   __ BIND(L_checkcast_copy);
2162     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2163     {
2164       // Before looking at dst.length, make sure dst is also an objArray.
2165       __ ldrw(rscratch1, Address(r18, lh_offset));
2166       __ movw(rscratch2, objArray_lh);
2167       __ eorw(rscratch1, rscratch1, rscratch2);
2168       __ cbnzw(rscratch1, L_failed);
2169 
2170       // It is safe to examine both src.length and dst.length.
2171       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2172                              r18, L_failed);
2173 
2174       __ load_klass(dst_klass, dst); // reload
2175 
2176       // Marshal the base address arguments now, freeing registers.
2177       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2178       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2179       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2180       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2181       __ movw(count, length);           // length (reloaded)
2182       Register sco_temp = c_rarg3;      // this register is free now
2183       assert_different_registers(from, to, count, sco_temp,
2184                                  dst_klass, scratch_src_klass);
2185       // assert_clean_int(count, sco_temp);
2186 
2187       // Generate the type check.
2188       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2189       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2190 
2191       // Smashes rscratch1, rscratch2
2192       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2193 
2194       // Fetch destination element klass from the ObjArrayKlass header.
2195       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2196       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2197       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2198 
2199       // the checkcast_copy loop needs two extra arguments:
2200       assert(c_rarg3 == sco_temp, "#3 already in place");
2201       // Set up arguments for checkcast_copy_entry.
2202       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2203       __ b(RuntimeAddress(checkcast_copy_entry));
2204     }
2205 
2206   __ BIND(L_failed);
2207     __ mov(r0, -1);
2208     __ leave();   // required for proper stackwalking of RuntimeStub frame
2209     __ ret(lr);
2210 
2211     return start;
2212   }
2213 
2214   //
2215   // Generate stub for array fill. If "aligned" is true, the
2216   // "to" address is assumed to be heapword aligned.
2217   //
2218   // Arguments for generated stub:
2219   //   to:    c_rarg0
2220   //   value: c_rarg1
2221   //   count: c_rarg2 treated as signed
2222   //
2223   address generate_fill(BasicType t, bool aligned, const char *name) {
2224     __ align(CodeEntryAlignment);
2225     StubCodeMark mark(this, "StubRoutines", name);
2226     address start = __ pc();
2227 
2228     BLOCK_COMMENT("Entry:");
2229 
2230     const Register to        = c_rarg0;  // source array address
2231     const Register value     = c_rarg1;  // value
2232     const Register count     = c_rarg2;  // elements count
2233 
2234     const Register bz_base = r10;        // base for block_zero routine
2235     const Register cnt_words = r11;      // temp register
2236 
2237     __ enter();
2238 
2239     Label L_fill_elements, L_exit1;
2240 
2241     int shift = -1;
2242     switch (t) {
2243       case T_BYTE:
2244         shift = 0;
2245         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2246         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2247         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2248         __ br(Assembler::LO, L_fill_elements);
2249         break;
2250       case T_SHORT:
2251         shift = 1;
2252         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2253         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2254         __ br(Assembler::LO, L_fill_elements);
2255         break;
2256       case T_INT:
2257         shift = 2;
2258         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2259         __ br(Assembler::LO, L_fill_elements);
2260         break;
2261       default: ShouldNotReachHere();
2262     }
2263 
2264     // Align source address at 8 bytes address boundary.
2265     Label L_skip_align1, L_skip_align2, L_skip_align4;
2266     if (!aligned) {
2267       switch (t) {
2268         case T_BYTE:
2269           // One byte misalignment happens only for byte arrays.
2270           __ tbz(to, 0, L_skip_align1);
2271           __ strb(value, Address(__ post(to, 1)));
2272           __ subw(count, count, 1);
2273           __ bind(L_skip_align1);
2274           // Fallthrough
2275         case T_SHORT:
2276           // Two bytes misalignment happens only for byte and short (char) arrays.
2277           __ tbz(to, 1, L_skip_align2);
2278           __ strh(value, Address(__ post(to, 2)));
2279           __ subw(count, count, 2 >> shift);
2280           __ bind(L_skip_align2);
2281           // Fallthrough
2282         case T_INT:
2283           // Align to 8 bytes, we know we are 4 byte aligned to start.
2284           __ tbz(to, 2, L_skip_align4);
2285           __ strw(value, Address(__ post(to, 4)));
2286           __ subw(count, count, 4 >> shift);
2287           __ bind(L_skip_align4);
2288           break;
2289         default: ShouldNotReachHere();
2290       }
2291     }
2292 
2293     //
2294     //  Fill large chunks
2295     //
2296     __ lsrw(cnt_words, count, 3 - shift); // number of words
2297     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2298     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2299     if (UseBlockZeroing) {
2300       Label non_block_zeroing, rest;
2301       // If the fill value is zero we can use the fast zero_words().
2302       __ cbnz(value, non_block_zeroing);
2303       __ mov(bz_base, to);
2304       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2305       __ zero_words(bz_base, cnt_words);
2306       __ b(rest);
2307       __ bind(non_block_zeroing);
2308       __ fill_words(to, cnt_words, value);
2309       __ bind(rest);
2310     } else {
2311       __ fill_words(to, cnt_words, value);
2312     }
2313 
2314     // Remaining count is less than 8 bytes. Fill it by a single store.
2315     // Note that the total length is no less than 8 bytes.
2316     if (t == T_BYTE || t == T_SHORT) {
2317       Label L_exit1;
2318       __ cbzw(count, L_exit1);
2319       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2320       __ str(value, Address(to, -8));    // overwrite some elements
2321       __ bind(L_exit1);
2322       __ leave();
2323       __ ret(lr);
2324     }
2325 
2326     // Handle copies less than 8 bytes.
2327     Label L_fill_2, L_fill_4, L_exit2;
2328     __ bind(L_fill_elements);
2329     switch (t) {
2330       case T_BYTE:
2331         __ tbz(count, 0, L_fill_2);
2332         __ strb(value, Address(__ post(to, 1)));
2333         __ bind(L_fill_2);
2334         __ tbz(count, 1, L_fill_4);
2335         __ strh(value, Address(__ post(to, 2)));
2336         __ bind(L_fill_4);
2337         __ tbz(count, 2, L_exit2);
2338         __ strw(value, Address(to));
2339         break;
2340       case T_SHORT:
2341         __ tbz(count, 0, L_fill_4);
2342         __ strh(value, Address(__ post(to, 2)));
2343         __ bind(L_fill_4);
2344         __ tbz(count, 1, L_exit2);
2345         __ strw(value, Address(to));
2346         break;
2347       case T_INT:
2348         __ cbzw(count, L_exit2);
2349         __ strw(value, Address(to));
2350         break;
2351       default: ShouldNotReachHere();
2352     }
2353     __ bind(L_exit2);
2354     __ leave();
2355     __ ret(lr);
2356     return start;
2357   }
2358 
2359   address generate_data_cache_writeback() {
2360     const Register line        = c_rarg0;  // address of line to write back
2361 
2362     __ align(CodeEntryAlignment);
2363 
2364     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2365 
2366     address start = __ pc();
2367     __ enter();
2368     __ cache_wb(Address(line, 0));
2369     __ leave();
2370     __ ret(lr);
2371 
2372     return start;
2373   }
2374 
2375   address generate_data_cache_writeback_sync() {
2376     const Register is_pre     = c_rarg0;  // pre or post sync
2377 
2378     __ align(CodeEntryAlignment);
2379 
2380     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2381 
2382     // pre wbsync is a no-op
2383     // post wbsync translates to an sfence
2384 
2385     Label skip;
2386     address start = __ pc();
2387     __ enter();
2388     __ cbnz(is_pre, skip);
2389     __ cache_wbsync(false);
2390     __ bind(skip);
2391     __ leave();
2392     __ ret(lr);
2393 
2394     return start;
2395   }
2396 
2397   void generate_arraycopy_stubs() {
2398     address entry;
2399     address entry_jbyte_arraycopy;
2400     address entry_jshort_arraycopy;
2401     address entry_jint_arraycopy;
2402     address entry_oop_arraycopy;
2403     address entry_jlong_arraycopy;
2404     address entry_checkcast_arraycopy;
2405 
2406     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2407     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2408 
2409     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2410 
2411     //*** jbyte
2412     // Always need aligned and unaligned versions
2413     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2414                                                                                   "jbyte_disjoint_arraycopy");
2415     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2416                                                                                   &entry_jbyte_arraycopy,
2417                                                                                   "jbyte_arraycopy");
2418     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2419                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2420     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2421                                                                                   "arrayof_jbyte_arraycopy");
2422 
2423     //*** jshort
2424     // Always need aligned and unaligned versions
2425     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2426                                                                                     "jshort_disjoint_arraycopy");
2427     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2428                                                                                     &entry_jshort_arraycopy,
2429                                                                                     "jshort_arraycopy");
2430     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2431                                                                                     "arrayof_jshort_disjoint_arraycopy");
2432     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2433                                                                                     "arrayof_jshort_arraycopy");
2434 
2435     //*** jint
2436     // Aligned versions
2437     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2438                                                                                 "arrayof_jint_disjoint_arraycopy");
2439     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2440                                                                                 "arrayof_jint_arraycopy");
2441     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2442     // entry_jint_arraycopy always points to the unaligned version
2443     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2444                                                                                 "jint_disjoint_arraycopy");
2445     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2446                                                                                 &entry_jint_arraycopy,
2447                                                                                 "jint_arraycopy");
2448 
2449     //*** jlong
2450     // It is always aligned
2451     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2452                                                                                   "arrayof_jlong_disjoint_arraycopy");
2453     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2454                                                                                   "arrayof_jlong_arraycopy");
2455     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2456     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2457 
2458     //*** oops
2459     {
2460       // With compressed oops we need unaligned versions; notice that
2461       // we overwrite entry_oop_arraycopy.
2462       bool aligned = !UseCompressedOops;
2463 
2464       StubRoutines::_arrayof_oop_disjoint_arraycopy
2465         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2466                                      /*dest_uninitialized*/false);
2467       StubRoutines::_arrayof_oop_arraycopy
2468         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2469                                      /*dest_uninitialized*/false);
2470       // Aligned versions without pre-barriers
2471       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2472         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2473                                      /*dest_uninitialized*/true);
2474       StubRoutines::_arrayof_oop_arraycopy_uninit
2475         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2476                                      /*dest_uninitialized*/true);
2477     }
2478 
2479     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2480     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2481     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2482     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2483 
2484     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2485     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2486                                                                         /*dest_uninitialized*/true);
2487 
2488     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2489                                                               entry_jbyte_arraycopy,
2490                                                               entry_jshort_arraycopy,
2491                                                               entry_jint_arraycopy,
2492                                                               entry_jlong_arraycopy);
2493 
2494     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2495                                                                entry_jbyte_arraycopy,
2496                                                                entry_jshort_arraycopy,
2497                                                                entry_jint_arraycopy,
2498                                                                entry_oop_arraycopy,
2499                                                                entry_jlong_arraycopy,
2500                                                                entry_checkcast_arraycopy);
2501 
2502     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2503     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2504     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2505     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2506     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2507     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2508   }
2509 
2510   void generate_math_stubs() { Unimplemented(); }
2511 
2512   // Arguments:
2513   //
2514   // Inputs:
2515   //   c_rarg0   - source byte array address
2516   //   c_rarg1   - destination byte array address
2517   //   c_rarg2   - K (key) in little endian int array
2518   //
2519   address generate_aescrypt_encryptBlock() {
2520     __ align(CodeEntryAlignment);
2521     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2522 
2523     Label L_doLast;
2524 
2525     const Register from        = c_rarg0;  // source array address
2526     const Register to          = c_rarg1;  // destination array address
2527     const Register key         = c_rarg2;  // key array address
2528     const Register keylen      = rscratch1;
2529 
2530     address start = __ pc();
2531     __ enter();
2532 
2533     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2534 
2535     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2536 
2537     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2538     __ rev32(v1, __ T16B, v1);
2539     __ rev32(v2, __ T16B, v2);
2540     __ rev32(v3, __ T16B, v3);
2541     __ rev32(v4, __ T16B, v4);
2542     __ aese(v0, v1);
2543     __ aesmc(v0, v0);
2544     __ aese(v0, v2);
2545     __ aesmc(v0, v0);
2546     __ aese(v0, v3);
2547     __ aesmc(v0, v0);
2548     __ aese(v0, v4);
2549     __ aesmc(v0, v0);
2550 
2551     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2552     __ rev32(v1, __ T16B, v1);
2553     __ rev32(v2, __ T16B, v2);
2554     __ rev32(v3, __ T16B, v3);
2555     __ rev32(v4, __ T16B, v4);
2556     __ aese(v0, v1);
2557     __ aesmc(v0, v0);
2558     __ aese(v0, v2);
2559     __ aesmc(v0, v0);
2560     __ aese(v0, v3);
2561     __ aesmc(v0, v0);
2562     __ aese(v0, v4);
2563     __ aesmc(v0, v0);
2564 
2565     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2566     __ rev32(v1, __ T16B, v1);
2567     __ rev32(v2, __ T16B, v2);
2568 
2569     __ cmpw(keylen, 44);
2570     __ br(Assembler::EQ, L_doLast);
2571 
2572     __ aese(v0, v1);
2573     __ aesmc(v0, v0);
2574     __ aese(v0, v2);
2575     __ aesmc(v0, v0);
2576 
2577     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2578     __ rev32(v1, __ T16B, v1);
2579     __ rev32(v2, __ T16B, v2);
2580 
2581     __ cmpw(keylen, 52);
2582     __ br(Assembler::EQ, L_doLast);
2583 
2584     __ aese(v0, v1);
2585     __ aesmc(v0, v0);
2586     __ aese(v0, v2);
2587     __ aesmc(v0, v0);
2588 
2589     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2590     __ rev32(v1, __ T16B, v1);
2591     __ rev32(v2, __ T16B, v2);
2592 
2593     __ BIND(L_doLast);
2594 
2595     __ aese(v0, v1);
2596     __ aesmc(v0, v0);
2597     __ aese(v0, v2);
2598 
2599     __ ld1(v1, __ T16B, key);
2600     __ rev32(v1, __ T16B, v1);
2601     __ eor(v0, __ T16B, v0, v1);
2602 
2603     __ st1(v0, __ T16B, to);
2604 
2605     __ mov(r0, 0);
2606 
2607     __ leave();
2608     __ ret(lr);
2609 
2610     return start;
2611   }
2612 
2613   // Arguments:
2614   //
2615   // Inputs:
2616   //   c_rarg0   - source byte array address
2617   //   c_rarg1   - destination byte array address
2618   //   c_rarg2   - K (key) in little endian int array
2619   //
2620   address generate_aescrypt_decryptBlock() {
2621     assert(UseAES, "need AES instructions and misaligned SSE support");
2622     __ align(CodeEntryAlignment);
2623     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2624     Label L_doLast;
2625 
2626     const Register from        = c_rarg0;  // source array address
2627     const Register to          = c_rarg1;  // destination array address
2628     const Register key         = c_rarg2;  // key array address
2629     const Register keylen      = rscratch1;
2630 
2631     address start = __ pc();
2632     __ enter(); // required for proper stackwalking of RuntimeStub frame
2633 
2634     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2635 
2636     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2637 
2638     __ ld1(v5, __ T16B, __ post(key, 16));
2639     __ rev32(v5, __ T16B, v5);
2640 
2641     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2642     __ rev32(v1, __ T16B, v1);
2643     __ rev32(v2, __ T16B, v2);
2644     __ rev32(v3, __ T16B, v3);
2645     __ rev32(v4, __ T16B, v4);
2646     __ aesd(v0, v1);
2647     __ aesimc(v0, v0);
2648     __ aesd(v0, v2);
2649     __ aesimc(v0, v0);
2650     __ aesd(v0, v3);
2651     __ aesimc(v0, v0);
2652     __ aesd(v0, v4);
2653     __ aesimc(v0, v0);
2654 
2655     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2656     __ rev32(v1, __ T16B, v1);
2657     __ rev32(v2, __ T16B, v2);
2658     __ rev32(v3, __ T16B, v3);
2659     __ rev32(v4, __ T16B, v4);
2660     __ aesd(v0, v1);
2661     __ aesimc(v0, v0);
2662     __ aesd(v0, v2);
2663     __ aesimc(v0, v0);
2664     __ aesd(v0, v3);
2665     __ aesimc(v0, v0);
2666     __ aesd(v0, v4);
2667     __ aesimc(v0, v0);
2668 
2669     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2670     __ rev32(v1, __ T16B, v1);
2671     __ rev32(v2, __ T16B, v2);
2672 
2673     __ cmpw(keylen, 44);
2674     __ br(Assembler::EQ, L_doLast);
2675 
2676     __ aesd(v0, v1);
2677     __ aesimc(v0, v0);
2678     __ aesd(v0, v2);
2679     __ aesimc(v0, v0);
2680 
2681     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2682     __ rev32(v1, __ T16B, v1);
2683     __ rev32(v2, __ T16B, v2);
2684 
2685     __ cmpw(keylen, 52);
2686     __ br(Assembler::EQ, L_doLast);
2687 
2688     __ aesd(v0, v1);
2689     __ aesimc(v0, v0);
2690     __ aesd(v0, v2);
2691     __ aesimc(v0, v0);
2692 
2693     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2694     __ rev32(v1, __ T16B, v1);
2695     __ rev32(v2, __ T16B, v2);
2696 
2697     __ BIND(L_doLast);
2698 
2699     __ aesd(v0, v1);
2700     __ aesimc(v0, v0);
2701     __ aesd(v0, v2);
2702 
2703     __ eor(v0, __ T16B, v0, v5);
2704 
2705     __ st1(v0, __ T16B, to);
2706 
2707     __ mov(r0, 0);
2708 
2709     __ leave();
2710     __ ret(lr);
2711 
2712     return start;
2713   }
2714 
2715   // Arguments:
2716   //
2717   // Inputs:
2718   //   c_rarg0   - source byte array address
2719   //   c_rarg1   - destination byte array address
2720   //   c_rarg2   - K (key) in little endian int array
2721   //   c_rarg3   - r vector byte array address
2722   //   c_rarg4   - input length
2723   //
2724   // Output:
2725   //   x0        - input length
2726   //
2727   address generate_cipherBlockChaining_encryptAESCrypt() {
2728     assert(UseAES, "need AES instructions and misaligned SSE support");
2729     __ align(CodeEntryAlignment);
2730     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2731 
2732     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2733 
2734     const Register from        = c_rarg0;  // source array address
2735     const Register to          = c_rarg1;  // destination array address
2736     const Register key         = c_rarg2;  // key array address
2737     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2738                                            // and left with the results of the last encryption block
2739     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2740     const Register keylen      = rscratch1;
2741 
2742     address start = __ pc();
2743 
2744       __ enter();
2745 
2746       __ movw(rscratch2, len_reg);
2747 
2748       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2749 
2750       __ ld1(v0, __ T16B, rvec);
2751 
2752       __ cmpw(keylen, 52);
2753       __ br(Assembler::CC, L_loadkeys_44);
2754       __ br(Assembler::EQ, L_loadkeys_52);
2755 
2756       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2757       __ rev32(v17, __ T16B, v17);
2758       __ rev32(v18, __ T16B, v18);
2759     __ BIND(L_loadkeys_52);
2760       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2761       __ rev32(v19, __ T16B, v19);
2762       __ rev32(v20, __ T16B, v20);
2763     __ BIND(L_loadkeys_44);
2764       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2765       __ rev32(v21, __ T16B, v21);
2766       __ rev32(v22, __ T16B, v22);
2767       __ rev32(v23, __ T16B, v23);
2768       __ rev32(v24, __ T16B, v24);
2769       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2770       __ rev32(v25, __ T16B, v25);
2771       __ rev32(v26, __ T16B, v26);
2772       __ rev32(v27, __ T16B, v27);
2773       __ rev32(v28, __ T16B, v28);
2774       __ ld1(v29, v30, v31, __ T16B, key);
2775       __ rev32(v29, __ T16B, v29);
2776       __ rev32(v30, __ T16B, v30);
2777       __ rev32(v31, __ T16B, v31);
2778 
2779     __ BIND(L_aes_loop);
2780       __ ld1(v1, __ T16B, __ post(from, 16));
2781       __ eor(v0, __ T16B, v0, v1);
2782 
2783       __ br(Assembler::CC, L_rounds_44);
2784       __ br(Assembler::EQ, L_rounds_52);
2785 
2786       __ aese(v0, v17); __ aesmc(v0, v0);
2787       __ aese(v0, v18); __ aesmc(v0, v0);
2788     __ BIND(L_rounds_52);
2789       __ aese(v0, v19); __ aesmc(v0, v0);
2790       __ aese(v0, v20); __ aesmc(v0, v0);
2791     __ BIND(L_rounds_44);
2792       __ aese(v0, v21); __ aesmc(v0, v0);
2793       __ aese(v0, v22); __ aesmc(v0, v0);
2794       __ aese(v0, v23); __ aesmc(v0, v0);
2795       __ aese(v0, v24); __ aesmc(v0, v0);
2796       __ aese(v0, v25); __ aesmc(v0, v0);
2797       __ aese(v0, v26); __ aesmc(v0, v0);
2798       __ aese(v0, v27); __ aesmc(v0, v0);
2799       __ aese(v0, v28); __ aesmc(v0, v0);
2800       __ aese(v0, v29); __ aesmc(v0, v0);
2801       __ aese(v0, v30);
2802       __ eor(v0, __ T16B, v0, v31);
2803 
2804       __ st1(v0, __ T16B, __ post(to, 16));
2805 
2806       __ subw(len_reg, len_reg, 16);
2807       __ cbnzw(len_reg, L_aes_loop);
2808 
2809       __ st1(v0, __ T16B, rvec);
2810 
2811       __ mov(r0, rscratch2);
2812 
2813       __ leave();
2814       __ ret(lr);
2815 
2816       return start;
2817   }
2818 
2819   // Arguments:
2820   //
2821   // Inputs:
2822   //   c_rarg0   - source byte array address
2823   //   c_rarg1   - destination byte array address
2824   //   c_rarg2   - K (key) in little endian int array
2825   //   c_rarg3   - r vector byte array address
2826   //   c_rarg4   - input length
2827   //
2828   // Output:
2829   //   r0        - input length
2830   //
2831   address generate_cipherBlockChaining_decryptAESCrypt() {
2832     assert(UseAES, "need AES instructions and misaligned SSE support");
2833     __ align(CodeEntryAlignment);
2834     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2835 
2836     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2837 
2838     const Register from        = c_rarg0;  // source array address
2839     const Register to          = c_rarg1;  // destination array address
2840     const Register key         = c_rarg2;  // key array address
2841     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2842                                            // and left with the results of the last encryption block
2843     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2844     const Register keylen      = rscratch1;
2845 
2846     address start = __ pc();
2847 
2848       __ enter();
2849 
2850       __ movw(rscratch2, len_reg);
2851 
2852       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2853 
2854       __ ld1(v2, __ T16B, rvec);
2855 
2856       __ ld1(v31, __ T16B, __ post(key, 16));
2857       __ rev32(v31, __ T16B, v31);
2858 
2859       __ cmpw(keylen, 52);
2860       __ br(Assembler::CC, L_loadkeys_44);
2861       __ br(Assembler::EQ, L_loadkeys_52);
2862 
2863       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2864       __ rev32(v17, __ T16B, v17);
2865       __ rev32(v18, __ T16B, v18);
2866     __ BIND(L_loadkeys_52);
2867       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2868       __ rev32(v19, __ T16B, v19);
2869       __ rev32(v20, __ T16B, v20);
2870     __ BIND(L_loadkeys_44);
2871       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2872       __ rev32(v21, __ T16B, v21);
2873       __ rev32(v22, __ T16B, v22);
2874       __ rev32(v23, __ T16B, v23);
2875       __ rev32(v24, __ T16B, v24);
2876       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2877       __ rev32(v25, __ T16B, v25);
2878       __ rev32(v26, __ T16B, v26);
2879       __ rev32(v27, __ T16B, v27);
2880       __ rev32(v28, __ T16B, v28);
2881       __ ld1(v29, v30, __ T16B, key);
2882       __ rev32(v29, __ T16B, v29);
2883       __ rev32(v30, __ T16B, v30);
2884 
2885     __ BIND(L_aes_loop);
2886       __ ld1(v0, __ T16B, __ post(from, 16));
2887       __ orr(v1, __ T16B, v0, v0);
2888 
2889       __ br(Assembler::CC, L_rounds_44);
2890       __ br(Assembler::EQ, L_rounds_52);
2891 
2892       __ aesd(v0, v17); __ aesimc(v0, v0);
2893       __ aesd(v0, v18); __ aesimc(v0, v0);
2894     __ BIND(L_rounds_52);
2895       __ aesd(v0, v19); __ aesimc(v0, v0);
2896       __ aesd(v0, v20); __ aesimc(v0, v0);
2897     __ BIND(L_rounds_44);
2898       __ aesd(v0, v21); __ aesimc(v0, v0);
2899       __ aesd(v0, v22); __ aesimc(v0, v0);
2900       __ aesd(v0, v23); __ aesimc(v0, v0);
2901       __ aesd(v0, v24); __ aesimc(v0, v0);
2902       __ aesd(v0, v25); __ aesimc(v0, v0);
2903       __ aesd(v0, v26); __ aesimc(v0, v0);
2904       __ aesd(v0, v27); __ aesimc(v0, v0);
2905       __ aesd(v0, v28); __ aesimc(v0, v0);
2906       __ aesd(v0, v29); __ aesimc(v0, v0);
2907       __ aesd(v0, v30);
2908       __ eor(v0, __ T16B, v0, v31);
2909       __ eor(v0, __ T16B, v0, v2);
2910 
2911       __ st1(v0, __ T16B, __ post(to, 16));
2912       __ orr(v2, __ T16B, v1, v1);
2913 
2914       __ subw(len_reg, len_reg, 16);
2915       __ cbnzw(len_reg, L_aes_loop);
2916 
2917       __ st1(v2, __ T16B, rvec);
2918 
2919       __ mov(r0, rscratch2);
2920 
2921       __ leave();
2922       __ ret(lr);
2923 
2924     return start;
2925   }
2926 
2927   // Arguments:
2928   //
2929   // Inputs:
2930   //   c_rarg0   - byte[]  source+offset
2931   //   c_rarg1   - int[]   SHA.state
2932   //   c_rarg2   - int     offset
2933   //   c_rarg3   - int     limit
2934   //
2935   address generate_sha1_implCompress(bool multi_block, const char *name) {
2936     __ align(CodeEntryAlignment);
2937     StubCodeMark mark(this, "StubRoutines", name);
2938     address start = __ pc();
2939 
2940     Register buf   = c_rarg0;
2941     Register state = c_rarg1;
2942     Register ofs   = c_rarg2;
2943     Register limit = c_rarg3;
2944 
2945     Label keys;
2946     Label sha1_loop;
2947 
2948     // load the keys into v0..v3
2949     __ adr(rscratch1, keys);
2950     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2951     // load 5 words state into v6, v7
2952     __ ldrq(v6, Address(state, 0));
2953     __ ldrs(v7, Address(state, 16));
2954 
2955 
2956     __ BIND(sha1_loop);
2957     // load 64 bytes of data into v16..v19
2958     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2959     __ rev32(v16, __ T16B, v16);
2960     __ rev32(v17, __ T16B, v17);
2961     __ rev32(v18, __ T16B, v18);
2962     __ rev32(v19, __ T16B, v19);
2963 
2964     // do the sha1
2965     __ addv(v4, __ T4S, v16, v0);
2966     __ orr(v20, __ T16B, v6, v6);
2967 
2968     FloatRegister d0 = v16;
2969     FloatRegister d1 = v17;
2970     FloatRegister d2 = v18;
2971     FloatRegister d3 = v19;
2972 
2973     for (int round = 0; round < 20; round++) {
2974       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2975       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2976       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2977       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2978       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2979 
2980       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2981       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2982       __ sha1h(tmp2, __ T4S, v20);
2983       if (round < 5)
2984         __ sha1c(v20, __ T4S, tmp3, tmp4);
2985       else if (round < 10 || round >= 15)
2986         __ sha1p(v20, __ T4S, tmp3, tmp4);
2987       else
2988         __ sha1m(v20, __ T4S, tmp3, tmp4);
2989       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2990 
2991       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2992     }
2993 
2994     __ addv(v7, __ T2S, v7, v21);
2995     __ addv(v6, __ T4S, v6, v20);
2996 
2997     if (multi_block) {
2998       __ add(ofs, ofs, 64);
2999       __ cmp(ofs, limit);
3000       __ br(Assembler::LE, sha1_loop);
3001       __ mov(c_rarg0, ofs); // return ofs
3002     }
3003 
3004     __ strq(v6, Address(state, 0));
3005     __ strs(v7, Address(state, 16));
3006 
3007     __ ret(lr);
3008 
3009     __ bind(keys);
3010     __ emit_int32(0x5a827999);
3011     __ emit_int32(0x6ed9eba1);
3012     __ emit_int32(0x8f1bbcdc);
3013     __ emit_int32(0xca62c1d6);
3014 
3015     return start;
3016   }
3017 
3018 
3019   // Arguments:
3020   //
3021   // Inputs:
3022   //   c_rarg0   - byte[]  source+offset
3023   //   c_rarg1   - int[]   SHA.state
3024   //   c_rarg2   - int     offset
3025   //   c_rarg3   - int     limit
3026   //
3027   address generate_sha256_implCompress(bool multi_block, const char *name) {
3028     static const uint32_t round_consts[64] = {
3029       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3030       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3031       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3032       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3033       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3034       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3035       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3036       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3037       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3038       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3039       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3040       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3041       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3042       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3043       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3044       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3045     };
3046     __ align(CodeEntryAlignment);
3047     StubCodeMark mark(this, "StubRoutines", name);
3048     address start = __ pc();
3049 
3050     Register buf   = c_rarg0;
3051     Register state = c_rarg1;
3052     Register ofs   = c_rarg2;
3053     Register limit = c_rarg3;
3054 
3055     Label sha1_loop;
3056 
3057     __ stpd(v8, v9, __ pre(sp, -32));
3058     __ stpd(v10, v11, Address(sp, 16));
3059 
3060 // dga == v0
3061 // dgb == v1
3062 // dg0 == v2
3063 // dg1 == v3
3064 // dg2 == v4
3065 // t0 == v6
3066 // t1 == v7
3067 
3068     // load 16 keys to v16..v31
3069     __ lea(rscratch1, ExternalAddress((address)round_consts));
3070     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3071     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3072     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3073     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3074 
3075     // load 8 words (256 bits) state
3076     __ ldpq(v0, v1, state);
3077 
3078     __ BIND(sha1_loop);
3079     // load 64 bytes of data into v8..v11
3080     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3081     __ rev32(v8, __ T16B, v8);
3082     __ rev32(v9, __ T16B, v9);
3083     __ rev32(v10, __ T16B, v10);
3084     __ rev32(v11, __ T16B, v11);
3085 
3086     __ addv(v6, __ T4S, v8, v16);
3087     __ orr(v2, __ T16B, v0, v0);
3088     __ orr(v3, __ T16B, v1, v1);
3089 
3090     FloatRegister d0 = v8;
3091     FloatRegister d1 = v9;
3092     FloatRegister d2 = v10;
3093     FloatRegister d3 = v11;
3094 
3095 
3096     for (int round = 0; round < 16; round++) {
3097       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3098       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3099       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3100       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3101 
3102       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3103        __ orr(v4, __ T16B, v2, v2);
3104       if (round < 15)
3105         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3106       __ sha256h(v2, __ T4S, v3, tmp2);
3107       __ sha256h2(v3, __ T4S, v4, tmp2);
3108       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3109 
3110       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3111     }
3112 
3113     __ addv(v0, __ T4S, v0, v2);
3114     __ addv(v1, __ T4S, v1, v3);
3115 
3116     if (multi_block) {
3117       __ add(ofs, ofs, 64);
3118       __ cmp(ofs, limit);
3119       __ br(Assembler::LE, sha1_loop);
3120       __ mov(c_rarg0, ofs); // return ofs
3121     }
3122 
3123     __ ldpd(v10, v11, Address(sp, 16));
3124     __ ldpd(v8, v9, __ post(sp, 32));
3125 
3126     __ stpq(v0, v1, state);
3127 
3128     __ ret(lr);
3129 
3130     return start;
3131   }
3132 
3133   // Arguments:
3134   //
3135   // Inputs:
3136   //   c_rarg0   - byte[]  source+offset
3137   //   c_rarg1   - int[]   SHA.state
3138   //   c_rarg2   - int     offset
3139   //   c_rarg3   - int     limit
3140   //
3141   address generate_sha512_implCompress(bool multi_block, const char *name) {
3142     static const uint64_t round_consts[80] = {
3143       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3144       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3145       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3146       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3147       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3148       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3149       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3150       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3151       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3152       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3153       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3154       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3155       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3156       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3157       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3158       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3159       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3160       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3161       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3162       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3163       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3164       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3165       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3166       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3167       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3168       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3169       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3170     };
3171 
3172     // Double rounds for sha512.
3173     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3174       if (dr < 36)                                                                   \
3175         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3176       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3177       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3178       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3179       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3180       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3181       if (dr < 32) {                                                                 \
3182         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3183         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3184       }                                                                              \
3185       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3186       if (dr < 32)                                                                   \
3187         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3188       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3189       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3190 
3191     __ align(CodeEntryAlignment);
3192     StubCodeMark mark(this, "StubRoutines", name);
3193     address start = __ pc();
3194 
3195     Register buf   = c_rarg0;
3196     Register state = c_rarg1;
3197     Register ofs   = c_rarg2;
3198     Register limit = c_rarg3;
3199 
3200     __ stpd(v8, v9, __ pre(sp, -64));
3201     __ stpd(v10, v11, Address(sp, 16));
3202     __ stpd(v12, v13, Address(sp, 32));
3203     __ stpd(v14, v15, Address(sp, 48));
3204 
3205     Label sha512_loop;
3206 
3207     // load state
3208     __ ld1(v8, v9, v10, v11, __ T2D, state);
3209 
3210     // load first 4 round constants
3211     __ lea(rscratch1, ExternalAddress((address)round_consts));
3212     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3213 
3214     __ BIND(sha512_loop);
3215     // load 128B of data into v12..v19
3216     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3217     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3218     __ rev64(v12, __ T16B, v12);
3219     __ rev64(v13, __ T16B, v13);
3220     __ rev64(v14, __ T16B, v14);
3221     __ rev64(v15, __ T16B, v15);
3222     __ rev64(v16, __ T16B, v16);
3223     __ rev64(v17, __ T16B, v17);
3224     __ rev64(v18, __ T16B, v18);
3225     __ rev64(v19, __ T16B, v19);
3226 
3227     __ mov(rscratch2, rscratch1);
3228 
3229     __ mov(v0, __ T16B, v8);
3230     __ mov(v1, __ T16B, v9);
3231     __ mov(v2, __ T16B, v10);
3232     __ mov(v3, __ T16B, v11);
3233 
3234     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3235     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3236     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3237     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3238     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3239     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3240     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3241     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3242     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3243     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3244     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3245     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3246     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3247     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3248     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3249     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3250     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3251     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3252     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3253     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3254     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3255     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3256     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3257     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3258     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3259     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3260     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3261     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3262     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3263     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3264     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3265     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3266     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3267     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3268     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3269     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3270     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3271     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3272     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3273     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3274 
3275     __ addv(v8, __ T2D, v8, v0);
3276     __ addv(v9, __ T2D, v9, v1);
3277     __ addv(v10, __ T2D, v10, v2);
3278     __ addv(v11, __ T2D, v11, v3);
3279 
3280     if (multi_block) {
3281       __ add(ofs, ofs, 128);
3282       __ cmp(ofs, limit);
3283       __ br(Assembler::LE, sha512_loop);
3284       __ mov(c_rarg0, ofs); // return ofs
3285     }
3286 
3287     __ st1(v8, v9, v10, v11, __ T2D, state);
3288 
3289     __ ldpd(v14, v15, Address(sp, 48));
3290     __ ldpd(v12, v13, Address(sp, 32));
3291     __ ldpd(v10, v11, Address(sp, 16));
3292     __ ldpd(v8, v9, __ post(sp, 64));
3293 
3294     __ ret(lr);
3295 
3296     return start;
3297   }
3298 
3299   // Safefetch stubs.
3300   void generate_safefetch(const char* name, int size, address* entry,
3301                           address* fault_pc, address* continuation_pc) {
3302     // safefetch signatures:
3303     //   int      SafeFetch32(int*      adr, int      errValue);
3304     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3305     //
3306     // arguments:
3307     //   c_rarg0 = adr
3308     //   c_rarg1 = errValue
3309     //
3310     // result:
3311     //   PPC_RET  = *adr or errValue
3312 
3313     StubCodeMark mark(this, "StubRoutines", name);
3314 
3315     // Entry point, pc or function descriptor.
3316     *entry = __ pc();
3317 
3318     // Load *adr into c_rarg1, may fault.
3319     *fault_pc = __ pc();
3320     switch (size) {
3321       case 4:
3322         // int32_t
3323         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3324         break;
3325       case 8:
3326         // int64_t
3327         __ ldr(c_rarg1, Address(c_rarg0, 0));
3328         break;
3329       default:
3330         ShouldNotReachHere();
3331     }
3332 
3333     // return errValue or *adr
3334     *continuation_pc = __ pc();
3335     __ mov(r0, c_rarg1);
3336     __ ret(lr);
3337   }
3338 
3339   /**
3340    *  Arguments:
3341    *
3342    * Inputs:
3343    *   c_rarg0   - int crc
3344    *   c_rarg1   - byte* buf
3345    *   c_rarg2   - int length
3346    *
3347    * Ouput:
3348    *       rax   - int crc result
3349    */
3350   address generate_updateBytesCRC32() {
3351     assert(UseCRC32Intrinsics, "what are we doing here?");
3352 
3353     __ align(CodeEntryAlignment);
3354     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3355 
3356     address start = __ pc();
3357 
3358     const Register crc   = c_rarg0;  // crc
3359     const Register buf   = c_rarg1;  // source java byte array address
3360     const Register len   = c_rarg2;  // length
3361     const Register table0 = c_rarg3; // crc_table address
3362     const Register table1 = c_rarg4;
3363     const Register table2 = c_rarg5;
3364     const Register table3 = c_rarg6;
3365     const Register tmp3 = c_rarg7;
3366 
3367     BLOCK_COMMENT("Entry:");
3368     __ enter(); // required for proper stackwalking of RuntimeStub frame
3369 
3370     __ kernel_crc32(crc, buf, len,
3371               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3372 
3373     __ leave(); // required for proper stackwalking of RuntimeStub frame
3374     __ ret(lr);
3375 
3376     return start;
3377   }
3378 
3379   /**
3380    *  Arguments:
3381    *
3382    * Inputs:
3383    *   c_rarg0   - int crc
3384    *   c_rarg1   - byte* buf
3385    *   c_rarg2   - int length
3386    *   c_rarg3   - int* table
3387    *
3388    * Ouput:
3389    *       r0   - int crc result
3390    */
3391   address generate_updateBytesCRC32C() {
3392     assert(UseCRC32CIntrinsics, "what are we doing here?");
3393 
3394     __ align(CodeEntryAlignment);
3395     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3396 
3397     address start = __ pc();
3398 
3399     const Register crc   = c_rarg0;  // crc
3400     const Register buf   = c_rarg1;  // source java byte array address
3401     const Register len   = c_rarg2;  // length
3402     const Register table0 = c_rarg3; // crc_table address
3403     const Register table1 = c_rarg4;
3404     const Register table2 = c_rarg5;
3405     const Register table3 = c_rarg6;
3406     const Register tmp3 = c_rarg7;
3407 
3408     BLOCK_COMMENT("Entry:");
3409     __ enter(); // required for proper stackwalking of RuntimeStub frame
3410 
3411     __ kernel_crc32c(crc, buf, len,
3412               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3413 
3414     __ leave(); // required for proper stackwalking of RuntimeStub frame
3415     __ ret(lr);
3416 
3417     return start;
3418   }
3419 
3420   /***
3421    *  Arguments:
3422    *
3423    *  Inputs:
3424    *   c_rarg0   - int   adler
3425    *   c_rarg1   - byte* buff
3426    *   c_rarg2   - int   len
3427    *
3428    * Output:
3429    *   c_rarg0   - int adler result
3430    */
3431   address generate_updateBytesAdler32() {
3432     __ align(CodeEntryAlignment);
3433     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3434     address start = __ pc();
3435 
3436     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3437 
3438     // Aliases
3439     Register adler  = c_rarg0;
3440     Register s1     = c_rarg0;
3441     Register s2     = c_rarg3;
3442     Register buff   = c_rarg1;
3443     Register len    = c_rarg2;
3444     Register nmax  = r4;
3445     Register base  = r5;
3446     Register count = r6;
3447     Register temp0 = rscratch1;
3448     Register temp1 = rscratch2;
3449     FloatRegister vbytes = v0;
3450     FloatRegister vs1acc = v1;
3451     FloatRegister vs2acc = v2;
3452     FloatRegister vtable = v3;
3453 
3454     // Max number of bytes we can process before having to take the mod
3455     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3456     uint64_t BASE = 0xfff1;
3457     uint64_t NMAX = 0x15B0;
3458 
3459     __ mov(base, BASE);
3460     __ mov(nmax, NMAX);
3461 
3462     // Load accumulation coefficients for the upper 16 bits
3463     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3464     __ ld1(vtable, __ T16B, Address(temp0));
3465 
3466     // s1 is initialized to the lower 16 bits of adler
3467     // s2 is initialized to the upper 16 bits of adler
3468     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3469     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3470 
3471     // The pipelined loop needs at least 16 elements for 1 iteration
3472     // It does check this, but it is more effective to skip to the cleanup loop
3473     __ cmp(len, (u1)16);
3474     __ br(Assembler::HS, L_nmax);
3475     __ cbz(len, L_combine);
3476 
3477     __ bind(L_simple_by1_loop);
3478     __ ldrb(temp0, Address(__ post(buff, 1)));
3479     __ add(s1, s1, temp0);
3480     __ add(s2, s2, s1);
3481     __ subs(len, len, 1);
3482     __ br(Assembler::HI, L_simple_by1_loop);
3483 
3484     // s1 = s1 % BASE
3485     __ subs(temp0, s1, base);
3486     __ csel(s1, temp0, s1, Assembler::HS);
3487 
3488     // s2 = s2 % BASE
3489     __ lsr(temp0, s2, 16);
3490     __ lsl(temp1, temp0, 4);
3491     __ sub(temp1, temp1, temp0);
3492     __ add(s2, temp1, s2, ext::uxth);
3493 
3494     __ subs(temp0, s2, base);
3495     __ csel(s2, temp0, s2, Assembler::HS);
3496 
3497     __ b(L_combine);
3498 
3499     __ bind(L_nmax);
3500     __ subs(len, len, nmax);
3501     __ sub(count, nmax, 16);
3502     __ br(Assembler::LO, L_by16);
3503 
3504     __ bind(L_nmax_loop);
3505 
3506     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3507                                       vbytes, vs1acc, vs2acc, vtable);
3508 
3509     __ subs(count, count, 16);
3510     __ br(Assembler::HS, L_nmax_loop);
3511 
3512     // s1 = s1 % BASE
3513     __ lsr(temp0, s1, 16);
3514     __ lsl(temp1, temp0, 4);
3515     __ sub(temp1, temp1, temp0);
3516     __ add(temp1, temp1, s1, ext::uxth);
3517 
3518     __ lsr(temp0, temp1, 16);
3519     __ lsl(s1, temp0, 4);
3520     __ sub(s1, s1, temp0);
3521     __ add(s1, s1, temp1, ext:: uxth);
3522 
3523     __ subs(temp0, s1, base);
3524     __ csel(s1, temp0, s1, Assembler::HS);
3525 
3526     // s2 = s2 % BASE
3527     __ lsr(temp0, s2, 16);
3528     __ lsl(temp1, temp0, 4);
3529     __ sub(temp1, temp1, temp0);
3530     __ add(temp1, temp1, s2, ext::uxth);
3531 
3532     __ lsr(temp0, temp1, 16);
3533     __ lsl(s2, temp0, 4);
3534     __ sub(s2, s2, temp0);
3535     __ add(s2, s2, temp1, ext:: uxth);
3536 
3537     __ subs(temp0, s2, base);
3538     __ csel(s2, temp0, s2, Assembler::HS);
3539 
3540     __ subs(len, len, nmax);
3541     __ sub(count, nmax, 16);
3542     __ br(Assembler::HS, L_nmax_loop);
3543 
3544     __ bind(L_by16);
3545     __ adds(len, len, count);
3546     __ br(Assembler::LO, L_by1);
3547 
3548     __ bind(L_by16_loop);
3549 
3550     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3551                                       vbytes, vs1acc, vs2acc, vtable);
3552 
3553     __ subs(len, len, 16);
3554     __ br(Assembler::HS, L_by16_loop);
3555 
3556     __ bind(L_by1);
3557     __ adds(len, len, 15);
3558     __ br(Assembler::LO, L_do_mod);
3559 
3560     __ bind(L_by1_loop);
3561     __ ldrb(temp0, Address(__ post(buff, 1)));
3562     __ add(s1, temp0, s1);
3563     __ add(s2, s2, s1);
3564     __ subs(len, len, 1);
3565     __ br(Assembler::HS, L_by1_loop);
3566 
3567     __ bind(L_do_mod);
3568     // s1 = s1 % BASE
3569     __ lsr(temp0, s1, 16);
3570     __ lsl(temp1, temp0, 4);
3571     __ sub(temp1, temp1, temp0);
3572     __ add(temp1, temp1, s1, ext::uxth);
3573 
3574     __ lsr(temp0, temp1, 16);
3575     __ lsl(s1, temp0, 4);
3576     __ sub(s1, s1, temp0);
3577     __ add(s1, s1, temp1, ext:: uxth);
3578 
3579     __ subs(temp0, s1, base);
3580     __ csel(s1, temp0, s1, Assembler::HS);
3581 
3582     // s2 = s2 % BASE
3583     __ lsr(temp0, s2, 16);
3584     __ lsl(temp1, temp0, 4);
3585     __ sub(temp1, temp1, temp0);
3586     __ add(temp1, temp1, s2, ext::uxth);
3587 
3588     __ lsr(temp0, temp1, 16);
3589     __ lsl(s2, temp0, 4);
3590     __ sub(s2, s2, temp0);
3591     __ add(s2, s2, temp1, ext:: uxth);
3592 
3593     __ subs(temp0, s2, base);
3594     __ csel(s2, temp0, s2, Assembler::HS);
3595 
3596     // Combine lower bits and higher bits
3597     __ bind(L_combine);
3598     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3599 
3600     __ ret(lr);
3601 
3602     return start;
3603   }
3604 
3605   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3606           Register temp0, Register temp1, FloatRegister vbytes,
3607           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3608     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3609     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3610     // In non-vectorized code, we update s1 and s2 as:
3611     //   s1 <- s1 + b1
3612     //   s2 <- s2 + s1
3613     //   s1 <- s1 + b2
3614     //   s2 <- s2 + b1
3615     //   ...
3616     //   s1 <- s1 + b16
3617     //   s2 <- s2 + s1
3618     // Putting above assignments together, we have:
3619     //   s1_new = s1 + b1 + b2 + ... + b16
3620     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3621     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3622     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3623     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3624 
3625     // s2 = s2 + s1 * 16
3626     __ add(s2, s2, s1, Assembler::LSL, 4);
3627 
3628     // vs1acc = b1 + b2 + b3 + ... + b16
3629     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3630     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3631     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3632     __ uaddlv(vs1acc, __ T16B, vbytes);
3633     __ uaddlv(vs2acc, __ T8H, vs2acc);
3634 
3635     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3636     __ fmovd(temp0, vs1acc);
3637     __ fmovd(temp1, vs2acc);
3638     __ add(s1, s1, temp0);
3639     __ add(s2, s2, temp1);
3640   }
3641 
3642   /**
3643    *  Arguments:
3644    *
3645    *  Input:
3646    *    c_rarg0   - x address
3647    *    c_rarg1   - x length
3648    *    c_rarg2   - y address
3649    *    c_rarg3   - y lenth
3650    *    c_rarg4   - z address
3651    *    c_rarg5   - z length
3652    */
3653   address generate_multiplyToLen() {
3654     __ align(CodeEntryAlignment);
3655     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3656 
3657     address start = __ pc();
3658     const Register x     = r0;
3659     const Register xlen  = r1;
3660     const Register y     = r2;
3661     const Register ylen  = r3;
3662     const Register z     = r4;
3663     const Register zlen  = r5;
3664 
3665     const Register tmp1  = r10;
3666     const Register tmp2  = r11;
3667     const Register tmp3  = r12;
3668     const Register tmp4  = r13;
3669     const Register tmp5  = r14;
3670     const Register tmp6  = r15;
3671     const Register tmp7  = r16;
3672 
3673     BLOCK_COMMENT("Entry:");
3674     __ enter(); // required for proper stackwalking of RuntimeStub frame
3675     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3676     __ leave(); // required for proper stackwalking of RuntimeStub frame
3677     __ ret(lr);
3678 
3679     return start;
3680   }
3681 
3682   address generate_squareToLen() {
3683     // squareToLen algorithm for sizes 1..127 described in java code works
3684     // faster than multiply_to_len on some CPUs and slower on others, but
3685     // multiply_to_len shows a bit better overall results
3686     __ align(CodeEntryAlignment);
3687     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3688     address start = __ pc();
3689 
3690     const Register x     = r0;
3691     const Register xlen  = r1;
3692     const Register z     = r2;
3693     const Register zlen  = r3;
3694     const Register y     = r4; // == x
3695     const Register ylen  = r5; // == xlen
3696 
3697     const Register tmp1  = r10;
3698     const Register tmp2  = r11;
3699     const Register tmp3  = r12;
3700     const Register tmp4  = r13;
3701     const Register tmp5  = r14;
3702     const Register tmp6  = r15;
3703     const Register tmp7  = r16;
3704 
3705     RegSet spilled_regs = RegSet::of(y, ylen);
3706     BLOCK_COMMENT("Entry:");
3707     __ enter();
3708     __ push(spilled_regs, sp);
3709     __ mov(y, x);
3710     __ mov(ylen, xlen);
3711     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3712     __ pop(spilled_regs, sp);
3713     __ leave();
3714     __ ret(lr);
3715     return start;
3716   }
3717 
3718   address generate_mulAdd() {
3719     __ align(CodeEntryAlignment);
3720     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3721 
3722     address start = __ pc();
3723 
3724     const Register out     = r0;
3725     const Register in      = r1;
3726     const Register offset  = r2;
3727     const Register len     = r3;
3728     const Register k       = r4;
3729 
3730     BLOCK_COMMENT("Entry:");
3731     __ enter();
3732     __ mul_add(out, in, offset, len, k);
3733     __ leave();
3734     __ ret(lr);
3735 
3736     return start;
3737   }
3738 
3739   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3740                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3741                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3742     // Karatsuba multiplication performs a 128*128 -> 256-bit
3743     // multiplication in three 128-bit multiplications and a few
3744     // additions.
3745     //
3746     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3747     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3748     //
3749     // Inputs:
3750     //
3751     // A0 in a.d[0]     (subkey)
3752     // A1 in a.d[1]
3753     // (A1+A0) in a1_xor_a0.d[0]
3754     //
3755     // B0 in b.d[0]     (state)
3756     // B1 in b.d[1]
3757 
3758     __ ext(tmp1, __ T16B, b, b, 0x08);
3759     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3760     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3761     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3762     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3763 
3764     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3765     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3766     __ eor(tmp2, __ T16B, tmp2, tmp4);
3767     __ eor(tmp2, __ T16B, tmp2, tmp3);
3768 
3769     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3770     __ ins(result_hi, __ D, tmp2, 0, 1);
3771     __ ins(result_lo, __ D, tmp2, 1, 0);
3772   }
3773 
3774   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3775                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3776     const FloatRegister t0 = result;
3777 
3778     // The GCM field polynomial f is z^128 + p(z), where p =
3779     // z^7+z^2+z+1.
3780     //
3781     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3782     //
3783     // so, given that the product we're reducing is
3784     //    a == lo + hi * z^128
3785     // substituting,
3786     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3787     //
3788     // we reduce by multiplying hi by p(z) and subtracting the result
3789     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3790     // bits we can do this with two 64-bit multiplications, lo*p and
3791     // hi*p.
3792 
3793     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3794     __ ext(t1, __ T16B, t0, z, 8);
3795     __ eor(hi, __ T16B, hi, t1);
3796     __ ext(t1, __ T16B, z, t0, 8);
3797     __ eor(lo, __ T16B, lo, t1);
3798     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3799     __ eor(result, __ T16B, lo, t0);
3800   }
3801 
3802   address generate_has_negatives(address &has_negatives_long) {
3803     const u1 large_loop_size = 64;
3804     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3805     int dcache_line = VM_Version::dcache_line_size();
3806 
3807     Register ary1 = r1, len = r2, result = r0;
3808 
3809     __ align(CodeEntryAlignment);
3810 
3811     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3812 
3813     address entry = __ pc();
3814 
3815     __ enter();
3816 
3817   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3818         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3819 
3820   __ cmp(len, (u1)15);
3821   __ br(Assembler::GT, LEN_OVER_15);
3822   // The only case when execution falls into this code is when pointer is near
3823   // the end of memory page and we have to avoid reading next page
3824   __ add(ary1, ary1, len);
3825   __ subs(len, len, 8);
3826   __ br(Assembler::GT, LEN_OVER_8);
3827   __ ldr(rscratch2, Address(ary1, -8));
3828   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3829   __ lsrv(rscratch2, rscratch2, rscratch1);
3830   __ tst(rscratch2, UPPER_BIT_MASK);
3831   __ cset(result, Assembler::NE);
3832   __ leave();
3833   __ ret(lr);
3834   __ bind(LEN_OVER_8);
3835   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3836   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3837   __ tst(rscratch2, UPPER_BIT_MASK);
3838   __ br(Assembler::NE, RET_TRUE_NO_POP);
3839   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3840   __ lsrv(rscratch1, rscratch1, rscratch2);
3841   __ tst(rscratch1, UPPER_BIT_MASK);
3842   __ cset(result, Assembler::NE);
3843   __ leave();
3844   __ ret(lr);
3845 
3846   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3847   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3848 
3849   has_negatives_long = __ pc(); // 2nd entry point
3850 
3851   __ enter();
3852 
3853   __ bind(LEN_OVER_15);
3854     __ push(spilled_regs, sp);
3855     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3856     __ cbz(rscratch2, ALIGNED);
3857     __ ldp(tmp6, tmp1, Address(ary1));
3858     __ mov(tmp5, 16);
3859     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3860     __ add(ary1, ary1, rscratch1);
3861     __ sub(len, len, rscratch1);
3862     __ orr(tmp6, tmp6, tmp1);
3863     __ tst(tmp6, UPPER_BIT_MASK);
3864     __ br(Assembler::NE, RET_TRUE);
3865 
3866   __ bind(ALIGNED);
3867     __ cmp(len, large_loop_size);
3868     __ br(Assembler::LT, CHECK_16);
3869     // Perform 16-byte load as early return in pre-loop to handle situation
3870     // when initially aligned large array has negative values at starting bytes,
3871     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3872     // slower. Cases with negative bytes further ahead won't be affected that
3873     // much. In fact, it'll be faster due to early loads, less instructions and
3874     // less branches in LARGE_LOOP.
3875     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3876     __ sub(len, len, 16);
3877     __ orr(tmp6, tmp6, tmp1);
3878     __ tst(tmp6, UPPER_BIT_MASK);
3879     __ br(Assembler::NE, RET_TRUE);
3880     __ cmp(len, large_loop_size);
3881     __ br(Assembler::LT, CHECK_16);
3882 
3883     if (SoftwarePrefetchHintDistance >= 0
3884         && SoftwarePrefetchHintDistance >= dcache_line) {
3885       // initial prefetch
3886       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3887     }
3888   __ bind(LARGE_LOOP);
3889     if (SoftwarePrefetchHintDistance >= 0) {
3890       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3891     }
3892     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3893     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3894     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3895     // instructions per cycle and have less branches, but this approach disables
3896     // early return, thus, all 64 bytes are loaded and checked every time.
3897     __ ldp(tmp2, tmp3, Address(ary1));
3898     __ ldp(tmp4, tmp5, Address(ary1, 16));
3899     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3900     __ ldp(tmp6, tmp1, Address(ary1, 48));
3901     __ add(ary1, ary1, large_loop_size);
3902     __ sub(len, len, large_loop_size);
3903     __ orr(tmp2, tmp2, tmp3);
3904     __ orr(tmp4, tmp4, tmp5);
3905     __ orr(rscratch1, rscratch1, rscratch2);
3906     __ orr(tmp6, tmp6, tmp1);
3907     __ orr(tmp2, tmp2, tmp4);
3908     __ orr(rscratch1, rscratch1, tmp6);
3909     __ orr(tmp2, tmp2, rscratch1);
3910     __ tst(tmp2, UPPER_BIT_MASK);
3911     __ br(Assembler::NE, RET_TRUE);
3912     __ cmp(len, large_loop_size);
3913     __ br(Assembler::GE, LARGE_LOOP);
3914 
3915   __ bind(CHECK_16); // small 16-byte load pre-loop
3916     __ cmp(len, (u1)16);
3917     __ br(Assembler::LT, POST_LOOP16);
3918 
3919   __ bind(LOOP16); // small 16-byte load loop
3920     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3921     __ sub(len, len, 16);
3922     __ orr(tmp2, tmp2, tmp3);
3923     __ tst(tmp2, UPPER_BIT_MASK);
3924     __ br(Assembler::NE, RET_TRUE);
3925     __ cmp(len, (u1)16);
3926     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3927 
3928   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3929     __ cmp(len, (u1)8);
3930     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3931     __ ldr(tmp3, Address(__ post(ary1, 8)));
3932     __ sub(len, len, 8);
3933     __ tst(tmp3, UPPER_BIT_MASK);
3934     __ br(Assembler::NE, RET_TRUE);
3935 
3936   __ bind(POST_LOOP16_LOAD_TAIL);
3937     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3938     __ ldr(tmp1, Address(ary1));
3939     __ mov(tmp2, 64);
3940     __ sub(tmp4, tmp2, len, __ LSL, 3);
3941     __ lslv(tmp1, tmp1, tmp4);
3942     __ tst(tmp1, UPPER_BIT_MASK);
3943     __ br(Assembler::NE, RET_TRUE);
3944     // Fallthrough
3945 
3946   __ bind(RET_FALSE);
3947     __ pop(spilled_regs, sp);
3948     __ leave();
3949     __ mov(result, zr);
3950     __ ret(lr);
3951 
3952   __ bind(RET_TRUE);
3953     __ pop(spilled_regs, sp);
3954   __ bind(RET_TRUE_NO_POP);
3955     __ leave();
3956     __ mov(result, 1);
3957     __ ret(lr);
3958 
3959   __ bind(DONE);
3960     __ pop(spilled_regs, sp);
3961     __ leave();
3962     __ ret(lr);
3963     return entry;
3964   }
3965 
3966   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3967         bool usePrefetch, Label &NOT_EQUAL) {
3968     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3969         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3970         tmp7 = r12, tmp8 = r13;
3971     Label LOOP;
3972 
3973     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3974     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3975     __ bind(LOOP);
3976     if (usePrefetch) {
3977       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3978       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3979     }
3980     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3981     __ eor(tmp1, tmp1, tmp2);
3982     __ eor(tmp3, tmp3, tmp4);
3983     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3984     __ orr(tmp1, tmp1, tmp3);
3985     __ cbnz(tmp1, NOT_EQUAL);
3986     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3987     __ eor(tmp5, tmp5, tmp6);
3988     __ eor(tmp7, tmp7, tmp8);
3989     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3990     __ orr(tmp5, tmp5, tmp7);
3991     __ cbnz(tmp5, NOT_EQUAL);
3992     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3993     __ eor(tmp1, tmp1, tmp2);
3994     __ eor(tmp3, tmp3, tmp4);
3995     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3996     __ orr(tmp1, tmp1, tmp3);
3997     __ cbnz(tmp1, NOT_EQUAL);
3998     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3999     __ eor(tmp5, tmp5, tmp6);
4000     __ sub(cnt1, cnt1, 8 * wordSize);
4001     __ eor(tmp7, tmp7, tmp8);
4002     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4003     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4004     // cmp) because subs allows an unlimited range of immediate operand.
4005     __ subs(tmp6, cnt1, loopThreshold);
4006     __ orr(tmp5, tmp5, tmp7);
4007     __ cbnz(tmp5, NOT_EQUAL);
4008     __ br(__ GE, LOOP);
4009     // post-loop
4010     __ eor(tmp1, tmp1, tmp2);
4011     __ eor(tmp3, tmp3, tmp4);
4012     __ orr(tmp1, tmp1, tmp3);
4013     __ sub(cnt1, cnt1, 2 * wordSize);
4014     __ cbnz(tmp1, NOT_EQUAL);
4015   }
4016 
4017   void generate_large_array_equals_loop_simd(int loopThreshold,
4018         bool usePrefetch, Label &NOT_EQUAL) {
4019     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4020         tmp2 = rscratch2;
4021     Label LOOP;
4022 
4023     __ bind(LOOP);
4024     if (usePrefetch) {
4025       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4026       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4027     }
4028     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4029     __ sub(cnt1, cnt1, 8 * wordSize);
4030     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4031     __ subs(tmp1, cnt1, loopThreshold);
4032     __ eor(v0, __ T16B, v0, v4);
4033     __ eor(v1, __ T16B, v1, v5);
4034     __ eor(v2, __ T16B, v2, v6);
4035     __ eor(v3, __ T16B, v3, v7);
4036     __ orr(v0, __ T16B, v0, v1);
4037     __ orr(v1, __ T16B, v2, v3);
4038     __ orr(v0, __ T16B, v0, v1);
4039     __ umov(tmp1, v0, __ D, 0);
4040     __ umov(tmp2, v0, __ D, 1);
4041     __ orr(tmp1, tmp1, tmp2);
4042     __ cbnz(tmp1, NOT_EQUAL);
4043     __ br(__ GE, LOOP);
4044   }
4045 
4046   // a1 = r1 - array1 address
4047   // a2 = r2 - array2 address
4048   // result = r0 - return value. Already contains "false"
4049   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4050   // r3-r5 are reserved temporary registers
4051   address generate_large_array_equals() {
4052     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4053         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4054         tmp7 = r12, tmp8 = r13;
4055     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4056         SMALL_LOOP, POST_LOOP;
4057     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4058     // calculate if at least 32 prefetched bytes are used
4059     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4060     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4061     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4062     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4063         tmp5, tmp6, tmp7, tmp8);
4064 
4065     __ align(CodeEntryAlignment);
4066 
4067     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4068 
4069     address entry = __ pc();
4070     __ enter();
4071     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4072     // also advance pointers to use post-increment instead of pre-increment
4073     __ add(a1, a1, wordSize);
4074     __ add(a2, a2, wordSize);
4075     if (AvoidUnalignedAccesses) {
4076       // both implementations (SIMD/nonSIMD) are using relatively large load
4077       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4078       // on some CPUs in case of address is not at least 16-byte aligned.
4079       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4080       // load if needed at least for 1st address and make if 16-byte aligned.
4081       Label ALIGNED16;
4082       __ tbz(a1, 3, ALIGNED16);
4083       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4084       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4085       __ sub(cnt1, cnt1, wordSize);
4086       __ eor(tmp1, tmp1, tmp2);
4087       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4088       __ bind(ALIGNED16);
4089     }
4090     if (UseSIMDForArrayEquals) {
4091       if (SoftwarePrefetchHintDistance >= 0) {
4092         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4093         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4094         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4095             /* prfm = */ true, NOT_EQUAL);
4096         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4097         __ br(__ LT, TAIL);
4098       }
4099       __ bind(NO_PREFETCH_LARGE_LOOP);
4100       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4101           /* prfm = */ false, NOT_EQUAL);
4102     } else {
4103       __ push(spilled_regs, sp);
4104       if (SoftwarePrefetchHintDistance >= 0) {
4105         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4106         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4107         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4108             /* prfm = */ true, NOT_EQUAL);
4109         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4110         __ br(__ LT, TAIL);
4111       }
4112       __ bind(NO_PREFETCH_LARGE_LOOP);
4113       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4114           /* prfm = */ false, NOT_EQUAL);
4115     }
4116     __ bind(TAIL);
4117       __ cbz(cnt1, EQUAL);
4118       __ subs(cnt1, cnt1, wordSize);
4119       __ br(__ LE, POST_LOOP);
4120     __ bind(SMALL_LOOP);
4121       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4122       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4123       __ subs(cnt1, cnt1, wordSize);
4124       __ eor(tmp1, tmp1, tmp2);
4125       __ cbnz(tmp1, NOT_EQUAL);
4126       __ br(__ GT, SMALL_LOOP);
4127     __ bind(POST_LOOP);
4128       __ ldr(tmp1, Address(a1, cnt1));
4129       __ ldr(tmp2, Address(a2, cnt1));
4130       __ eor(tmp1, tmp1, tmp2);
4131       __ cbnz(tmp1, NOT_EQUAL);
4132     __ bind(EQUAL);
4133       __ mov(result, true);
4134     __ bind(NOT_EQUAL);
4135       if (!UseSIMDForArrayEquals) {
4136         __ pop(spilled_regs, sp);
4137       }
4138     __ bind(NOT_EQUAL_NO_POP);
4139     __ leave();
4140     __ ret(lr);
4141     return entry;
4142   }
4143 
4144   address generate_dsin_dcos(bool isCos) {
4145     __ align(CodeEntryAlignment);
4146     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4147     address start = __ pc();
4148     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4149         (address)StubRoutines::aarch64::_two_over_pi,
4150         (address)StubRoutines::aarch64::_pio2,
4151         (address)StubRoutines::aarch64::_dsin_coef,
4152         (address)StubRoutines::aarch64::_dcos_coef);
4153     return start;
4154   }
4155 
4156   address generate_dlog() {
4157     __ align(CodeEntryAlignment);
4158     StubCodeMark mark(this, "StubRoutines", "dlog");
4159     address entry = __ pc();
4160     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4161         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4162     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4163     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4164         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4165     return entry;
4166   }
4167 
4168   // code for comparing 16 bytes of strings with same encoding
4169   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4170     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4171     __ ldr(rscratch1, Address(__ post(str1, 8)));
4172     __ eor(rscratch2, tmp1, tmp2);
4173     __ ldr(cnt1, Address(__ post(str2, 8)));
4174     __ cbnz(rscratch2, DIFF1);
4175     __ ldr(tmp1, Address(__ post(str1, 8)));
4176     __ eor(rscratch2, rscratch1, cnt1);
4177     __ ldr(tmp2, Address(__ post(str2, 8)));
4178     __ cbnz(rscratch2, DIFF2);
4179   }
4180 
4181   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4182   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4183       Label &DIFF2) {
4184     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4185     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4186 
4187     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4188     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4189     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4190     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4191 
4192     __ fmovd(tmpL, vtmp3);
4193     __ eor(rscratch2, tmp3, tmpL);
4194     __ cbnz(rscratch2, DIFF2);
4195 
4196     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4197     __ umov(tmpL, vtmp3, __ D, 1);
4198     __ eor(rscratch2, tmpU, tmpL);
4199     __ cbnz(rscratch2, DIFF1);
4200 
4201     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4202     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4203     __ fmovd(tmpL, vtmp);
4204     __ eor(rscratch2, tmp3, tmpL);
4205     __ cbnz(rscratch2, DIFF2);
4206 
4207     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4208     __ umov(tmpL, vtmp, __ D, 1);
4209     __ eor(rscratch2, tmpU, tmpL);
4210     __ cbnz(rscratch2, DIFF1);
4211   }
4212 
4213   // r0  = result
4214   // r1  = str1
4215   // r2  = cnt1
4216   // r3  = str2
4217   // r4  = cnt2
4218   // r10 = tmp1
4219   // r11 = tmp2
4220   address generate_compare_long_string_different_encoding(bool isLU) {
4221     __ align(CodeEntryAlignment);
4222     StubCodeMark mark(this, "StubRoutines", isLU
4223         ? "compare_long_string_different_encoding LU"
4224         : "compare_long_string_different_encoding UL");
4225     address entry = __ pc();
4226     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4227         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4228         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4229     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4230         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4231     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4232     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4233 
4234     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4235 
4236     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4237     // cnt2 == amount of characters left to compare
4238     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4239     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4240     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4241     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4242     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4243     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4244     __ eor(rscratch2, tmp1, tmp2);
4245     __ mov(rscratch1, tmp2);
4246     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4247     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4248              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4249     __ push(spilled_regs, sp);
4250     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4251     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4252 
4253     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4254 
4255     if (SoftwarePrefetchHintDistance >= 0) {
4256       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4257       __ br(__ LT, NO_PREFETCH);
4258       __ bind(LARGE_LOOP_PREFETCH);
4259         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4260         __ mov(tmp4, 2);
4261         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4262         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4263           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4264           __ subs(tmp4, tmp4, 1);
4265           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4266           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4267           __ mov(tmp4, 2);
4268         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4269           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4270           __ subs(tmp4, tmp4, 1);
4271           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4272           __ sub(cnt2, cnt2, 64);
4273           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4274           __ br(__ GE, LARGE_LOOP_PREFETCH);
4275     }
4276     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4277     __ bind(NO_PREFETCH);
4278     __ subs(cnt2, cnt2, 16);
4279     __ br(__ LT, TAIL);
4280     __ align(OptoLoopAlignment);
4281     __ bind(SMALL_LOOP); // smaller loop
4282       __ subs(cnt2, cnt2, 16);
4283       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4284       __ br(__ GE, SMALL_LOOP);
4285       __ cmn(cnt2, (u1)16);
4286       __ br(__ EQ, LOAD_LAST);
4287     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4288       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4289       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4290       __ ldr(tmp3, Address(cnt1, -8));
4291       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4292       __ b(LOAD_LAST);
4293     __ bind(DIFF2);
4294       __ mov(tmpU, tmp3);
4295     __ bind(DIFF1);
4296       __ pop(spilled_regs, sp);
4297       __ b(CALCULATE_DIFFERENCE);
4298     __ bind(LOAD_LAST);
4299       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4300       // No need to load it again
4301       __ mov(tmpU, tmp3);
4302       __ pop(spilled_regs, sp);
4303 
4304       // tmp2 points to the address of the last 4 Latin1 characters right now
4305       __ ldrs(vtmp, Address(tmp2));
4306       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4307       __ fmovd(tmpL, vtmp);
4308 
4309       __ eor(rscratch2, tmpU, tmpL);
4310       __ cbz(rscratch2, DONE);
4311 
4312     // Find the first different characters in the longwords and
4313     // compute their difference.
4314     __ bind(CALCULATE_DIFFERENCE);
4315       __ rev(rscratch2, rscratch2);
4316       __ clz(rscratch2, rscratch2);
4317       __ andr(rscratch2, rscratch2, -16);
4318       __ lsrv(tmp1, tmp1, rscratch2);
4319       __ uxthw(tmp1, tmp1);
4320       __ lsrv(rscratch1, rscratch1, rscratch2);
4321       __ uxthw(rscratch1, rscratch1);
4322       __ subw(result, tmp1, rscratch1);
4323     __ bind(DONE);
4324       __ ret(lr);
4325     return entry;
4326   }
4327 
4328     address generate_method_entry_barrier() {
4329     __ align(CodeEntryAlignment);
4330     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
4331 
4332     Label deoptimize_label;
4333 
4334     address start = __ pc();
4335 
4336     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
4337 
4338     __ enter();
4339     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
4340 
4341     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
4342 
4343     __ push_call_clobbered_registers();
4344 
4345     __ mov(c_rarg0, rscratch2);
4346     __ call_VM_leaf
4347          (CAST_FROM_FN_PTR
4348           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
4349 
4350     __ reset_last_Java_frame(true);
4351 
4352     __ mov(rscratch1, r0);
4353 
4354     __ pop_call_clobbered_registers();
4355 
4356     __ cbnz(rscratch1, deoptimize_label);
4357 
4358     __ leave();
4359     __ ret(lr);
4360 
4361     __ BIND(deoptimize_label);
4362 
4363     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
4364     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
4365 
4366     __ mov(sp, rscratch1);
4367     __ br(rscratch2);
4368 
4369     return start;
4370   }
4371 
4372   // r0  = result
4373   // r1  = str1
4374   // r2  = cnt1
4375   // r3  = str2
4376   // r4  = cnt2
4377   // r10 = tmp1
4378   // r11 = tmp2
4379   address generate_compare_long_string_same_encoding(bool isLL) {
4380     __ align(CodeEntryAlignment);
4381     StubCodeMark mark(this, "StubRoutines", isLL
4382         ? "compare_long_string_same_encoding LL"
4383         : "compare_long_string_same_encoding UU");
4384     address entry = __ pc();
4385     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4386         tmp1 = r10, tmp2 = r11;
4387     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4388         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4389         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4390     // exit from large loop when less than 64 bytes left to read or we're about
4391     // to prefetch memory behind array border
4392     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4393     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4394     // update cnt2 counter with already loaded 8 bytes
4395     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4396     // update pointers, because of previous read
4397     __ add(str1, str1, wordSize);
4398     __ add(str2, str2, wordSize);
4399     if (SoftwarePrefetchHintDistance >= 0) {
4400       __ bind(LARGE_LOOP_PREFETCH);
4401         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4402         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4403         compare_string_16_bytes_same(DIFF, DIFF2);
4404         compare_string_16_bytes_same(DIFF, DIFF2);
4405         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4406         compare_string_16_bytes_same(DIFF, DIFF2);
4407         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4408         compare_string_16_bytes_same(DIFF, DIFF2);
4409         __ br(__ GT, LARGE_LOOP_PREFETCH);
4410         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4411     }
4412     // less than 16 bytes left?
4413     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4414     __ br(__ LT, TAIL);
4415     __ align(OptoLoopAlignment);
4416     __ bind(SMALL_LOOP);
4417       compare_string_16_bytes_same(DIFF, DIFF2);
4418       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4419       __ br(__ GE, SMALL_LOOP);
4420     __ bind(TAIL);
4421       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4422       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4423       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4424       __ br(__ LE, CHECK_LAST);
4425       __ eor(rscratch2, tmp1, tmp2);
4426       __ cbnz(rscratch2, DIFF);
4427       __ ldr(tmp1, Address(__ post(str1, 8)));
4428       __ ldr(tmp2, Address(__ post(str2, 8)));
4429       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4430     __ bind(CHECK_LAST);
4431       if (!isLL) {
4432         __ add(cnt2, cnt2, cnt2); // now in bytes
4433       }
4434       __ eor(rscratch2, tmp1, tmp2);
4435       __ cbnz(rscratch2, DIFF);
4436       __ ldr(rscratch1, Address(str1, cnt2));
4437       __ ldr(cnt1, Address(str2, cnt2));
4438       __ eor(rscratch2, rscratch1, cnt1);
4439       __ cbz(rscratch2, LENGTH_DIFF);
4440       // Find the first different characters in the longwords and
4441       // compute their difference.
4442     __ bind(DIFF2);
4443       __ rev(rscratch2, rscratch2);
4444       __ clz(rscratch2, rscratch2);
4445       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4446       __ lsrv(rscratch1, rscratch1, rscratch2);
4447       if (isLL) {
4448         __ lsrv(cnt1, cnt1, rscratch2);
4449         __ uxtbw(rscratch1, rscratch1);
4450         __ uxtbw(cnt1, cnt1);
4451       } else {
4452         __ lsrv(cnt1, cnt1, rscratch2);
4453         __ uxthw(rscratch1, rscratch1);
4454         __ uxthw(cnt1, cnt1);
4455       }
4456       __ subw(result, rscratch1, cnt1);
4457       __ b(LENGTH_DIFF);
4458     __ bind(DIFF);
4459       __ rev(rscratch2, rscratch2);
4460       __ clz(rscratch2, rscratch2);
4461       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4462       __ lsrv(tmp1, tmp1, rscratch2);
4463       if (isLL) {
4464         __ lsrv(tmp2, tmp2, rscratch2);
4465         __ uxtbw(tmp1, tmp1);
4466         __ uxtbw(tmp2, tmp2);
4467       } else {
4468         __ lsrv(tmp2, tmp2, rscratch2);
4469         __ uxthw(tmp1, tmp1);
4470         __ uxthw(tmp2, tmp2);
4471       }
4472       __ subw(result, tmp1, tmp2);
4473       __ b(LENGTH_DIFF);
4474     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4475       __ eor(rscratch2, tmp1, tmp2);
4476       __ cbnz(rscratch2, DIFF);
4477     __ bind(LENGTH_DIFF);
4478       __ ret(lr);
4479     return entry;
4480   }
4481 
4482   void generate_compare_long_strings() {
4483       StubRoutines::aarch64::_compare_long_string_LL
4484           = generate_compare_long_string_same_encoding(true);
4485       StubRoutines::aarch64::_compare_long_string_UU
4486           = generate_compare_long_string_same_encoding(false);
4487       StubRoutines::aarch64::_compare_long_string_LU
4488           = generate_compare_long_string_different_encoding(true);
4489       StubRoutines::aarch64::_compare_long_string_UL
4490           = generate_compare_long_string_different_encoding(false);
4491   }
4492 
4493   // R0 = result
4494   // R1 = str2
4495   // R2 = cnt1
4496   // R3 = str1
4497   // R4 = cnt2
4498   // This generic linear code use few additional ideas, which makes it faster:
4499   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4500   // in order to skip initial loading(help in systems with 1 ld pipeline)
4501   // 2) we can use "fast" algorithm of finding single character to search for
4502   // first symbol with less branches(1 branch per each loaded register instead
4503   // of branch for each symbol), so, this is where constants like
4504   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4505   // 3) after loading and analyzing 1st register of source string, it can be
4506   // used to search for every 1st character entry, saving few loads in
4507   // comparison with "simplier-but-slower" implementation
4508   // 4) in order to avoid lots of push/pop operations, code below is heavily
4509   // re-using/re-initializing/compressing register values, which makes code
4510   // larger and a bit less readable, however, most of extra operations are
4511   // issued during loads or branches, so, penalty is minimal
4512   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4513     const char* stubName = str1_isL
4514         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4515         : "indexof_linear_uu";
4516     __ align(CodeEntryAlignment);
4517     StubCodeMark mark(this, "StubRoutines", stubName);
4518     address entry = __ pc();
4519 
4520     int str1_chr_size = str1_isL ? 1 : 2;
4521     int str2_chr_size = str2_isL ? 1 : 2;
4522     int str1_chr_shift = str1_isL ? 0 : 1;
4523     int str2_chr_shift = str2_isL ? 0 : 1;
4524     bool isL = str1_isL && str2_isL;
4525    // parameters
4526     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4527     // temporary registers
4528     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4529     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4530     // redefinitions
4531     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4532 
4533     __ push(spilled_regs, sp);
4534     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4535         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4536         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4537         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4538         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4539         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4540     // Read whole register from str1. It is safe, because length >=8 here
4541     __ ldr(ch1, Address(str1));
4542     // Read whole register from str2. It is safe, because length >=8 here
4543     __ ldr(ch2, Address(str2));
4544     __ sub(cnt2, cnt2, cnt1);
4545     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4546     if (str1_isL != str2_isL) {
4547       __ eor(v0, __ T16B, v0, v0);
4548     }
4549     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4550     __ mul(first, first, tmp1);
4551     // check if we have less than 1 register to check
4552     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4553     if (str1_isL != str2_isL) {
4554       __ fmovd(v1, ch1);
4555     }
4556     __ br(__ LE, L_SMALL);
4557     __ eor(ch2, first, ch2);
4558     if (str1_isL != str2_isL) {
4559       __ zip1(v1, __ T16B, v1, v0);
4560     }
4561     __ sub(tmp2, ch2, tmp1);
4562     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4563     __ bics(tmp2, tmp2, ch2);
4564     if (str1_isL != str2_isL) {
4565       __ fmovd(ch1, v1);
4566     }
4567     __ br(__ NE, L_HAS_ZERO);
4568     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4569     __ add(result, result, wordSize/str2_chr_size);
4570     __ add(str2, str2, wordSize);
4571     __ br(__ LT, L_POST_LOOP);
4572     __ BIND(L_LOOP);
4573       __ ldr(ch2, Address(str2));
4574       __ eor(ch2, first, ch2);
4575       __ sub(tmp2, ch2, tmp1);
4576       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4577       __ bics(tmp2, tmp2, ch2);
4578       __ br(__ NE, L_HAS_ZERO);
4579     __ BIND(L_LOOP_PROCEED);
4580       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4581       __ add(str2, str2, wordSize);
4582       __ add(result, result, wordSize/str2_chr_size);
4583       __ br(__ GE, L_LOOP);
4584     __ BIND(L_POST_LOOP);
4585       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4586       __ br(__ LE, NOMATCH);
4587       __ ldr(ch2, Address(str2));
4588       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4589       __ eor(ch2, first, ch2);
4590       __ sub(tmp2, ch2, tmp1);
4591       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4592       __ mov(tmp4, -1); // all bits set
4593       __ b(L_SMALL_PROCEED);
4594     __ align(OptoLoopAlignment);
4595     __ BIND(L_SMALL);
4596       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4597       __ eor(ch2, first, ch2);
4598       if (str1_isL != str2_isL) {
4599         __ zip1(v1, __ T16B, v1, v0);
4600       }
4601       __ sub(tmp2, ch2, tmp1);
4602       __ mov(tmp4, -1); // all bits set
4603       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4604       if (str1_isL != str2_isL) {
4605         __ fmovd(ch1, v1); // move converted 4 symbols
4606       }
4607     __ BIND(L_SMALL_PROCEED);
4608       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4609       __ bic(tmp2, tmp2, ch2);
4610       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4611       __ rbit(tmp2, tmp2);
4612       __ br(__ EQ, NOMATCH);
4613     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4614       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4615       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4616       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4617       if (str2_isL) { // LL
4618         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4619         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4620         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4621         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4622         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4623       } else {
4624         __ mov(ch2, 0xE); // all bits in byte set except last one
4625         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4626         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4627         __ lslv(tmp2, tmp2, tmp4);
4628         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4629         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4630         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4631         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4632       }
4633       __ cmp(ch1, ch2);
4634       __ mov(tmp4, wordSize/str2_chr_size);
4635       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4636     __ BIND(L_SMALL_CMP_LOOP);
4637       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4638                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4639       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4640                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4641       __ add(tmp4, tmp4, 1);
4642       __ cmp(tmp4, cnt1);
4643       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4644       __ cmp(first, ch2);
4645       __ br(__ EQ, L_SMALL_CMP_LOOP);
4646     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4647       __ cbz(tmp2, NOMATCH); // no more matches. exit
4648       __ clz(tmp4, tmp2);
4649       __ add(result, result, 1); // advance index
4650       __ add(str2, str2, str2_chr_size); // advance pointer
4651       __ b(L_SMALL_HAS_ZERO_LOOP);
4652     __ align(OptoLoopAlignment);
4653     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4654       __ cmp(first, ch2);
4655       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4656       __ b(DONE);
4657     __ align(OptoLoopAlignment);
4658     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4659       if (str2_isL) { // LL
4660         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4661         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4662         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4663         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4664         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4665       } else {
4666         __ mov(ch2, 0xE); // all bits in byte set except last one
4667         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4668         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4669         __ lslv(tmp2, tmp2, tmp4);
4670         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4671         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4672         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4673         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4674       }
4675       __ cmp(ch1, ch2);
4676       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4677       __ b(DONE);
4678     __ align(OptoLoopAlignment);
4679     __ BIND(L_HAS_ZERO);
4680       __ rbit(tmp2, tmp2);
4681       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4682       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4683       // It's fine because both counters are 32bit and are not changed in this
4684       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4685       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4686       __ sub(result, result, 1);
4687     __ BIND(L_HAS_ZERO_LOOP);
4688       __ mov(cnt1, wordSize/str2_chr_size);
4689       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4690       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4691       if (str2_isL) {
4692         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4693         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4694         __ lslv(tmp2, tmp2, tmp4);
4695         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4696         __ add(tmp4, tmp4, 1);
4697         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4698         __ lsl(tmp2, tmp2, 1);
4699         __ mov(tmp4, wordSize/str2_chr_size);
4700       } else {
4701         __ mov(ch2, 0xE);
4702         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4703         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4704         __ lslv(tmp2, tmp2, tmp4);
4705         __ add(tmp4, tmp4, 1);
4706         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4707         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4708         __ lsl(tmp2, tmp2, 1);
4709         __ mov(tmp4, wordSize/str2_chr_size);
4710         __ sub(str2, str2, str2_chr_size);
4711       }
4712       __ cmp(ch1, ch2);
4713       __ mov(tmp4, wordSize/str2_chr_size);
4714       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4715     __ BIND(L_CMP_LOOP);
4716       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4717                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4718       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4719                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4720       __ add(tmp4, tmp4, 1);
4721       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4722       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4723       __ cmp(cnt1, ch2);
4724       __ br(__ EQ, L_CMP_LOOP);
4725     __ BIND(L_CMP_LOOP_NOMATCH);
4726       // here we're not matched
4727       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4728       __ clz(tmp4, tmp2);
4729       __ add(str2, str2, str2_chr_size); // advance pointer
4730       __ b(L_HAS_ZERO_LOOP);
4731     __ align(OptoLoopAlignment);
4732     __ BIND(L_CMP_LOOP_LAST_CMP);
4733       __ cmp(cnt1, ch2);
4734       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4735       __ b(DONE);
4736     __ align(OptoLoopAlignment);
4737     __ BIND(L_CMP_LOOP_LAST_CMP2);
4738       if (str2_isL) {
4739         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4740         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4741         __ lslv(tmp2, tmp2, tmp4);
4742         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4743         __ add(tmp4, tmp4, 1);
4744         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4745         __ lsl(tmp2, tmp2, 1);
4746       } else {
4747         __ mov(ch2, 0xE);
4748         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4749         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4750         __ lslv(tmp2, tmp2, tmp4);
4751         __ add(tmp4, tmp4, 1);
4752         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4753         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4754         __ lsl(tmp2, tmp2, 1);
4755         __ sub(str2, str2, str2_chr_size);
4756       }
4757       __ cmp(ch1, ch2);
4758       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4759       __ b(DONE);
4760     __ align(OptoLoopAlignment);
4761     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4762       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4763       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4764       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4765       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4766       // result by analyzed characters value, so, we can just reset lower bits
4767       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4768       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4769       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4770       // index of last analyzed substring inside current octet. So, str2 in at
4771       // respective start address. We need to advance it to next octet
4772       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4773       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4774       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4775       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4776       __ movw(cnt2, cnt2);
4777       __ b(L_LOOP_PROCEED);
4778     __ align(OptoLoopAlignment);
4779     __ BIND(NOMATCH);
4780       __ mov(result, -1);
4781     __ BIND(DONE);
4782       __ pop(spilled_regs, sp);
4783       __ ret(lr);
4784     return entry;
4785   }
4786 
4787   void generate_string_indexof_stubs() {
4788     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4789     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4790     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4791   }
4792 
4793   void inflate_and_store_2_fp_registers(bool generatePrfm,
4794       FloatRegister src1, FloatRegister src2) {
4795     Register dst = r1;
4796     __ zip1(v1, __ T16B, src1, v0);
4797     __ zip2(v2, __ T16B, src1, v0);
4798     if (generatePrfm) {
4799       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4800     }
4801     __ zip1(v3, __ T16B, src2, v0);
4802     __ zip2(v4, __ T16B, src2, v0);
4803     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4804   }
4805 
4806   // R0 = src
4807   // R1 = dst
4808   // R2 = len
4809   // R3 = len >> 3
4810   // V0 = 0
4811   // v1 = loaded 8 bytes
4812   address generate_large_byte_array_inflate() {
4813     __ align(CodeEntryAlignment);
4814     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4815     address entry = __ pc();
4816     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4817     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4818     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
4819 
4820     // do one more 8-byte read to have address 16-byte aligned in most cases
4821     // also use single store instruction
4822     __ ldrd(v2, __ post(src, 8));
4823     __ sub(octetCounter, octetCounter, 2);
4824     __ zip1(v1, __ T16B, v1, v0);
4825     __ zip1(v2, __ T16B, v2, v0);
4826     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4827     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4828     __ subs(rscratch1, octetCounter, large_loop_threshold);
4829     __ br(__ LE, LOOP_START);
4830     __ b(LOOP_PRFM_START);
4831     __ bind(LOOP_PRFM);
4832       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4833     __ bind(LOOP_PRFM_START);
4834       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4835       __ sub(octetCounter, octetCounter, 8);
4836       __ subs(rscratch1, octetCounter, large_loop_threshold);
4837       inflate_and_store_2_fp_registers(true, v3, v4);
4838       inflate_and_store_2_fp_registers(true, v5, v6);
4839       __ br(__ GT, LOOP_PRFM);
4840       __ cmp(octetCounter, (u1)8);
4841       __ br(__ LT, DONE);
4842     __ bind(LOOP);
4843       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4844       __ bind(LOOP_START);
4845       __ sub(octetCounter, octetCounter, 8);
4846       __ cmp(octetCounter, (u1)8);
4847       inflate_and_store_2_fp_registers(false, v3, v4);
4848       inflate_and_store_2_fp_registers(false, v5, v6);
4849       __ br(__ GE, LOOP);
4850     __ bind(DONE);
4851       __ ret(lr);
4852     return entry;
4853   }
4854 
4855   /**
4856    *  Arguments:
4857    *
4858    *  Input:
4859    *  c_rarg0   - current state address
4860    *  c_rarg1   - H key address
4861    *  c_rarg2   - data address
4862    *  c_rarg3   - number of blocks
4863    *
4864    *  Output:
4865    *  Updated state at c_rarg0
4866    */
4867   address generate_ghash_processBlocks() {
4868     // Bafflingly, GCM uses little-endian for the byte order, but
4869     // big-endian for the bit order.  For example, the polynomial 1 is
4870     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4871     //
4872     // So, we must either reverse the bytes in each word and do
4873     // everything big-endian or reverse the bits in each byte and do
4874     // it little-endian.  On AArch64 it's more idiomatic to reverse
4875     // the bits in each byte (we have an instruction, RBIT, to do
4876     // that) and keep the data in little-endian bit order throught the
4877     // calculation, bit-reversing the inputs and outputs.
4878 
4879     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4880     __ align(wordSize * 2);
4881     address p = __ pc();
4882     __ emit_int64(0x87);  // The low-order bits of the field
4883                           // polynomial (i.e. p = z^7+z^2+z+1)
4884                           // repeated in the low and high parts of a
4885                           // 128-bit vector
4886     __ emit_int64(0x87);
4887 
4888     __ align(CodeEntryAlignment);
4889     address start = __ pc();
4890 
4891     Register state   = c_rarg0;
4892     Register subkeyH = c_rarg1;
4893     Register data    = c_rarg2;
4894     Register blocks  = c_rarg3;
4895 
4896     FloatRegister vzr = v30;
4897     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4898 
4899     __ ldrq(v0, Address(state));
4900     __ ldrq(v1, Address(subkeyH));
4901 
4902     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4903     __ rbit(v0, __ T16B, v0);
4904     __ rev64(v1, __ T16B, v1);
4905     __ rbit(v1, __ T16B, v1);
4906 
4907     __ ldrq(v26, p);
4908 
4909     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4910     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4911 
4912     {
4913       Label L_ghash_loop;
4914       __ bind(L_ghash_loop);
4915 
4916       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4917                                                  // reversing each byte
4918       __ rbit(v2, __ T16B, v2);
4919       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4920 
4921       // Multiply state in v2 by subkey in v1
4922       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4923                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4924                      /*temps*/v6, v20, v18, v21);
4925       // Reduce v7:v5 by the field polynomial
4926       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4927 
4928       __ sub(blocks, blocks, 1);
4929       __ cbnz(blocks, L_ghash_loop);
4930     }
4931 
4932     // The bit-reversed result is at this point in v0
4933     __ rev64(v1, __ T16B, v0);
4934     __ rbit(v1, __ T16B, v1);
4935 
4936     __ st1(v1, __ T16B, state);
4937     __ ret(lr);
4938 
4939     return start;
4940   }
4941 
4942   // Continuation point for throwing of implicit exceptions that are
4943   // not handled in the current activation. Fabricates an exception
4944   // oop and initiates normal exception dispatching in this
4945   // frame. Since we need to preserve callee-saved values (currently
4946   // only for C2, but done for C1 as well) we need a callee-saved oop
4947   // map and therefore have to make these stubs into RuntimeStubs
4948   // rather than BufferBlobs.  If the compiler needs all registers to
4949   // be preserved between the fault point and the exception handler
4950   // then it must assume responsibility for that in
4951   // AbstractCompiler::continuation_for_implicit_null_exception or
4952   // continuation_for_implicit_division_by_zero_exception. All other
4953   // implicit exceptions (e.g., NullPointerException or
4954   // AbstractMethodError on entry) are either at call sites or
4955   // otherwise assume that stack unwinding will be initiated, so
4956   // caller saved registers were assumed volatile in the compiler.
4957 
4958 #undef __
4959 #define __ masm->
4960 
4961   address generate_throw_exception(const char* name,
4962                                    address runtime_entry,
4963                                    Register arg1 = noreg,
4964                                    Register arg2 = noreg) {
4965     // Information about frame layout at time of blocking runtime call.
4966     // Note that we only have to preserve callee-saved registers since
4967     // the compilers are responsible for supplying a continuation point
4968     // if they expect all registers to be preserved.
4969     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4970     enum layout {
4971       rfp_off = 0,
4972       rfp_off2,
4973       return_off,
4974       return_off2,
4975       framesize // inclusive of return address
4976     };
4977 
4978     int insts_size = 512;
4979     int locs_size  = 64;
4980 
4981     CodeBuffer code(name, insts_size, locs_size);
4982     OopMapSet* oop_maps  = new OopMapSet();
4983     MacroAssembler* masm = new MacroAssembler(&code);
4984 
4985     address start = __ pc();
4986 
4987     // This is an inlined and slightly modified version of call_VM
4988     // which has the ability to fetch the return PC out of
4989     // thread-local storage and also sets up last_Java_sp slightly
4990     // differently than the real call_VM
4991 
4992     __ enter(); // Save FP and LR before call
4993 
4994     assert(is_even(framesize/2), "sp not 16-byte aligned");
4995 
4996     // lr and fp are already in place
4997     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4998 
4999     int frame_complete = __ pc() - start;
5000 
5001     // Set up last_Java_sp and last_Java_fp
5002     address the_pc = __ pc();
5003     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
5004 
5005     // Call runtime
5006     if (arg1 != noreg) {
5007       assert(arg2 != c_rarg1, "clobbered");
5008       __ mov(c_rarg1, arg1);
5009     }
5010     if (arg2 != noreg) {
5011       __ mov(c_rarg2, arg2);
5012     }
5013     __ mov(c_rarg0, rthread);
5014     BLOCK_COMMENT("call runtime_entry");
5015     __ mov(rscratch1, runtime_entry);
5016     __ blr(rscratch1);
5017 
5018     // Generate oop map
5019     OopMap* map = new OopMap(framesize, 0);
5020 
5021     oop_maps->add_gc_map(the_pc - start, map);
5022 
5023     __ reset_last_Java_frame(true);
5024     __ maybe_isb();
5025 
5026     if (UseSVE > 0) {
5027       // Reinitialize the ptrue predicate register, in case the external runtime
5028       // call clobbers ptrue reg, as we may return to SVE compiled code.
5029       __ reinitialize_ptrue();
5030     }
5031 
5032     __ leave();
5033 
5034     // check for pending exceptions
5035 #ifdef ASSERT
5036     Label L;
5037     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
5038     __ cbnz(rscratch1, L);
5039     __ should_not_reach_here();
5040     __ bind(L);
5041 #endif // ASSERT
5042     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5043 
5044 
5045     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5046     RuntimeStub* stub =
5047       RuntimeStub::new_runtime_stub(name,
5048                                     &code,
5049                                     frame_complete,
5050                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5051                                     oop_maps, false);
5052     return stub->entry_point();
5053   }
5054 
5055   class MontgomeryMultiplyGenerator : public MacroAssembler {
5056 
5057     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
5058       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
5059 
5060     RegSet _toSave;
5061     bool _squaring;
5062 
5063   public:
5064     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
5065       : MacroAssembler(as->code()), _squaring(squaring) {
5066 
5067       // Register allocation
5068 
5069       Register reg = c_rarg0;
5070       Pa_base = reg;       // Argument registers
5071       if (squaring)
5072         Pb_base = Pa_base;
5073       else
5074         Pb_base = ++reg;
5075       Pn_base = ++reg;
5076       Rlen= ++reg;
5077       inv = ++reg;
5078       Pm_base = ++reg;
5079 
5080                           // Working registers:
5081       Ra =  ++reg;        // The current digit of a, b, n, and m.
5082       Rb =  ++reg;
5083       Rm =  ++reg;
5084       Rn =  ++reg;
5085 
5086       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
5087       Pb =  ++reg;
5088       Pm =  ++reg;
5089       Pn =  ++reg;
5090 
5091       t0 =  ++reg;        // Three registers which form a
5092       t1 =  ++reg;        // triple-precision accumuator.
5093       t2 =  ++reg;
5094 
5095       Ri =  ++reg;        // Inner and outer loop indexes.
5096       Rj =  ++reg;
5097 
5098       Rhi_ab = ++reg;     // Product registers: low and high parts
5099       Rlo_ab = ++reg;     // of a*b and m*n.
5100       Rhi_mn = ++reg;
5101       Rlo_mn = ++reg;
5102 
5103       // r19 and up are callee-saved.
5104       _toSave = RegSet::range(r19, reg) + Pm_base;
5105     }
5106 
5107   private:
5108     void save_regs() {
5109       push(_toSave, sp);
5110     }
5111 
5112     void restore_regs() {
5113       pop(_toSave, sp);
5114     }
5115 
5116     template <typename T>
5117     void unroll_2(Register count, T block) {
5118       Label loop, end, odd;
5119       tbnz(count, 0, odd);
5120       cbz(count, end);
5121       align(16);
5122       bind(loop);
5123       (this->*block)();
5124       bind(odd);
5125       (this->*block)();
5126       subs(count, count, 2);
5127       br(Assembler::GT, loop);
5128       bind(end);
5129     }
5130 
5131     template <typename T>
5132     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
5133       Label loop, end, odd;
5134       tbnz(count, 0, odd);
5135       cbz(count, end);
5136       align(16);
5137       bind(loop);
5138       (this->*block)(d, s, tmp);
5139       bind(odd);
5140       (this->*block)(d, s, tmp);
5141       subs(count, count, 2);
5142       br(Assembler::GT, loop);
5143       bind(end);
5144     }
5145 
5146     void pre1(RegisterOrConstant i) {
5147       block_comment("pre1");
5148       // Pa = Pa_base;
5149       // Pb = Pb_base + i;
5150       // Pm = Pm_base;
5151       // Pn = Pn_base + i;
5152       // Ra = *Pa;
5153       // Rb = *Pb;
5154       // Rm = *Pm;
5155       // Rn = *Pn;
5156       ldr(Ra, Address(Pa_base));
5157       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5158       ldr(Rm, Address(Pm_base));
5159       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5160       lea(Pa, Address(Pa_base));
5161       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5162       lea(Pm, Address(Pm_base));
5163       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5164 
5165       // Zero the m*n result.
5166       mov(Rhi_mn, zr);
5167       mov(Rlo_mn, zr);
5168     }
5169 
5170     // The core multiply-accumulate step of a Montgomery
5171     // multiplication.  The idea is to schedule operations as a
5172     // pipeline so that instructions with long latencies (loads and
5173     // multiplies) have time to complete before their results are
5174     // used.  This most benefits in-order implementations of the
5175     // architecture but out-of-order ones also benefit.
5176     void step() {
5177       block_comment("step");
5178       // MACC(Ra, Rb, t0, t1, t2);
5179       // Ra = *++Pa;
5180       // Rb = *--Pb;
5181       umulh(Rhi_ab, Ra, Rb);
5182       mul(Rlo_ab, Ra, Rb);
5183       ldr(Ra, pre(Pa, wordSize));
5184       ldr(Rb, pre(Pb, -wordSize));
5185       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5186                                        // previous iteration.
5187       // MACC(Rm, Rn, t0, t1, t2);
5188       // Rm = *++Pm;
5189       // Rn = *--Pn;
5190       umulh(Rhi_mn, Rm, Rn);
5191       mul(Rlo_mn, Rm, Rn);
5192       ldr(Rm, pre(Pm, wordSize));
5193       ldr(Rn, pre(Pn, -wordSize));
5194       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5195     }
5196 
5197     void post1() {
5198       block_comment("post1");
5199 
5200       // MACC(Ra, Rb, t0, t1, t2);
5201       // Ra = *++Pa;
5202       // Rb = *--Pb;
5203       umulh(Rhi_ab, Ra, Rb);
5204       mul(Rlo_ab, Ra, Rb);
5205       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5206       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5207 
5208       // *Pm = Rm = t0 * inv;
5209       mul(Rm, t0, inv);
5210       str(Rm, Address(Pm));
5211 
5212       // MACC(Rm, Rn, t0, t1, t2);
5213       // t0 = t1; t1 = t2; t2 = 0;
5214       umulh(Rhi_mn, Rm, Rn);
5215 
5216 #ifndef PRODUCT
5217       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5218       {
5219         mul(Rlo_mn, Rm, Rn);
5220         add(Rlo_mn, t0, Rlo_mn);
5221         Label ok;
5222         cbz(Rlo_mn, ok); {
5223           stop("broken Montgomery multiply");
5224         } bind(ok);
5225       }
5226 #endif
5227       // We have very carefully set things up so that
5228       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5229       // the lower half of Rm * Rn because we know the result already:
5230       // it must be -t0.  t0 + (-t0) must generate a carry iff
5231       // t0 != 0.  So, rather than do a mul and an adds we just set
5232       // the carry flag iff t0 is nonzero.
5233       //
5234       // mul(Rlo_mn, Rm, Rn);
5235       // adds(zr, t0, Rlo_mn);
5236       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5237       adcs(t0, t1, Rhi_mn);
5238       adc(t1, t2, zr);
5239       mov(t2, zr);
5240     }
5241 
5242     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5243       block_comment("pre2");
5244       // Pa = Pa_base + i-len;
5245       // Pb = Pb_base + len;
5246       // Pm = Pm_base + i-len;
5247       // Pn = Pn_base + len;
5248 
5249       if (i.is_register()) {
5250         sub(Rj, i.as_register(), len);
5251       } else {
5252         mov(Rj, i.as_constant());
5253         sub(Rj, Rj, len);
5254       }
5255       // Rj == i-len
5256 
5257       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5258       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5259       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5260       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5261 
5262       // Ra = *++Pa;
5263       // Rb = *--Pb;
5264       // Rm = *++Pm;
5265       // Rn = *--Pn;
5266       ldr(Ra, pre(Pa, wordSize));
5267       ldr(Rb, pre(Pb, -wordSize));
5268       ldr(Rm, pre(Pm, wordSize));
5269       ldr(Rn, pre(Pn, -wordSize));
5270 
5271       mov(Rhi_mn, zr);
5272       mov(Rlo_mn, zr);
5273     }
5274 
5275     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5276       block_comment("post2");
5277       if (i.is_constant()) {
5278         mov(Rj, i.as_constant()-len.as_constant());
5279       } else {
5280         sub(Rj, i.as_register(), len);
5281       }
5282 
5283       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5284 
5285       // As soon as we know the least significant digit of our result,
5286       // store it.
5287       // Pm_base[i-len] = t0;
5288       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5289 
5290       // t0 = t1; t1 = t2; t2 = 0;
5291       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5292       adc(t1, t2, zr);
5293       mov(t2, zr);
5294     }
5295 
5296     // A carry in t0 after Montgomery multiplication means that we
5297     // should subtract multiples of n from our result in m.  We'll
5298     // keep doing that until there is no carry.
5299     void normalize(RegisterOrConstant len) {
5300       block_comment("normalize");
5301       // while (t0)
5302       //   t0 = sub(Pm_base, Pn_base, t0, len);
5303       Label loop, post, again;
5304       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5305       cbz(t0, post); {
5306         bind(again); {
5307           mov(i, zr);
5308           mov(cnt, len);
5309           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5310           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5311           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5312           align(16);
5313           bind(loop); {
5314             sbcs(Rm, Rm, Rn);
5315             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5316             add(i, i, 1);
5317             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5318             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5319             sub(cnt, cnt, 1);
5320           } cbnz(cnt, loop);
5321           sbc(t0, t0, zr);
5322         } cbnz(t0, again);
5323       } bind(post);
5324     }
5325 
5326     // Move memory at s to d, reversing words.
5327     //    Increments d to end of copied memory
5328     //    Destroys tmp1, tmp2
5329     //    Preserves len
5330     //    Leaves s pointing to the address which was in d at start
5331     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5332       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5333 
5334       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5335       mov(tmp1, len);
5336       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5337       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5338     }
5339     // where
5340     void reverse1(Register d, Register s, Register tmp) {
5341       ldr(tmp, pre(s, -wordSize));
5342       ror(tmp, tmp, 32);
5343       str(tmp, post(d, wordSize));
5344     }
5345 
5346     void step_squaring() {
5347       // An extra ACC
5348       step();
5349       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5350     }
5351 
5352     void last_squaring(RegisterOrConstant i) {
5353       Label dont;
5354       // if ((i & 1) == 0) {
5355       tbnz(i.as_register(), 0, dont); {
5356         // MACC(Ra, Rb, t0, t1, t2);
5357         // Ra = *++Pa;
5358         // Rb = *--Pb;
5359         umulh(Rhi_ab, Ra, Rb);
5360         mul(Rlo_ab, Ra, Rb);
5361         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5362       } bind(dont);
5363     }
5364 
5365     void extra_step_squaring() {
5366       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5367 
5368       // MACC(Rm, Rn, t0, t1, t2);
5369       // Rm = *++Pm;
5370       // Rn = *--Pn;
5371       umulh(Rhi_mn, Rm, Rn);
5372       mul(Rlo_mn, Rm, Rn);
5373       ldr(Rm, pre(Pm, wordSize));
5374       ldr(Rn, pre(Pn, -wordSize));
5375     }
5376 
5377     void post1_squaring() {
5378       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5379 
5380       // *Pm = Rm = t0 * inv;
5381       mul(Rm, t0, inv);
5382       str(Rm, Address(Pm));
5383 
5384       // MACC(Rm, Rn, t0, t1, t2);
5385       // t0 = t1; t1 = t2; t2 = 0;
5386       umulh(Rhi_mn, Rm, Rn);
5387 
5388 #ifndef PRODUCT
5389       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5390       {
5391         mul(Rlo_mn, Rm, Rn);
5392         add(Rlo_mn, t0, Rlo_mn);
5393         Label ok;
5394         cbz(Rlo_mn, ok); {
5395           stop("broken Montgomery multiply");
5396         } bind(ok);
5397       }
5398 #endif
5399       // We have very carefully set things up so that
5400       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5401       // the lower half of Rm * Rn because we know the result already:
5402       // it must be -t0.  t0 + (-t0) must generate a carry iff
5403       // t0 != 0.  So, rather than do a mul and an adds we just set
5404       // the carry flag iff t0 is nonzero.
5405       //
5406       // mul(Rlo_mn, Rm, Rn);
5407       // adds(zr, t0, Rlo_mn);
5408       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5409       adcs(t0, t1, Rhi_mn);
5410       adc(t1, t2, zr);
5411       mov(t2, zr);
5412     }
5413 
5414     void acc(Register Rhi, Register Rlo,
5415              Register t0, Register t1, Register t2) {
5416       adds(t0, t0, Rlo);
5417       adcs(t1, t1, Rhi);
5418       adc(t2, t2, zr);
5419     }
5420 
5421   public:
5422     /**
5423      * Fast Montgomery multiplication.  The derivation of the
5424      * algorithm is in A Cryptographic Library for the Motorola
5425      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5426      *
5427      * Arguments:
5428      *
5429      * Inputs for multiplication:
5430      *   c_rarg0   - int array elements a
5431      *   c_rarg1   - int array elements b
5432      *   c_rarg2   - int array elements n (the modulus)
5433      *   c_rarg3   - int length
5434      *   c_rarg4   - int inv
5435      *   c_rarg5   - int array elements m (the result)
5436      *
5437      * Inputs for squaring:
5438      *   c_rarg0   - int array elements a
5439      *   c_rarg1   - int array elements n (the modulus)
5440      *   c_rarg2   - int length
5441      *   c_rarg3   - int inv
5442      *   c_rarg4   - int array elements m (the result)
5443      *
5444      */
5445     address generate_multiply() {
5446       Label argh, nothing;
5447       bind(argh);
5448       stop("MontgomeryMultiply total_allocation must be <= 8192");
5449 
5450       align(CodeEntryAlignment);
5451       address entry = pc();
5452 
5453       cbzw(Rlen, nothing);
5454 
5455       enter();
5456 
5457       // Make room.
5458       cmpw(Rlen, 512);
5459       br(Assembler::HI, argh);
5460       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5461       andr(sp, Ra, -2 * wordSize);
5462 
5463       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5464 
5465       {
5466         // Copy input args, reversing as we go.  We use Ra as a
5467         // temporary variable.
5468         reverse(Ra, Pa_base, Rlen, t0, t1);
5469         if (!_squaring)
5470           reverse(Ra, Pb_base, Rlen, t0, t1);
5471         reverse(Ra, Pn_base, Rlen, t0, t1);
5472       }
5473 
5474       // Push all call-saved registers and also Pm_base which we'll need
5475       // at the end.
5476       save_regs();
5477 
5478 #ifndef PRODUCT
5479       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5480       {
5481         ldr(Rn, Address(Pn_base, 0));
5482         mul(Rlo_mn, Rn, inv);
5483         subs(zr, Rlo_mn, -1);
5484         Label ok;
5485         br(EQ, ok); {
5486           stop("broken inverse in Montgomery multiply");
5487         } bind(ok);
5488       }
5489 #endif
5490 
5491       mov(Pm_base, Ra);
5492 
5493       mov(t0, zr);
5494       mov(t1, zr);
5495       mov(t2, zr);
5496 
5497       block_comment("for (int i = 0; i < len; i++) {");
5498       mov(Ri, zr); {
5499         Label loop, end;
5500         cmpw(Ri, Rlen);
5501         br(Assembler::GE, end);
5502 
5503         bind(loop);
5504         pre1(Ri);
5505 
5506         block_comment("  for (j = i; j; j--) {"); {
5507           movw(Rj, Ri);
5508           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5509         } block_comment("  } // j");
5510 
5511         post1();
5512         addw(Ri, Ri, 1);
5513         cmpw(Ri, Rlen);
5514         br(Assembler::LT, loop);
5515         bind(end);
5516         block_comment("} // i");
5517       }
5518 
5519       block_comment("for (int i = len; i < 2*len; i++) {");
5520       mov(Ri, Rlen); {
5521         Label loop, end;
5522         cmpw(Ri, Rlen, Assembler::LSL, 1);
5523         br(Assembler::GE, end);
5524 
5525         bind(loop);
5526         pre2(Ri, Rlen);
5527 
5528         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5529           lslw(Rj, Rlen, 1);
5530           subw(Rj, Rj, Ri);
5531           subw(Rj, Rj, 1);
5532           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5533         } block_comment("  } // j");
5534 
5535         post2(Ri, Rlen);
5536         addw(Ri, Ri, 1);
5537         cmpw(Ri, Rlen, Assembler::LSL, 1);
5538         br(Assembler::LT, loop);
5539         bind(end);
5540       }
5541       block_comment("} // i");
5542 
5543       normalize(Rlen);
5544 
5545       mov(Ra, Pm_base);  // Save Pm_base in Ra
5546       restore_regs();  // Restore caller's Pm_base
5547 
5548       // Copy our result into caller's Pm_base
5549       reverse(Pm_base, Ra, Rlen, t0, t1);
5550 
5551       leave();
5552       bind(nothing);
5553       ret(lr);
5554 
5555       return entry;
5556     }
5557     // In C, approximately:
5558 
5559     // void
5560     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
5561     //                     julong Pn_base[], julong Pm_base[],
5562     //                     julong inv, int len) {
5563     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5564     //   julong *Pa, *Pb, *Pn, *Pm;
5565     //   julong Ra, Rb, Rn, Rm;
5566 
5567     //   int i;
5568 
5569     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5570 
5571     //   for (i = 0; i < len; i++) {
5572     //     int j;
5573 
5574     //     Pa = Pa_base;
5575     //     Pb = Pb_base + i;
5576     //     Pm = Pm_base;
5577     //     Pn = Pn_base + i;
5578 
5579     //     Ra = *Pa;
5580     //     Rb = *Pb;
5581     //     Rm = *Pm;
5582     //     Rn = *Pn;
5583 
5584     //     int iters = i;
5585     //     for (j = 0; iters--; j++) {
5586     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5587     //       MACC(Ra, Rb, t0, t1, t2);
5588     //       Ra = *++Pa;
5589     //       Rb = *--Pb;
5590     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5591     //       MACC(Rm, Rn, t0, t1, t2);
5592     //       Rm = *++Pm;
5593     //       Rn = *--Pn;
5594     //     }
5595 
5596     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5597     //     MACC(Ra, Rb, t0, t1, t2);
5598     //     *Pm = Rm = t0 * inv;
5599     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5600     //     MACC(Rm, Rn, t0, t1, t2);
5601 
5602     //     assert(t0 == 0, "broken Montgomery multiply");
5603 
5604     //     t0 = t1; t1 = t2; t2 = 0;
5605     //   }
5606 
5607     //   for (i = len; i < 2*len; i++) {
5608     //     int j;
5609 
5610     //     Pa = Pa_base + i-len;
5611     //     Pb = Pb_base + len;
5612     //     Pm = Pm_base + i-len;
5613     //     Pn = Pn_base + len;
5614 
5615     //     Ra = *++Pa;
5616     //     Rb = *--Pb;
5617     //     Rm = *++Pm;
5618     //     Rn = *--Pn;
5619 
5620     //     int iters = len*2-i-1;
5621     //     for (j = i-len+1; iters--; j++) {
5622     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5623     //       MACC(Ra, Rb, t0, t1, t2);
5624     //       Ra = *++Pa;
5625     //       Rb = *--Pb;
5626     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5627     //       MACC(Rm, Rn, t0, t1, t2);
5628     //       Rm = *++Pm;
5629     //       Rn = *--Pn;
5630     //     }
5631 
5632     //     Pm_base[i-len] = t0;
5633     //     t0 = t1; t1 = t2; t2 = 0;
5634     //   }
5635 
5636     //   while (t0)
5637     //     t0 = sub(Pm_base, Pn_base, t0, len);
5638     // }
5639 
5640     /**
5641      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5642      * multiplies than Montgomery multiplication so it should be up to
5643      * 25% faster.  However, its loop control is more complex and it
5644      * may actually run slower on some machines.
5645      *
5646      * Arguments:
5647      *
5648      * Inputs:
5649      *   c_rarg0   - int array elements a
5650      *   c_rarg1   - int array elements n (the modulus)
5651      *   c_rarg2   - int length
5652      *   c_rarg3   - int inv
5653      *   c_rarg4   - int array elements m (the result)
5654      *
5655      */
5656     address generate_square() {
5657       Label argh;
5658       bind(argh);
5659       stop("MontgomeryMultiply total_allocation must be <= 8192");
5660 
5661       align(CodeEntryAlignment);
5662       address entry = pc();
5663 
5664       enter();
5665 
5666       // Make room.
5667       cmpw(Rlen, 512);
5668       br(Assembler::HI, argh);
5669       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5670       andr(sp, Ra, -2 * wordSize);
5671 
5672       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5673 
5674       {
5675         // Copy input args, reversing as we go.  We use Ra as a
5676         // temporary variable.
5677         reverse(Ra, Pa_base, Rlen, t0, t1);
5678         reverse(Ra, Pn_base, Rlen, t0, t1);
5679       }
5680 
5681       // Push all call-saved registers and also Pm_base which we'll need
5682       // at the end.
5683       save_regs();
5684 
5685       mov(Pm_base, Ra);
5686 
5687       mov(t0, zr);
5688       mov(t1, zr);
5689       mov(t2, zr);
5690 
5691       block_comment("for (int i = 0; i < len; i++) {");
5692       mov(Ri, zr); {
5693         Label loop, end;
5694         bind(loop);
5695         cmp(Ri, Rlen);
5696         br(Assembler::GE, end);
5697 
5698         pre1(Ri);
5699 
5700         block_comment("for (j = (i+1)/2; j; j--) {"); {
5701           add(Rj, Ri, 1);
5702           lsr(Rj, Rj, 1);
5703           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5704         } block_comment("  } // j");
5705 
5706         last_squaring(Ri);
5707 
5708         block_comment("  for (j = i/2; j; j--) {"); {
5709           lsr(Rj, Ri, 1);
5710           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5711         } block_comment("  } // j");
5712 
5713         post1_squaring();
5714         add(Ri, Ri, 1);
5715         cmp(Ri, Rlen);
5716         br(Assembler::LT, loop);
5717 
5718         bind(end);
5719         block_comment("} // i");
5720       }
5721 
5722       block_comment("for (int i = len; i < 2*len; i++) {");
5723       mov(Ri, Rlen); {
5724         Label loop, end;
5725         bind(loop);
5726         cmp(Ri, Rlen, Assembler::LSL, 1);
5727         br(Assembler::GE, end);
5728 
5729         pre2(Ri, Rlen);
5730 
5731         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5732           lsl(Rj, Rlen, 1);
5733           sub(Rj, Rj, Ri);
5734           sub(Rj, Rj, 1);
5735           lsr(Rj, Rj, 1);
5736           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5737         } block_comment("  } // j");
5738 
5739         last_squaring(Ri);
5740 
5741         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5742           lsl(Rj, Rlen, 1);
5743           sub(Rj, Rj, Ri);
5744           lsr(Rj, Rj, 1);
5745           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5746         } block_comment("  } // j");
5747 
5748         post2(Ri, Rlen);
5749         add(Ri, Ri, 1);
5750         cmp(Ri, Rlen, Assembler::LSL, 1);
5751 
5752         br(Assembler::LT, loop);
5753         bind(end);
5754         block_comment("} // i");
5755       }
5756 
5757       normalize(Rlen);
5758 
5759       mov(Ra, Pm_base);  // Save Pm_base in Ra
5760       restore_regs();  // Restore caller's Pm_base
5761 
5762       // Copy our result into caller's Pm_base
5763       reverse(Pm_base, Ra, Rlen, t0, t1);
5764 
5765       leave();
5766       ret(lr);
5767 
5768       return entry;
5769     }
5770     // In C, approximately:
5771 
5772     // void
5773     // montgomery_square(julong Pa_base[], julong Pn_base[],
5774     //                   julong Pm_base[], julong inv, int len) {
5775     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5776     //   julong *Pa, *Pb, *Pn, *Pm;
5777     //   julong Ra, Rb, Rn, Rm;
5778 
5779     //   int i;
5780 
5781     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5782 
5783     //   for (i = 0; i < len; i++) {
5784     //     int j;
5785 
5786     //     Pa = Pa_base;
5787     //     Pb = Pa_base + i;
5788     //     Pm = Pm_base;
5789     //     Pn = Pn_base + i;
5790 
5791     //     Ra = *Pa;
5792     //     Rb = *Pb;
5793     //     Rm = *Pm;
5794     //     Rn = *Pn;
5795 
5796     //     int iters = (i+1)/2;
5797     //     for (j = 0; iters--; j++) {
5798     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5799     //       MACC2(Ra, Rb, t0, t1, t2);
5800     //       Ra = *++Pa;
5801     //       Rb = *--Pb;
5802     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5803     //       MACC(Rm, Rn, t0, t1, t2);
5804     //       Rm = *++Pm;
5805     //       Rn = *--Pn;
5806     //     }
5807     //     if ((i & 1) == 0) {
5808     //       assert(Ra == Pa_base[j], "must be");
5809     //       MACC(Ra, Ra, t0, t1, t2);
5810     //     }
5811     //     iters = i/2;
5812     //     assert(iters == i-j, "must be");
5813     //     for (; iters--; j++) {
5814     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5815     //       MACC(Rm, Rn, t0, t1, t2);
5816     //       Rm = *++Pm;
5817     //       Rn = *--Pn;
5818     //     }
5819 
5820     //     *Pm = Rm = t0 * inv;
5821     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5822     //     MACC(Rm, Rn, t0, t1, t2);
5823 
5824     //     assert(t0 == 0, "broken Montgomery multiply");
5825 
5826     //     t0 = t1; t1 = t2; t2 = 0;
5827     //   }
5828 
5829     //   for (i = len; i < 2*len; i++) {
5830     //     int start = i-len+1;
5831     //     int end = start + (len - start)/2;
5832     //     int j;
5833 
5834     //     Pa = Pa_base + i-len;
5835     //     Pb = Pa_base + len;
5836     //     Pm = Pm_base + i-len;
5837     //     Pn = Pn_base + len;
5838 
5839     //     Ra = *++Pa;
5840     //     Rb = *--Pb;
5841     //     Rm = *++Pm;
5842     //     Rn = *--Pn;
5843 
5844     //     int iters = (2*len-i-1)/2;
5845     //     assert(iters == end-start, "must be");
5846     //     for (j = start; iters--; j++) {
5847     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5848     //       MACC2(Ra, Rb, t0, t1, t2);
5849     //       Ra = *++Pa;
5850     //       Rb = *--Pb;
5851     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5852     //       MACC(Rm, Rn, t0, t1, t2);
5853     //       Rm = *++Pm;
5854     //       Rn = *--Pn;
5855     //     }
5856     //     if ((i & 1) == 0) {
5857     //       assert(Ra == Pa_base[j], "must be");
5858     //       MACC(Ra, Ra, t0, t1, t2);
5859     //     }
5860     //     iters =  (2*len-i)/2;
5861     //     assert(iters == len-j, "must be");
5862     //     for (; iters--; j++) {
5863     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5864     //       MACC(Rm, Rn, t0, t1, t2);
5865     //       Rm = *++Pm;
5866     //       Rn = *--Pn;
5867     //     }
5868     //     Pm_base[i-len] = t0;
5869     //     t0 = t1; t1 = t2; t2 = 0;
5870     //   }
5871 
5872     //   while (t0)
5873     //     t0 = sub(Pm_base, Pn_base, t0, len);
5874     // }
5875   };
5876 
5877 
5878   // Initialization
5879   void generate_initial() {
5880     // Generate initial stubs and initializes the entry points
5881 
5882     // entry points that exist in all platforms Note: This is code
5883     // that could be shared among different platforms - however the
5884     // benefit seems to be smaller than the disadvantage of having a
5885     // much more complicated generator structure. See also comment in
5886     // stubRoutines.hpp.
5887 
5888     StubRoutines::_forward_exception_entry = generate_forward_exception();
5889 
5890     StubRoutines::_call_stub_entry =
5891       generate_call_stub(StubRoutines::_call_stub_return_address);
5892 
5893     // is referenced by megamorphic call
5894     StubRoutines::_catch_exception_entry = generate_catch_exception();
5895 
5896     // Build this early so it's available for the interpreter.
5897     StubRoutines::_throw_StackOverflowError_entry =
5898       generate_throw_exception("StackOverflowError throw_exception",
5899                                CAST_FROM_FN_PTR(address,
5900                                                 SharedRuntime::throw_StackOverflowError));
5901     StubRoutines::_throw_delayed_StackOverflowError_entry =
5902       generate_throw_exception("delayed StackOverflowError throw_exception",
5903                                CAST_FROM_FN_PTR(address,
5904                                                 SharedRuntime::throw_delayed_StackOverflowError));
5905     if (UseCRC32Intrinsics) {
5906       // set table address before stub generation which use it
5907       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5908       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5909     }
5910 
5911     if (UseCRC32CIntrinsics) {
5912       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5913     }
5914 
5915     // Disabled until JDK-8210858 is fixed
5916     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5917     //   StubRoutines::_dlog = generate_dlog();
5918     // }
5919 
5920     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5921       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5922     }
5923 
5924     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5925       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5926     }
5927 
5928     // Safefetch stubs.
5929     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5930                                                        &StubRoutines::_safefetch32_fault_pc,
5931                                                        &StubRoutines::_safefetch32_continuation_pc);
5932     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5933                                                        &StubRoutines::_safefetchN_fault_pc,
5934                                                        &StubRoutines::_safefetchN_continuation_pc);
5935   }
5936 
5937   void generate_all() {
5938     // support for verify_oop (must happen after universe_init)
5939     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5940     StubRoutines::_throw_AbstractMethodError_entry =
5941       generate_throw_exception("AbstractMethodError throw_exception",
5942                                CAST_FROM_FN_PTR(address,
5943                                                 SharedRuntime::
5944                                                 throw_AbstractMethodError));
5945 
5946     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5947       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5948                                CAST_FROM_FN_PTR(address,
5949                                                 SharedRuntime::
5950                                                 throw_IncompatibleClassChangeError));
5951 
5952     StubRoutines::_throw_NullPointerException_at_call_entry =
5953       generate_throw_exception("NullPointerException at call throw_exception",
5954                                CAST_FROM_FN_PTR(address,
5955                                                 SharedRuntime::
5956                                                 throw_NullPointerException_at_call));
5957 
5958     // arraycopy stubs used by compilers
5959     generate_arraycopy_stubs();
5960 
5961     // has negatives stub for large arrays.
5962     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5963 
5964     // array equals stub for large arrays.
5965     if (!UseSimpleArrayEquals) {
5966       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5967     }
5968 
5969     generate_compare_long_strings();
5970 
5971     generate_string_indexof_stubs();
5972 
5973     // byte_array_inflate stub for large arrays.
5974     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5975 
5976     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5977     if (bs_nm != NULL) {
5978       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
5979     }
5980 #ifdef COMPILER2
5981     if (UseMultiplyToLenIntrinsic) {
5982       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5983     }
5984 
5985     if (UseSquareToLenIntrinsic) {
5986       StubRoutines::_squareToLen = generate_squareToLen();
5987     }
5988 
5989     if (UseMulAddIntrinsic) {
5990       StubRoutines::_mulAdd = generate_mulAdd();
5991     }
5992 
5993     if (UseMontgomeryMultiplyIntrinsic) {
5994       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5995       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5996       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5997     }
5998 
5999     if (UseMontgomerySquareIntrinsic) {
6000       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
6001       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
6002       // We use generate_multiply() rather than generate_square()
6003       // because it's faster for the sizes of modulus we care about.
6004       StubRoutines::_montgomerySquare = g.generate_multiply();
6005     }
6006 #endif // COMPILER2
6007 
6008     // generate GHASH intrinsics code
6009     if (UseGHASHIntrinsics) {
6010       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6011     }
6012 
6013     // data cache line writeback
6014     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
6015     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
6016 
6017     if (UseAESIntrinsics) {
6018       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6019       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6020       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6021       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
6022     }
6023 
6024     if (UseSHA1Intrinsics) {
6025       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
6026       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
6027     }
6028     if (UseSHA256Intrinsics) {
6029       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
6030       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
6031     }
6032     if (UseSHA512Intrinsics) {
6033       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
6034       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
6035     }
6036 
6037     // generate Adler32 intrinsics code
6038     if (UseAdler32Intrinsics) {
6039       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6040     }
6041 
6042     StubRoutines::aarch64::set_completed();
6043   }
6044 
6045  public:
6046   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6047     if (all) {
6048       generate_all();
6049     } else {
6050       generate_initial();
6051     }
6052   }
6053 }; // end class declaration
6054 
6055 #define UCM_TABLE_MAX_ENTRIES 8
6056 void StubGenerator_generate(CodeBuffer* code, bool all) {
6057   if (UnsafeCopyMemory::_table == NULL) {
6058     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
6059   }
6060   StubGenerator g(code, all);
6061 }