New src/hotspot/cpu/aarch64/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #include "utilities/powerOfTwo.hpp"
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zThreadLocalData.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     address aarch64_entry = __ pc();
 222 
 223     // set up frame and move sp to end of save area
 224     __ enter();
 225     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 226 
 227     // save register parameters and Java scratch/global registers
 228     // n.b. we save thread even though it gets installed in
 229     // rthread because we want to sanity check rthread later
 230     __ str(c_rarg7,  thread);
 231     __ strw(c_rarg6, parameter_size);
 232     __ stp(c_rarg4, c_rarg5,  entry_point);
 233     __ stp(c_rarg2, c_rarg3,  result_type);
 234     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 235 
 236     __ stp(r20, r19,   r20_save);
 237     __ stp(r22, r21,   r22_save);
 238     __ stp(r24, r23,   r24_save);
 239     __ stp(r26, r25,   r26_save);
 240     __ stp(r28, r27,   r28_save);
 241 
 242     __ stpd(v9,  v8,   d9_save);
 243     __ stpd(v11, v10,  d11_save);
 244     __ stpd(v13, v12,  d13_save);
 245     __ stpd(v15, v14,  d15_save);
 246 
 247     // install Java thread in global register now we have saved
 248     // whatever value it held
 249     __ mov(rthread, c_rarg7);
 250     // And method
 251     __ mov(rmethod, c_rarg3);
 252 
 253     // set up the heapbase register
 254     __ reinit_heapbase();
 255 
 256 #ifdef ASSERT
 257     // make sure we have no pending exceptions
 258     {
 259       Label L;
 260       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 261       __ cmp(rscratch1, (u1)NULL_WORD);
 262       __ br(Assembler::EQ, L);
 263       __ stop("StubRoutines::call_stub: entered with pending exception");
 264       __ BIND(L);
 265     }
 266 #endif
 267     // pass parameters if any
 268     __ mov(esp, sp);
 269     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 270     __ andr(sp, rscratch1, -2 * wordSize);
 271 
 272     BLOCK_COMMENT("pass parameters if any");
 273     Label parameters_done;
 274     // parameter count is still in c_rarg6
 275     // and parameter pointer identifying param 1 is in c_rarg5
 276     __ cbzw(c_rarg6, parameters_done);
 277 
 278     address loop = __ pc();
 279     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 280     __ subsw(c_rarg6, c_rarg6, 1);
 281     __ push(rscratch1);
 282     __ br(Assembler::GT, loop);
 283 
 284     __ BIND(parameters_done);
 285 
 286     // call Java entry -- passing methdoOop, and current sp
 287     //      rmethod: Method*
 288     //      r13: sender sp
 289     BLOCK_COMMENT("call Java function");
 290     __ mov(r13, sp);
 291     __ blr(c_rarg4);
 292 
 293     // we do this here because the notify will already have been done
 294     // if we get to the next instruction via an exception
 295     //
 296     // n.b. adding this instruction here affects the calculation of
 297     // whether or not a routine returns to the call stub (used when
 298     // doing stack walks) since the normal test is to check the return
 299     // pc against the address saved below. so we may need to allow for
 300     // this extra instruction in the check.
 301 
 302     // save current address for use by exception handling code
 303 
 304     return_address = __ pc();
 305 
 306     // store result depending on type (everything that is not
 307     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 308     // n.b. this assumes Java returns an integral result in r0
 309     // and a floating result in j_farg0
 310     __ ldr(j_rarg2, result);
 311     Label is_long, is_float, is_double, exit;
 312     __ ldr(j_rarg1, result_type);
 313     __ cmp(j_rarg1, (u1)T_OBJECT);
 314     __ br(Assembler::EQ, is_long);
 315     __ cmp(j_rarg1, (u1)T_LONG);
 316     __ br(Assembler::EQ, is_long);
 317     __ cmp(j_rarg1, (u1)T_FLOAT);
 318     __ br(Assembler::EQ, is_float);
 319     __ cmp(j_rarg1, (u1)T_DOUBLE);
 320     __ br(Assembler::EQ, is_double);
 321 
 322     // handle T_INT case
 323     __ strw(r0, Address(j_rarg2));
 324 
 325     __ BIND(exit);
 326 
 327     // pop parameters
 328     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 329 
 330 #ifdef ASSERT
 331     // verify that threads correspond
 332     {
 333       Label L, S;
 334       __ ldr(rscratch1, thread);
 335       __ cmp(rthread, rscratch1);
 336       __ br(Assembler::NE, S);
 337       __ get_thread(rscratch1);
 338       __ cmp(rthread, rscratch1);
 339       __ br(Assembler::EQ, L);
 340       __ BIND(S);
 341       __ stop("StubRoutines::call_stub: threads must correspond");
 342       __ BIND(L);
 343     }
 344 #endif
 345 
 346     // restore callee-save registers
 347     __ ldpd(v15, v14,  d15_save);
 348     __ ldpd(v13, v12,  d13_save);
 349     __ ldpd(v11, v10,  d11_save);
 350     __ ldpd(v9,  v8,   d9_save);
 351 
 352     __ ldp(r28, r27,   r28_save);
 353     __ ldp(r26, r25,   r26_save);
 354     __ ldp(r24, r23,   r24_save);
 355     __ ldp(r22, r21,   r22_save);
 356     __ ldp(r20, r19,   r20_save);
 357 
 358     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 359     __ ldrw(c_rarg2, result_type);
 360     __ ldr(c_rarg3,  method);
 361     __ ldp(c_rarg4, c_rarg5,  entry_point);
 362     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 363 
 364     // leave frame and return to caller
 365     __ leave();
 366     __ ret(lr);
 367 
 368     // handle return types different from T_INT
 369 
 370     __ BIND(is_long);
 371     __ str(r0, Address(j_rarg2, 0));
 372     __ br(Assembler::AL, exit);
 373 
 374     __ BIND(is_float);
 375     __ strs(j_farg0, Address(j_rarg2, 0));
 376     __ br(Assembler::AL, exit);
 377 
 378     __ BIND(is_double);
 379     __ strd(j_farg0, Address(j_rarg2, 0));
 380     __ br(Assembler::AL, exit);
 381 
 382     return start;
 383   }
 384 
 385   // Return point for a Java call if there's an exception thrown in
 386   // Java code.  The exception is caught and transformed into a
 387   // pending exception stored in JavaThread that can be tested from
 388   // within the VM.
 389   //
 390   // Note: Usually the parameters are removed by the callee. In case
 391   // of an exception crossing an activation frame boundary, that is
 392   // not the case if the callee is compiled code => need to setup the
 393   // rsp.
 394   //
 395   // r0: exception oop
 396 
 397   address generate_catch_exception() {
 398     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 399     address start = __ pc();
 400 
 401     // same as in generate_call_stub():
 402     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 403     const Address thread        (rfp, thread_off         * wordSize);
 404 
 405 #ifdef ASSERT
 406     // verify that threads correspond
 407     {
 408       Label L, S;
 409       __ ldr(rscratch1, thread);
 410       __ cmp(rthread, rscratch1);
 411       __ br(Assembler::NE, S);
 412       __ get_thread(rscratch1);
 413       __ cmp(rthread, rscratch1);
 414       __ br(Assembler::EQ, L);
 415       __ bind(S);
 416       __ stop("StubRoutines::catch_exception: threads must correspond");
 417       __ bind(L);
 418     }
 419 #endif
 420 
 421     // set pending exception
 422     __ verify_oop(r0);
 423 
 424     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 425     __ mov(rscratch1, (address)__FILE__);
 426     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 427     __ movw(rscratch1, (int)__LINE__);
 428     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 429 
 430     // complete return to VM
 431     assert(StubRoutines::_call_stub_return_address != NULL,
 432            "_call_stub_return_address must have been generated before");
 433     __ b(StubRoutines::_call_stub_return_address);
 434 
 435     return start;
 436   }
 437 
 438   // Continuation point for runtime calls returning with a pending
 439   // exception.  The pending exception check happened in the runtime
 440   // or native call stub.  The pending exception in Thread is
 441   // converted into a Java-level exception.
 442   //
 443   // Contract with Java-level exception handlers:
 444   // r0: exception
 445   // r3: throwing pc
 446   //
 447   // NOTE: At entry of this stub, exception-pc must be in LR !!
 448 
 449   // NOTE: this is always used as a jump target within generated code
 450   // so it just needs to be generated code wiht no x86 prolog
 451 
 452   address generate_forward_exception() {
 453     StubCodeMark mark(this, "StubRoutines", "forward exception");
 454     address start = __ pc();
 455 
 456     // Upon entry, LR points to the return address returning into
 457     // Java (interpreted or compiled) code; i.e., the return address
 458     // becomes the throwing pc.
 459     //
 460     // Arguments pushed before the runtime call are still on the stack
 461     // but the exception handler will reset the stack pointer ->
 462     // ignore them.  A potential result in registers can be ignored as
 463     // well.
 464 
 465 #ifdef ASSERT
 466     // make sure this code is only executed if there is a pending exception
 467     {
 468       Label L;
 469       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 470       __ cbnz(rscratch1, L);
 471       __ stop("StubRoutines::forward exception: no pending exception (1)");
 472       __ bind(L);
 473     }
 474 #endif
 475 
 476     // compute exception handler into r19
 477 
 478     // call the VM to find the handler address associated with the
 479     // caller address. pass thread in r0 and caller pc (ret address)
 480     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 481     // the stack.
 482     __ mov(c_rarg1, lr);
 483     // lr will be trashed by the VM call so we move it to R19
 484     // (callee-saved) because we also need to pass it to the handler
 485     // returned by this call.
 486     __ mov(r19, lr);
 487     BLOCK_COMMENT("call exception_handler_for_return_address");
 488     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 489                          SharedRuntime::exception_handler_for_return_address),
 490                     rthread, c_rarg1);
 491     // we should not really care that lr is no longer the callee
 492     // address. we saved the value the handler needs in r19 so we can
 493     // just copy it to r3. however, the C2 handler will push its own
 494     // frame and then calls into the VM and the VM code asserts that
 495     // the PC for the frame above the handler belongs to a compiled
 496     // Java method. So, we restore lr here to satisfy that assert.
 497     __ mov(lr, r19);
 498     // setup r0 & r3 & clear pending exception
 499     __ mov(r3, r19);
 500     __ mov(r19, r0);
 501     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 502     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 503 
 504 #ifdef ASSERT
 505     // make sure exception is set
 506     {
 507       Label L;
 508       __ cbnz(r0, L);
 509       __ stop("StubRoutines::forward exception: no pending exception (2)");
 510       __ bind(L);
 511     }
 512 #endif
 513 
 514     // continue at exception handler
 515     // r0: exception
 516     // r3: throwing pc
 517     // r19: exception handler
 518     __ verify_oop(r0);
 519     __ br(r19);
 520 
 521     return start;
 522   }
 523 
 524   // Non-destructive plausibility checks for oops
 525   //
 526   // Arguments:
 527   //    r0: oop to verify
 528   //    rscratch1: error message
 529   //
 530   // Stack after saving c_rarg3:
 531   //    [tos + 0]: saved c_rarg3
 532   //    [tos + 1]: saved c_rarg2
 533   //    [tos + 2]: saved lr
 534   //    [tos + 3]: saved rscratch2
 535   //    [tos + 4]: saved r0
 536   //    [tos + 5]: saved rscratch1
 537   address generate_verify_oop() {
 538 
 539     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 540     address start = __ pc();
 541 
 542     Label exit, error;
 543 
 544     // save c_rarg2 and c_rarg3
 545     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 546 
 547     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 548     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ ldr(c_rarg3, Address(c_rarg2));
 550     __ add(c_rarg3, c_rarg3, 1);
 551     __ str(c_rarg3, Address(c_rarg2));
 552 
 553     // object is in r0
 554     // make sure object is 'reasonable'
 555     __ cbz(r0, exit); // if obj is NULL it is OK
 556 
 557 #if INCLUDE_ZGC
 558     if (UseZGC) {
 559       // Check if mask is good.
 560       // verifies that ZAddressBadMask & r0 == 0
 561       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 562       __ andr(c_rarg2, r0, c_rarg3);
 563       __ cbnz(c_rarg2, error);
 564     }
 565 #endif
 566 
 567     // Check if the oop is in the right area of memory
 568     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 569     __ andr(c_rarg2, r0, c_rarg3);
 570     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 571 
 572     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 573     // instruction here because the flags register is live.
 574     __ eor(c_rarg2, c_rarg2, c_rarg3);
 575     __ cbnz(c_rarg2, error);
 576 
 577     // make sure klass is 'reasonable', which is not zero.
 578     __ load_klass(r0, r0);  // get klass
 579     __ cbz(r0, error);      // if klass is NULL it is broken
 580 
 581     // return if everything seems ok
 582     __ bind(exit);
 583 
 584     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 585     __ ret(lr);
 586 
 587     // handle errors
 588     __ bind(error);
 589     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 590 
 591     __ push(RegSet::range(r0, r29), sp);
 592     // debug(char* msg, int64_t pc, int64_t regs[])
 593     __ mov(c_rarg0, rscratch1);      // pass address of error message
 594     __ mov(c_rarg1, lr);             // pass return address
 595     __ mov(c_rarg2, sp);             // pass address of regs on stack
 596 #ifndef PRODUCT
 597     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 598 #endif
 599     BLOCK_COMMENT("call MacroAssembler::debug");
 600     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 601     __ blr(rscratch1);
 602     __ hlt(0);
 603 
 604     return start;
 605   }
 606 
 607   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 608 
 609   // The inner part of zero_words().  This is the bulk operation,
 610   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 611   // caller is responsible for zeroing the last few words.
 612   //
 613   // Inputs:
 614   // r10: the HeapWord-aligned base address of an array to zero.
 615   // r11: the count in HeapWords, r11 > 0.
 616   //
 617   // Returns r10 and r11, adjusted for the caller to clear.
 618   // r10: the base address of the tail of words left to clear.
 619   // r11: the number of words in the tail.
 620   //      r11 < MacroAssembler::zero_words_block_size.
 621 
 622   address generate_zero_blocks() {
 623     Label done;
 624     Label base_aligned;
 625 
 626     Register base = r10, cnt = r11;
 627 
 628     __ align(CodeEntryAlignment);
 629     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 630     address start = __ pc();
 631 
 632     if (UseBlockZeroing) {
 633       int zva_length = VM_Version::zva_length();
 634 
 635       // Ensure ZVA length can be divided by 16. This is required by
 636       // the subsequent operations.
 637       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 638 
 639       __ tbz(base, 3, base_aligned);
 640       __ str(zr, Address(__ post(base, 8)));
 641       __ sub(cnt, cnt, 1);
 642       __ bind(base_aligned);
 643 
 644       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 645       // alignment.
 646       Label small;
 647       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 648       __ subs(rscratch1, cnt, low_limit >> 3);
 649       __ br(Assembler::LT, small);
 650       __ zero_dcache_blocks(base, cnt);
 651       __ bind(small);
 652     }
 653 
 654     {
 655       // Number of stp instructions we'll unroll
 656       const int unroll =
 657         MacroAssembler::zero_words_block_size / 2;
 658       // Clear the remaining blocks.
 659       Label loop;
 660       __ subs(cnt, cnt, unroll * 2);
 661       __ br(Assembler::LT, done);
 662       __ bind(loop);
 663       for (int i = 0; i < unroll; i++)
 664         __ stp(zr, zr, __ post(base, 16));
 665       __ subs(cnt, cnt, unroll * 2);
 666       __ br(Assembler::GE, loop);
 667       __ bind(done);
 668       __ add(cnt, cnt, unroll * 2);
 669     }
 670 
 671     __ ret(lr);
 672 
 673     return start;
 674   }
 675 
 676 
 677   typedef enum {
 678     copy_forwards = 1,
 679     copy_backwards = -1
 680   } copy_direction;
 681 
 682   // Bulk copy of blocks of 8 words.
 683   //
 684   // count is a count of words.
 685   //
 686   // Precondition: count >= 8
 687   //
 688   // Postconditions:
 689   //
 690   // The least significant bit of count contains the remaining count
 691   // of words to copy.  The rest of count is trash.
 692   //
 693   // s and d are adjusted to point to the remaining words to copy
 694   //
 695   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 696                            copy_direction direction) {
 697     int unit = wordSize * direction;
 698     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 699 
 700     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 701       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 702     const Register stride = r13;
 703 
 704     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 705     assert_different_registers(s, d, count, rscratch1);
 706 
 707     Label again, drain;
 708     const char *stub_name;
 709     if (direction == copy_forwards)
 710       stub_name = "forward_copy_longs";
 711     else
 712       stub_name = "backward_copy_longs";
 713 
 714     __ align(CodeEntryAlignment);
 715 
 716     StubCodeMark mark(this, "StubRoutines", stub_name);
 717 
 718     __ bind(start);
 719 
 720     Label unaligned_copy_long;
 721     if (AvoidUnalignedAccesses) {
 722       __ tbnz(d, 3, unaligned_copy_long);
 723     }
 724 
 725     if (direction == copy_forwards) {
 726       __ sub(s, s, bias);
 727       __ sub(d, d, bias);
 728     }
 729 
 730 #ifdef ASSERT
 731     // Make sure we are never given < 8 words
 732     {
 733       Label L;
 734       __ cmp(count, (u1)8);
 735       __ br(Assembler::GE, L);
 736       __ stop("genrate_copy_longs called with < 8 words");
 737       __ bind(L);
 738     }
 739 #endif
 740 
 741     // Fill 8 registers
 742     if (UseSIMDForMemoryOps) {
 743       __ ldpq(v0, v1, Address(s, 4 * unit));
 744       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 745     } else {
 746       __ ldp(t0, t1, Address(s, 2 * unit));
 747       __ ldp(t2, t3, Address(s, 4 * unit));
 748       __ ldp(t4, t5, Address(s, 6 * unit));
 749       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 750     }
 751 
 752     __ subs(count, count, 16);
 753     __ br(Assembler::LO, drain);
 754 
 755     int prefetch = PrefetchCopyIntervalInBytes;
 756     bool use_stride = false;
 757     if (direction == copy_backwards) {
 758        use_stride = prefetch > 256;
 759        prefetch = -prefetch;
 760        if (use_stride) __ mov(stride, prefetch);
 761     }
 762 
 763     __ bind(again);
 764 
 765     if (PrefetchCopyIntervalInBytes > 0)
 766       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 767 
 768     if (UseSIMDForMemoryOps) {
 769       __ stpq(v0, v1, Address(d, 4 * unit));
 770       __ ldpq(v0, v1, Address(s, 4 * unit));
 771       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 772       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 773     } else {
 774       __ stp(t0, t1, Address(d, 2 * unit));
 775       __ ldp(t0, t1, Address(s, 2 * unit));
 776       __ stp(t2, t3, Address(d, 4 * unit));
 777       __ ldp(t2, t3, Address(s, 4 * unit));
 778       __ stp(t4, t5, Address(d, 6 * unit));
 779       __ ldp(t4, t5, Address(s, 6 * unit));
 780       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 781       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 782     }
 783 
 784     __ subs(count, count, 8);
 785     __ br(Assembler::HS, again);
 786 
 787     // Drain
 788     __ bind(drain);
 789     if (UseSIMDForMemoryOps) {
 790       __ stpq(v0, v1, Address(d, 4 * unit));
 791       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 792     } else {
 793       __ stp(t0, t1, Address(d, 2 * unit));
 794       __ stp(t2, t3, Address(d, 4 * unit));
 795       __ stp(t4, t5, Address(d, 6 * unit));
 796       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 797     }
 798 
 799     {
 800       Label L1, L2;
 801       __ tbz(count, exact_log2(4), L1);
 802       if (UseSIMDForMemoryOps) {
 803         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 804         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 805       } else {
 806         __ ldp(t0, t1, Address(s, 2 * unit));
 807         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 808         __ stp(t0, t1, Address(d, 2 * unit));
 809         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 810       }
 811       __ bind(L1);
 812 
 813       if (direction == copy_forwards) {
 814         __ add(s, s, bias);
 815         __ add(d, d, bias);
 816       }
 817 
 818       __ tbz(count, 1, L2);
 819       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 820       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 821       __ bind(L2);
 822     }
 823 
 824     __ ret(lr);
 825 
 826     if (AvoidUnalignedAccesses) {
 827       Label drain, again;
 828       // Register order for storing. Order is different for backward copy.
 829 
 830       __ bind(unaligned_copy_long);
 831 
 832       // source address is even aligned, target odd aligned
 833       //
 834       // when forward copying word pairs we read long pairs at offsets
 835       // {0, 2, 4, 6} (in long words). when backwards copying we read
 836       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 837       // address by -2 in the forwards case so we can compute the
 838       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 839       // or -1.
 840       //
 841       // when forward copying we need to store 1 word, 3 pairs and
 842       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 843       // zero offset We adjust the destination by -1 which means we
 844       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 845       //
 846       // When backwards copyng we need to store 1 word, 3 pairs and
 847       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 848       // offsets {1, 3, 5, 7, 8} * unit.
 849 
 850       if (direction == copy_forwards) {
 851         __ sub(s, s, 16);
 852         __ sub(d, d, 8);
 853       }
 854 
 855       // Fill 8 registers
 856       //
 857       // for forwards copy s was offset by -16 from the original input
 858       // value of s so the register contents are at these offsets
 859       // relative to the 64 bit block addressed by that original input
 860       // and so on for each successive 64 byte block when s is updated
 861       //
 862       // t0 at offset 0,  t1 at offset 8
 863       // t2 at offset 16, t3 at offset 24
 864       // t4 at offset 32, t5 at offset 40
 865       // t6 at offset 48, t7 at offset 56
 866 
 867       // for backwards copy s was not offset so the register contents
 868       // are at these offsets into the preceding 64 byte block
 869       // relative to that original input and so on for each successive
 870       // preceding 64 byte block when s is updated. this explains the
 871       // slightly counter-intuitive looking pattern of register usage
 872       // in the stp instructions for backwards copy.
 873       //
 874       // t0 at offset -16, t1 at offset -8
 875       // t2 at offset -32, t3 at offset -24
 876       // t4 at offset -48, t5 at offset -40
 877       // t6 at offset -64, t7 at offset -56
 878 
 879       __ ldp(t0, t1, Address(s, 2 * unit));
 880       __ ldp(t2, t3, Address(s, 4 * unit));
 881       __ ldp(t4, t5, Address(s, 6 * unit));
 882       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 883 
 884       __ subs(count, count, 16);
 885       __ br(Assembler::LO, drain);
 886 
 887       int prefetch = PrefetchCopyIntervalInBytes;
 888       bool use_stride = false;
 889       if (direction == copy_backwards) {
 890          use_stride = prefetch > 256;
 891          prefetch = -prefetch;
 892          if (use_stride) __ mov(stride, prefetch);
 893       }
 894 
 895       __ bind(again);
 896 
 897       if (PrefetchCopyIntervalInBytes > 0)
 898         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 899 
 900       if (direction == copy_forwards) {
 901        // allowing for the offset of -8 the store instructions place
 902        // registers into the target 64 bit block at the following
 903        // offsets
 904        //
 905        // t0 at offset 0
 906        // t1 at offset 8,  t2 at offset 16
 907        // t3 at offset 24, t4 at offset 32
 908        // t5 at offset 40, t6 at offset 48
 909        // t7 at offset 56
 910 
 911         __ str(t0, Address(d, 1 * unit));
 912         __ stp(t1, t2, Address(d, 2 * unit));
 913         __ ldp(t0, t1, Address(s, 2 * unit));
 914         __ stp(t3, t4, Address(d, 4 * unit));
 915         __ ldp(t2, t3, Address(s, 4 * unit));
 916         __ stp(t5, t6, Address(d, 6 * unit));
 917         __ ldp(t4, t5, Address(s, 6 * unit));
 918         __ str(t7, Address(__ pre(d, 8 * unit)));
 919         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 920       } else {
 921        // d was not offset when we started so the registers are
 922        // written into the 64 bit block preceding d with the following
 923        // offsets
 924        //
 925        // t1 at offset -8
 926        // t3 at offset -24, t0 at offset -16
 927        // t5 at offset -48, t2 at offset -32
 928        // t7 at offset -56, t4 at offset -48
 929        //                   t6 at offset -64
 930        //
 931        // note that this matches the offsets previously noted for the
 932        // loads
 933 
 934         __ str(t1, Address(d, 1 * unit));
 935         __ stp(t3, t0, Address(d, 3 * unit));
 936         __ ldp(t0, t1, Address(s, 2 * unit));
 937         __ stp(t5, t2, Address(d, 5 * unit));
 938         __ ldp(t2, t3, Address(s, 4 * unit));
 939         __ stp(t7, t4, Address(d, 7 * unit));
 940         __ ldp(t4, t5, Address(s, 6 * unit));
 941         __ str(t6, Address(__ pre(d, 8 * unit)));
 942         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 943       }
 944 
 945       __ subs(count, count, 8);
 946       __ br(Assembler::HS, again);
 947 
 948       // Drain
 949       //
 950       // this uses the same pattern of offsets and register arguments
 951       // as above
 952       __ bind(drain);
 953       if (direction == copy_forwards) {
 954         __ str(t0, Address(d, 1 * unit));
 955         __ stp(t1, t2, Address(d, 2 * unit));
 956         __ stp(t3, t4, Address(d, 4 * unit));
 957         __ stp(t5, t6, Address(d, 6 * unit));
 958         __ str(t7, Address(__ pre(d, 8 * unit)));
 959       } else {
 960         __ str(t1, Address(d, 1 * unit));
 961         __ stp(t3, t0, Address(d, 3 * unit));
 962         __ stp(t5, t2, Address(d, 5 * unit));
 963         __ stp(t7, t4, Address(d, 7 * unit));
 964         __ str(t6, Address(__ pre(d, 8 * unit)));
 965       }
 966       // now we need to copy any remaining part block which may
 967       // include a 4 word block subblock and/or a 2 word subblock.
 968       // bits 2 and 1 in the count are the tell-tale for whetehr we
 969       // have each such subblock
 970       {
 971         Label L1, L2;
 972         __ tbz(count, exact_log2(4), L1);
 973        // this is the same as above but copying only 4 longs hence
 974        // with ony one intervening stp between the str instructions
 975        // but note that the offsets and registers still follow the
 976        // same pattern
 977         __ ldp(t0, t1, Address(s, 2 * unit));
 978         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 979         if (direction == copy_forwards) {
 980           __ str(t0, Address(d, 1 * unit));
 981           __ stp(t1, t2, Address(d, 2 * unit));
 982           __ str(t3, Address(__ pre(d, 4 * unit)));
 983         } else {
 984           __ str(t1, Address(d, 1 * unit));
 985           __ stp(t3, t0, Address(d, 3 * unit));
 986           __ str(t2, Address(__ pre(d, 4 * unit)));
 987         }
 988         __ bind(L1);
 989 
 990         __ tbz(count, 1, L2);
 991        // this is the same as above but copying only 2 longs hence
 992        // there is no intervening stp between the str instructions
 993        // but note that the offset and register patterns are still
 994        // the same
 995         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
 996         if (direction == copy_forwards) {
 997           __ str(t0, Address(d, 1 * unit));
 998           __ str(t1, Address(__ pre(d, 2 * unit)));
 999         } else {
1000           __ str(t1, Address(d, 1 * unit));
1001           __ str(t0, Address(__ pre(d, 2 * unit)));
1002         }
1003         __ bind(L2);
1004 
1005        // for forwards copy we need to re-adjust the offsets we
1006        // applied so that s and d are follow the last words written
1007 
1008        if (direction == copy_forwards) {
1009          __ add(s, s, 16);
1010          __ add(d, d, 8);
1011        }
1012 
1013       }
1014 
1015       __ ret(lr);
1016       }
1017   }
1018 
1019   // Small copy: less than 16 bytes.
1020   //
1021   // NB: Ignores all of the bits of count which represent more than 15
1022   // bytes, so a caller doesn't have to mask them.
1023 
1024   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1025     bool is_backwards = step < 0;
1026     size_t granularity = uabs(step);
1027     int direction = is_backwards ? -1 : 1;
1028     int unit = wordSize * direction;
1029 
1030     Label Lword, Lint, Lshort, Lbyte;
1031 
1032     assert(granularity
1033            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1034 
1035     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1036 
1037     // ??? I don't know if this bit-test-and-branch is the right thing
1038     // to do.  It does a lot of jumping, resulting in several
1039     // mispredicted branches.  It might make more sense to do this
1040     // with something like Duff's device with a single computed branch.
1041 
1042     __ tbz(count, 3 - exact_log2(granularity), Lword);
1043     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1044     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1045     __ bind(Lword);
1046 
1047     if (granularity <= sizeof (jint)) {
1048       __ tbz(count, 2 - exact_log2(granularity), Lint);
1049       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1050       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1051       __ bind(Lint);
1052     }
1053 
1054     if (granularity <= sizeof (jshort)) {
1055       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1056       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1057       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1058       __ bind(Lshort);
1059     }
1060 
1061     if (granularity <= sizeof (jbyte)) {
1062       __ tbz(count, 0, Lbyte);
1063       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1064       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1065       __ bind(Lbyte);
1066     }
1067   }
1068 
1069   Label copy_f, copy_b;
1070 
1071   // All-singing all-dancing memory copy.
1072   //
1073   // Copy count units of memory from s to d.  The size of a unit is
1074   // step, which can be positive or negative depending on the direction
1075   // of copy.  If is_aligned is false, we align the source address.
1076   //
1077 
1078   void copy_memory(bool is_aligned, Register s, Register d,
1079                    Register count, Register tmp, int step) {
1080     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1081     bool is_backwards = step < 0;
1082     int granularity = uabs(step);
1083     const Register t0 = r3, t1 = r4;
1084 
1085     // <= 96 bytes do inline. Direction doesn't matter because we always
1086     // load all the data before writing anything
1087     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1088     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1089     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1090     const Register send = r17, dend = r18;
1091 
1092     if (PrefetchCopyIntervalInBytes > 0)
1093       __ prfm(Address(s, 0), PLDL1KEEP);
1094     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1095     __ br(Assembler::HI, copy_big);
1096 
1097     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1098     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1099 
1100     __ cmp(count, u1(16/granularity));
1101     __ br(Assembler::LS, copy16);
1102 
1103     __ cmp(count, u1(64/granularity));
1104     __ br(Assembler::HI, copy80);
1105 
1106     __ cmp(count, u1(32/granularity));
1107     __ br(Assembler::LS, copy32);
1108 
1109     // 33..64 bytes
1110     if (UseSIMDForMemoryOps) {
1111       __ ldpq(v0, v1, Address(s, 0));
1112       __ ldpq(v2, v3, Address(send, -32));
1113       __ stpq(v0, v1, Address(d, 0));
1114       __ stpq(v2, v3, Address(dend, -32));
1115     } else {
1116       __ ldp(t0, t1, Address(s, 0));
1117       __ ldp(t2, t3, Address(s, 16));
1118       __ ldp(t4, t5, Address(send, -32));
1119       __ ldp(t6, t7, Address(send, -16));
1120 
1121       __ stp(t0, t1, Address(d, 0));
1122       __ stp(t2, t3, Address(d, 16));
1123       __ stp(t4, t5, Address(dend, -32));
1124       __ stp(t6, t7, Address(dend, -16));
1125     }
1126     __ b(finish);
1127 
1128     // 17..32 bytes
1129     __ bind(copy32);
1130     __ ldp(t0, t1, Address(s, 0));
1131     __ ldp(t2, t3, Address(send, -16));
1132     __ stp(t0, t1, Address(d, 0));
1133     __ stp(t2, t3, Address(dend, -16));
1134     __ b(finish);
1135 
1136     // 65..80/96 bytes
1137     // (96 bytes if SIMD because we do 32 byes per instruction)
1138     __ bind(copy80);
1139     if (UseSIMDForMemoryOps) {
1140       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1141       __ ldpq(v4, v5, Address(send, -32));
1142       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1143       __ stpq(v4, v5, Address(dend, -32));
1144     } else {
1145       __ ldp(t0, t1, Address(s, 0));
1146       __ ldp(t2, t3, Address(s, 16));
1147       __ ldp(t4, t5, Address(s, 32));
1148       __ ldp(t6, t7, Address(s, 48));
1149       __ ldp(t8, t9, Address(send, -16));
1150 
1151       __ stp(t0, t1, Address(d, 0));
1152       __ stp(t2, t3, Address(d, 16));
1153       __ stp(t4, t5, Address(d, 32));
1154       __ stp(t6, t7, Address(d, 48));
1155       __ stp(t8, t9, Address(dend, -16));
1156     }
1157     __ b(finish);
1158 
1159     // 0..16 bytes
1160     __ bind(copy16);
1161     __ cmp(count, u1(8/granularity));
1162     __ br(Assembler::LO, copy8);
1163 
1164     // 8..16 bytes
1165     __ ldr(t0, Address(s, 0));
1166     __ ldr(t1, Address(send, -8));
1167     __ str(t0, Address(d, 0));
1168     __ str(t1, Address(dend, -8));
1169     __ b(finish);
1170 
1171     if (granularity < 8) {
1172       // 4..7 bytes
1173       __ bind(copy8);
1174       __ tbz(count, 2 - exact_log2(granularity), copy4);
1175       __ ldrw(t0, Address(s, 0));
1176       __ ldrw(t1, Address(send, -4));
1177       __ strw(t0, Address(d, 0));
1178       __ strw(t1, Address(dend, -4));
1179       __ b(finish);
1180       if (granularity < 4) {
1181         // 0..3 bytes
1182         __ bind(copy4);
1183         __ cbz(count, finish); // get rid of 0 case
1184         if (granularity == 2) {
1185           __ ldrh(t0, Address(s, 0));
1186           __ strh(t0, Address(d, 0));
1187         } else { // granularity == 1
1188           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1189           // the first and last byte.
1190           // Handle the 3 byte case by loading and storing base + count/2
1191           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1192           // This does means in the 1 byte case we load/store the same
1193           // byte 3 times.
1194           __ lsr(count, count, 1);
1195           __ ldrb(t0, Address(s, 0));
1196           __ ldrb(t1, Address(send, -1));
1197           __ ldrb(t2, Address(s, count));
1198           __ strb(t0, Address(d, 0));
1199           __ strb(t1, Address(dend, -1));
1200           __ strb(t2, Address(d, count));
1201         }
1202         __ b(finish);
1203       }
1204     }
1205 
1206     __ bind(copy_big);
1207     if (is_backwards) {
1208       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1209       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1210     }
1211 
1212     // Now we've got the small case out of the way we can align the
1213     // source address on a 2-word boundary.
1214 
1215     Label aligned;
1216 
1217     if (is_aligned) {
1218       // We may have to adjust by 1 word to get s 2-word-aligned.
1219       __ tbz(s, exact_log2(wordSize), aligned);
1220       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1221       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1222       __ sub(count, count, wordSize/granularity);
1223     } else {
1224       if (is_backwards) {
1225         __ andr(rscratch2, s, 2 * wordSize - 1);
1226       } else {
1227         __ neg(rscratch2, s);
1228         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1229       }
1230       // rscratch2 is the byte adjustment needed to align s.
1231       __ cbz(rscratch2, aligned);
1232       int shift = exact_log2(granularity);
1233       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1234       __ sub(count, count, rscratch2);
1235 
1236 #if 0
1237       // ?? This code is only correct for a disjoint copy.  It may or
1238       // may not make sense to use it in that case.
1239 
1240       // Copy the first pair; s and d may not be aligned.
1241       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1242       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1243 
1244       // Align s and d, adjust count
1245       if (is_backwards) {
1246         __ sub(s, s, rscratch2);
1247         __ sub(d, d, rscratch2);
1248       } else {
1249         __ add(s, s, rscratch2);
1250         __ add(d, d, rscratch2);
1251       }
1252 #else
1253       copy_memory_small(s, d, rscratch2, rscratch1, step);
1254 #endif
1255     }
1256 
1257     __ bind(aligned);
1258 
1259     // s is now 2-word-aligned.
1260 
1261     // We have a count of units and some trailing bytes.  Adjust the
1262     // count and do a bulk copy of words.
1263     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1264     if (direction == copy_forwards)
1265       __ bl(copy_f);
1266     else
1267       __ bl(copy_b);
1268 
1269     // And the tail.
1270     copy_memory_small(s, d, count, tmp, step);
1271 
1272     if (granularity >= 8) __ bind(copy8);
1273     if (granularity >= 4) __ bind(copy4);
1274     __ bind(finish);
1275   }
1276 
1277 
1278   void clobber_registers() {
1279 #ifdef ASSERT
1280     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1281     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1282     for (Register r = r3; r <= r18; r++)
1283       if (r != rscratch1) __ mov(r, rscratch1);
1284 #endif
1285   }
1286 
1287   // Scan over array at a for count oops, verifying each one.
1288   // Preserves a and count, clobbers rscratch1 and rscratch2.
1289   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1290     Label loop, end;
1291     __ mov(rscratch1, a);
1292     __ mov(rscratch2, zr);
1293     __ bind(loop);
1294     __ cmp(rscratch2, count);
1295     __ br(Assembler::HS, end);
1296     if (size == (size_t)wordSize) {
1297       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1298       __ verify_oop(temp);
1299     } else {
1300       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1301       __ decode_heap_oop(temp); // calls verify_oop
1302     }
1303     __ add(rscratch2, rscratch2, size);
1304     __ b(loop);
1305     __ bind(end);
1306   }
1307 
1308   // Arguments:
1309   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1310   //             ignored
1311   //   is_oop  - true => oop array, so generate store check code
1312   //   name    - stub name string
1313   //
1314   // Inputs:
1315   //   c_rarg0   - source array address
1316   //   c_rarg1   - destination array address
1317   //   c_rarg2   - element count, treated as ssize_t, can be zero
1318   //
1319   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1320   // the hardware handle it.  The two dwords within qwords that span
1321   // cache line boundaries will still be loaded and stored atomicly.
1322   //
1323   // Side Effects:
1324   //   disjoint_int_copy_entry is set to the no-overlap entry point
1325   //   used by generate_conjoint_int_oop_copy().
1326   //
1327   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1328                                   const char *name, bool dest_uninitialized = false) {
1329     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1330     RegSet saved_reg = RegSet::of(s, d, count);
1331     __ align(CodeEntryAlignment);
1332     StubCodeMark mark(this, "StubRoutines", name);
1333     address start = __ pc();
1334     __ enter();
1335 
1336     if (entry != NULL) {
1337       *entry = __ pc();
1338       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1339       BLOCK_COMMENT("Entry:");
1340     }
1341 
1342     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1343     if (dest_uninitialized) {
1344       decorators |= IS_DEST_UNINITIALIZED;
1345     }
1346     if (aligned) {
1347       decorators |= ARRAYCOPY_ALIGNED;
1348     }
1349 
1350     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1351     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1352 
1353     if (is_oop) {
1354       // save regs before copy_memory
1355       __ push(RegSet::of(d, count), sp);
1356     }
1357     {
1358       // UnsafeCopyMemory page error: continue after ucm
1359       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1360       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1361       copy_memory(aligned, s, d, count, rscratch1, size);
1362     }
1363 
1364     if (is_oop) {
1365       __ pop(RegSet::of(d, count), sp);
1366       if (VerifyOops)
1367         verify_oop_array(size, d, count, r16);
1368     }
1369 
1370     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1371 
1372     __ leave();
1373     __ mov(r0, zr); // return 0
1374     __ ret(lr);
1375     return start;
1376   }
1377 
1378   // Arguments:
1379   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1380   //             ignored
1381   //   is_oop  - true => oop array, so generate store check code
1382   //   name    - stub name string
1383   //
1384   // Inputs:
1385   //   c_rarg0   - source array address
1386   //   c_rarg1   - destination array address
1387   //   c_rarg2   - element count, treated as ssize_t, can be zero
1388   //
1389   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1390   // the hardware handle it.  The two dwords within qwords that span
1391   // cache line boundaries will still be loaded and stored atomicly.
1392   //
1393   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1394                                  address *entry, const char *name,
1395                                  bool dest_uninitialized = false) {
1396     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1397     RegSet saved_regs = RegSet::of(s, d, count);
1398     StubCodeMark mark(this, "StubRoutines", name);
1399     address start = __ pc();
1400     __ enter();
1401 
1402     if (entry != NULL) {
1403       *entry = __ pc();
1404       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1405       BLOCK_COMMENT("Entry:");
1406     }
1407 
1408     // use fwd copy when (d-s) above_equal (count*size)
1409     __ sub(rscratch1, d, s);
1410     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1411     __ br(Assembler::HS, nooverlap_target);
1412 
1413     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1414     if (dest_uninitialized) {
1415       decorators |= IS_DEST_UNINITIALIZED;
1416     }
1417     if (aligned) {
1418       decorators |= ARRAYCOPY_ALIGNED;
1419     }
1420 
1421     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1422     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1423 
1424     if (is_oop) {
1425       // save regs before copy_memory
1426       __ push(RegSet::of(d, count), sp);
1427     }
1428     {
1429       // UnsafeCopyMemory page error: continue after ucm
1430       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1431       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1432       copy_memory(aligned, s, d, count, rscratch1, -size);
1433     }
1434     if (is_oop) {
1435       __ pop(RegSet::of(d, count), sp);
1436       if (VerifyOops)
1437         verify_oop_array(size, d, count, r16);
1438     }
1439     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1440     __ leave();
1441     __ mov(r0, zr); // return 0
1442     __ ret(lr);
1443     return start;
1444 }
1445 
1446   // Arguments:
1447   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1448   //             ignored
1449   //   name    - stub name string
1450   //
1451   // Inputs:
1452   //   c_rarg0   - source array address
1453   //   c_rarg1   - destination array address
1454   //   c_rarg2   - element count, treated as ssize_t, can be zero
1455   //
1456   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1457   // we let the hardware handle it.  The one to eight bytes within words,
1458   // dwords or qwords that span cache line boundaries will still be loaded
1459   // and stored atomically.
1460   //
1461   // Side Effects:
1462   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1463   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1464   // we let the hardware handle it.  The one to eight bytes within words,
1465   // dwords or qwords that span cache line boundaries will still be loaded
1466   // and stored atomically.
1467   //
1468   // Side Effects:
1469   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1470   //   used by generate_conjoint_byte_copy().
1471   //
1472   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1473     const bool not_oop = false;
1474     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1475   }
1476 
1477   // Arguments:
1478   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1479   //             ignored
1480   //   name    - stub name string
1481   //
1482   // Inputs:
1483   //   c_rarg0   - source array address
1484   //   c_rarg1   - destination array address
1485   //   c_rarg2   - element count, treated as ssize_t, can be zero
1486   //
1487   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1488   // we let the hardware handle it.  The one to eight bytes within words,
1489   // dwords or qwords that span cache line boundaries will still be loaded
1490   // and stored atomically.
1491   //
1492   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1493                                       address* entry, const char *name) {
1494     const bool not_oop = false;
1495     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1496   }
1497 
1498   // Arguments:
1499   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1500   //             ignored
1501   //   name    - stub name string
1502   //
1503   // Inputs:
1504   //   c_rarg0   - source array address
1505   //   c_rarg1   - destination array address
1506   //   c_rarg2   - element count, treated as ssize_t, can be zero
1507   //
1508   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1509   // let the hardware handle it.  The two or four words within dwords
1510   // or qwords that span cache line boundaries will still be loaded
1511   // and stored atomically.
1512   //
1513   // Side Effects:
1514   //   disjoint_short_copy_entry is set to the no-overlap entry point
1515   //   used by generate_conjoint_short_copy().
1516   //
1517   address generate_disjoint_short_copy(bool aligned,
1518                                        address* entry, const char *name) {
1519     const bool not_oop = false;
1520     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1521   }
1522 
1523   // Arguments:
1524   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1525   //             ignored
1526   //   name    - stub name string
1527   //
1528   // Inputs:
1529   //   c_rarg0   - source array address
1530   //   c_rarg1   - destination array address
1531   //   c_rarg2   - element count, treated as ssize_t, can be zero
1532   //
1533   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1534   // let the hardware handle it.  The two or four words within dwords
1535   // or qwords that span cache line boundaries will still be loaded
1536   // and stored atomically.
1537   //
1538   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1539                                        address *entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1542 
1543   }
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1555   // the hardware handle it.  The two dwords within qwords that span
1556   // cache line boundaries will still be loaded and stored atomicly.
1557   //
1558   // Side Effects:
1559   //   disjoint_int_copy_entry is set to the no-overlap entry point
1560   //   used by generate_conjoint_int_oop_copy().
1561   //
1562   address generate_disjoint_int_copy(bool aligned, address *entry,
1563                                          const char *name, bool dest_uninitialized = false) {
1564     const bool not_oop = false;
1565     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1566   }
1567 
1568   // Arguments:
1569   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1570   //             ignored
1571   //   name    - stub name string
1572   //
1573   // Inputs:
1574   //   c_rarg0   - source array address
1575   //   c_rarg1   - destination array address
1576   //   c_rarg2   - element count, treated as ssize_t, can be zero
1577   //
1578   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1579   // the hardware handle it.  The two dwords within qwords that span
1580   // cache line boundaries will still be loaded and stored atomicly.
1581   //
1582   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1583                                      address *entry, const char *name,
1584                                      bool dest_uninitialized = false) {
1585     const bool not_oop = false;
1586     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1587   }
1588 
1589 
1590   // Arguments:
1591   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1592   //             ignored
1593   //   name    - stub name string
1594   //
1595   // Inputs:
1596   //   c_rarg0   - source array address
1597   //   c_rarg1   - destination array address
1598   //   c_rarg2   - element count, treated as size_t, can be zero
1599   //
1600   // Side Effects:
1601   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1602   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1603   //
1604   address generate_disjoint_long_copy(bool aligned, address *entry,
1605                                           const char *name, bool dest_uninitialized = false) {
1606     const bool not_oop = false;
1607     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1608   }
1609 
1610   // Arguments:
1611   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1612   //             ignored
1613   //   name    - stub name string
1614   //
1615   // Inputs:
1616   //   c_rarg0   - source array address
1617   //   c_rarg1   - destination array address
1618   //   c_rarg2   - element count, treated as size_t, can be zero
1619   //
1620   address generate_conjoint_long_copy(bool aligned,
1621                                       address nooverlap_target, address *entry,
1622                                       const char *name, bool dest_uninitialized = false) {
1623     const bool not_oop = false;
1624     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1625   }
1626 
1627   // Arguments:
1628   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1629   //             ignored
1630   //   name    - stub name string
1631   //
1632   // Inputs:
1633   //   c_rarg0   - source array address
1634   //   c_rarg1   - destination array address
1635   //   c_rarg2   - element count, treated as size_t, can be zero
1636   //
1637   // Side Effects:
1638   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1639   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1640   //
1641   address generate_disjoint_oop_copy(bool aligned, address *entry,
1642                                      const char *name, bool dest_uninitialized) {
1643     const bool is_oop = true;
1644     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1645     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1646   }
1647 
1648   // Arguments:
1649   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1650   //             ignored
1651   //   name    - stub name string
1652   //
1653   // Inputs:
1654   //   c_rarg0   - source array address
1655   //   c_rarg1   - destination array address
1656   //   c_rarg2   - element count, treated as size_t, can be zero
1657   //
1658   address generate_conjoint_oop_copy(bool aligned,
1659                                      address nooverlap_target, address *entry,
1660                                      const char *name, bool dest_uninitialized) {
1661     const bool is_oop = true;
1662     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1663     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1664                                   name, dest_uninitialized);
1665   }
1666 
1667 
1668   // Helper for generating a dynamic type check.
1669   // Smashes rscratch1, rscratch2.
1670   void generate_type_check(Register sub_klass,
1671                            Register super_check_offset,
1672                            Register super_klass,
1673                            Label& L_success) {
1674     assert_different_registers(sub_klass, super_check_offset, super_klass);
1675 
1676     BLOCK_COMMENT("type_check:");
1677 
1678     Label L_miss;
1679 
1680     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1681                                      super_check_offset);
1682     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1683 
1684     // Fall through on failure!
1685     __ BIND(L_miss);
1686   }
1687 
1688   //
1689   //  Generate checkcasting array copy stub
1690   //
1691   //  Input:
1692   //    c_rarg0   - source array address
1693   //    c_rarg1   - destination array address
1694   //    c_rarg2   - element count, treated as ssize_t, can be zero
1695   //    c_rarg3   - size_t ckoff (super_check_offset)
1696   //    c_rarg4   - oop ckval (super_klass)
1697   //
1698   //  Output:
1699   //    r0 ==  0  -  success
1700   //    r0 == -1^K - failure, where K is partial transfer count
1701   //
1702   address generate_checkcast_copy(const char *name, address *entry,
1703                                   bool dest_uninitialized = false) {
1704 
1705     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1706 
1707     // Input registers (after setup_arg_regs)
1708     const Register from        = c_rarg0;   // source array address
1709     const Register to          = c_rarg1;   // destination array address
1710     const Register count       = c_rarg2;   // elementscount
1711     const Register ckoff       = c_rarg3;   // super_check_offset
1712     const Register ckval       = c_rarg4;   // super_klass
1713 
1714     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1715     RegSet wb_post_saved_regs = RegSet::of(count);
1716 
1717     // Registers used as temps (r18, r19, r20 are save-on-entry)
1718     const Register count_save  = r21;       // orig elementscount
1719     const Register start_to    = r20;       // destination array start address
1720     const Register copied_oop  = r18;       // actual oop copied
1721     const Register r19_klass   = r19;       // oop._klass
1722 
1723     //---------------------------------------------------------------
1724     // Assembler stub will be used for this call to arraycopy
1725     // if the two arrays are subtypes of Object[] but the
1726     // destination array type is not equal to or a supertype
1727     // of the source type.  Each element must be separately
1728     // checked.
1729 
1730     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1731                                copied_oop, r19_klass, count_save);
1732 
1733     __ align(CodeEntryAlignment);
1734     StubCodeMark mark(this, "StubRoutines", name);
1735     address start = __ pc();
1736 
1737     __ enter(); // required for proper stackwalking of RuntimeStub frame
1738 
1739 #ifdef ASSERT
1740     // caller guarantees that the arrays really are different
1741     // otherwise, we would have to make conjoint checks
1742     { Label L;
1743       array_overlap_test(L, TIMES_OOP);
1744       __ stop("checkcast_copy within a single array");
1745       __ bind(L);
1746     }
1747 #endif //ASSERT
1748 
1749     // Caller of this entry point must set up the argument registers.
1750     if (entry != NULL) {
1751       *entry = __ pc();
1752       BLOCK_COMMENT("Entry:");
1753     }
1754 
1755      // Empty array:  Nothing to do.
1756     __ cbz(count, L_done);
1757 
1758     __ push(RegSet::of(r18, r19, r20, r21), sp);
1759 
1760 #ifdef ASSERT
1761     BLOCK_COMMENT("assert consistent ckoff/ckval");
1762     // The ckoff and ckval must be mutually consistent,
1763     // even though caller generates both.
1764     { Label L;
1765       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1766       __ ldrw(start_to, Address(ckval, sco_offset));
1767       __ cmpw(ckoff, start_to);
1768       __ br(Assembler::EQ, L);
1769       __ stop("super_check_offset inconsistent");
1770       __ bind(L);
1771     }
1772 #endif //ASSERT
1773 
1774     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1775     bool is_oop = true;
1776     if (dest_uninitialized) {
1777       decorators |= IS_DEST_UNINITIALIZED;
1778     }
1779 
1780     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1781     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1782 
1783     // save the original count
1784     __ mov(count_save, count);
1785 
1786     // Copy from low to high addresses
1787     __ mov(start_to, to);              // Save destination array start address
1788     __ b(L_load_element);
1789 
1790     // ======== begin loop ========
1791     // (Loop is rotated; its entry is L_load_element.)
1792     // Loop control:
1793     //   for (; count != 0; count--) {
1794     //     copied_oop = load_heap_oop(from++);
1795     //     ... generate_type_check ...;
1796     //     store_heap_oop(to++, copied_oop);
1797     //   }
1798     __ align(OptoLoopAlignment);
1799 
1800     __ BIND(L_store_element);
1801     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1802     __ sub(count, count, 1);
1803     __ cbz(count, L_do_card_marks);
1804 
1805     // ======== loop entry is here ========
1806     __ BIND(L_load_element);
1807     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1808     __ cbz(copied_oop, L_store_element);
1809 
1810     __ load_klass(r19_klass, copied_oop);// query the object klass
1811     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1812     // ======== end loop ========
1813 
1814     // It was a real error; we must depend on the caller to finish the job.
1815     // Register count = remaining oops, count_orig = total oops.
1816     // Emit GC store barriers for the oops we have copied and report
1817     // their number to the caller.
1818 
1819     __ subs(count, count_save, count);     // K = partially copied oop count
1820     __ eon(count, count, zr);                   // report (-1^K) to caller
1821     __ br(Assembler::EQ, L_done_pop);
1822 
1823     __ BIND(L_do_card_marks);
1824     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1825 
1826     __ bind(L_done_pop);
1827     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1828     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1829 
1830     __ bind(L_done);
1831     __ mov(r0, count);
1832     __ leave();
1833     __ ret(lr);
1834 
1835     return start;
1836   }
1837 
1838   // Perform range checks on the proposed arraycopy.
1839   // Kills temp, but nothing else.
1840   // Also, clean the sign bits of src_pos and dst_pos.
1841   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1842                               Register src_pos, // source position (c_rarg1)
1843                               Register dst,     // destination array oo (c_rarg2)
1844                               Register dst_pos, // destination position (c_rarg3)
1845                               Register length,
1846                               Register temp,
1847                               Label& L_failed) {
1848     BLOCK_COMMENT("arraycopy_range_checks:");
1849 
1850     assert_different_registers(rscratch1, temp);
1851 
1852     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1853     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1854     __ addw(temp, length, src_pos);
1855     __ cmpw(temp, rscratch1);
1856     __ br(Assembler::HI, L_failed);
1857 
1858     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1859     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1860     __ addw(temp, length, dst_pos);
1861     __ cmpw(temp, rscratch1);
1862     __ br(Assembler::HI, L_failed);
1863 
1864     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1865     __ movw(src_pos, src_pos);
1866     __ movw(dst_pos, dst_pos);
1867 
1868     BLOCK_COMMENT("arraycopy_range_checks done");
1869   }
1870 
1871   // These stubs get called from some dumb test routine.
1872   // I'll write them properly when they're called from
1873   // something that's actually doing something.
1874   static void fake_arraycopy_stub(address src, address dst, int count) {
1875     assert(count == 0, "huh?");
1876   }
1877 
1878 
1879   //
1880   //  Generate 'unsafe' array copy stub
1881   //  Though just as safe as the other stubs, it takes an unscaled
1882   //  size_t argument instead of an element count.
1883   //
1884   //  Input:
1885   //    c_rarg0   - source array address
1886   //    c_rarg1   - destination array address
1887   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1888   //
1889   // Examines the alignment of the operands and dispatches
1890   // to a long, int, short, or byte copy loop.
1891   //
1892   address generate_unsafe_copy(const char *name,
1893                                address byte_copy_entry,
1894                                address short_copy_entry,
1895                                address int_copy_entry,
1896                                address long_copy_entry) {
1897     Label L_long_aligned, L_int_aligned, L_short_aligned;
1898     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1899 
1900     __ align(CodeEntryAlignment);
1901     StubCodeMark mark(this, "StubRoutines", name);
1902     address start = __ pc();
1903     __ enter(); // required for proper stackwalking of RuntimeStub frame
1904 
1905     // bump this on entry, not on exit:
1906     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1907 
1908     __ orr(rscratch1, s, d);
1909     __ orr(rscratch1, rscratch1, count);
1910 
1911     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1912     __ cbz(rscratch1, L_long_aligned);
1913     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1914     __ cbz(rscratch1, L_int_aligned);
1915     __ tbz(rscratch1, 0, L_short_aligned);
1916     __ b(RuntimeAddress(byte_copy_entry));
1917 
1918     __ BIND(L_short_aligned);
1919     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1920     __ b(RuntimeAddress(short_copy_entry));
1921     __ BIND(L_int_aligned);
1922     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1923     __ b(RuntimeAddress(int_copy_entry));
1924     __ BIND(L_long_aligned);
1925     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1926     __ b(RuntimeAddress(long_copy_entry));
1927 
1928     return start;
1929   }
1930 
1931   //
1932   //  Generate generic array copy stubs
1933   //
1934   //  Input:
1935   //    c_rarg0    -  src oop
1936   //    c_rarg1    -  src_pos (32-bits)
1937   //    c_rarg2    -  dst oop
1938   //    c_rarg3    -  dst_pos (32-bits)
1939   //    c_rarg4    -  element count (32-bits)
1940   //
1941   //  Output:
1942   //    r0 ==  0  -  success
1943   //    r0 == -1^K - failure, where K is partial transfer count
1944   //
1945   address generate_generic_copy(const char *name,
1946                                 address byte_copy_entry, address short_copy_entry,
1947                                 address int_copy_entry, address oop_copy_entry,
1948                                 address long_copy_entry, address checkcast_copy_entry) {
1949 
1950     Label L_failed, L_objArray;
1951     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1952 
1953     // Input registers
1954     const Register src        = c_rarg0;  // source array oop
1955     const Register src_pos    = c_rarg1;  // source position
1956     const Register dst        = c_rarg2;  // destination array oop
1957     const Register dst_pos    = c_rarg3;  // destination position
1958     const Register length     = c_rarg4;
1959 
1960 
1961     // Registers used as temps
1962     const Register dst_klass  = c_rarg5;
1963 
1964     __ align(CodeEntryAlignment);
1965 
1966     StubCodeMark mark(this, "StubRoutines", name);
1967 
1968     address start = __ pc();
1969 
1970     __ enter(); // required for proper stackwalking of RuntimeStub frame
1971 
1972     // bump this on entry, not on exit:
1973     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1974 
1975     //-----------------------------------------------------------------------
1976     // Assembler stub will be used for this call to arraycopy
1977     // if the following conditions are met:
1978     //
1979     // (1) src and dst must not be null.
1980     // (2) src_pos must not be negative.
1981     // (3) dst_pos must not be negative.
1982     // (4) length  must not be negative.
1983     // (5) src klass and dst klass should be the same and not NULL.
1984     // (6) src and dst should be arrays.
1985     // (7) src_pos + length must not exceed length of src.
1986     // (8) dst_pos + length must not exceed length of dst.
1987     //
1988 
1989     //  if (src == NULL) return -1;
1990     __ cbz(src, L_failed);
1991 
1992     //  if (src_pos < 0) return -1;
1993     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1994 
1995     //  if (dst == NULL) return -1;
1996     __ cbz(dst, L_failed);
1997 
1998     //  if (dst_pos < 0) return -1;
1999     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2000 
2001     // registers used as temp
2002     const Register scratch_length    = r16; // elements count to copy
2003     const Register scratch_src_klass = r17; // array klass
2004     const Register lh                = r18; // layout helper
2005 
2006     //  if (length < 0) return -1;
2007     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2008     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2009 
2010     __ load_klass(scratch_src_klass, src);
2011 #ifdef ASSERT
2012     //  assert(src->klass() != NULL);
2013     {
2014       BLOCK_COMMENT("assert klasses not null {");
2015       Label L1, L2;
2016       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2017       __ bind(L1);
2018       __ stop("broken null klass");
2019       __ bind(L2);
2020       __ load_klass(rscratch1, dst);
2021       __ cbz(rscratch1, L1);     // this would be broken also
2022       BLOCK_COMMENT("} assert klasses not null done");
2023     }
2024 #endif
2025 
2026     // Load layout helper (32-bits)
2027     //
2028     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2029     // 32        30    24            16              8     2                 0
2030     //
2031     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2032     //
2033 
2034     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2035 
2036     // Handle objArrays completely differently...
2037     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2038     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2039     __ movw(rscratch1, objArray_lh);
2040     __ eorw(rscratch2, lh, rscratch1);
2041     __ cbzw(rscratch2, L_objArray);
2042 
2043     //  if (src->klass() != dst->klass()) return -1;
2044     __ load_klass(rscratch2, dst);
2045     __ eor(rscratch2, rscratch2, scratch_src_klass);
2046     __ cbnz(rscratch2, L_failed);
2047 
2048     //  if (!src->is_Array()) return -1;
2049     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2050 
2051     // At this point, it is known to be a typeArray (array_tag 0x3).
2052 #ifdef ASSERT
2053     {
2054       BLOCK_COMMENT("assert primitive array {");
2055       Label L;
2056       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2057       __ cmpw(lh, rscratch2);
2058       __ br(Assembler::GE, L);
2059       __ stop("must be a primitive array");
2060       __ bind(L);
2061       BLOCK_COMMENT("} assert primitive array done");
2062     }
2063 #endif
2064 
2065     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2066                            rscratch2, L_failed);
2067 
2068     // TypeArrayKlass
2069     //
2070     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2071     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2072     //
2073 
2074     const Register rscratch1_offset = rscratch1;    // array offset
2075     const Register r18_elsize = lh; // element size
2076 
2077     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2078            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2079     __ add(src, src, rscratch1_offset);           // src array offset
2080     __ add(dst, dst, rscratch1_offset);           // dst array offset
2081     BLOCK_COMMENT("choose copy loop based on element size");
2082 
2083     // next registers should be set before the jump to corresponding stub
2084     const Register from     = c_rarg0;  // source array address
2085     const Register to       = c_rarg1;  // destination array address
2086     const Register count    = c_rarg2;  // elements count
2087 
2088     // 'from', 'to', 'count' registers should be set in such order
2089     // since they are the same as 'src', 'src_pos', 'dst'.
2090 
2091     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2092 
2093     // The possible values of elsize are 0-3, i.e. exact_log2(element
2094     // size in bytes).  We do a simple bitwise binary search.
2095   __ BIND(L_copy_bytes);
2096     __ tbnz(r18_elsize, 1, L_copy_ints);
2097     __ tbnz(r18_elsize, 0, L_copy_shorts);
2098     __ lea(from, Address(src, src_pos));// src_addr
2099     __ lea(to,   Address(dst, dst_pos));// dst_addr
2100     __ movw(count, scratch_length); // length
2101     __ b(RuntimeAddress(byte_copy_entry));
2102 
2103   __ BIND(L_copy_shorts);
2104     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2105     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2106     __ movw(count, scratch_length); // length
2107     __ b(RuntimeAddress(short_copy_entry));
2108 
2109   __ BIND(L_copy_ints);
2110     __ tbnz(r18_elsize, 0, L_copy_longs);
2111     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2112     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2113     __ movw(count, scratch_length); // length
2114     __ b(RuntimeAddress(int_copy_entry));
2115 
2116   __ BIND(L_copy_longs);
2117 #ifdef ASSERT
2118     {
2119       BLOCK_COMMENT("assert long copy {");
2120       Label L;
2121       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2122       __ cmpw(r18_elsize, LogBytesPerLong);
2123       __ br(Assembler::EQ, L);
2124       __ stop("must be long copy, but elsize is wrong");
2125       __ bind(L);
2126       BLOCK_COMMENT("} assert long copy done");
2127     }
2128 #endif
2129     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2130     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2131     __ movw(count, scratch_length); // length
2132     __ b(RuntimeAddress(long_copy_entry));
2133 
2134     // ObjArrayKlass
2135   __ BIND(L_objArray);
2136     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2137 
2138     Label L_plain_copy, L_checkcast_copy;
2139     //  test array classes for subtyping
2140     __ load_klass(r18, dst);
2141     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2142     __ br(Assembler::NE, L_checkcast_copy);
2143 
2144     // Identically typed arrays can be copied without element-wise checks.
2145     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2146                            rscratch2, L_failed);
2147 
2148     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2149     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2150     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2151     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2152     __ movw(count, scratch_length); // length
2153   __ BIND(L_plain_copy);
2154     __ b(RuntimeAddress(oop_copy_entry));
2155 
2156   __ BIND(L_checkcast_copy);
2157     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2158     {
2159       // Before looking at dst.length, make sure dst is also an objArray.
2160       __ ldrw(rscratch1, Address(r18, lh_offset));
2161       __ movw(rscratch2, objArray_lh);
2162       __ eorw(rscratch1, rscratch1, rscratch2);
2163       __ cbnzw(rscratch1, L_failed);
2164 
2165       // It is safe to examine both src.length and dst.length.
2166       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2167                              r18, L_failed);
2168 
2169       __ load_klass(dst_klass, dst); // reload
2170 
2171       // Marshal the base address arguments now, freeing registers.
2172       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2173       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2174       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2175       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2176       __ movw(count, length);           // length (reloaded)
2177       Register sco_temp = c_rarg3;      // this register is free now
2178       assert_different_registers(from, to, count, sco_temp,
2179                                  dst_klass, scratch_src_klass);
2180       // assert_clean_int(count, sco_temp);
2181 
2182       // Generate the type check.
2183       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2184       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2185 
2186       // Smashes rscratch1, rscratch2
2187       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2188 
2189       // Fetch destination element klass from the ObjArrayKlass header.
2190       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2191       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2192       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2193 
2194       // the checkcast_copy loop needs two extra arguments:
2195       assert(c_rarg3 == sco_temp, "#3 already in place");
2196       // Set up arguments for checkcast_copy_entry.
2197       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2198       __ b(RuntimeAddress(checkcast_copy_entry));
2199     }
2200 
2201   __ BIND(L_failed);
2202     __ mov(r0, -1);
2203     __ leave();   // required for proper stackwalking of RuntimeStub frame
2204     __ ret(lr);
2205 
2206     return start;
2207   }
2208 
2209   //
2210   // Generate stub for array fill. If "aligned" is true, the
2211   // "to" address is assumed to be heapword aligned.
2212   //
2213   // Arguments for generated stub:
2214   //   to:    c_rarg0
2215   //   value: c_rarg1
2216   //   count: c_rarg2 treated as signed
2217   //
2218   address generate_fill(BasicType t, bool aligned, const char *name) {
2219     __ align(CodeEntryAlignment);
2220     StubCodeMark mark(this, "StubRoutines", name);
2221     address start = __ pc();
2222 
2223     BLOCK_COMMENT("Entry:");
2224 
2225     const Register to        = c_rarg0;  // source array address
2226     const Register value     = c_rarg1;  // value
2227     const Register count     = c_rarg2;  // elements count
2228 
2229     const Register bz_base = r10;        // base for block_zero routine
2230     const Register cnt_words = r11;      // temp register
2231 
2232     __ enter();
2233 
2234     Label L_fill_elements, L_exit1;
2235 
2236     int shift = -1;
2237     switch (t) {
2238       case T_BYTE:
2239         shift = 0;
2240         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2241         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2242         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2243         __ br(Assembler::LO, L_fill_elements);
2244         break;
2245       case T_SHORT:
2246         shift = 1;
2247         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2248         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2249         __ br(Assembler::LO, L_fill_elements);
2250         break;
2251       case T_INT:
2252         shift = 2;
2253         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2254         __ br(Assembler::LO, L_fill_elements);
2255         break;
2256       default: ShouldNotReachHere();
2257     }
2258 
2259     // Align source address at 8 bytes address boundary.
2260     Label L_skip_align1, L_skip_align2, L_skip_align4;
2261     if (!aligned) {
2262       switch (t) {
2263         case T_BYTE:
2264           // One byte misalignment happens only for byte arrays.
2265           __ tbz(to, 0, L_skip_align1);
2266           __ strb(value, Address(__ post(to, 1)));
2267           __ subw(count, count, 1);
2268           __ bind(L_skip_align1);
2269           // Fallthrough
2270         case T_SHORT:
2271           // Two bytes misalignment happens only for byte and short (char) arrays.
2272           __ tbz(to, 1, L_skip_align2);
2273           __ strh(value, Address(__ post(to, 2)));
2274           __ subw(count, count, 2 >> shift);
2275           __ bind(L_skip_align2);
2276           // Fallthrough
2277         case T_INT:
2278           // Align to 8 bytes, we know we are 4 byte aligned to start.
2279           __ tbz(to, 2, L_skip_align4);
2280           __ strw(value, Address(__ post(to, 4)));
2281           __ subw(count, count, 4 >> shift);
2282           __ bind(L_skip_align4);
2283           break;
2284         default: ShouldNotReachHere();
2285       }
2286     }
2287 
2288     //
2289     //  Fill large chunks
2290     //
2291     __ lsrw(cnt_words, count, 3 - shift); // number of words
2292     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2293     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2294     if (UseBlockZeroing) {
2295       Label non_block_zeroing, rest;
2296       // If the fill value is zero we can use the fast zero_words().
2297       __ cbnz(value, non_block_zeroing);
2298       __ mov(bz_base, to);
2299       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2300       __ zero_words(bz_base, cnt_words);
2301       __ b(rest);
2302       __ bind(non_block_zeroing);
2303       __ fill_words(to, cnt_words, value);
2304       __ bind(rest);
2305     } else {
2306       __ fill_words(to, cnt_words, value);
2307     }
2308 
2309     // Remaining count is less than 8 bytes. Fill it by a single store.
2310     // Note that the total length is no less than 8 bytes.
2311     if (t == T_BYTE || t == T_SHORT) {
2312       Label L_exit1;
2313       __ cbzw(count, L_exit1);
2314       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2315       __ str(value, Address(to, -8));    // overwrite some elements
2316       __ bind(L_exit1);
2317       __ leave();
2318       __ ret(lr);
2319     }
2320 
2321     // Handle copies less than 8 bytes.
2322     Label L_fill_2, L_fill_4, L_exit2;
2323     __ bind(L_fill_elements);
2324     switch (t) {
2325       case T_BYTE:
2326         __ tbz(count, 0, L_fill_2);
2327         __ strb(value, Address(__ post(to, 1)));
2328         __ bind(L_fill_2);
2329         __ tbz(count, 1, L_fill_4);
2330         __ strh(value, Address(__ post(to, 2)));
2331         __ bind(L_fill_4);
2332         __ tbz(count, 2, L_exit2);
2333         __ strw(value, Address(to));
2334         break;
2335       case T_SHORT:
2336         __ tbz(count, 0, L_fill_4);
2337         __ strh(value, Address(__ post(to, 2)));
2338         __ bind(L_fill_4);
2339         __ tbz(count, 1, L_exit2);
2340         __ strw(value, Address(to));
2341         break;
2342       case T_INT:
2343         __ cbzw(count, L_exit2);
2344         __ strw(value, Address(to));
2345         break;
2346       default: ShouldNotReachHere();
2347     }
2348     __ bind(L_exit2);
2349     __ leave();
2350     __ ret(lr);
2351     return start;
2352   }
2353 
2354   address generate_data_cache_writeback() {
2355     const Register line        = c_rarg0;  // address of line to write back
2356 
2357     __ align(CodeEntryAlignment);
2358 
2359     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2360 
2361     address start = __ pc();
2362     __ enter();
2363     __ cache_wb(Address(line, 0));
2364     __ leave();
2365     __ ret(lr);
2366 
2367     return start;
2368   }
2369 
2370   address generate_data_cache_writeback_sync() {
2371     const Register is_pre     = c_rarg0;  // pre or post sync
2372 
2373     __ align(CodeEntryAlignment);
2374 
2375     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2376 
2377     // pre wbsync is a no-op
2378     // post wbsync translates to an sfence
2379 
2380     Label skip;
2381     address start = __ pc();
2382     __ enter();
2383     __ cbnz(is_pre, skip);
2384     __ cache_wbsync(false);
2385     __ bind(skip);
2386     __ leave();
2387     __ ret(lr);
2388 
2389     return start;
2390   }
2391 
2392   void generate_arraycopy_stubs() {
2393     address entry;
2394     address entry_jbyte_arraycopy;
2395     address entry_jshort_arraycopy;
2396     address entry_jint_arraycopy;
2397     address entry_oop_arraycopy;
2398     address entry_jlong_arraycopy;
2399     address entry_checkcast_arraycopy;
2400 
2401     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2402     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2403 
2404     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2405 
2406     //*** jbyte
2407     // Always need aligned and unaligned versions
2408     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2409                                                                                   "jbyte_disjoint_arraycopy");
2410     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2411                                                                                   &entry_jbyte_arraycopy,
2412                                                                                   "jbyte_arraycopy");
2413     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2414                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2415     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2416                                                                                   "arrayof_jbyte_arraycopy");
2417 
2418     //*** jshort
2419     // Always need aligned and unaligned versions
2420     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2421                                                                                     "jshort_disjoint_arraycopy");
2422     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2423                                                                                     &entry_jshort_arraycopy,
2424                                                                                     "jshort_arraycopy");
2425     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2426                                                                                     "arrayof_jshort_disjoint_arraycopy");
2427     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2428                                                                                     "arrayof_jshort_arraycopy");
2429 
2430     //*** jint
2431     // Aligned versions
2432     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2433                                                                                 "arrayof_jint_disjoint_arraycopy");
2434     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2435                                                                                 "arrayof_jint_arraycopy");
2436     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2437     // entry_jint_arraycopy always points to the unaligned version
2438     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2439                                                                                 "jint_disjoint_arraycopy");
2440     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2441                                                                                 &entry_jint_arraycopy,
2442                                                                                 "jint_arraycopy");
2443 
2444     //*** jlong
2445     // It is always aligned
2446     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2447                                                                                   "arrayof_jlong_disjoint_arraycopy");
2448     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2449                                                                                   "arrayof_jlong_arraycopy");
2450     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2451     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2452 
2453     //*** oops
2454     {
2455       // With compressed oops we need unaligned versions; notice that
2456       // we overwrite entry_oop_arraycopy.
2457       bool aligned = !UseCompressedOops;
2458 
2459       StubRoutines::_arrayof_oop_disjoint_arraycopy
2460         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2461                                      /*dest_uninitialized*/false);
2462       StubRoutines::_arrayof_oop_arraycopy
2463         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2464                                      /*dest_uninitialized*/false);
2465       // Aligned versions without pre-barriers
2466       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2467         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2468                                      /*dest_uninitialized*/true);
2469       StubRoutines::_arrayof_oop_arraycopy_uninit
2470         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2471                                      /*dest_uninitialized*/true);
2472     }
2473 
2474     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2475     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2476     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2477     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2478 
2479     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2480     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2481                                                                         /*dest_uninitialized*/true);
2482 
2483     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2484                                                               entry_jbyte_arraycopy,
2485                                                               entry_jshort_arraycopy,
2486                                                               entry_jint_arraycopy,
2487                                                               entry_jlong_arraycopy);
2488 
2489     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2490                                                                entry_jbyte_arraycopy,
2491                                                                entry_jshort_arraycopy,
2492                                                                entry_jint_arraycopy,
2493                                                                entry_oop_arraycopy,
2494                                                                entry_jlong_arraycopy,
2495                                                                entry_checkcast_arraycopy);
2496 
2497     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2498     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2499     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2500     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2501     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2502     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2503   }
2504 
2505   void generate_math_stubs() { Unimplemented(); }
2506 
2507   // Arguments:
2508   //
2509   // Inputs:
2510   //   c_rarg0   - source byte array address
2511   //   c_rarg1   - destination byte array address
2512   //   c_rarg2   - K (key) in little endian int array
2513   //
2514   address generate_aescrypt_encryptBlock() {
2515     __ align(CodeEntryAlignment);
2516     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2517 
2518     Label L_doLast;
2519 
2520     const Register from        = c_rarg0;  // source array address
2521     const Register to          = c_rarg1;  // destination array address
2522     const Register key         = c_rarg2;  // key array address
2523     const Register keylen      = rscratch1;
2524 
2525     address start = __ pc();
2526     __ enter();
2527 
2528     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2529 
2530     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2531 
2532     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2533     __ rev32(v1, __ T16B, v1);
2534     __ rev32(v2, __ T16B, v2);
2535     __ rev32(v3, __ T16B, v3);
2536     __ rev32(v4, __ T16B, v4);
2537     __ aese(v0, v1);
2538     __ aesmc(v0, v0);
2539     __ aese(v0, v2);
2540     __ aesmc(v0, v0);
2541     __ aese(v0, v3);
2542     __ aesmc(v0, v0);
2543     __ aese(v0, v4);
2544     __ aesmc(v0, v0);
2545 
2546     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2547     __ rev32(v1, __ T16B, v1);
2548     __ rev32(v2, __ T16B, v2);
2549     __ rev32(v3, __ T16B, v3);
2550     __ rev32(v4, __ T16B, v4);
2551     __ aese(v0, v1);
2552     __ aesmc(v0, v0);
2553     __ aese(v0, v2);
2554     __ aesmc(v0, v0);
2555     __ aese(v0, v3);
2556     __ aesmc(v0, v0);
2557     __ aese(v0, v4);
2558     __ aesmc(v0, v0);
2559 
2560     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2561     __ rev32(v1, __ T16B, v1);
2562     __ rev32(v2, __ T16B, v2);
2563 
2564     __ cmpw(keylen, 44);
2565     __ br(Assembler::EQ, L_doLast);
2566 
2567     __ aese(v0, v1);
2568     __ aesmc(v0, v0);
2569     __ aese(v0, v2);
2570     __ aesmc(v0, v0);
2571 
2572     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2573     __ rev32(v1, __ T16B, v1);
2574     __ rev32(v2, __ T16B, v2);
2575 
2576     __ cmpw(keylen, 52);
2577     __ br(Assembler::EQ, L_doLast);
2578 
2579     __ aese(v0, v1);
2580     __ aesmc(v0, v0);
2581     __ aese(v0, v2);
2582     __ aesmc(v0, v0);
2583 
2584     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2585     __ rev32(v1, __ T16B, v1);
2586     __ rev32(v2, __ T16B, v2);
2587 
2588     __ BIND(L_doLast);
2589 
2590     __ aese(v0, v1);
2591     __ aesmc(v0, v0);
2592     __ aese(v0, v2);
2593 
2594     __ ld1(v1, __ T16B, key);
2595     __ rev32(v1, __ T16B, v1);
2596     __ eor(v0, __ T16B, v0, v1);
2597 
2598     __ st1(v0, __ T16B, to);
2599 
2600     __ mov(r0, 0);
2601 
2602     __ leave();
2603     __ ret(lr);
2604 
2605     return start;
2606   }
2607 
2608   // Arguments:
2609   //
2610   // Inputs:
2611   //   c_rarg0   - source byte array address
2612   //   c_rarg1   - destination byte array address
2613   //   c_rarg2   - K (key) in little endian int array
2614   //
2615   address generate_aescrypt_decryptBlock() {
2616     assert(UseAES, "need AES instructions and misaligned SSE support");
2617     __ align(CodeEntryAlignment);
2618     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2619     Label L_doLast;
2620 
2621     const Register from        = c_rarg0;  // source array address
2622     const Register to          = c_rarg1;  // destination array address
2623     const Register key         = c_rarg2;  // key array address
2624     const Register keylen      = rscratch1;
2625 
2626     address start = __ pc();
2627     __ enter(); // required for proper stackwalking of RuntimeStub frame
2628 
2629     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2630 
2631     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2632 
2633     __ ld1(v5, __ T16B, __ post(key, 16));
2634     __ rev32(v5, __ T16B, v5);
2635 
2636     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2637     __ rev32(v1, __ T16B, v1);
2638     __ rev32(v2, __ T16B, v2);
2639     __ rev32(v3, __ T16B, v3);
2640     __ rev32(v4, __ T16B, v4);
2641     __ aesd(v0, v1);
2642     __ aesimc(v0, v0);
2643     __ aesd(v0, v2);
2644     __ aesimc(v0, v0);
2645     __ aesd(v0, v3);
2646     __ aesimc(v0, v0);
2647     __ aesd(v0, v4);
2648     __ aesimc(v0, v0);
2649 
2650     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2651     __ rev32(v1, __ T16B, v1);
2652     __ rev32(v2, __ T16B, v2);
2653     __ rev32(v3, __ T16B, v3);
2654     __ rev32(v4, __ T16B, v4);
2655     __ aesd(v0, v1);
2656     __ aesimc(v0, v0);
2657     __ aesd(v0, v2);
2658     __ aesimc(v0, v0);
2659     __ aesd(v0, v3);
2660     __ aesimc(v0, v0);
2661     __ aesd(v0, v4);
2662     __ aesimc(v0, v0);
2663 
2664     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2665     __ rev32(v1, __ T16B, v1);
2666     __ rev32(v2, __ T16B, v2);
2667 
2668     __ cmpw(keylen, 44);
2669     __ br(Assembler::EQ, L_doLast);
2670 
2671     __ aesd(v0, v1);
2672     __ aesimc(v0, v0);
2673     __ aesd(v0, v2);
2674     __ aesimc(v0, v0);
2675 
2676     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2677     __ rev32(v1, __ T16B, v1);
2678     __ rev32(v2, __ T16B, v2);
2679 
2680     __ cmpw(keylen, 52);
2681     __ br(Assembler::EQ, L_doLast);
2682 
2683     __ aesd(v0, v1);
2684     __ aesimc(v0, v0);
2685     __ aesd(v0, v2);
2686     __ aesimc(v0, v0);
2687 
2688     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2689     __ rev32(v1, __ T16B, v1);
2690     __ rev32(v2, __ T16B, v2);
2691 
2692     __ BIND(L_doLast);
2693 
2694     __ aesd(v0, v1);
2695     __ aesimc(v0, v0);
2696     __ aesd(v0, v2);
2697 
2698     __ eor(v0, __ T16B, v0, v5);
2699 
2700     __ st1(v0, __ T16B, to);
2701 
2702     __ mov(r0, 0);
2703 
2704     __ leave();
2705     __ ret(lr);
2706 
2707     return start;
2708   }
2709 
2710   // Arguments:
2711   //
2712   // Inputs:
2713   //   c_rarg0   - source byte array address
2714   //   c_rarg1   - destination byte array address
2715   //   c_rarg2   - K (key) in little endian int array
2716   //   c_rarg3   - r vector byte array address
2717   //   c_rarg4   - input length
2718   //
2719   // Output:
2720   //   x0        - input length
2721   //
2722   address generate_cipherBlockChaining_encryptAESCrypt() {
2723     assert(UseAES, "need AES instructions and misaligned SSE support");
2724     __ align(CodeEntryAlignment);
2725     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2726 
2727     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2728 
2729     const Register from        = c_rarg0;  // source array address
2730     const Register to          = c_rarg1;  // destination array address
2731     const Register key         = c_rarg2;  // key array address
2732     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2733                                            // and left with the results of the last encryption block
2734     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2735     const Register keylen      = rscratch1;
2736 
2737     address start = __ pc();
2738 
2739       __ enter();
2740 
2741       __ movw(rscratch2, len_reg);
2742 
2743       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2744 
2745       __ ld1(v0, __ T16B, rvec);
2746 
2747       __ cmpw(keylen, 52);
2748       __ br(Assembler::CC, L_loadkeys_44);
2749       __ br(Assembler::EQ, L_loadkeys_52);
2750 
2751       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2752       __ rev32(v17, __ T16B, v17);
2753       __ rev32(v18, __ T16B, v18);
2754     __ BIND(L_loadkeys_52);
2755       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2756       __ rev32(v19, __ T16B, v19);
2757       __ rev32(v20, __ T16B, v20);
2758     __ BIND(L_loadkeys_44);
2759       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2760       __ rev32(v21, __ T16B, v21);
2761       __ rev32(v22, __ T16B, v22);
2762       __ rev32(v23, __ T16B, v23);
2763       __ rev32(v24, __ T16B, v24);
2764       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2765       __ rev32(v25, __ T16B, v25);
2766       __ rev32(v26, __ T16B, v26);
2767       __ rev32(v27, __ T16B, v27);
2768       __ rev32(v28, __ T16B, v28);
2769       __ ld1(v29, v30, v31, __ T16B, key);
2770       __ rev32(v29, __ T16B, v29);
2771       __ rev32(v30, __ T16B, v30);
2772       __ rev32(v31, __ T16B, v31);
2773 
2774     __ BIND(L_aes_loop);
2775       __ ld1(v1, __ T16B, __ post(from, 16));
2776       __ eor(v0, __ T16B, v0, v1);
2777 
2778       __ br(Assembler::CC, L_rounds_44);
2779       __ br(Assembler::EQ, L_rounds_52);
2780 
2781       __ aese(v0, v17); __ aesmc(v0, v0);
2782       __ aese(v0, v18); __ aesmc(v0, v0);
2783     __ BIND(L_rounds_52);
2784       __ aese(v0, v19); __ aesmc(v0, v0);
2785       __ aese(v0, v20); __ aesmc(v0, v0);
2786     __ BIND(L_rounds_44);
2787       __ aese(v0, v21); __ aesmc(v0, v0);
2788       __ aese(v0, v22); __ aesmc(v0, v0);
2789       __ aese(v0, v23); __ aesmc(v0, v0);
2790       __ aese(v0, v24); __ aesmc(v0, v0);
2791       __ aese(v0, v25); __ aesmc(v0, v0);
2792       __ aese(v0, v26); __ aesmc(v0, v0);
2793       __ aese(v0, v27); __ aesmc(v0, v0);
2794       __ aese(v0, v28); __ aesmc(v0, v0);
2795       __ aese(v0, v29); __ aesmc(v0, v0);
2796       __ aese(v0, v30);
2797       __ eor(v0, __ T16B, v0, v31);
2798 
2799       __ st1(v0, __ T16B, __ post(to, 16));
2800 
2801       __ subw(len_reg, len_reg, 16);
2802       __ cbnzw(len_reg, L_aes_loop);
2803 
2804       __ st1(v0, __ T16B, rvec);
2805 
2806       __ mov(r0, rscratch2);
2807 
2808       __ leave();
2809       __ ret(lr);
2810 
2811       return start;
2812   }
2813 
2814   // Arguments:
2815   //
2816   // Inputs:
2817   //   c_rarg0   - source byte array address
2818   //   c_rarg1   - destination byte array address
2819   //   c_rarg2   - K (key) in little endian int array
2820   //   c_rarg3   - r vector byte array address
2821   //   c_rarg4   - input length
2822   //
2823   // Output:
2824   //   r0        - input length
2825   //
2826   address generate_cipherBlockChaining_decryptAESCrypt() {
2827     assert(UseAES, "need AES instructions and misaligned SSE support");
2828     __ align(CodeEntryAlignment);
2829     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2830 
2831     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2832 
2833     const Register from        = c_rarg0;  // source array address
2834     const Register to          = c_rarg1;  // destination array address
2835     const Register key         = c_rarg2;  // key array address
2836     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2837                                            // and left with the results of the last encryption block
2838     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2839     const Register keylen      = rscratch1;
2840 
2841     address start = __ pc();
2842 
2843       __ enter();
2844 
2845       __ movw(rscratch2, len_reg);
2846 
2847       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2848 
2849       __ ld1(v2, __ T16B, rvec);
2850 
2851       __ ld1(v31, __ T16B, __ post(key, 16));
2852       __ rev32(v31, __ T16B, v31);
2853 
2854       __ cmpw(keylen, 52);
2855       __ br(Assembler::CC, L_loadkeys_44);
2856       __ br(Assembler::EQ, L_loadkeys_52);
2857 
2858       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2859       __ rev32(v17, __ T16B, v17);
2860       __ rev32(v18, __ T16B, v18);
2861     __ BIND(L_loadkeys_52);
2862       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2863       __ rev32(v19, __ T16B, v19);
2864       __ rev32(v20, __ T16B, v20);
2865     __ BIND(L_loadkeys_44);
2866       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2867       __ rev32(v21, __ T16B, v21);
2868       __ rev32(v22, __ T16B, v22);
2869       __ rev32(v23, __ T16B, v23);
2870       __ rev32(v24, __ T16B, v24);
2871       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2872       __ rev32(v25, __ T16B, v25);
2873       __ rev32(v26, __ T16B, v26);
2874       __ rev32(v27, __ T16B, v27);
2875       __ rev32(v28, __ T16B, v28);
2876       __ ld1(v29, v30, __ T16B, key);
2877       __ rev32(v29, __ T16B, v29);
2878       __ rev32(v30, __ T16B, v30);
2879 
2880     __ BIND(L_aes_loop);
2881       __ ld1(v0, __ T16B, __ post(from, 16));
2882       __ orr(v1, __ T16B, v0, v0);
2883 
2884       __ br(Assembler::CC, L_rounds_44);
2885       __ br(Assembler::EQ, L_rounds_52);
2886 
2887       __ aesd(v0, v17); __ aesimc(v0, v0);
2888       __ aesd(v0, v18); __ aesimc(v0, v0);
2889     __ BIND(L_rounds_52);
2890       __ aesd(v0, v19); __ aesimc(v0, v0);
2891       __ aesd(v0, v20); __ aesimc(v0, v0);
2892     __ BIND(L_rounds_44);
2893       __ aesd(v0, v21); __ aesimc(v0, v0);
2894       __ aesd(v0, v22); __ aesimc(v0, v0);
2895       __ aesd(v0, v23); __ aesimc(v0, v0);
2896       __ aesd(v0, v24); __ aesimc(v0, v0);
2897       __ aesd(v0, v25); __ aesimc(v0, v0);
2898       __ aesd(v0, v26); __ aesimc(v0, v0);
2899       __ aesd(v0, v27); __ aesimc(v0, v0);
2900       __ aesd(v0, v28); __ aesimc(v0, v0);
2901       __ aesd(v0, v29); __ aesimc(v0, v0);
2902       __ aesd(v0, v30);
2903       __ eor(v0, __ T16B, v0, v31);
2904       __ eor(v0, __ T16B, v0, v2);
2905 
2906       __ st1(v0, __ T16B, __ post(to, 16));
2907       __ orr(v2, __ T16B, v1, v1);
2908 
2909       __ subw(len_reg, len_reg, 16);
2910       __ cbnzw(len_reg, L_aes_loop);
2911 
2912       __ st1(v2, __ T16B, rvec);
2913 
2914       __ mov(r0, rscratch2);
2915 
2916       __ leave();
2917       __ ret(lr);
2918 
2919     return start;
2920   }
2921 
2922   // Arguments:
2923   //
2924   // Inputs:
2925   //   c_rarg0   - byte[]  source+offset
2926   //   c_rarg1   - int[]   SHA.state
2927   //   c_rarg2   - int     offset
2928   //   c_rarg3   - int     limit
2929   //
2930   address generate_sha1_implCompress(bool multi_block, const char *name) {
2931     __ align(CodeEntryAlignment);
2932     StubCodeMark mark(this, "StubRoutines", name);
2933     address start = __ pc();
2934 
2935     Register buf   = c_rarg0;
2936     Register state = c_rarg1;
2937     Register ofs   = c_rarg2;
2938     Register limit = c_rarg3;
2939 
2940     Label keys;
2941     Label sha1_loop;
2942 
2943     // load the keys into v0..v3
2944     __ adr(rscratch1, keys);
2945     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2946     // load 5 words state into v6, v7
2947     __ ldrq(v6, Address(state, 0));
2948     __ ldrs(v7, Address(state, 16));
2949 
2950 
2951     __ BIND(sha1_loop);
2952     // load 64 bytes of data into v16..v19
2953     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2954     __ rev32(v16, __ T16B, v16);
2955     __ rev32(v17, __ T16B, v17);
2956     __ rev32(v18, __ T16B, v18);
2957     __ rev32(v19, __ T16B, v19);
2958 
2959     // do the sha1
2960     __ addv(v4, __ T4S, v16, v0);
2961     __ orr(v20, __ T16B, v6, v6);
2962 
2963     FloatRegister d0 = v16;
2964     FloatRegister d1 = v17;
2965     FloatRegister d2 = v18;
2966     FloatRegister d3 = v19;
2967 
2968     for (int round = 0; round < 20; round++) {
2969       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2970       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2971       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2972       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2973       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2974 
2975       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2976       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2977       __ sha1h(tmp2, __ T4S, v20);
2978       if (round < 5)
2979         __ sha1c(v20, __ T4S, tmp3, tmp4);
2980       else if (round < 10 || round >= 15)
2981         __ sha1p(v20, __ T4S, tmp3, tmp4);
2982       else
2983         __ sha1m(v20, __ T4S, tmp3, tmp4);
2984       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2985 
2986       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2987     }
2988 
2989     __ addv(v7, __ T2S, v7, v21);
2990     __ addv(v6, __ T4S, v6, v20);
2991 
2992     if (multi_block) {
2993       __ add(ofs, ofs, 64);
2994       __ cmp(ofs, limit);
2995       __ br(Assembler::LE, sha1_loop);
2996       __ mov(c_rarg0, ofs); // return ofs
2997     }
2998 
2999     __ strq(v6, Address(state, 0));
3000     __ strs(v7, Address(state, 16));
3001 
3002     __ ret(lr);
3003 
3004     __ bind(keys);
3005     __ emit_int32(0x5a827999);
3006     __ emit_int32(0x6ed9eba1);
3007     __ emit_int32(0x8f1bbcdc);
3008     __ emit_int32(0xca62c1d6);
3009 
3010     return start;
3011   }
3012 
3013 
3014   // Arguments:
3015   //
3016   // Inputs:
3017   //   c_rarg0   - byte[]  source+offset
3018   //   c_rarg1   - int[]   SHA.state
3019   //   c_rarg2   - int     offset
3020   //   c_rarg3   - int     limit
3021   //
3022   address generate_sha256_implCompress(bool multi_block, const char *name) {
3023     static const uint32_t round_consts[64] = {
3024       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3025       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3026       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3027       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3028       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3029       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3030       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3031       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3032       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3033       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3034       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3035       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3036       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3037       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3038       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3039       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3040     };
3041     __ align(CodeEntryAlignment);
3042     StubCodeMark mark(this, "StubRoutines", name);
3043     address start = __ pc();
3044 
3045     Register buf   = c_rarg0;
3046     Register state = c_rarg1;
3047     Register ofs   = c_rarg2;
3048     Register limit = c_rarg3;
3049 
3050     Label sha1_loop;
3051 
3052     __ stpd(v8, v9, __ pre(sp, -32));
3053     __ stpd(v10, v11, Address(sp, 16));
3054 
3055 // dga == v0
3056 // dgb == v1
3057 // dg0 == v2
3058 // dg1 == v3
3059 // dg2 == v4
3060 // t0 == v6
3061 // t1 == v7
3062 
3063     // load 16 keys to v16..v31
3064     __ lea(rscratch1, ExternalAddress((address)round_consts));
3065     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3066     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3067     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3068     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3069 
3070     // load 8 words (256 bits) state
3071     __ ldpq(v0, v1, state);
3072 
3073     __ BIND(sha1_loop);
3074     // load 64 bytes of data into v8..v11
3075     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3076     __ rev32(v8, __ T16B, v8);
3077     __ rev32(v9, __ T16B, v9);
3078     __ rev32(v10, __ T16B, v10);
3079     __ rev32(v11, __ T16B, v11);
3080 
3081     __ addv(v6, __ T4S, v8, v16);
3082     __ orr(v2, __ T16B, v0, v0);
3083     __ orr(v3, __ T16B, v1, v1);
3084 
3085     FloatRegister d0 = v8;
3086     FloatRegister d1 = v9;
3087     FloatRegister d2 = v10;
3088     FloatRegister d3 = v11;
3089 
3090 
3091     for (int round = 0; round < 16; round++) {
3092       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3093       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3094       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3095       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3096 
3097       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3098        __ orr(v4, __ T16B, v2, v2);
3099       if (round < 15)
3100         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3101       __ sha256h(v2, __ T4S, v3, tmp2);
3102       __ sha256h2(v3, __ T4S, v4, tmp2);
3103       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3104 
3105       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3106     }
3107 
3108     __ addv(v0, __ T4S, v0, v2);
3109     __ addv(v1, __ T4S, v1, v3);
3110 
3111     if (multi_block) {
3112       __ add(ofs, ofs, 64);
3113       __ cmp(ofs, limit);
3114       __ br(Assembler::LE, sha1_loop);
3115       __ mov(c_rarg0, ofs); // return ofs
3116     }
3117 
3118     __ ldpd(v10, v11, Address(sp, 16));
3119     __ ldpd(v8, v9, __ post(sp, 32));
3120 
3121     __ stpq(v0, v1, state);
3122 
3123     __ ret(lr);
3124 
3125     return start;
3126   }
3127 
3128   // Arguments:
3129   //
3130   // Inputs:
3131   //   c_rarg0   - byte[]  source+offset
3132   //   c_rarg1   - int[]   SHA.state
3133   //   c_rarg2   - int     offset
3134   //   c_rarg3   - int     limit
3135   //
3136   address generate_sha512_implCompress(bool multi_block, const char *name) {
3137     static const uint64_t round_consts[80] = {
3138       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3139       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3140       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3141       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3142       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3143       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3144       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3145       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3146       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3147       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3148       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3149       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3150       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3151       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3152       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3153       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3154       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3155       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3156       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3157       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3158       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3159       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3160       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3161       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3162       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3163       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3164       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3165     };
3166 
3167     // Double rounds for sha512.
3168     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3169       if (dr < 36)                                                                   \
3170         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3171       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3172       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3173       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3174       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3175       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3176       if (dr < 32) {                                                                 \
3177         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3178         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3179       }                                                                              \
3180       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3181       if (dr < 32)                                                                   \
3182         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3183       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3184       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3185 
3186     __ align(CodeEntryAlignment);
3187     StubCodeMark mark(this, "StubRoutines", name);
3188     address start = __ pc();
3189 
3190     Register buf   = c_rarg0;
3191     Register state = c_rarg1;
3192     Register ofs   = c_rarg2;
3193     Register limit = c_rarg3;
3194 
3195     __ stpd(v8, v9, __ pre(sp, -64));
3196     __ stpd(v10, v11, Address(sp, 16));
3197     __ stpd(v12, v13, Address(sp, 32));
3198     __ stpd(v14, v15, Address(sp, 48));
3199 
3200     Label sha512_loop;
3201 
3202     // load state
3203     __ ld1(v8, v9, v10, v11, __ T2D, state);
3204 
3205     // load first 4 round constants
3206     __ lea(rscratch1, ExternalAddress((address)round_consts));
3207     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3208 
3209     __ BIND(sha512_loop);
3210     // load 128B of data into v12..v19
3211     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3212     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3213     __ rev64(v12, __ T16B, v12);
3214     __ rev64(v13, __ T16B, v13);
3215     __ rev64(v14, __ T16B, v14);
3216     __ rev64(v15, __ T16B, v15);
3217     __ rev64(v16, __ T16B, v16);
3218     __ rev64(v17, __ T16B, v17);
3219     __ rev64(v18, __ T16B, v18);
3220     __ rev64(v19, __ T16B, v19);
3221 
3222     __ mov(rscratch2, rscratch1);
3223 
3224     __ mov(v0, __ T16B, v8);
3225     __ mov(v1, __ T16B, v9);
3226     __ mov(v2, __ T16B, v10);
3227     __ mov(v3, __ T16B, v11);
3228 
3229     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3230     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3231     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3232     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3233     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3234     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3235     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3236     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3237     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3238     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3239     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3240     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3241     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3242     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3243     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3244     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3245     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3246     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3247     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3248     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3249     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3250     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3251     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3252     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3253     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3254     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3255     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3256     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3257     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3258     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3259     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3260     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3261     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3262     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3263     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3264     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3265     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3266     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3267     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3268     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3269 
3270     __ addv(v8, __ T2D, v8, v0);
3271     __ addv(v9, __ T2D, v9, v1);
3272     __ addv(v10, __ T2D, v10, v2);
3273     __ addv(v11, __ T2D, v11, v3);
3274 
3275     if (multi_block) {
3276       __ add(ofs, ofs, 128);
3277       __ cmp(ofs, limit);
3278       __ br(Assembler::LE, sha512_loop);
3279       __ mov(c_rarg0, ofs); // return ofs
3280     }
3281 
3282     __ st1(v8, v9, v10, v11, __ T2D, state);
3283 
3284     __ ldpd(v14, v15, Address(sp, 48));
3285     __ ldpd(v12, v13, Address(sp, 32));
3286     __ ldpd(v10, v11, Address(sp, 16));
3287     __ ldpd(v8, v9, __ post(sp, 64));
3288 
3289     __ ret(lr);
3290 
3291     return start;
3292   }
3293 
3294   // Arguments:
3295   //
3296   // Inputs:
3297   //   c_rarg0   - byte[]  source+offset
3298   //   c_rarg1   - byte[]   SHA.state
3299   //   c_rarg2   - int     digest_length
3300   //   c_rarg3   - int     offset
3301   //   c_rarg4   - int     limit
3302   //
3303   address generate_sha3_implCompress(bool multi_block, const char *name) {
3304     static const uint64_t round_consts[24] = {
3305       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3306       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3307       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3308       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3309       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3310       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3311       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3312       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3313     };
3314 
3315     __ align(CodeEntryAlignment);
3316     StubCodeMark mark(this, "StubRoutines", name);
3317     address start = __ pc();
3318 
3319     Register buf           = c_rarg0;
3320     Register state         = c_rarg1;
3321     Register digest_length = c_rarg2;
3322     Register ofs           = c_rarg3;
3323     Register limit         = c_rarg4;
3324 
3325     Label sha3_loop, rounds24_loop;
3326     Label sha3_512, sha3_384_or_224, sha3_256;
3327 
3328     __ stpd(v8, v9, __ pre(sp, -64));
3329     __ stpd(v10, v11, Address(sp, 16));
3330     __ stpd(v12, v13, Address(sp, 32));
3331     __ stpd(v14, v15, Address(sp, 48));
3332 
3333     // load state
3334     __ add(rscratch1, state, 32);
3335     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3336     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3337     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3338     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3339     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3340     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3341     __ ld1(v24, __ T1D, rscratch1);
3342 
3343     __ BIND(sha3_loop);
3344 
3345     // 24 keccak rounds
3346     __ movw(rscratch2, 24);
3347 
3348     // load round_constants base
3349     __ lea(rscratch1, ExternalAddress((address) round_consts));
3350 
3351     // load input
3352     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3353     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3354     __ eor(v0, __ T8B, v0, v25);
3355     __ eor(v1, __ T8B, v1, v26);
3356     __ eor(v2, __ T8B, v2, v27);
3357     __ eor(v3, __ T8B, v3, v28);
3358     __ eor(v4, __ T8B, v4, v29);
3359     __ eor(v5, __ T8B, v5, v30);
3360     __ eor(v6, __ T8B, v6, v31);
3361 
3362     // digest_length == 64, SHA3-512
3363     __ tbnz(digest_length, 6, sha3_512);
3364 
3365     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3366     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3367     __ eor(v7, __ T8B, v7, v25);
3368     __ eor(v8, __ T8B, v8, v26);
3369     __ eor(v9, __ T8B, v9, v27);
3370     __ eor(v10, __ T8B, v10, v28);
3371     __ eor(v11, __ T8B, v11, v29);
3372     __ eor(v12, __ T8B, v12, v30);
3373 
3374     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3375     __ tbnz(digest_length, 4, sha3_384_or_224);
3376 
3377     // SHA3-256
3378     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3379     __ eor(v13, __ T8B, v13, v25);
3380     __ eor(v14, __ T8B, v14, v26);
3381     __ eor(v15, __ T8B, v15, v27);
3382     __ eor(v16, __ T8B, v16, v28);
3383     __ b(rounds24_loop);
3384 
3385     __ BIND(sha3_384_or_224);
3386     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3387 
3388     // SHA3-224
3389     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3390     __ ld1(v29, __ T8B, __ post(buf, 8));
3391     __ eor(v13, __ T8B, v13, v25);
3392     __ eor(v14, __ T8B, v14, v26);
3393     __ eor(v15, __ T8B, v15, v27);
3394     __ eor(v16, __ T8B, v16, v28);
3395     __ eor(v17, __ T8B, v17, v29);
3396     __ b(rounds24_loop);
3397 
3398     __ BIND(sha3_512);
3399     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3400     __ eor(v7, __ T8B, v7, v25);
3401     __ eor(v8, __ T8B, v8, v26);
3402 
3403     __ BIND(rounds24_loop);
3404     __ subw(rscratch2, rscratch2, 1);
3405 
3406     __ eor3(v29, __ T16B, v4, v9, v14);
3407     __ eor3(v26, __ T16B, v1, v6, v11);
3408     __ eor3(v28, __ T16B, v3, v8, v13);
3409     __ eor3(v25, __ T16B, v0, v5, v10);
3410     __ eor3(v27, __ T16B, v2, v7, v12);
3411     __ eor3(v29, __ T16B, v29, v19, v24);
3412     __ eor3(v26, __ T16B, v26, v16, v21);
3413     __ eor3(v28, __ T16B, v28, v18, v23);
3414     __ eor3(v25, __ T16B, v25, v15, v20);
3415     __ eor3(v27, __ T16B, v27, v17, v22);
3416 
3417     __ rax1(v30, __ T2D, v29, v26);
3418     __ rax1(v26, __ T2D, v26, v28);
3419     __ rax1(v28, __ T2D, v28, v25);
3420     __ rax1(v25, __ T2D, v25, v27);
3421     __ rax1(v27, __ T2D, v27, v29);
3422 
3423     __ eor(v0, __ T16B, v0, v30);
3424     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3425     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3426     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3427     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3428     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3429     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3430     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3431     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3432     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3433     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3434     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3435     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3436     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3437     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3438     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3439     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3440     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3441     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3442     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3443     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3444     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3445     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3446     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3447     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3448 
3449     __ bcax(v20, __ T16B, v31, v22, v8);
3450     __ bcax(v21, __ T16B, v8,  v23, v22);
3451     __ bcax(v22, __ T16B, v22, v24, v23);
3452     __ bcax(v23, __ T16B, v23, v31, v24);
3453     __ bcax(v24, __ T16B, v24, v8,  v31);
3454 
3455     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3456 
3457     __ bcax(v17, __ T16B, v25, v19, v3);
3458     __ bcax(v18, __ T16B, v3,  v15, v19);
3459     __ bcax(v19, __ T16B, v19, v16, v15);
3460     __ bcax(v15, __ T16B, v15, v25, v16);
3461     __ bcax(v16, __ T16B, v16, v3,  v25);
3462 
3463     __ bcax(v10, __ T16B, v29, v12, v26);
3464     __ bcax(v11, __ T16B, v26, v13, v12);
3465     __ bcax(v12, __ T16B, v12, v14, v13);
3466     __ bcax(v13, __ T16B, v13, v29, v14);
3467     __ bcax(v14, __ T16B, v14, v26, v29);
3468 
3469     __ bcax(v7, __ T16B, v30, v9,  v4);
3470     __ bcax(v8, __ T16B, v4,  v5,  v9);
3471     __ bcax(v9, __ T16B, v9,  v6,  v5);
3472     __ bcax(v5, __ T16B, v5,  v30, v6);
3473     __ bcax(v6, __ T16B, v6,  v4,  v30);
3474 
3475     __ bcax(v3, __ T16B, v27, v0,  v28);
3476     __ bcax(v4, __ T16B, v28, v1,  v0);
3477     __ bcax(v0, __ T16B, v0,  v2,  v1);
3478     __ bcax(v1, __ T16B, v1,  v27, v2);
3479     __ bcax(v2, __ T16B, v2,  v28, v27);
3480 
3481     __ eor(v0, __ T16B, v0, v31);
3482 
3483     __ cbnzw(rscratch2, rounds24_loop);
3484 
3485     if (multi_block) {
3486       // block_size =  200 - 2 * digest_length, ofs += block_size
3487       __ add(ofs, ofs, 200);
3488       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3489 
3490       __ cmp(ofs, limit);
3491       __ br(Assembler::LE, sha3_loop);
3492       __ mov(c_rarg0, ofs); // return ofs
3493     }
3494 
3495     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3496     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3497     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3498     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3499     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3500     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3501     __ st1(v24, __ T1D, state);
3502 
3503     __ ldpd(v14, v15, Address(sp, 48));
3504     __ ldpd(v12, v13, Address(sp, 32));
3505     __ ldpd(v10, v11, Address(sp, 16));
3506     __ ldpd(v8, v9, __ post(sp, 64));
3507 
3508     __ ret(lr);
3509 
3510     return start;
3511   }
3512 
3513   // Safefetch stubs.
3514   void generate_safefetch(const char* name, int size, address* entry,
3515                           address* fault_pc, address* continuation_pc) {
3516     // safefetch signatures:
3517     //   int      SafeFetch32(int*      adr, int      errValue);
3518     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3519     //
3520     // arguments:
3521     //   c_rarg0 = adr
3522     //   c_rarg1 = errValue
3523     //
3524     // result:
3525     //   PPC_RET  = *adr or errValue
3526 
3527     StubCodeMark mark(this, "StubRoutines", name);
3528 
3529     // Entry point, pc or function descriptor.
3530     *entry = __ pc();
3531 
3532     // Load *adr into c_rarg1, may fault.
3533     *fault_pc = __ pc();
3534     switch (size) {
3535       case 4:
3536         // int32_t
3537         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3538         break;
3539       case 8:
3540         // int64_t
3541         __ ldr(c_rarg1, Address(c_rarg0, 0));
3542         break;
3543       default:
3544         ShouldNotReachHere();
3545     }
3546 
3547     // return errValue or *adr
3548     *continuation_pc = __ pc();
3549     __ mov(r0, c_rarg1);
3550     __ ret(lr);
3551   }
3552 
3553   /**
3554    *  Arguments:
3555    *
3556    * Inputs:
3557    *   c_rarg0   - int crc
3558    *   c_rarg1   - byte* buf
3559    *   c_rarg2   - int length
3560    *
3561    * Ouput:
3562    *       rax   - int crc result
3563    */
3564   address generate_updateBytesCRC32() {
3565     assert(UseCRC32Intrinsics, "what are we doing here?");
3566 
3567     __ align(CodeEntryAlignment);
3568     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3569 
3570     address start = __ pc();
3571 
3572     const Register crc   = c_rarg0;  // crc
3573     const Register buf   = c_rarg1;  // source java byte array address
3574     const Register len   = c_rarg2;  // length
3575     const Register table0 = c_rarg3; // crc_table address
3576     const Register table1 = c_rarg4;
3577     const Register table2 = c_rarg5;
3578     const Register table3 = c_rarg6;
3579     const Register tmp3 = c_rarg7;
3580 
3581     BLOCK_COMMENT("Entry:");
3582     __ enter(); // required for proper stackwalking of RuntimeStub frame
3583 
3584     __ kernel_crc32(crc, buf, len,
3585               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3586 
3587     __ leave(); // required for proper stackwalking of RuntimeStub frame
3588     __ ret(lr);
3589 
3590     return start;
3591   }
3592 
3593   /**
3594    *  Arguments:
3595    *
3596    * Inputs:
3597    *   c_rarg0   - int crc
3598    *   c_rarg1   - byte* buf
3599    *   c_rarg2   - int length
3600    *   c_rarg3   - int* table
3601    *
3602    * Ouput:
3603    *       r0   - int crc result
3604    */
3605   address generate_updateBytesCRC32C() {
3606     assert(UseCRC32CIntrinsics, "what are we doing here?");
3607 
3608     __ align(CodeEntryAlignment);
3609     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3610 
3611     address start = __ pc();
3612 
3613     const Register crc   = c_rarg0;  // crc
3614     const Register buf   = c_rarg1;  // source java byte array address
3615     const Register len   = c_rarg2;  // length
3616     const Register table0 = c_rarg3; // crc_table address
3617     const Register table1 = c_rarg4;
3618     const Register table2 = c_rarg5;
3619     const Register table3 = c_rarg6;
3620     const Register tmp3 = c_rarg7;
3621 
3622     BLOCK_COMMENT("Entry:");
3623     __ enter(); // required for proper stackwalking of RuntimeStub frame
3624 
3625     __ kernel_crc32c(crc, buf, len,
3626               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3627 
3628     __ leave(); // required for proper stackwalking of RuntimeStub frame
3629     __ ret(lr);
3630 
3631     return start;
3632   }
3633 
3634   /***
3635    *  Arguments:
3636    *
3637    *  Inputs:
3638    *   c_rarg0   - int   adler
3639    *   c_rarg1   - byte* buff
3640    *   c_rarg2   - int   len
3641    *
3642    * Output:
3643    *   c_rarg0   - int adler result
3644    */
3645   address generate_updateBytesAdler32() {
3646     __ align(CodeEntryAlignment);
3647     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3648     address start = __ pc();
3649 
3650     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3651 
3652     // Aliases
3653     Register adler  = c_rarg0;
3654     Register s1     = c_rarg0;
3655     Register s2     = c_rarg3;
3656     Register buff   = c_rarg1;
3657     Register len    = c_rarg2;
3658     Register nmax  = r4;
3659     Register base  = r5;
3660     Register count = r6;
3661     Register temp0 = rscratch1;
3662     Register temp1 = rscratch2;
3663     FloatRegister vbytes = v0;
3664     FloatRegister vs1acc = v1;
3665     FloatRegister vs2acc = v2;
3666     FloatRegister vtable = v3;
3667 
3668     // Max number of bytes we can process before having to take the mod
3669     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3670     uint64_t BASE = 0xfff1;
3671     uint64_t NMAX = 0x15B0;
3672 
3673     __ mov(base, BASE);
3674     __ mov(nmax, NMAX);
3675 
3676     // Load accumulation coefficients for the upper 16 bits
3677     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3678     __ ld1(vtable, __ T16B, Address(temp0));
3679 
3680     // s1 is initialized to the lower 16 bits of adler
3681     // s2 is initialized to the upper 16 bits of adler
3682     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3683     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3684 
3685     // The pipelined loop needs at least 16 elements for 1 iteration
3686     // It does check this, but it is more effective to skip to the cleanup loop
3687     __ cmp(len, (u1)16);
3688     __ br(Assembler::HS, L_nmax);
3689     __ cbz(len, L_combine);
3690 
3691     __ bind(L_simple_by1_loop);
3692     __ ldrb(temp0, Address(__ post(buff, 1)));
3693     __ add(s1, s1, temp0);
3694     __ add(s2, s2, s1);
3695     __ subs(len, len, 1);
3696     __ br(Assembler::HI, L_simple_by1_loop);
3697 
3698     // s1 = s1 % BASE
3699     __ subs(temp0, s1, base);
3700     __ csel(s1, temp0, s1, Assembler::HS);
3701 
3702     // s2 = s2 % BASE
3703     __ lsr(temp0, s2, 16);
3704     __ lsl(temp1, temp0, 4);
3705     __ sub(temp1, temp1, temp0);
3706     __ add(s2, temp1, s2, ext::uxth);
3707 
3708     __ subs(temp0, s2, base);
3709     __ csel(s2, temp0, s2, Assembler::HS);
3710 
3711     __ b(L_combine);
3712 
3713     __ bind(L_nmax);
3714     __ subs(len, len, nmax);
3715     __ sub(count, nmax, 16);
3716     __ br(Assembler::LO, L_by16);
3717 
3718     __ bind(L_nmax_loop);
3719 
3720     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3721                                       vbytes, vs1acc, vs2acc, vtable);
3722 
3723     __ subs(count, count, 16);
3724     __ br(Assembler::HS, L_nmax_loop);
3725 
3726     // s1 = s1 % BASE
3727     __ lsr(temp0, s1, 16);
3728     __ lsl(temp1, temp0, 4);
3729     __ sub(temp1, temp1, temp0);
3730     __ add(temp1, temp1, s1, ext::uxth);
3731 
3732     __ lsr(temp0, temp1, 16);
3733     __ lsl(s1, temp0, 4);
3734     __ sub(s1, s1, temp0);
3735     __ add(s1, s1, temp1, ext:: uxth);
3736 
3737     __ subs(temp0, s1, base);
3738     __ csel(s1, temp0, s1, Assembler::HS);
3739 
3740     // s2 = s2 % BASE
3741     __ lsr(temp0, s2, 16);
3742     __ lsl(temp1, temp0, 4);
3743     __ sub(temp1, temp1, temp0);
3744     __ add(temp1, temp1, s2, ext::uxth);
3745 
3746     __ lsr(temp0, temp1, 16);
3747     __ lsl(s2, temp0, 4);
3748     __ sub(s2, s2, temp0);
3749     __ add(s2, s2, temp1, ext:: uxth);
3750 
3751     __ subs(temp0, s2, base);
3752     __ csel(s2, temp0, s2, Assembler::HS);
3753 
3754     __ subs(len, len, nmax);
3755     __ sub(count, nmax, 16);
3756     __ br(Assembler::HS, L_nmax_loop);
3757 
3758     __ bind(L_by16);
3759     __ adds(len, len, count);
3760     __ br(Assembler::LO, L_by1);
3761 
3762     __ bind(L_by16_loop);
3763 
3764     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3765                                       vbytes, vs1acc, vs2acc, vtable);
3766 
3767     __ subs(len, len, 16);
3768     __ br(Assembler::HS, L_by16_loop);
3769 
3770     __ bind(L_by1);
3771     __ adds(len, len, 15);
3772     __ br(Assembler::LO, L_do_mod);
3773 
3774     __ bind(L_by1_loop);
3775     __ ldrb(temp0, Address(__ post(buff, 1)));
3776     __ add(s1, temp0, s1);
3777     __ add(s2, s2, s1);
3778     __ subs(len, len, 1);
3779     __ br(Assembler::HS, L_by1_loop);
3780 
3781     __ bind(L_do_mod);
3782     // s1 = s1 % BASE
3783     __ lsr(temp0, s1, 16);
3784     __ lsl(temp1, temp0, 4);
3785     __ sub(temp1, temp1, temp0);
3786     __ add(temp1, temp1, s1, ext::uxth);
3787 
3788     __ lsr(temp0, temp1, 16);
3789     __ lsl(s1, temp0, 4);
3790     __ sub(s1, s1, temp0);
3791     __ add(s1, s1, temp1, ext:: uxth);
3792 
3793     __ subs(temp0, s1, base);
3794     __ csel(s1, temp0, s1, Assembler::HS);
3795 
3796     // s2 = s2 % BASE
3797     __ lsr(temp0, s2, 16);
3798     __ lsl(temp1, temp0, 4);
3799     __ sub(temp1, temp1, temp0);
3800     __ add(temp1, temp1, s2, ext::uxth);
3801 
3802     __ lsr(temp0, temp1, 16);
3803     __ lsl(s2, temp0, 4);
3804     __ sub(s2, s2, temp0);
3805     __ add(s2, s2, temp1, ext:: uxth);
3806 
3807     __ subs(temp0, s2, base);
3808     __ csel(s2, temp0, s2, Assembler::HS);
3809 
3810     // Combine lower bits and higher bits
3811     __ bind(L_combine);
3812     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3813 
3814     __ ret(lr);
3815 
3816     return start;
3817   }
3818 
3819   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3820           Register temp0, Register temp1, FloatRegister vbytes,
3821           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3822     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3823     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3824     // In non-vectorized code, we update s1 and s2 as:
3825     //   s1 <- s1 + b1
3826     //   s2 <- s2 + s1
3827     //   s1 <- s1 + b2
3828     //   s2 <- s2 + b1
3829     //   ...
3830     //   s1 <- s1 + b16
3831     //   s2 <- s2 + s1
3832     // Putting above assignments together, we have:
3833     //   s1_new = s1 + b1 + b2 + ... + b16
3834     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3835     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3836     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3837     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3838 
3839     // s2 = s2 + s1 * 16
3840     __ add(s2, s2, s1, Assembler::LSL, 4);
3841 
3842     // vs1acc = b1 + b2 + b3 + ... + b16
3843     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3844     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3845     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3846     __ uaddlv(vs1acc, __ T16B, vbytes);
3847     __ uaddlv(vs2acc, __ T8H, vs2acc);
3848 
3849     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3850     __ fmovd(temp0, vs1acc);
3851     __ fmovd(temp1, vs2acc);
3852     __ add(s1, s1, temp0);
3853     __ add(s2, s2, temp1);
3854   }
3855 
3856   /**
3857    *  Arguments:
3858    *
3859    *  Input:
3860    *    c_rarg0   - x address
3861    *    c_rarg1   - x length
3862    *    c_rarg2   - y address
3863    *    c_rarg3   - y lenth
3864    *    c_rarg4   - z address
3865    *    c_rarg5   - z length
3866    */
3867   address generate_multiplyToLen() {
3868     __ align(CodeEntryAlignment);
3869     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3870 
3871     address start = __ pc();
3872     const Register x     = r0;
3873     const Register xlen  = r1;
3874     const Register y     = r2;
3875     const Register ylen  = r3;
3876     const Register z     = r4;
3877     const Register zlen  = r5;
3878 
3879     const Register tmp1  = r10;
3880     const Register tmp2  = r11;
3881     const Register tmp3  = r12;
3882     const Register tmp4  = r13;
3883     const Register tmp5  = r14;
3884     const Register tmp6  = r15;
3885     const Register tmp7  = r16;
3886 
3887     BLOCK_COMMENT("Entry:");
3888     __ enter(); // required for proper stackwalking of RuntimeStub frame
3889     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3890     __ leave(); // required for proper stackwalking of RuntimeStub frame
3891     __ ret(lr);
3892 
3893     return start;
3894   }
3895 
3896   address generate_squareToLen() {
3897     // squareToLen algorithm for sizes 1..127 described in java code works
3898     // faster than multiply_to_len on some CPUs and slower on others, but
3899     // multiply_to_len shows a bit better overall results
3900     __ align(CodeEntryAlignment);
3901     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3902     address start = __ pc();
3903 
3904     const Register x     = r0;
3905     const Register xlen  = r1;
3906     const Register z     = r2;
3907     const Register zlen  = r3;
3908     const Register y     = r4; // == x
3909     const Register ylen  = r5; // == xlen
3910 
3911     const Register tmp1  = r10;
3912     const Register tmp2  = r11;
3913     const Register tmp3  = r12;
3914     const Register tmp4  = r13;
3915     const Register tmp5  = r14;
3916     const Register tmp6  = r15;
3917     const Register tmp7  = r16;
3918 
3919     RegSet spilled_regs = RegSet::of(y, ylen);
3920     BLOCK_COMMENT("Entry:");
3921     __ enter();
3922     __ push(spilled_regs, sp);
3923     __ mov(y, x);
3924     __ mov(ylen, xlen);
3925     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3926     __ pop(spilled_regs, sp);
3927     __ leave();
3928     __ ret(lr);
3929     return start;
3930   }
3931 
3932   address generate_mulAdd() {
3933     __ align(CodeEntryAlignment);
3934     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3935 
3936     address start = __ pc();
3937 
3938     const Register out     = r0;
3939     const Register in      = r1;
3940     const Register offset  = r2;
3941     const Register len     = r3;
3942     const Register k       = r4;
3943 
3944     BLOCK_COMMENT("Entry:");
3945     __ enter();
3946     __ mul_add(out, in, offset, len, k);
3947     __ leave();
3948     __ ret(lr);
3949 
3950     return start;
3951   }
3952 
3953   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3954                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3955                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3956     // Karatsuba multiplication performs a 128*128 -> 256-bit
3957     // multiplication in three 128-bit multiplications and a few
3958     // additions.
3959     //
3960     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3961     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3962     //
3963     // Inputs:
3964     //
3965     // A0 in a.d[0]     (subkey)
3966     // A1 in a.d[1]
3967     // (A1+A0) in a1_xor_a0.d[0]
3968     //
3969     // B0 in b.d[0]     (state)
3970     // B1 in b.d[1]
3971 
3972     __ ext(tmp1, __ T16B, b, b, 0x08);
3973     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3974     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3975     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3976     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3977 
3978     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3979     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3980     __ eor(tmp2, __ T16B, tmp2, tmp4);
3981     __ eor(tmp2, __ T16B, tmp2, tmp3);
3982 
3983     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3984     __ ins(result_hi, __ D, tmp2, 0, 1);
3985     __ ins(result_lo, __ D, tmp2, 1, 0);
3986   }
3987 
3988   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3989                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3990     const FloatRegister t0 = result;
3991 
3992     // The GCM field polynomial f is z^128 + p(z), where p =
3993     // z^7+z^2+z+1.
3994     //
3995     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3996     //
3997     // so, given that the product we're reducing is
3998     //    a == lo + hi * z^128
3999     // substituting,
4000     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
4001     //
4002     // we reduce by multiplying hi by p(z) and subtracting the result
4003     // from (i.e. XORing it with) lo.  Because p has no nonzero high
4004     // bits we can do this with two 64-bit multiplications, lo*p and
4005     // hi*p.
4006 
4007     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
4008     __ ext(t1, __ T16B, t0, z, 8);
4009     __ eor(hi, __ T16B, hi, t1);
4010     __ ext(t1, __ T16B, z, t0, 8);
4011     __ eor(lo, __ T16B, lo, t1);
4012     __ pmull(t0, __ T1Q, hi, p, __ T1D);
4013     __ eor(result, __ T16B, lo, t0);
4014   }
4015 
4016   address generate_has_negatives(address &has_negatives_long) {
4017     const u1 large_loop_size = 64;
4018     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4019     int dcache_line = VM_Version::dcache_line_size();
4020 
4021     Register ary1 = r1, len = r2, result = r0;
4022 
4023     __ align(CodeEntryAlignment);
4024 
4025     StubCodeMark mark(this, "StubRoutines", "has_negatives");
4026 
4027     address entry = __ pc();
4028 
4029     __ enter();
4030 
4031   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
4032         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4033 
4034   __ cmp(len, (u1)15);
4035   __ br(Assembler::GT, LEN_OVER_15);
4036   // The only case when execution falls into this code is when pointer is near
4037   // the end of memory page and we have to avoid reading next page
4038   __ add(ary1, ary1, len);
4039   __ subs(len, len, 8);
4040   __ br(Assembler::GT, LEN_OVER_8);
4041   __ ldr(rscratch2, Address(ary1, -8));
4042   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4043   __ lsrv(rscratch2, rscratch2, rscratch1);
4044   __ tst(rscratch2, UPPER_BIT_MASK);
4045   __ cset(result, Assembler::NE);
4046   __ leave();
4047   __ ret(lr);
4048   __ bind(LEN_OVER_8);
4049   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4050   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4051   __ tst(rscratch2, UPPER_BIT_MASK);
4052   __ br(Assembler::NE, RET_TRUE_NO_POP);
4053   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4054   __ lsrv(rscratch1, rscratch1, rscratch2);
4055   __ tst(rscratch1, UPPER_BIT_MASK);
4056   __ cset(result, Assembler::NE);
4057   __ leave();
4058   __ ret(lr);
4059 
4060   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4061   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4062 
4063   has_negatives_long = __ pc(); // 2nd entry point
4064 
4065   __ enter();
4066 
4067   __ bind(LEN_OVER_15);
4068     __ push(spilled_regs, sp);
4069     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4070     __ cbz(rscratch2, ALIGNED);
4071     __ ldp(tmp6, tmp1, Address(ary1));
4072     __ mov(tmp5, 16);
4073     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4074     __ add(ary1, ary1, rscratch1);
4075     __ sub(len, len, rscratch1);
4076     __ orr(tmp6, tmp6, tmp1);
4077     __ tst(tmp6, UPPER_BIT_MASK);
4078     __ br(Assembler::NE, RET_TRUE);
4079 
4080   __ bind(ALIGNED);
4081     __ cmp(len, large_loop_size);
4082     __ br(Assembler::LT, CHECK_16);
4083     // Perform 16-byte load as early return in pre-loop to handle situation
4084     // when initially aligned large array has negative values at starting bytes,
4085     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4086     // slower. Cases with negative bytes further ahead won't be affected that
4087     // much. In fact, it'll be faster due to early loads, less instructions and
4088     // less branches in LARGE_LOOP.
4089     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4090     __ sub(len, len, 16);
4091     __ orr(tmp6, tmp6, tmp1);
4092     __ tst(tmp6, UPPER_BIT_MASK);
4093     __ br(Assembler::NE, RET_TRUE);
4094     __ cmp(len, large_loop_size);
4095     __ br(Assembler::LT, CHECK_16);
4096 
4097     if (SoftwarePrefetchHintDistance >= 0
4098         && SoftwarePrefetchHintDistance >= dcache_line) {
4099       // initial prefetch
4100       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4101     }
4102   __ bind(LARGE_LOOP);
4103     if (SoftwarePrefetchHintDistance >= 0) {
4104       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4105     }
4106     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4107     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4108     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4109     // instructions per cycle and have less branches, but this approach disables
4110     // early return, thus, all 64 bytes are loaded and checked every time.
4111     __ ldp(tmp2, tmp3, Address(ary1));
4112     __ ldp(tmp4, tmp5, Address(ary1, 16));
4113     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4114     __ ldp(tmp6, tmp1, Address(ary1, 48));
4115     __ add(ary1, ary1, large_loop_size);
4116     __ sub(len, len, large_loop_size);
4117     __ orr(tmp2, tmp2, tmp3);
4118     __ orr(tmp4, tmp4, tmp5);
4119     __ orr(rscratch1, rscratch1, rscratch2);
4120     __ orr(tmp6, tmp6, tmp1);
4121     __ orr(tmp2, tmp2, tmp4);
4122     __ orr(rscratch1, rscratch1, tmp6);
4123     __ orr(tmp2, tmp2, rscratch1);
4124     __ tst(tmp2, UPPER_BIT_MASK);
4125     __ br(Assembler::NE, RET_TRUE);
4126     __ cmp(len, large_loop_size);
4127     __ br(Assembler::GE, LARGE_LOOP);
4128 
4129   __ bind(CHECK_16); // small 16-byte load pre-loop
4130     __ cmp(len, (u1)16);
4131     __ br(Assembler::LT, POST_LOOP16);
4132 
4133   __ bind(LOOP16); // small 16-byte load loop
4134     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4135     __ sub(len, len, 16);
4136     __ orr(tmp2, tmp2, tmp3);
4137     __ tst(tmp2, UPPER_BIT_MASK);
4138     __ br(Assembler::NE, RET_TRUE);
4139     __ cmp(len, (u1)16);
4140     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4141 
4142   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4143     __ cmp(len, (u1)8);
4144     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4145     __ ldr(tmp3, Address(__ post(ary1, 8)));
4146     __ sub(len, len, 8);
4147     __ tst(tmp3, UPPER_BIT_MASK);
4148     __ br(Assembler::NE, RET_TRUE);
4149 
4150   __ bind(POST_LOOP16_LOAD_TAIL);
4151     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
4152     __ ldr(tmp1, Address(ary1));
4153     __ mov(tmp2, 64);
4154     __ sub(tmp4, tmp2, len, __ LSL, 3);
4155     __ lslv(tmp1, tmp1, tmp4);
4156     __ tst(tmp1, UPPER_BIT_MASK);
4157     __ br(Assembler::NE, RET_TRUE);
4158     // Fallthrough
4159 
4160   __ bind(RET_FALSE);
4161     __ pop(spilled_regs, sp);
4162     __ leave();
4163     __ mov(result, zr);
4164     __ ret(lr);
4165 
4166   __ bind(RET_TRUE);
4167     __ pop(spilled_regs, sp);
4168   __ bind(RET_TRUE_NO_POP);
4169     __ leave();
4170     __ mov(result, 1);
4171     __ ret(lr);
4172 
4173   __ bind(DONE);
4174     __ pop(spilled_regs, sp);
4175     __ leave();
4176     __ ret(lr);
4177     return entry;
4178   }
4179 
4180   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4181         bool usePrefetch, Label &NOT_EQUAL) {
4182     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4183         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4184         tmp7 = r12, tmp8 = r13;
4185     Label LOOP;
4186 
4187     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4188     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4189     __ bind(LOOP);
4190     if (usePrefetch) {
4191       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4192       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4193     }
4194     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4195     __ eor(tmp1, tmp1, tmp2);
4196     __ eor(tmp3, tmp3, tmp4);
4197     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4198     __ orr(tmp1, tmp1, tmp3);
4199     __ cbnz(tmp1, NOT_EQUAL);
4200     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4201     __ eor(tmp5, tmp5, tmp6);
4202     __ eor(tmp7, tmp7, tmp8);
4203     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4204     __ orr(tmp5, tmp5, tmp7);
4205     __ cbnz(tmp5, NOT_EQUAL);
4206     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4207     __ eor(tmp1, tmp1, tmp2);
4208     __ eor(tmp3, tmp3, tmp4);
4209     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4210     __ orr(tmp1, tmp1, tmp3);
4211     __ cbnz(tmp1, NOT_EQUAL);
4212     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4213     __ eor(tmp5, tmp5, tmp6);
4214     __ sub(cnt1, cnt1, 8 * wordSize);
4215     __ eor(tmp7, tmp7, tmp8);
4216     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4217     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4218     // cmp) because subs allows an unlimited range of immediate operand.
4219     __ subs(tmp6, cnt1, loopThreshold);
4220     __ orr(tmp5, tmp5, tmp7);
4221     __ cbnz(tmp5, NOT_EQUAL);
4222     __ br(__ GE, LOOP);
4223     // post-loop
4224     __ eor(tmp1, tmp1, tmp2);
4225     __ eor(tmp3, tmp3, tmp4);
4226     __ orr(tmp1, tmp1, tmp3);
4227     __ sub(cnt1, cnt1, 2 * wordSize);
4228     __ cbnz(tmp1, NOT_EQUAL);
4229   }
4230 
4231   void generate_large_array_equals_loop_simd(int loopThreshold,
4232         bool usePrefetch, Label &NOT_EQUAL) {
4233     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4234         tmp2 = rscratch2;
4235     Label LOOP;
4236 
4237     __ bind(LOOP);
4238     if (usePrefetch) {
4239       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4240       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4241     }
4242     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4243     __ sub(cnt1, cnt1, 8 * wordSize);
4244     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4245     __ subs(tmp1, cnt1, loopThreshold);
4246     __ eor(v0, __ T16B, v0, v4);
4247     __ eor(v1, __ T16B, v1, v5);
4248     __ eor(v2, __ T16B, v2, v6);
4249     __ eor(v3, __ T16B, v3, v7);
4250     __ orr(v0, __ T16B, v0, v1);
4251     __ orr(v1, __ T16B, v2, v3);
4252     __ orr(v0, __ T16B, v0, v1);
4253     __ umov(tmp1, v0, __ D, 0);
4254     __ umov(tmp2, v0, __ D, 1);
4255     __ orr(tmp1, tmp1, tmp2);
4256     __ cbnz(tmp1, NOT_EQUAL);
4257     __ br(__ GE, LOOP);
4258   }
4259 
4260   // a1 = r1 - array1 address
4261   // a2 = r2 - array2 address
4262   // result = r0 - return value. Already contains "false"
4263   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4264   // r3-r5 are reserved temporary registers
4265   address generate_large_array_equals() {
4266     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4267         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4268         tmp7 = r12, tmp8 = r13;
4269     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4270         SMALL_LOOP, POST_LOOP;
4271     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4272     // calculate if at least 32 prefetched bytes are used
4273     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4274     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4275     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4276     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4277         tmp5, tmp6, tmp7, tmp8);
4278 
4279     __ align(CodeEntryAlignment);
4280 
4281     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4282 
4283     address entry = __ pc();
4284     __ enter();
4285     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4286     // also advance pointers to use post-increment instead of pre-increment
4287     __ add(a1, a1, wordSize);
4288     __ add(a2, a2, wordSize);
4289     if (AvoidUnalignedAccesses) {
4290       // both implementations (SIMD/nonSIMD) are using relatively large load
4291       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4292       // on some CPUs in case of address is not at least 16-byte aligned.
4293       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4294       // load if needed at least for 1st address and make if 16-byte aligned.
4295       Label ALIGNED16;
4296       __ tbz(a1, 3, ALIGNED16);
4297       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4298       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4299       __ sub(cnt1, cnt1, wordSize);
4300       __ eor(tmp1, tmp1, tmp2);
4301       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4302       __ bind(ALIGNED16);
4303     }
4304     if (UseSIMDForArrayEquals) {
4305       if (SoftwarePrefetchHintDistance >= 0) {
4306         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4307         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4308         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4309             /* prfm = */ true, NOT_EQUAL);
4310         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4311         __ br(__ LT, TAIL);
4312       }
4313       __ bind(NO_PREFETCH_LARGE_LOOP);
4314       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4315           /* prfm = */ false, NOT_EQUAL);
4316     } else {
4317       __ push(spilled_regs, sp);
4318       if (SoftwarePrefetchHintDistance >= 0) {
4319         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4320         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4321         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4322             /* prfm = */ true, NOT_EQUAL);
4323         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4324         __ br(__ LT, TAIL);
4325       }
4326       __ bind(NO_PREFETCH_LARGE_LOOP);
4327       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4328           /* prfm = */ false, NOT_EQUAL);
4329     }
4330     __ bind(TAIL);
4331       __ cbz(cnt1, EQUAL);
4332       __ subs(cnt1, cnt1, wordSize);
4333       __ br(__ LE, POST_LOOP);
4334     __ bind(SMALL_LOOP);
4335       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4336       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4337       __ subs(cnt1, cnt1, wordSize);
4338       __ eor(tmp1, tmp1, tmp2);
4339       __ cbnz(tmp1, NOT_EQUAL);
4340       __ br(__ GT, SMALL_LOOP);
4341     __ bind(POST_LOOP);
4342       __ ldr(tmp1, Address(a1, cnt1));
4343       __ ldr(tmp2, Address(a2, cnt1));
4344       __ eor(tmp1, tmp1, tmp2);
4345       __ cbnz(tmp1, NOT_EQUAL);
4346     __ bind(EQUAL);
4347       __ mov(result, true);
4348     __ bind(NOT_EQUAL);
4349       if (!UseSIMDForArrayEquals) {
4350         __ pop(spilled_regs, sp);
4351       }
4352     __ bind(NOT_EQUAL_NO_POP);
4353     __ leave();
4354     __ ret(lr);
4355     return entry;
4356   }
4357 
4358   address generate_dsin_dcos(bool isCos) {
4359     __ align(CodeEntryAlignment);
4360     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4361     address start = __ pc();
4362     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4363         (address)StubRoutines::aarch64::_two_over_pi,
4364         (address)StubRoutines::aarch64::_pio2,
4365         (address)StubRoutines::aarch64::_dsin_coef,
4366         (address)StubRoutines::aarch64::_dcos_coef);
4367     return start;
4368   }
4369 
4370   address generate_dlog() {
4371     __ align(CodeEntryAlignment);
4372     StubCodeMark mark(this, "StubRoutines", "dlog");
4373     address entry = __ pc();
4374     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4375         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4376     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4377     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4378         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4379     return entry;
4380   }
4381 
4382   // code for comparing 16 bytes of strings with same encoding
4383   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4384     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4385     __ ldr(rscratch1, Address(__ post(str1, 8)));
4386     __ eor(rscratch2, tmp1, tmp2);
4387     __ ldr(cnt1, Address(__ post(str2, 8)));
4388     __ cbnz(rscratch2, DIFF1);
4389     __ ldr(tmp1, Address(__ post(str1, 8)));
4390     __ eor(rscratch2, rscratch1, cnt1);
4391     __ ldr(tmp2, Address(__ post(str2, 8)));
4392     __ cbnz(rscratch2, DIFF2);
4393   }
4394 
4395   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4396   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4397       Label &DIFF2) {
4398     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4399     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4400 
4401     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4402     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4403     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4404     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4405 
4406     __ fmovd(tmpL, vtmp3);
4407     __ eor(rscratch2, tmp3, tmpL);
4408     __ cbnz(rscratch2, DIFF2);
4409 
4410     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4411     __ umov(tmpL, vtmp3, __ D, 1);
4412     __ eor(rscratch2, tmpU, tmpL);
4413     __ cbnz(rscratch2, DIFF1);
4414 
4415     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4416     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4417     __ fmovd(tmpL, vtmp);
4418     __ eor(rscratch2, tmp3, tmpL);
4419     __ cbnz(rscratch2, DIFF2);
4420 
4421     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4422     __ umov(tmpL, vtmp, __ D, 1);
4423     __ eor(rscratch2, tmpU, tmpL);
4424     __ cbnz(rscratch2, DIFF1);
4425   }
4426 
4427   // r0  = result
4428   // r1  = str1
4429   // r2  = cnt1
4430   // r3  = str2
4431   // r4  = cnt2
4432   // r10 = tmp1
4433   // r11 = tmp2
4434   address generate_compare_long_string_different_encoding(bool isLU) {
4435     __ align(CodeEntryAlignment);
4436     StubCodeMark mark(this, "StubRoutines", isLU
4437         ? "compare_long_string_different_encoding LU"
4438         : "compare_long_string_different_encoding UL");
4439     address entry = __ pc();
4440     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4441         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4442         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4443     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4444         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4445     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4446     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4447 
4448     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4449 
4450     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4451     // cnt2 == amount of characters left to compare
4452     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4453     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4454     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4455     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4456     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4457     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4458     __ eor(rscratch2, tmp1, tmp2);
4459     __ mov(rscratch1, tmp2);
4460     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4461     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4462              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4463     __ push(spilled_regs, sp);
4464     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4465     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4466 
4467     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4468 
4469     if (SoftwarePrefetchHintDistance >= 0) {
4470       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4471       __ br(__ LT, NO_PREFETCH);
4472       __ bind(LARGE_LOOP_PREFETCH);
4473         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4474         __ mov(tmp4, 2);
4475         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4476         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4477           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4478           __ subs(tmp4, tmp4, 1);
4479           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4480           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4481           __ mov(tmp4, 2);
4482         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4483           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4484           __ subs(tmp4, tmp4, 1);
4485           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4486           __ sub(cnt2, cnt2, 64);
4487           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4488           __ br(__ GE, LARGE_LOOP_PREFETCH);
4489     }
4490     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4491     __ bind(NO_PREFETCH);
4492     __ subs(cnt2, cnt2, 16);
4493     __ br(__ LT, TAIL);
4494     __ align(OptoLoopAlignment);
4495     __ bind(SMALL_LOOP); // smaller loop
4496       __ subs(cnt2, cnt2, 16);
4497       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4498       __ br(__ GE, SMALL_LOOP);
4499       __ cmn(cnt2, (u1)16);
4500       __ br(__ EQ, LOAD_LAST);
4501     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4502       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4503       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4504       __ ldr(tmp3, Address(cnt1, -8));
4505       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4506       __ b(LOAD_LAST);
4507     __ bind(DIFF2);
4508       __ mov(tmpU, tmp3);
4509     __ bind(DIFF1);
4510       __ pop(spilled_regs, sp);
4511       __ b(CALCULATE_DIFFERENCE);
4512     __ bind(LOAD_LAST);
4513       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4514       // No need to load it again
4515       __ mov(tmpU, tmp3);
4516       __ pop(spilled_regs, sp);
4517 
4518       // tmp2 points to the address of the last 4 Latin1 characters right now
4519       __ ldrs(vtmp, Address(tmp2));
4520       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4521       __ fmovd(tmpL, vtmp);
4522 
4523       __ eor(rscratch2, tmpU, tmpL);
4524       __ cbz(rscratch2, DONE);
4525 
4526     // Find the first different characters in the longwords and
4527     // compute their difference.
4528     __ bind(CALCULATE_DIFFERENCE);
4529       __ rev(rscratch2, rscratch2);
4530       __ clz(rscratch2, rscratch2);
4531       __ andr(rscratch2, rscratch2, -16);
4532       __ lsrv(tmp1, tmp1, rscratch2);
4533       __ uxthw(tmp1, tmp1);
4534       __ lsrv(rscratch1, rscratch1, rscratch2);
4535       __ uxthw(rscratch1, rscratch1);
4536       __ subw(result, tmp1, rscratch1);
4537     __ bind(DONE);
4538       __ ret(lr);
4539     return entry;
4540   }
4541 
4542     address generate_method_entry_barrier() {
4543     __ align(CodeEntryAlignment);
4544     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
4545 
4546     Label deoptimize_label;
4547 
4548     address start = __ pc();
4549 
4550     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
4551 
4552     __ enter();
4553     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
4554 
4555     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
4556 
4557     __ push_call_clobbered_registers();
4558 
4559     __ mov(c_rarg0, rscratch2);
4560     __ call_VM_leaf
4561          (CAST_FROM_FN_PTR
4562           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
4563 
4564     __ reset_last_Java_frame(true);
4565 
4566     __ mov(rscratch1, r0);
4567 
4568     __ pop_call_clobbered_registers();
4569 
4570     __ cbnz(rscratch1, deoptimize_label);
4571 
4572     __ leave();
4573     __ ret(lr);
4574 
4575     __ BIND(deoptimize_label);
4576 
4577     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
4578     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
4579 
4580     __ mov(sp, rscratch1);
4581     __ br(rscratch2);
4582 
4583     return start;
4584   }
4585 
4586   // r0  = result
4587   // r1  = str1
4588   // r2  = cnt1
4589   // r3  = str2
4590   // r4  = cnt2
4591   // r10 = tmp1
4592   // r11 = tmp2
4593   address generate_compare_long_string_same_encoding(bool isLL) {
4594     __ align(CodeEntryAlignment);
4595     StubCodeMark mark(this, "StubRoutines", isLL
4596         ? "compare_long_string_same_encoding LL"
4597         : "compare_long_string_same_encoding UU");
4598     address entry = __ pc();
4599     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4600         tmp1 = r10, tmp2 = r11;
4601     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4602         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4603         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4604     // exit from large loop when less than 64 bytes left to read or we're about
4605     // to prefetch memory behind array border
4606     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4607     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4608     // update cnt2 counter with already loaded 8 bytes
4609     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4610     // update pointers, because of previous read
4611     __ add(str1, str1, wordSize);
4612     __ add(str2, str2, wordSize);
4613     if (SoftwarePrefetchHintDistance >= 0) {
4614       __ bind(LARGE_LOOP_PREFETCH);
4615         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4616         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4617         compare_string_16_bytes_same(DIFF, DIFF2);
4618         compare_string_16_bytes_same(DIFF, DIFF2);
4619         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4620         compare_string_16_bytes_same(DIFF, DIFF2);
4621         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4622         compare_string_16_bytes_same(DIFF, DIFF2);
4623         __ br(__ GT, LARGE_LOOP_PREFETCH);
4624         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4625     }
4626     // less than 16 bytes left?
4627     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4628     __ br(__ LT, TAIL);
4629     __ align(OptoLoopAlignment);
4630     __ bind(SMALL_LOOP);
4631       compare_string_16_bytes_same(DIFF, DIFF2);
4632       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4633       __ br(__ GE, SMALL_LOOP);
4634     __ bind(TAIL);
4635       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4636       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4637       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4638       __ br(__ LE, CHECK_LAST);
4639       __ eor(rscratch2, tmp1, tmp2);
4640       __ cbnz(rscratch2, DIFF);
4641       __ ldr(tmp1, Address(__ post(str1, 8)));
4642       __ ldr(tmp2, Address(__ post(str2, 8)));
4643       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4644     __ bind(CHECK_LAST);
4645       if (!isLL) {
4646         __ add(cnt2, cnt2, cnt2); // now in bytes
4647       }
4648       __ eor(rscratch2, tmp1, tmp2);
4649       __ cbnz(rscratch2, DIFF);
4650       __ ldr(rscratch1, Address(str1, cnt2));
4651       __ ldr(cnt1, Address(str2, cnt2));
4652       __ eor(rscratch2, rscratch1, cnt1);
4653       __ cbz(rscratch2, LENGTH_DIFF);
4654       // Find the first different characters in the longwords and
4655       // compute their difference.
4656     __ bind(DIFF2);
4657       __ rev(rscratch2, rscratch2);
4658       __ clz(rscratch2, rscratch2);
4659       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4660       __ lsrv(rscratch1, rscratch1, rscratch2);
4661       if (isLL) {
4662         __ lsrv(cnt1, cnt1, rscratch2);
4663         __ uxtbw(rscratch1, rscratch1);
4664         __ uxtbw(cnt1, cnt1);
4665       } else {
4666         __ lsrv(cnt1, cnt1, rscratch2);
4667         __ uxthw(rscratch1, rscratch1);
4668         __ uxthw(cnt1, cnt1);
4669       }
4670       __ subw(result, rscratch1, cnt1);
4671       __ b(LENGTH_DIFF);
4672     __ bind(DIFF);
4673       __ rev(rscratch2, rscratch2);
4674       __ clz(rscratch2, rscratch2);
4675       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4676       __ lsrv(tmp1, tmp1, rscratch2);
4677       if (isLL) {
4678         __ lsrv(tmp2, tmp2, rscratch2);
4679         __ uxtbw(tmp1, tmp1);
4680         __ uxtbw(tmp2, tmp2);
4681       } else {
4682         __ lsrv(tmp2, tmp2, rscratch2);
4683         __ uxthw(tmp1, tmp1);
4684         __ uxthw(tmp2, tmp2);
4685       }
4686       __ subw(result, tmp1, tmp2);
4687       __ b(LENGTH_DIFF);
4688     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4689       __ eor(rscratch2, tmp1, tmp2);
4690       __ cbnz(rscratch2, DIFF);
4691     __ bind(LENGTH_DIFF);
4692       __ ret(lr);
4693     return entry;
4694   }
4695 
4696   void generate_compare_long_strings() {
4697       StubRoutines::aarch64::_compare_long_string_LL
4698           = generate_compare_long_string_same_encoding(true);
4699       StubRoutines::aarch64::_compare_long_string_UU
4700           = generate_compare_long_string_same_encoding(false);
4701       StubRoutines::aarch64::_compare_long_string_LU
4702           = generate_compare_long_string_different_encoding(true);
4703       StubRoutines::aarch64::_compare_long_string_UL
4704           = generate_compare_long_string_different_encoding(false);
4705   }
4706 
4707   // R0 = result
4708   // R1 = str2
4709   // R2 = cnt1
4710   // R3 = str1
4711   // R4 = cnt2
4712   // This generic linear code use few additional ideas, which makes it faster:
4713   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4714   // in order to skip initial loading(help in systems with 1 ld pipeline)
4715   // 2) we can use "fast" algorithm of finding single character to search for
4716   // first symbol with less branches(1 branch per each loaded register instead
4717   // of branch for each symbol), so, this is where constants like
4718   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4719   // 3) after loading and analyzing 1st register of source string, it can be
4720   // used to search for every 1st character entry, saving few loads in
4721   // comparison with "simplier-but-slower" implementation
4722   // 4) in order to avoid lots of push/pop operations, code below is heavily
4723   // re-using/re-initializing/compressing register values, which makes code
4724   // larger and a bit less readable, however, most of extra operations are
4725   // issued during loads or branches, so, penalty is minimal
4726   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4727     const char* stubName = str1_isL
4728         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4729         : "indexof_linear_uu";
4730     __ align(CodeEntryAlignment);
4731     StubCodeMark mark(this, "StubRoutines", stubName);
4732     address entry = __ pc();
4733 
4734     int str1_chr_size = str1_isL ? 1 : 2;
4735     int str2_chr_size = str2_isL ? 1 : 2;
4736     int str1_chr_shift = str1_isL ? 0 : 1;
4737     int str2_chr_shift = str2_isL ? 0 : 1;
4738     bool isL = str1_isL && str2_isL;
4739    // parameters
4740     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4741     // temporary registers
4742     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4743     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4744     // redefinitions
4745     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4746 
4747     __ push(spilled_regs, sp);
4748     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4749         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4750         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4751         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4752         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4753         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4754     // Read whole register from str1. It is safe, because length >=8 here
4755     __ ldr(ch1, Address(str1));
4756     // Read whole register from str2. It is safe, because length >=8 here
4757     __ ldr(ch2, Address(str2));
4758     __ sub(cnt2, cnt2, cnt1);
4759     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4760     if (str1_isL != str2_isL) {
4761       __ eor(v0, __ T16B, v0, v0);
4762     }
4763     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4764     __ mul(first, first, tmp1);
4765     // check if we have less than 1 register to check
4766     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4767     if (str1_isL != str2_isL) {
4768       __ fmovd(v1, ch1);
4769     }
4770     __ br(__ LE, L_SMALL);
4771     __ eor(ch2, first, ch2);
4772     if (str1_isL != str2_isL) {
4773       __ zip1(v1, __ T16B, v1, v0);
4774     }
4775     __ sub(tmp2, ch2, tmp1);
4776     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4777     __ bics(tmp2, tmp2, ch2);
4778     if (str1_isL != str2_isL) {
4779       __ fmovd(ch1, v1);
4780     }
4781     __ br(__ NE, L_HAS_ZERO);
4782     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4783     __ add(result, result, wordSize/str2_chr_size);
4784     __ add(str2, str2, wordSize);
4785     __ br(__ LT, L_POST_LOOP);
4786     __ BIND(L_LOOP);
4787       __ ldr(ch2, Address(str2));
4788       __ eor(ch2, first, ch2);
4789       __ sub(tmp2, ch2, tmp1);
4790       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4791       __ bics(tmp2, tmp2, ch2);
4792       __ br(__ NE, L_HAS_ZERO);
4793     __ BIND(L_LOOP_PROCEED);
4794       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4795       __ add(str2, str2, wordSize);
4796       __ add(result, result, wordSize/str2_chr_size);
4797       __ br(__ GE, L_LOOP);
4798     __ BIND(L_POST_LOOP);
4799       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4800       __ br(__ LE, NOMATCH);
4801       __ ldr(ch2, Address(str2));
4802       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4803       __ eor(ch2, first, ch2);
4804       __ sub(tmp2, ch2, tmp1);
4805       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4806       __ mov(tmp4, -1); // all bits set
4807       __ b(L_SMALL_PROCEED);
4808     __ align(OptoLoopAlignment);
4809     __ BIND(L_SMALL);
4810       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4811       __ eor(ch2, first, ch2);
4812       if (str1_isL != str2_isL) {
4813         __ zip1(v1, __ T16B, v1, v0);
4814       }
4815       __ sub(tmp2, ch2, tmp1);
4816       __ mov(tmp4, -1); // all bits set
4817       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4818       if (str1_isL != str2_isL) {
4819         __ fmovd(ch1, v1); // move converted 4 symbols
4820       }
4821     __ BIND(L_SMALL_PROCEED);
4822       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4823       __ bic(tmp2, tmp2, ch2);
4824       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4825       __ rbit(tmp2, tmp2);
4826       __ br(__ EQ, NOMATCH);
4827     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4828       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4829       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4830       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4831       if (str2_isL) { // LL
4832         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4833         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4834         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4835         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4836         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4837       } else {
4838         __ mov(ch2, 0xE); // all bits in byte set except last one
4839         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4840         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4841         __ lslv(tmp2, tmp2, tmp4);
4842         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4843         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4844         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4845         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4846       }
4847       __ cmp(ch1, ch2);
4848       __ mov(tmp4, wordSize/str2_chr_size);
4849       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4850     __ BIND(L_SMALL_CMP_LOOP);
4851       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4852                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4853       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4854                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4855       __ add(tmp4, tmp4, 1);
4856       __ cmp(tmp4, cnt1);
4857       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4858       __ cmp(first, ch2);
4859       __ br(__ EQ, L_SMALL_CMP_LOOP);
4860     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4861       __ cbz(tmp2, NOMATCH); // no more matches. exit
4862       __ clz(tmp4, tmp2);
4863       __ add(result, result, 1); // advance index
4864       __ add(str2, str2, str2_chr_size); // advance pointer
4865       __ b(L_SMALL_HAS_ZERO_LOOP);
4866     __ align(OptoLoopAlignment);
4867     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4868       __ cmp(first, ch2);
4869       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4870       __ b(DONE);
4871     __ align(OptoLoopAlignment);
4872     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4873       if (str2_isL) { // LL
4874         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4875         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4876         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4877         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4878         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4879       } else {
4880         __ mov(ch2, 0xE); // all bits in byte set except last one
4881         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4882         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4883         __ lslv(tmp2, tmp2, tmp4);
4884         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4885         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4886         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4887         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4888       }
4889       __ cmp(ch1, ch2);
4890       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4891       __ b(DONE);
4892     __ align(OptoLoopAlignment);
4893     __ BIND(L_HAS_ZERO);
4894       __ rbit(tmp2, tmp2);
4895       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4896       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4897       // It's fine because both counters are 32bit and are not changed in this
4898       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4899       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4900       __ sub(result, result, 1);
4901     __ BIND(L_HAS_ZERO_LOOP);
4902       __ mov(cnt1, wordSize/str2_chr_size);
4903       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4904       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4905       if (str2_isL) {
4906         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4907         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4908         __ lslv(tmp2, tmp2, tmp4);
4909         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4910         __ add(tmp4, tmp4, 1);
4911         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4912         __ lsl(tmp2, tmp2, 1);
4913         __ mov(tmp4, wordSize/str2_chr_size);
4914       } else {
4915         __ mov(ch2, 0xE);
4916         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4917         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4918         __ lslv(tmp2, tmp2, tmp4);
4919         __ add(tmp4, tmp4, 1);
4920         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4921         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4922         __ lsl(tmp2, tmp2, 1);
4923         __ mov(tmp4, wordSize/str2_chr_size);
4924         __ sub(str2, str2, str2_chr_size);
4925       }
4926       __ cmp(ch1, ch2);
4927       __ mov(tmp4, wordSize/str2_chr_size);
4928       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4929     __ BIND(L_CMP_LOOP);
4930       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4931                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4932       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4933                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4934       __ add(tmp4, tmp4, 1);
4935       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4936       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4937       __ cmp(cnt1, ch2);
4938       __ br(__ EQ, L_CMP_LOOP);
4939     __ BIND(L_CMP_LOOP_NOMATCH);
4940       // here we're not matched
4941       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4942       __ clz(tmp4, tmp2);
4943       __ add(str2, str2, str2_chr_size); // advance pointer
4944       __ b(L_HAS_ZERO_LOOP);
4945     __ align(OptoLoopAlignment);
4946     __ BIND(L_CMP_LOOP_LAST_CMP);
4947       __ cmp(cnt1, ch2);
4948       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4949       __ b(DONE);
4950     __ align(OptoLoopAlignment);
4951     __ BIND(L_CMP_LOOP_LAST_CMP2);
4952       if (str2_isL) {
4953         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4954         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4955         __ lslv(tmp2, tmp2, tmp4);
4956         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4957         __ add(tmp4, tmp4, 1);
4958         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4959         __ lsl(tmp2, tmp2, 1);
4960       } else {
4961         __ mov(ch2, 0xE);
4962         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4963         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4964         __ lslv(tmp2, tmp2, tmp4);
4965         __ add(tmp4, tmp4, 1);
4966         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4967         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4968         __ lsl(tmp2, tmp2, 1);
4969         __ sub(str2, str2, str2_chr_size);
4970       }
4971       __ cmp(ch1, ch2);
4972       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4973       __ b(DONE);
4974     __ align(OptoLoopAlignment);
4975     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4976       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4977       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4978       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4979       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4980       // result by analyzed characters value, so, we can just reset lower bits
4981       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4982       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4983       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4984       // index of last analyzed substring inside current octet. So, str2 in at
4985       // respective start address. We need to advance it to next octet
4986       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4987       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4988       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4989       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4990       __ movw(cnt2, cnt2);
4991       __ b(L_LOOP_PROCEED);
4992     __ align(OptoLoopAlignment);
4993     __ BIND(NOMATCH);
4994       __ mov(result, -1);
4995     __ BIND(DONE);
4996       __ pop(spilled_regs, sp);
4997       __ ret(lr);
4998     return entry;
4999   }
5000 
5001   void generate_string_indexof_stubs() {
5002     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5003     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5004     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5005   }
5006 
5007   void inflate_and_store_2_fp_registers(bool generatePrfm,
5008       FloatRegister src1, FloatRegister src2) {
5009     Register dst = r1;
5010     __ zip1(v1, __ T16B, src1, v0);
5011     __ zip2(v2, __ T16B, src1, v0);
5012     if (generatePrfm) {
5013       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5014     }
5015     __ zip1(v3, __ T16B, src2, v0);
5016     __ zip2(v4, __ T16B, src2, v0);
5017     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5018   }
5019 
5020   // R0 = src
5021   // R1 = dst
5022   // R2 = len
5023   // R3 = len >> 3
5024   // V0 = 0
5025   // v1 = loaded 8 bytes
5026   address generate_large_byte_array_inflate() {
5027     __ align(CodeEntryAlignment);
5028     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5029     address entry = __ pc();
5030     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5031     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5032     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5033 
5034     // do one more 8-byte read to have address 16-byte aligned in most cases
5035     // also use single store instruction
5036     __ ldrd(v2, __ post(src, 8));
5037     __ sub(octetCounter, octetCounter, 2);
5038     __ zip1(v1, __ T16B, v1, v0);
5039     __ zip1(v2, __ T16B, v2, v0);
5040     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5041     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5042     __ subs(rscratch1, octetCounter, large_loop_threshold);
5043     __ br(__ LE, LOOP_START);
5044     __ b(LOOP_PRFM_START);
5045     __ bind(LOOP_PRFM);
5046       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5047     __ bind(LOOP_PRFM_START);
5048       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5049       __ sub(octetCounter, octetCounter, 8);
5050       __ subs(rscratch1, octetCounter, large_loop_threshold);
5051       inflate_and_store_2_fp_registers(true, v3, v4);
5052       inflate_and_store_2_fp_registers(true, v5, v6);
5053       __ br(__ GT, LOOP_PRFM);
5054       __ cmp(octetCounter, (u1)8);
5055       __ br(__ LT, DONE);
5056     __ bind(LOOP);
5057       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5058       __ bind(LOOP_START);
5059       __ sub(octetCounter, octetCounter, 8);
5060       __ cmp(octetCounter, (u1)8);
5061       inflate_and_store_2_fp_registers(false, v3, v4);
5062       inflate_and_store_2_fp_registers(false, v5, v6);
5063       __ br(__ GE, LOOP);
5064     __ bind(DONE);
5065       __ ret(lr);
5066     return entry;
5067   }
5068 
5069   /**
5070    *  Arguments:
5071    *
5072    *  Input:
5073    *  c_rarg0   - current state address
5074    *  c_rarg1   - H key address
5075    *  c_rarg2   - data address
5076    *  c_rarg3   - number of blocks
5077    *
5078    *  Output:
5079    *  Updated state at c_rarg0
5080    */
5081   address generate_ghash_processBlocks() {
5082     // Bafflingly, GCM uses little-endian for the byte order, but
5083     // big-endian for the bit order.  For example, the polynomial 1 is
5084     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5085     //
5086     // So, we must either reverse the bytes in each word and do
5087     // everything big-endian or reverse the bits in each byte and do
5088     // it little-endian.  On AArch64 it's more idiomatic to reverse
5089     // the bits in each byte (we have an instruction, RBIT, to do
5090     // that) and keep the data in little-endian bit order throught the
5091     // calculation, bit-reversing the inputs and outputs.
5092 
5093     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5094     __ align(wordSize * 2);
5095     address p = __ pc();
5096     __ emit_int64(0x87);  // The low-order bits of the field
5097                           // polynomial (i.e. p = z^7+z^2+z+1)
5098                           // repeated in the low and high parts of a
5099                           // 128-bit vector
5100     __ emit_int64(0x87);
5101 
5102     __ align(CodeEntryAlignment);
5103     address start = __ pc();
5104 
5105     Register state   = c_rarg0;
5106     Register subkeyH = c_rarg1;
5107     Register data    = c_rarg2;
5108     Register blocks  = c_rarg3;
5109 
5110     FloatRegister vzr = v30;
5111     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5112 
5113     __ ldrq(v0, Address(state));
5114     __ ldrq(v1, Address(subkeyH));
5115 
5116     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5117     __ rbit(v0, __ T16B, v0);
5118     __ rev64(v1, __ T16B, v1);
5119     __ rbit(v1, __ T16B, v1);
5120 
5121     __ ldrq(v26, p);
5122 
5123     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5124     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5125 
5126     {
5127       Label L_ghash_loop;
5128       __ bind(L_ghash_loop);
5129 
5130       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5131                                                  // reversing each byte
5132       __ rbit(v2, __ T16B, v2);
5133       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5134 
5135       // Multiply state in v2 by subkey in v1
5136       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5137                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
5138                      /*temps*/v6, v20, v18, v21);
5139       // Reduce v7:v5 by the field polynomial
5140       ghash_reduce(v0, v5, v7, v26, vzr, v20);
5141 
5142       __ sub(blocks, blocks, 1);
5143       __ cbnz(blocks, L_ghash_loop);
5144     }
5145 
5146     // The bit-reversed result is at this point in v0
5147     __ rev64(v1, __ T16B, v0);
5148     __ rbit(v1, __ T16B, v1);
5149 
5150     __ st1(v1, __ T16B, state);
5151     __ ret(lr);
5152 
5153     return start;
5154   }
5155 
5156   // Continuation point for throwing of implicit exceptions that are
5157   // not handled in the current activation. Fabricates an exception
5158   // oop and initiates normal exception dispatching in this
5159   // frame. Since we need to preserve callee-saved values (currently
5160   // only for C2, but done for C1 as well) we need a callee-saved oop
5161   // map and therefore have to make these stubs into RuntimeStubs
5162   // rather than BufferBlobs.  If the compiler needs all registers to
5163   // be preserved between the fault point and the exception handler
5164   // then it must assume responsibility for that in
5165   // AbstractCompiler::continuation_for_implicit_null_exception or
5166   // continuation_for_implicit_division_by_zero_exception. All other
5167   // implicit exceptions (e.g., NullPointerException or
5168   // AbstractMethodError on entry) are either at call sites or
5169   // otherwise assume that stack unwinding will be initiated, so
5170   // caller saved registers were assumed volatile in the compiler.
5171 
5172 #undef __
5173 #define __ masm->
5174 
5175   address generate_throw_exception(const char* name,
5176                                    address runtime_entry,
5177                                    Register arg1 = noreg,
5178                                    Register arg2 = noreg) {
5179     // Information about frame layout at time of blocking runtime call.
5180     // Note that we only have to preserve callee-saved registers since
5181     // the compilers are responsible for supplying a continuation point
5182     // if they expect all registers to be preserved.
5183     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
5184     enum layout {
5185       rfp_off = 0,
5186       rfp_off2,
5187       return_off,
5188       return_off2,
5189       framesize // inclusive of return address
5190     };
5191 
5192     int insts_size = 512;
5193     int locs_size  = 64;
5194 
5195     CodeBuffer code(name, insts_size, locs_size);
5196     OopMapSet* oop_maps  = new OopMapSet();
5197     MacroAssembler* masm = new MacroAssembler(&code);
5198 
5199     address start = __ pc();
5200 
5201     // This is an inlined and slightly modified version of call_VM
5202     // which has the ability to fetch the return PC out of
5203     // thread-local storage and also sets up last_Java_sp slightly
5204     // differently than the real call_VM
5205 
5206     __ enter(); // Save FP and LR before call
5207 
5208     assert(is_even(framesize/2), "sp not 16-byte aligned");
5209 
5210     // lr and fp are already in place
5211     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
5212 
5213     int frame_complete = __ pc() - start;
5214 
5215     // Set up last_Java_sp and last_Java_fp
5216     address the_pc = __ pc();
5217     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
5218 
5219     // Call runtime
5220     if (arg1 != noreg) {
5221       assert(arg2 != c_rarg1, "clobbered");
5222       __ mov(c_rarg1, arg1);
5223     }
5224     if (arg2 != noreg) {
5225       __ mov(c_rarg2, arg2);
5226     }
5227     __ mov(c_rarg0, rthread);
5228     BLOCK_COMMENT("call runtime_entry");
5229     __ mov(rscratch1, runtime_entry);
5230     __ blr(rscratch1);
5231 
5232     // Generate oop map
5233     OopMap* map = new OopMap(framesize, 0);
5234 
5235     oop_maps->add_gc_map(the_pc - start, map);
5236 
5237     __ reset_last_Java_frame(true);
5238     __ maybe_isb();
5239 
5240     __ leave();
5241 
5242     // check for pending exceptions
5243 #ifdef ASSERT
5244     Label L;
5245     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
5246     __ cbnz(rscratch1, L);
5247     __ should_not_reach_here();
5248     __ bind(L);
5249 #endif // ASSERT
5250     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5251 
5252 
5253     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5254     RuntimeStub* stub =
5255       RuntimeStub::new_runtime_stub(name,
5256                                     &code,
5257                                     frame_complete,
5258                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5259                                     oop_maps, false);
5260     return stub->entry_point();
5261   }
5262 
5263   class MontgomeryMultiplyGenerator : public MacroAssembler {
5264 
5265     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
5266       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
5267 
5268     RegSet _toSave;
5269     bool _squaring;
5270 
5271   public:
5272     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
5273       : MacroAssembler(as->code()), _squaring(squaring) {
5274 
5275       // Register allocation
5276 
5277       Register reg = c_rarg0;
5278       Pa_base = reg;       // Argument registers
5279       if (squaring)
5280         Pb_base = Pa_base;
5281       else
5282         Pb_base = ++reg;
5283       Pn_base = ++reg;
5284       Rlen= ++reg;
5285       inv = ++reg;
5286       Pm_base = ++reg;
5287 
5288                           // Working registers:
5289       Ra =  ++reg;        // The current digit of a, b, n, and m.
5290       Rb =  ++reg;
5291       Rm =  ++reg;
5292       Rn =  ++reg;
5293 
5294       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
5295       Pb =  ++reg;
5296       Pm =  ++reg;
5297       Pn =  ++reg;
5298 
5299       t0 =  ++reg;        // Three registers which form a
5300       t1 =  ++reg;        // triple-precision accumuator.
5301       t2 =  ++reg;
5302 
5303       Ri =  ++reg;        // Inner and outer loop indexes.
5304       Rj =  ++reg;
5305 
5306       Rhi_ab = ++reg;     // Product registers: low and high parts
5307       Rlo_ab = ++reg;     // of a*b and m*n.
5308       Rhi_mn = ++reg;
5309       Rlo_mn = ++reg;
5310 
5311       // r19 and up are callee-saved.
5312       _toSave = RegSet::range(r19, reg) + Pm_base;
5313     }
5314 
5315   private:
5316     void save_regs() {
5317       push(_toSave, sp);
5318     }
5319 
5320     void restore_regs() {
5321       pop(_toSave, sp);
5322     }
5323 
5324     template <typename T>
5325     void unroll_2(Register count, T block) {
5326       Label loop, end, odd;
5327       tbnz(count, 0, odd);
5328       cbz(count, end);
5329       align(16);
5330       bind(loop);
5331       (this->*block)();
5332       bind(odd);
5333       (this->*block)();
5334       subs(count, count, 2);
5335       br(Assembler::GT, loop);
5336       bind(end);
5337     }
5338 
5339     template <typename T>
5340     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
5341       Label loop, end, odd;
5342       tbnz(count, 0, odd);
5343       cbz(count, end);
5344       align(16);
5345       bind(loop);
5346       (this->*block)(d, s, tmp);
5347       bind(odd);
5348       (this->*block)(d, s, tmp);
5349       subs(count, count, 2);
5350       br(Assembler::GT, loop);
5351       bind(end);
5352     }
5353 
5354     void pre1(RegisterOrConstant i) {
5355       block_comment("pre1");
5356       // Pa = Pa_base;
5357       // Pb = Pb_base + i;
5358       // Pm = Pm_base;
5359       // Pn = Pn_base + i;
5360       // Ra = *Pa;
5361       // Rb = *Pb;
5362       // Rm = *Pm;
5363       // Rn = *Pn;
5364       ldr(Ra, Address(Pa_base));
5365       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5366       ldr(Rm, Address(Pm_base));
5367       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5368       lea(Pa, Address(Pa_base));
5369       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5370       lea(Pm, Address(Pm_base));
5371       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5372 
5373       // Zero the m*n result.
5374       mov(Rhi_mn, zr);
5375       mov(Rlo_mn, zr);
5376     }
5377 
5378     // The core multiply-accumulate step of a Montgomery
5379     // multiplication.  The idea is to schedule operations as a
5380     // pipeline so that instructions with long latencies (loads and
5381     // multiplies) have time to complete before their results are
5382     // used.  This most benefits in-order implementations of the
5383     // architecture but out-of-order ones also benefit.
5384     void step() {
5385       block_comment("step");
5386       // MACC(Ra, Rb, t0, t1, t2);
5387       // Ra = *++Pa;
5388       // Rb = *--Pb;
5389       umulh(Rhi_ab, Ra, Rb);
5390       mul(Rlo_ab, Ra, Rb);
5391       ldr(Ra, pre(Pa, wordSize));
5392       ldr(Rb, pre(Pb, -wordSize));
5393       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5394                                        // previous iteration.
5395       // MACC(Rm, Rn, t0, t1, t2);
5396       // Rm = *++Pm;
5397       // Rn = *--Pn;
5398       umulh(Rhi_mn, Rm, Rn);
5399       mul(Rlo_mn, Rm, Rn);
5400       ldr(Rm, pre(Pm, wordSize));
5401       ldr(Rn, pre(Pn, -wordSize));
5402       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5403     }
5404 
5405     void post1() {
5406       block_comment("post1");
5407 
5408       // MACC(Ra, Rb, t0, t1, t2);
5409       // Ra = *++Pa;
5410       // Rb = *--Pb;
5411       umulh(Rhi_ab, Ra, Rb);
5412       mul(Rlo_ab, Ra, Rb);
5413       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5414       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5415 
5416       // *Pm = Rm = t0 * inv;
5417       mul(Rm, t0, inv);
5418       str(Rm, Address(Pm));
5419 
5420       // MACC(Rm, Rn, t0, t1, t2);
5421       // t0 = t1; t1 = t2; t2 = 0;
5422       umulh(Rhi_mn, Rm, Rn);
5423 
5424 #ifndef PRODUCT
5425       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5426       {
5427         mul(Rlo_mn, Rm, Rn);
5428         add(Rlo_mn, t0, Rlo_mn);
5429         Label ok;
5430         cbz(Rlo_mn, ok); {
5431           stop("broken Montgomery multiply");
5432         } bind(ok);
5433       }
5434 #endif
5435       // We have very carefully set things up so that
5436       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5437       // the lower half of Rm * Rn because we know the result already:
5438       // it must be -t0.  t0 + (-t0) must generate a carry iff
5439       // t0 != 0.  So, rather than do a mul and an adds we just set
5440       // the carry flag iff t0 is nonzero.
5441       //
5442       // mul(Rlo_mn, Rm, Rn);
5443       // adds(zr, t0, Rlo_mn);
5444       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5445       adcs(t0, t1, Rhi_mn);
5446       adc(t1, t2, zr);
5447       mov(t2, zr);
5448     }
5449 
5450     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5451       block_comment("pre2");
5452       // Pa = Pa_base + i-len;
5453       // Pb = Pb_base + len;
5454       // Pm = Pm_base + i-len;
5455       // Pn = Pn_base + len;
5456 
5457       if (i.is_register()) {
5458         sub(Rj, i.as_register(), len);
5459       } else {
5460         mov(Rj, i.as_constant());
5461         sub(Rj, Rj, len);
5462       }
5463       // Rj == i-len
5464 
5465       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5466       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5467       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5468       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5469 
5470       // Ra = *++Pa;
5471       // Rb = *--Pb;
5472       // Rm = *++Pm;
5473       // Rn = *--Pn;
5474       ldr(Ra, pre(Pa, wordSize));
5475       ldr(Rb, pre(Pb, -wordSize));
5476       ldr(Rm, pre(Pm, wordSize));
5477       ldr(Rn, pre(Pn, -wordSize));
5478 
5479       mov(Rhi_mn, zr);
5480       mov(Rlo_mn, zr);
5481     }
5482 
5483     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5484       block_comment("post2");
5485       if (i.is_constant()) {
5486         mov(Rj, i.as_constant()-len.as_constant());
5487       } else {
5488         sub(Rj, i.as_register(), len);
5489       }
5490 
5491       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5492 
5493       // As soon as we know the least significant digit of our result,
5494       // store it.
5495       // Pm_base[i-len] = t0;
5496       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5497 
5498       // t0 = t1; t1 = t2; t2 = 0;
5499       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5500       adc(t1, t2, zr);
5501       mov(t2, zr);
5502     }
5503 
5504     // A carry in t0 after Montgomery multiplication means that we
5505     // should subtract multiples of n from our result in m.  We'll
5506     // keep doing that until there is no carry.
5507     void normalize(RegisterOrConstant len) {
5508       block_comment("normalize");
5509       // while (t0)
5510       //   t0 = sub(Pm_base, Pn_base, t0, len);
5511       Label loop, post, again;
5512       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5513       cbz(t0, post); {
5514         bind(again); {
5515           mov(i, zr);
5516           mov(cnt, len);
5517           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5518           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5519           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5520           align(16);
5521           bind(loop); {
5522             sbcs(Rm, Rm, Rn);
5523             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5524             add(i, i, 1);
5525             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5526             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5527             sub(cnt, cnt, 1);
5528           } cbnz(cnt, loop);
5529           sbc(t0, t0, zr);
5530         } cbnz(t0, again);
5531       } bind(post);
5532     }
5533 
5534     // Move memory at s to d, reversing words.
5535     //    Increments d to end of copied memory
5536     //    Destroys tmp1, tmp2
5537     //    Preserves len
5538     //    Leaves s pointing to the address which was in d at start
5539     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5540       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5541 
5542       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5543       mov(tmp1, len);
5544       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5545       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5546     }
5547     // where
5548     void reverse1(Register d, Register s, Register tmp) {
5549       ldr(tmp, pre(s, -wordSize));
5550       ror(tmp, tmp, 32);
5551       str(tmp, post(d, wordSize));
5552     }
5553 
5554     void step_squaring() {
5555       // An extra ACC
5556       step();
5557       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5558     }
5559 
5560     void last_squaring(RegisterOrConstant i) {
5561       Label dont;
5562       // if ((i & 1) == 0) {
5563       tbnz(i.as_register(), 0, dont); {
5564         // MACC(Ra, Rb, t0, t1, t2);
5565         // Ra = *++Pa;
5566         // Rb = *--Pb;
5567         umulh(Rhi_ab, Ra, Rb);
5568         mul(Rlo_ab, Ra, Rb);
5569         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5570       } bind(dont);
5571     }
5572 
5573     void extra_step_squaring() {
5574       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5575 
5576       // MACC(Rm, Rn, t0, t1, t2);
5577       // Rm = *++Pm;
5578       // Rn = *--Pn;
5579       umulh(Rhi_mn, Rm, Rn);
5580       mul(Rlo_mn, Rm, Rn);
5581       ldr(Rm, pre(Pm, wordSize));
5582       ldr(Rn, pre(Pn, -wordSize));
5583     }
5584 
5585     void post1_squaring() {
5586       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5587 
5588       // *Pm = Rm = t0 * inv;
5589       mul(Rm, t0, inv);
5590       str(Rm, Address(Pm));
5591 
5592       // MACC(Rm, Rn, t0, t1, t2);
5593       // t0 = t1; t1 = t2; t2 = 0;
5594       umulh(Rhi_mn, Rm, Rn);
5595 
5596 #ifndef PRODUCT
5597       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5598       {
5599         mul(Rlo_mn, Rm, Rn);
5600         add(Rlo_mn, t0, Rlo_mn);
5601         Label ok;
5602         cbz(Rlo_mn, ok); {
5603           stop("broken Montgomery multiply");
5604         } bind(ok);
5605       }
5606 #endif
5607       // We have very carefully set things up so that
5608       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5609       // the lower half of Rm * Rn because we know the result already:
5610       // it must be -t0.  t0 + (-t0) must generate a carry iff
5611       // t0 != 0.  So, rather than do a mul and an adds we just set
5612       // the carry flag iff t0 is nonzero.
5613       //
5614       // mul(Rlo_mn, Rm, Rn);
5615       // adds(zr, t0, Rlo_mn);
5616       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5617       adcs(t0, t1, Rhi_mn);
5618       adc(t1, t2, zr);
5619       mov(t2, zr);
5620     }
5621 
5622     void acc(Register Rhi, Register Rlo,
5623              Register t0, Register t1, Register t2) {
5624       adds(t0, t0, Rlo);
5625       adcs(t1, t1, Rhi);
5626       adc(t2, t2, zr);
5627     }
5628 
5629   public:
5630     /**
5631      * Fast Montgomery multiplication.  The derivation of the
5632      * algorithm is in A Cryptographic Library for the Motorola
5633      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5634      *
5635      * Arguments:
5636      *
5637      * Inputs for multiplication:
5638      *   c_rarg0   - int array elements a
5639      *   c_rarg1   - int array elements b
5640      *   c_rarg2   - int array elements n (the modulus)
5641      *   c_rarg3   - int length
5642      *   c_rarg4   - int inv
5643      *   c_rarg5   - int array elements m (the result)
5644      *
5645      * Inputs for squaring:
5646      *   c_rarg0   - int array elements a
5647      *   c_rarg1   - int array elements n (the modulus)
5648      *   c_rarg2   - int length
5649      *   c_rarg3   - int inv
5650      *   c_rarg4   - int array elements m (the result)
5651      *
5652      */
5653     address generate_multiply() {
5654       Label argh, nothing;
5655       bind(argh);
5656       stop("MontgomeryMultiply total_allocation must be <= 8192");
5657 
5658       align(CodeEntryAlignment);
5659       address entry = pc();
5660 
5661       cbzw(Rlen, nothing);
5662 
5663       enter();
5664 
5665       // Make room.
5666       cmpw(Rlen, 512);
5667       br(Assembler::HI, argh);
5668       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5669       andr(sp, Ra, -2 * wordSize);
5670 
5671       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5672 
5673       {
5674         // Copy input args, reversing as we go.  We use Ra as a
5675         // temporary variable.
5676         reverse(Ra, Pa_base, Rlen, t0, t1);
5677         if (!_squaring)
5678           reverse(Ra, Pb_base, Rlen, t0, t1);
5679         reverse(Ra, Pn_base, Rlen, t0, t1);
5680       }
5681 
5682       // Push all call-saved registers and also Pm_base which we'll need
5683       // at the end.
5684       save_regs();
5685 
5686 #ifndef PRODUCT
5687       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5688       {
5689         ldr(Rn, Address(Pn_base, 0));
5690         mul(Rlo_mn, Rn, inv);
5691         subs(zr, Rlo_mn, -1);
5692         Label ok;
5693         br(EQ, ok); {
5694           stop("broken inverse in Montgomery multiply");
5695         } bind(ok);
5696       }
5697 #endif
5698 
5699       mov(Pm_base, Ra);
5700 
5701       mov(t0, zr);
5702       mov(t1, zr);
5703       mov(t2, zr);
5704 
5705       block_comment("for (int i = 0; i < len; i++) {");
5706       mov(Ri, zr); {
5707         Label loop, end;
5708         cmpw(Ri, Rlen);
5709         br(Assembler::GE, end);
5710 
5711         bind(loop);
5712         pre1(Ri);
5713 
5714         block_comment("  for (j = i; j; j--) {"); {
5715           movw(Rj, Ri);
5716           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5717         } block_comment("  } // j");
5718 
5719         post1();
5720         addw(Ri, Ri, 1);
5721         cmpw(Ri, Rlen);
5722         br(Assembler::LT, loop);
5723         bind(end);
5724         block_comment("} // i");
5725       }
5726 
5727       block_comment("for (int i = len; i < 2*len; i++) {");
5728       mov(Ri, Rlen); {
5729         Label loop, end;
5730         cmpw(Ri, Rlen, Assembler::LSL, 1);
5731         br(Assembler::GE, end);
5732 
5733         bind(loop);
5734         pre2(Ri, Rlen);
5735 
5736         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5737           lslw(Rj, Rlen, 1);
5738           subw(Rj, Rj, Ri);
5739           subw(Rj, Rj, 1);
5740           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5741         } block_comment("  } // j");
5742 
5743         post2(Ri, Rlen);
5744         addw(Ri, Ri, 1);
5745         cmpw(Ri, Rlen, Assembler::LSL, 1);
5746         br(Assembler::LT, loop);
5747         bind(end);
5748       }
5749       block_comment("} // i");
5750 
5751       normalize(Rlen);
5752 
5753       mov(Ra, Pm_base);  // Save Pm_base in Ra
5754       restore_regs();  // Restore caller's Pm_base
5755 
5756       // Copy our result into caller's Pm_base
5757       reverse(Pm_base, Ra, Rlen, t0, t1);
5758 
5759       leave();
5760       bind(nothing);
5761       ret(lr);
5762 
5763       return entry;
5764     }
5765     // In C, approximately:
5766 
5767     // void
5768     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
5769     //                     julong Pn_base[], julong Pm_base[],
5770     //                     julong inv, int len) {
5771     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5772     //   julong *Pa, *Pb, *Pn, *Pm;
5773     //   julong Ra, Rb, Rn, Rm;
5774 
5775     //   int i;
5776 
5777     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5778 
5779     //   for (i = 0; i < len; i++) {
5780     //     int j;
5781 
5782     //     Pa = Pa_base;
5783     //     Pb = Pb_base + i;
5784     //     Pm = Pm_base;
5785     //     Pn = Pn_base + i;
5786 
5787     //     Ra = *Pa;
5788     //     Rb = *Pb;
5789     //     Rm = *Pm;
5790     //     Rn = *Pn;
5791 
5792     //     int iters = i;
5793     //     for (j = 0; iters--; j++) {
5794     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5795     //       MACC(Ra, Rb, t0, t1, t2);
5796     //       Ra = *++Pa;
5797     //       Rb = *--Pb;
5798     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5799     //       MACC(Rm, Rn, t0, t1, t2);
5800     //       Rm = *++Pm;
5801     //       Rn = *--Pn;
5802     //     }
5803 
5804     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5805     //     MACC(Ra, Rb, t0, t1, t2);
5806     //     *Pm = Rm = t0 * inv;
5807     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5808     //     MACC(Rm, Rn, t0, t1, t2);
5809 
5810     //     assert(t0 == 0, "broken Montgomery multiply");
5811 
5812     //     t0 = t1; t1 = t2; t2 = 0;
5813     //   }
5814 
5815     //   for (i = len; i < 2*len; i++) {
5816     //     int j;
5817 
5818     //     Pa = Pa_base + i-len;
5819     //     Pb = Pb_base + len;
5820     //     Pm = Pm_base + i-len;
5821     //     Pn = Pn_base + len;
5822 
5823     //     Ra = *++Pa;
5824     //     Rb = *--Pb;
5825     //     Rm = *++Pm;
5826     //     Rn = *--Pn;
5827 
5828     //     int iters = len*2-i-1;
5829     //     for (j = i-len+1; iters--; j++) {
5830     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5831     //       MACC(Ra, Rb, t0, t1, t2);
5832     //       Ra = *++Pa;
5833     //       Rb = *--Pb;
5834     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5835     //       MACC(Rm, Rn, t0, t1, t2);
5836     //       Rm = *++Pm;
5837     //       Rn = *--Pn;
5838     //     }
5839 
5840     //     Pm_base[i-len] = t0;
5841     //     t0 = t1; t1 = t2; t2 = 0;
5842     //   }
5843 
5844     //   while (t0)
5845     //     t0 = sub(Pm_base, Pn_base, t0, len);
5846     // }
5847 
5848     /**
5849      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5850      * multiplies than Montgomery multiplication so it should be up to
5851      * 25% faster.  However, its loop control is more complex and it
5852      * may actually run slower on some machines.
5853      *
5854      * Arguments:
5855      *
5856      * Inputs:
5857      *   c_rarg0   - int array elements a
5858      *   c_rarg1   - int array elements n (the modulus)
5859      *   c_rarg2   - int length
5860      *   c_rarg3   - int inv
5861      *   c_rarg4   - int array elements m (the result)
5862      *
5863      */
5864     address generate_square() {
5865       Label argh;
5866       bind(argh);
5867       stop("MontgomeryMultiply total_allocation must be <= 8192");
5868 
5869       align(CodeEntryAlignment);
5870       address entry = pc();
5871 
5872       enter();
5873 
5874       // Make room.
5875       cmpw(Rlen, 512);
5876       br(Assembler::HI, argh);
5877       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5878       andr(sp, Ra, -2 * wordSize);
5879 
5880       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5881 
5882       {
5883         // Copy input args, reversing as we go.  We use Ra as a
5884         // temporary variable.
5885         reverse(Ra, Pa_base, Rlen, t0, t1);
5886         reverse(Ra, Pn_base, Rlen, t0, t1);
5887       }
5888 
5889       // Push all call-saved registers and also Pm_base which we'll need
5890       // at the end.
5891       save_regs();
5892 
5893       mov(Pm_base, Ra);
5894 
5895       mov(t0, zr);
5896       mov(t1, zr);
5897       mov(t2, zr);
5898 
5899       block_comment("for (int i = 0; i < len; i++) {");
5900       mov(Ri, zr); {
5901         Label loop, end;
5902         bind(loop);
5903         cmp(Ri, Rlen);
5904         br(Assembler::GE, end);
5905 
5906         pre1(Ri);
5907 
5908         block_comment("for (j = (i+1)/2; j; j--) {"); {
5909           add(Rj, Ri, 1);
5910           lsr(Rj, Rj, 1);
5911           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5912         } block_comment("  } // j");
5913 
5914         last_squaring(Ri);
5915 
5916         block_comment("  for (j = i/2; j; j--) {"); {
5917           lsr(Rj, Ri, 1);
5918           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5919         } block_comment("  } // j");
5920 
5921         post1_squaring();
5922         add(Ri, Ri, 1);
5923         cmp(Ri, Rlen);
5924         br(Assembler::LT, loop);
5925 
5926         bind(end);
5927         block_comment("} // i");
5928       }
5929 
5930       block_comment("for (int i = len; i < 2*len; i++) {");
5931       mov(Ri, Rlen); {
5932         Label loop, end;
5933         bind(loop);
5934         cmp(Ri, Rlen, Assembler::LSL, 1);
5935         br(Assembler::GE, end);
5936 
5937         pre2(Ri, Rlen);
5938 
5939         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5940           lsl(Rj, Rlen, 1);
5941           sub(Rj, Rj, Ri);
5942           sub(Rj, Rj, 1);
5943           lsr(Rj, Rj, 1);
5944           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5945         } block_comment("  } // j");
5946 
5947         last_squaring(Ri);
5948 
5949         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5950           lsl(Rj, Rlen, 1);
5951           sub(Rj, Rj, Ri);
5952           lsr(Rj, Rj, 1);
5953           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5954         } block_comment("  } // j");
5955 
5956         post2(Ri, Rlen);
5957         add(Ri, Ri, 1);
5958         cmp(Ri, Rlen, Assembler::LSL, 1);
5959 
5960         br(Assembler::LT, loop);
5961         bind(end);
5962         block_comment("} // i");
5963       }
5964 
5965       normalize(Rlen);
5966 
5967       mov(Ra, Pm_base);  // Save Pm_base in Ra
5968       restore_regs();  // Restore caller's Pm_base
5969 
5970       // Copy our result into caller's Pm_base
5971       reverse(Pm_base, Ra, Rlen, t0, t1);
5972 
5973       leave();
5974       ret(lr);
5975 
5976       return entry;
5977     }
5978     // In C, approximately:
5979 
5980     // void
5981     // montgomery_square(julong Pa_base[], julong Pn_base[],
5982     //                   julong Pm_base[], julong inv, int len) {
5983     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5984     //   julong *Pa, *Pb, *Pn, *Pm;
5985     //   julong Ra, Rb, Rn, Rm;
5986 
5987     //   int i;
5988 
5989     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5990 
5991     //   for (i = 0; i < len; i++) {
5992     //     int j;
5993 
5994     //     Pa = Pa_base;
5995     //     Pb = Pa_base + i;
5996     //     Pm = Pm_base;
5997     //     Pn = Pn_base + i;
5998 
5999     //     Ra = *Pa;
6000     //     Rb = *Pb;
6001     //     Rm = *Pm;
6002     //     Rn = *Pn;
6003 
6004     //     int iters = (i+1)/2;
6005     //     for (j = 0; iters--; j++) {
6006     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
6007     //       MACC2(Ra, Rb, t0, t1, t2);
6008     //       Ra = *++Pa;
6009     //       Rb = *--Pb;
6010     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6011     //       MACC(Rm, Rn, t0, t1, t2);
6012     //       Rm = *++Pm;
6013     //       Rn = *--Pn;
6014     //     }
6015     //     if ((i & 1) == 0) {
6016     //       assert(Ra == Pa_base[j], "must be");
6017     //       MACC(Ra, Ra, t0, t1, t2);
6018     //     }
6019     //     iters = i/2;
6020     //     assert(iters == i-j, "must be");
6021     //     for (; iters--; j++) {
6022     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6023     //       MACC(Rm, Rn, t0, t1, t2);
6024     //       Rm = *++Pm;
6025     //       Rn = *--Pn;
6026     //     }
6027 
6028     //     *Pm = Rm = t0 * inv;
6029     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
6030     //     MACC(Rm, Rn, t0, t1, t2);
6031 
6032     //     assert(t0 == 0, "broken Montgomery multiply");
6033 
6034     //     t0 = t1; t1 = t2; t2 = 0;
6035     //   }
6036 
6037     //   for (i = len; i < 2*len; i++) {
6038     //     int start = i-len+1;
6039     //     int end = start + (len - start)/2;
6040     //     int j;
6041 
6042     //     Pa = Pa_base + i-len;
6043     //     Pb = Pa_base + len;
6044     //     Pm = Pm_base + i-len;
6045     //     Pn = Pn_base + len;
6046 
6047     //     Ra = *++Pa;
6048     //     Rb = *--Pb;
6049     //     Rm = *++Pm;
6050     //     Rn = *--Pn;
6051 
6052     //     int iters = (2*len-i-1)/2;
6053     //     assert(iters == end-start, "must be");
6054     //     for (j = start; iters--; j++) {
6055     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
6056     //       MACC2(Ra, Rb, t0, t1, t2);
6057     //       Ra = *++Pa;
6058     //       Rb = *--Pb;
6059     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6060     //       MACC(Rm, Rn, t0, t1, t2);
6061     //       Rm = *++Pm;
6062     //       Rn = *--Pn;
6063     //     }
6064     //     if ((i & 1) == 0) {
6065     //       assert(Ra == Pa_base[j], "must be");
6066     //       MACC(Ra, Ra, t0, t1, t2);
6067     //     }
6068     //     iters =  (2*len-i)/2;
6069     //     assert(iters == len-j, "must be");
6070     //     for (; iters--; j++) {
6071     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6072     //       MACC(Rm, Rn, t0, t1, t2);
6073     //       Rm = *++Pm;
6074     //       Rn = *--Pn;
6075     //     }
6076     //     Pm_base[i-len] = t0;
6077     //     t0 = t1; t1 = t2; t2 = 0;
6078     //   }
6079 
6080     //   while (t0)
6081     //     t0 = sub(Pm_base, Pn_base, t0, len);
6082     // }
6083   };
6084 
6085 
6086   // Initialization
6087   void generate_initial() {
6088     // Generate initial stubs and initializes the entry points
6089 
6090     // entry points that exist in all platforms Note: This is code
6091     // that could be shared among different platforms - however the
6092     // benefit seems to be smaller than the disadvantage of having a
6093     // much more complicated generator structure. See also comment in
6094     // stubRoutines.hpp.
6095 
6096     StubRoutines::_forward_exception_entry = generate_forward_exception();
6097 
6098     StubRoutines::_call_stub_entry =
6099       generate_call_stub(StubRoutines::_call_stub_return_address);
6100 
6101     // is referenced by megamorphic call
6102     StubRoutines::_catch_exception_entry = generate_catch_exception();
6103 
6104     // Build this early so it's available for the interpreter.
6105     StubRoutines::_throw_StackOverflowError_entry =
6106       generate_throw_exception("StackOverflowError throw_exception",
6107                                CAST_FROM_FN_PTR(address,
6108                                                 SharedRuntime::throw_StackOverflowError));
6109     StubRoutines::_throw_delayed_StackOverflowError_entry =
6110       generate_throw_exception("delayed StackOverflowError throw_exception",
6111                                CAST_FROM_FN_PTR(address,
6112                                                 SharedRuntime::throw_delayed_StackOverflowError));
6113     if (UseCRC32Intrinsics) {
6114       // set table address before stub generation which use it
6115       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
6116       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
6117     }
6118 
6119     if (UseCRC32CIntrinsics) {
6120       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
6121     }
6122 
6123     // Disabled until JDK-8210858 is fixed
6124     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
6125     //   StubRoutines::_dlog = generate_dlog();
6126     // }
6127 
6128     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
6129       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
6130     }
6131 
6132     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
6133       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
6134     }
6135 
6136     // Safefetch stubs.
6137     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
6138                                                        &StubRoutines::_safefetch32_fault_pc,
6139                                                        &StubRoutines::_safefetch32_continuation_pc);
6140     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6141                                                        &StubRoutines::_safefetchN_fault_pc,
6142                                                        &StubRoutines::_safefetchN_continuation_pc);
6143   }
6144 
6145   void generate_all() {
6146     // support for verify_oop (must happen after universe_init)
6147     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
6148     StubRoutines::_throw_AbstractMethodError_entry =
6149       generate_throw_exception("AbstractMethodError throw_exception",
6150                                CAST_FROM_FN_PTR(address,
6151                                                 SharedRuntime::
6152                                                 throw_AbstractMethodError));
6153 
6154     StubRoutines::_throw_IncompatibleClassChangeError_entry =
6155       generate_throw_exception("IncompatibleClassChangeError throw_exception",
6156                                CAST_FROM_FN_PTR(address,
6157                                                 SharedRuntime::
6158                                                 throw_IncompatibleClassChangeError));
6159 
6160     StubRoutines::_throw_NullPointerException_at_call_entry =
6161       generate_throw_exception("NullPointerException at call throw_exception",
6162                                CAST_FROM_FN_PTR(address,
6163                                                 SharedRuntime::
6164                                                 throw_NullPointerException_at_call));
6165 
6166     // arraycopy stubs used by compilers
6167     generate_arraycopy_stubs();
6168 
6169     // has negatives stub for large arrays.
6170     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
6171 
6172     // array equals stub for large arrays.
6173     if (!UseSimpleArrayEquals) {
6174       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
6175     }
6176 
6177     generate_compare_long_strings();
6178 
6179     generate_string_indexof_stubs();
6180 
6181     // byte_array_inflate stub for large arrays.
6182     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
6183 
6184     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
6185     if (bs_nm != NULL) {
6186       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
6187     }
6188 #ifdef COMPILER2
6189     if (UseMultiplyToLenIntrinsic) {
6190       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6191     }
6192 
6193     if (UseSquareToLenIntrinsic) {
6194       StubRoutines::_squareToLen = generate_squareToLen();
6195     }
6196 
6197     if (UseMulAddIntrinsic) {
6198       StubRoutines::_mulAdd = generate_mulAdd();
6199     }
6200 
6201     if (UseMontgomeryMultiplyIntrinsic) {
6202       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
6203       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
6204       StubRoutines::_montgomeryMultiply = g.generate_multiply();
6205     }
6206 
6207     if (UseMontgomerySquareIntrinsic) {
6208       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
6209       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
6210       // We use generate_multiply() rather than generate_square()
6211       // because it's faster for the sizes of modulus we care about.
6212       StubRoutines::_montgomerySquare = g.generate_multiply();
6213     }
6214 #endif // COMPILER2
6215 
6216     // generate GHASH intrinsics code
6217     if (UseGHASHIntrinsics) {
6218       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6219     }
6220 
6221     // data cache line writeback
6222     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
6223     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
6224 
6225     if (UseAESIntrinsics) {
6226       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6227       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6228       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6229       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
6230     }
6231 
6232     if (UseSHA1Intrinsics) {
6233       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
6234       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
6235     }
6236     if (UseSHA256Intrinsics) {
6237       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
6238       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
6239     }
6240     if (UseSHA512Intrinsics) {
6241       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
6242       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
6243     }
6244     if (UseSHA3Intrinsics) {
6245       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
6246       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
6247     }
6248 
6249     // generate Adler32 intrinsics code
6250     if (UseAdler32Intrinsics) {
6251       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6252     }
6253 
6254     StubRoutines::aarch64::set_completed();
6255   }
6256 
6257  public:
6258   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6259     if (all) {
6260       generate_all();
6261     } else {
6262       generate_initial();
6263     }
6264   }
6265 }; // end class declaration
6266 
6267 #define UCM_TABLE_MAX_ENTRIES 8
6268 void StubGenerator_generate(CodeBuffer* code, bool all) {
6269   if (UnsafeCopyMemory::_table == NULL) {
6270     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
6271   }
6272   StubGenerator g(code, all);
6273 }