1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #include "utilities/powerOfTwo.hpp"
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zThreadLocalData.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     address aarch64_entry = __ pc();
 222 
 223     // set up frame and move sp to end of save area
 224     __ enter();
 225     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 226 
 227     // save register parameters and Java scratch/global registers
 228     // n.b. we save thread even though it gets installed in
 229     // rthread because we want to sanity check rthread later
 230     __ str(c_rarg7,  thread);
 231     __ strw(c_rarg6, parameter_size);
 232     __ stp(c_rarg4, c_rarg5,  entry_point);
 233     __ stp(c_rarg2, c_rarg3,  result_type);
 234     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 235 
 236     __ stp(r20, r19,   r20_save);
 237     __ stp(r22, r21,   r22_save);
 238     __ stp(r24, r23,   r24_save);
 239     __ stp(r26, r25,   r26_save);
 240     __ stp(r28, r27,   r28_save);
 241 
 242     __ stpd(v9,  v8,   d9_save);
 243     __ stpd(v11, v10,  d11_save);
 244     __ stpd(v13, v12,  d13_save);
 245     __ stpd(v15, v14,  d15_save);
 246 
 247     // install Java thread in global register now we have saved
 248     // whatever value it held
 249     __ mov(rthread, c_rarg7);
 250     // And method
 251     __ mov(rmethod, c_rarg3);
 252 
 253     // set up the heapbase register
 254     __ reinit_heapbase();
 255 
 256 #ifdef ASSERT
 257     // make sure we have no pending exceptions
 258     {
 259       Label L;
 260       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 261       __ cmp(rscratch1, (u1)NULL_WORD);
 262       __ br(Assembler::EQ, L);
 263       __ stop("StubRoutines::call_stub: entered with pending exception");
 264       __ BIND(L);
 265     }
 266 #endif
 267     // pass parameters if any
 268     __ mov(esp, sp);
 269     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 270     __ andr(sp, rscratch1, -2 * wordSize);
 271 
 272     BLOCK_COMMENT("pass parameters if any");
 273     Label parameters_done;
 274     // parameter count is still in c_rarg6
 275     // and parameter pointer identifying param 1 is in c_rarg5
 276     __ cbzw(c_rarg6, parameters_done);
 277 
 278     address loop = __ pc();
 279     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 280     __ subsw(c_rarg6, c_rarg6, 1);
 281     __ push(rscratch1);
 282     __ br(Assembler::GT, loop);
 283 
 284     __ BIND(parameters_done);
 285 
 286     // call Java entry -- passing methdoOop, and current sp
 287     //      rmethod: Method*
 288     //      r13: sender sp
 289     BLOCK_COMMENT("call Java function");
 290     __ mov(r13, sp);
 291     __ blr(c_rarg4);
 292 
 293     // we do this here because the notify will already have been done
 294     // if we get to the next instruction via an exception
 295     //
 296     // n.b. adding this instruction here affects the calculation of
 297     // whether or not a routine returns to the call stub (used when
 298     // doing stack walks) since the normal test is to check the return
 299     // pc against the address saved below. so we may need to allow for
 300     // this extra instruction in the check.
 301 
 302     // save current address for use by exception handling code
 303 
 304     return_address = __ pc();
 305 
 306     // store result depending on type (everything that is not
 307     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 308     // n.b. this assumes Java returns an integral result in r0
 309     // and a floating result in j_farg0
 310     __ ldr(j_rarg2, result);
 311     Label is_long, is_float, is_double, exit;
 312     __ ldr(j_rarg1, result_type);
 313     __ cmp(j_rarg1, (u1)T_OBJECT);
 314     __ br(Assembler::EQ, is_long);
 315     __ cmp(j_rarg1, (u1)T_LONG);
 316     __ br(Assembler::EQ, is_long);
 317     __ cmp(j_rarg1, (u1)T_FLOAT);
 318     __ br(Assembler::EQ, is_float);
 319     __ cmp(j_rarg1, (u1)T_DOUBLE);
 320     __ br(Assembler::EQ, is_double);
 321 
 322     // handle T_INT case
 323     __ strw(r0, Address(j_rarg2));
 324 
 325     __ BIND(exit);
 326 
 327     // pop parameters
 328     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 329 
 330 #ifdef ASSERT
 331     // verify that threads correspond
 332     {
 333       Label L, S;
 334       __ ldr(rscratch1, thread);
 335       __ cmp(rthread, rscratch1);
 336       __ br(Assembler::NE, S);
 337       __ get_thread(rscratch1);
 338       __ cmp(rthread, rscratch1);
 339       __ br(Assembler::EQ, L);
 340       __ BIND(S);
 341       __ stop("StubRoutines::call_stub: threads must correspond");
 342       __ BIND(L);
 343     }
 344 #endif
 345 
 346     // restore callee-save registers
 347     __ ldpd(v15, v14,  d15_save);
 348     __ ldpd(v13, v12,  d13_save);
 349     __ ldpd(v11, v10,  d11_save);
 350     __ ldpd(v9,  v8,   d9_save);
 351 
 352     __ ldp(r28, r27,   r28_save);
 353     __ ldp(r26, r25,   r26_save);
 354     __ ldp(r24, r23,   r24_save);
 355     __ ldp(r22, r21,   r22_save);
 356     __ ldp(r20, r19,   r20_save);
 357 
 358     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 359     __ ldrw(c_rarg2, result_type);
 360     __ ldr(c_rarg3,  method);
 361     __ ldp(c_rarg4, c_rarg5,  entry_point);
 362     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 363 
 364     // leave frame and return to caller
 365     __ leave();
 366     __ ret(lr);
 367 
 368     // handle return types different from T_INT
 369 
 370     __ BIND(is_long);
 371     __ str(r0, Address(j_rarg2, 0));
 372     __ br(Assembler::AL, exit);
 373 
 374     __ BIND(is_float);
 375     __ strs(j_farg0, Address(j_rarg2, 0));
 376     __ br(Assembler::AL, exit);
 377 
 378     __ BIND(is_double);
 379     __ strd(j_farg0, Address(j_rarg2, 0));
 380     __ br(Assembler::AL, exit);
 381 
 382     return start;
 383   }
 384 
 385   // Return point for a Java call if there's an exception thrown in
 386   // Java code.  The exception is caught and transformed into a
 387   // pending exception stored in JavaThread that can be tested from
 388   // within the VM.
 389   //
 390   // Note: Usually the parameters are removed by the callee. In case
 391   // of an exception crossing an activation frame boundary, that is
 392   // not the case if the callee is compiled code => need to setup the
 393   // rsp.
 394   //
 395   // r0: exception oop
 396 
 397   address generate_catch_exception() {
 398     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 399     address start = __ pc();
 400 
 401     // same as in generate_call_stub():
 402     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 403     const Address thread        (rfp, thread_off         * wordSize);
 404 
 405 #ifdef ASSERT
 406     // verify that threads correspond
 407     {
 408       Label L, S;
 409       __ ldr(rscratch1, thread);
 410       __ cmp(rthread, rscratch1);
 411       __ br(Assembler::NE, S);
 412       __ get_thread(rscratch1);
 413       __ cmp(rthread, rscratch1);
 414       __ br(Assembler::EQ, L);
 415       __ bind(S);
 416       __ stop("StubRoutines::catch_exception: threads must correspond");
 417       __ bind(L);
 418     }
 419 #endif
 420 
 421     // set pending exception
 422     __ verify_oop(r0);
 423 
 424     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 425     __ mov(rscratch1, (address)__FILE__);
 426     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 427     __ movw(rscratch1, (int)__LINE__);
 428     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 429 
 430     // complete return to VM
 431     assert(StubRoutines::_call_stub_return_address != NULL,
 432            "_call_stub_return_address must have been generated before");
 433     __ b(StubRoutines::_call_stub_return_address);
 434 
 435     return start;
 436   }
 437 
 438   // Continuation point for runtime calls returning with a pending
 439   // exception.  The pending exception check happened in the runtime
 440   // or native call stub.  The pending exception in Thread is
 441   // converted into a Java-level exception.
 442   //
 443   // Contract with Java-level exception handlers:
 444   // r0: exception
 445   // r3: throwing pc
 446   //
 447   // NOTE: At entry of this stub, exception-pc must be in LR !!
 448 
 449   // NOTE: this is always used as a jump target within generated code
 450   // so it just needs to be generated code wiht no x86 prolog
 451 
 452   address generate_forward_exception() {
 453     StubCodeMark mark(this, "StubRoutines", "forward exception");
 454     address start = __ pc();
 455 
 456     // Upon entry, LR points to the return address returning into
 457     // Java (interpreted or compiled) code; i.e., the return address
 458     // becomes the throwing pc.
 459     //
 460     // Arguments pushed before the runtime call are still on the stack
 461     // but the exception handler will reset the stack pointer ->
 462     // ignore them.  A potential result in registers can be ignored as
 463     // well.
 464 
 465 #ifdef ASSERT
 466     // make sure this code is only executed if there is a pending exception
 467     {
 468       Label L;
 469       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 470       __ cbnz(rscratch1, L);
 471       __ stop("StubRoutines::forward exception: no pending exception (1)");
 472       __ bind(L);
 473     }
 474 #endif
 475 
 476     // compute exception handler into r19
 477 
 478     // call the VM to find the handler address associated with the
 479     // caller address. pass thread in r0 and caller pc (ret address)
 480     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 481     // the stack.
 482     __ mov(c_rarg1, lr);
 483     // lr will be trashed by the VM call so we move it to R19
 484     // (callee-saved) because we also need to pass it to the handler
 485     // returned by this call.
 486     __ mov(r19, lr);
 487     BLOCK_COMMENT("call exception_handler_for_return_address");
 488     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 489                          SharedRuntime::exception_handler_for_return_address),
 490                     rthread, c_rarg1);
 491     // we should not really care that lr is no longer the callee
 492     // address. we saved the value the handler needs in r19 so we can
 493     // just copy it to r3. however, the C2 handler will push its own
 494     // frame and then calls into the VM and the VM code asserts that
 495     // the PC for the frame above the handler belongs to a compiled
 496     // Java method. So, we restore lr here to satisfy that assert.
 497     __ mov(lr, r19);
 498     // setup r0 & r3 & clear pending exception
 499     __ mov(r3, r19);
 500     __ mov(r19, r0);
 501     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 502     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 503 
 504 #ifdef ASSERT
 505     // make sure exception is set
 506     {
 507       Label L;
 508       __ cbnz(r0, L);
 509       __ stop("StubRoutines::forward exception: no pending exception (2)");
 510       __ bind(L);
 511     }
 512 #endif
 513 
 514     // continue at exception handler
 515     // r0: exception
 516     // r3: throwing pc
 517     // r19: exception handler
 518     __ verify_oop(r0);
 519     __ br(r19);
 520 
 521     return start;
 522   }
 523 
 524   // Non-destructive plausibility checks for oops
 525   //
 526   // Arguments:
 527   //    r0: oop to verify
 528   //    rscratch1: error message
 529   //
 530   // Stack after saving c_rarg3:
 531   //    [tos + 0]: saved c_rarg3
 532   //    [tos + 1]: saved c_rarg2
 533   //    [tos + 2]: saved lr
 534   //    [tos + 3]: saved rscratch2
 535   //    [tos + 4]: saved r0
 536   //    [tos + 5]: saved rscratch1
 537   address generate_verify_oop() {
 538 
 539     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 540     address start = __ pc();
 541 
 542     Label exit, error;
 543 
 544     // save c_rarg2 and c_rarg3
 545     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 546 
 547     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 548     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ ldr(c_rarg3, Address(c_rarg2));
 550     __ add(c_rarg3, c_rarg3, 1);
 551     __ str(c_rarg3, Address(c_rarg2));
 552 
 553     // object is in r0
 554     // make sure object is 'reasonable'
 555     __ cbz(r0, exit); // if obj is NULL it is OK
 556 
 557 #if INCLUDE_ZGC
 558     if (UseZGC) {
 559       // Check if mask is good.
 560       // verifies that ZAddressBadMask & r0 == 0
 561       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 562       __ andr(c_rarg2, r0, c_rarg3);
 563       __ cbnz(c_rarg2, error);
 564     }
 565 #endif
 566 
 567     // Check if the oop is in the right area of memory
 568     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 569     __ andr(c_rarg2, r0, c_rarg3);
 570     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 571 
 572     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 573     // instruction here because the flags register is live.
 574     __ eor(c_rarg2, c_rarg2, c_rarg3);
 575     __ cbnz(c_rarg2, error);
 576 
 577     // make sure klass is 'reasonable', which is not zero.
 578     __ load_klass(r0, r0);  // get klass
 579     __ cbz(r0, error);      // if klass is NULL it is broken
 580 
 581     // return if everything seems ok
 582     __ bind(exit);
 583 
 584     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 585     __ ret(lr);
 586 
 587     // handle errors
 588     __ bind(error);
 589     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 590 
 591     __ push(RegSet::range(r0, r29), sp);
 592     // debug(char* msg, int64_t pc, int64_t regs[])
 593     __ mov(c_rarg0, rscratch1);      // pass address of error message
 594     __ mov(c_rarg1, lr);             // pass return address
 595     __ mov(c_rarg2, sp);             // pass address of regs on stack
 596 #ifndef PRODUCT
 597     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 598 #endif
 599     BLOCK_COMMENT("call MacroAssembler::debug");
 600     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 601     __ blr(rscratch1);
 602     __ hlt(0);
 603 
 604     return start;
 605   }
 606 
 607   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 608 
 609   // The inner part of zero_words().  This is the bulk operation,
 610   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 611   // caller is responsible for zeroing the last few words.
 612   //
 613   // Inputs:
 614   // r10: the HeapWord-aligned base address of an array to zero.
 615   // r11: the count in HeapWords, r11 > 0.
 616   //
 617   // Returns r10 and r11, adjusted for the caller to clear.
 618   // r10: the base address of the tail of words left to clear.
 619   // r11: the number of words in the tail.
 620   //      r11 < MacroAssembler::zero_words_block_size.
 621 
 622   address generate_zero_blocks() {
 623     Label done;
 624     Label base_aligned;
 625 
 626     Register base = r10, cnt = r11;
 627 
 628     __ align(CodeEntryAlignment);
 629     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 630     address start = __ pc();
 631 
 632     if (UseBlockZeroing) {
 633       int zva_length = VM_Version::zva_length();
 634 
 635       // Ensure ZVA length can be divided by 16. This is required by
 636       // the subsequent operations.
 637       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 638 
 639       __ tbz(base, 3, base_aligned);
 640       __ str(zr, Address(__ post(base, 8)));
 641       __ sub(cnt, cnt, 1);
 642       __ bind(base_aligned);
 643 
 644       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 645       // alignment.
 646       Label small;
 647       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 648       __ subs(rscratch1, cnt, low_limit >> 3);
 649       __ br(Assembler::LT, small);
 650       __ zero_dcache_blocks(base, cnt);
 651       __ bind(small);
 652     }
 653 
 654     {
 655       // Number of stp instructions we'll unroll
 656       const int unroll =
 657         MacroAssembler::zero_words_block_size / 2;
 658       // Clear the remaining blocks.
 659       Label loop;
 660       __ subs(cnt, cnt, unroll * 2);
 661       __ br(Assembler::LT, done);
 662       __ bind(loop);
 663       for (int i = 0; i < unroll; i++)
 664         __ stp(zr, zr, __ post(base, 16));
 665       __ subs(cnt, cnt, unroll * 2);
 666       __ br(Assembler::GE, loop);
 667       __ bind(done);
 668       __ add(cnt, cnt, unroll * 2);
 669     }
 670 
 671     __ ret(lr);
 672 
 673     return start;
 674   }
 675 
 676 
 677   typedef enum {
 678     copy_forwards = 1,
 679     copy_backwards = -1
 680   } copy_direction;
 681 
 682   // Bulk copy of blocks of 8 words.
 683   //
 684   // count is a count of words.
 685   //
 686   // Precondition: count >= 8
 687   //
 688   // Postconditions:
 689   //
 690   // The least significant bit of count contains the remaining count
 691   // of words to copy.  The rest of count is trash.
 692   //
 693   // s and d are adjusted to point to the remaining words to copy
 694   //
 695   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 696                            copy_direction direction) {
 697     int unit = wordSize * direction;
 698     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 699 
 700     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 701       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 702     const Register stride = r13;
 703 
 704     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 705     assert_different_registers(s, d, count, rscratch1);
 706 
 707     Label again, drain;
 708     const char *stub_name;
 709     if (direction == copy_forwards)
 710       stub_name = "forward_copy_longs";
 711     else
 712       stub_name = "backward_copy_longs";
 713 
 714     __ align(CodeEntryAlignment);
 715 
 716     StubCodeMark mark(this, "StubRoutines", stub_name);
 717 
 718     __ bind(start);
 719 
 720     Label unaligned_copy_long;
 721     if (AvoidUnalignedAccesses) {
 722       __ tbnz(d, 3, unaligned_copy_long);
 723     }
 724 
 725     if (direction == copy_forwards) {
 726       __ sub(s, s, bias);
 727       __ sub(d, d, bias);
 728     }
 729 
 730 #ifdef ASSERT
 731     // Make sure we are never given < 8 words
 732     {
 733       Label L;
 734       __ cmp(count, (u1)8);
 735       __ br(Assembler::GE, L);
 736       __ stop("genrate_copy_longs called with < 8 words");
 737       __ bind(L);
 738     }
 739 #endif
 740 
 741     // Fill 8 registers
 742     if (UseSIMDForMemoryOps) {
 743       __ ldpq(v0, v1, Address(s, 4 * unit));
 744       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 745     } else {
 746       __ ldp(t0, t1, Address(s, 2 * unit));
 747       __ ldp(t2, t3, Address(s, 4 * unit));
 748       __ ldp(t4, t5, Address(s, 6 * unit));
 749       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 750     }
 751 
 752     __ subs(count, count, 16);
 753     __ br(Assembler::LO, drain);
 754 
 755     int prefetch = PrefetchCopyIntervalInBytes;
 756     bool use_stride = false;
 757     if (direction == copy_backwards) {
 758        use_stride = prefetch > 256;
 759        prefetch = -prefetch;
 760        if (use_stride) __ mov(stride, prefetch);
 761     }
 762 
 763     __ bind(again);
 764 
 765     if (PrefetchCopyIntervalInBytes > 0)
 766       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 767 
 768     if (UseSIMDForMemoryOps) {
 769       __ stpq(v0, v1, Address(d, 4 * unit));
 770       __ ldpq(v0, v1, Address(s, 4 * unit));
 771       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 772       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 773     } else {
 774       __ stp(t0, t1, Address(d, 2 * unit));
 775       __ ldp(t0, t1, Address(s, 2 * unit));
 776       __ stp(t2, t3, Address(d, 4 * unit));
 777       __ ldp(t2, t3, Address(s, 4 * unit));
 778       __ stp(t4, t5, Address(d, 6 * unit));
 779       __ ldp(t4, t5, Address(s, 6 * unit));
 780       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 781       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 782     }
 783 
 784     __ subs(count, count, 8);
 785     __ br(Assembler::HS, again);
 786 
 787     // Drain
 788     __ bind(drain);
 789     if (UseSIMDForMemoryOps) {
 790       __ stpq(v0, v1, Address(d, 4 * unit));
 791       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 792     } else {
 793       __ stp(t0, t1, Address(d, 2 * unit));
 794       __ stp(t2, t3, Address(d, 4 * unit));
 795       __ stp(t4, t5, Address(d, 6 * unit));
 796       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 797     }
 798 
 799     {
 800       Label L1, L2;
 801       __ tbz(count, exact_log2(4), L1);
 802       if (UseSIMDForMemoryOps) {
 803         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 804         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 805       } else {
 806         __ ldp(t0, t1, Address(s, 2 * unit));
 807         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 808         __ stp(t0, t1, Address(d, 2 * unit));
 809         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 810       }
 811       __ bind(L1);
 812 
 813       if (direction == copy_forwards) {
 814         __ add(s, s, bias);
 815         __ add(d, d, bias);
 816       }
 817 
 818       __ tbz(count, 1, L2);
 819       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 820       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 821       __ bind(L2);
 822     }
 823 
 824     __ ret(lr);
 825 
 826     if (AvoidUnalignedAccesses) {
 827       Label drain, again;
 828       // Register order for storing. Order is different for backward copy.
 829 
 830       __ bind(unaligned_copy_long);
 831 
 832       // source address is even aligned, target odd aligned
 833       //
 834       // when forward copying word pairs we read long pairs at offsets
 835       // {0, 2, 4, 6} (in long words). when backwards copying we read
 836       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 837       // address by -2 in the forwards case so we can compute the
 838       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 839       // or -1.
 840       //
 841       // when forward copying we need to store 1 word, 3 pairs and
 842       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 843       // zero offset We adjust the destination by -1 which means we
 844       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 845       //
 846       // When backwards copyng we need to store 1 word, 3 pairs and
 847       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 848       // offsets {1, 3, 5, 7, 8} * unit.
 849 
 850       if (direction == copy_forwards) {
 851         __ sub(s, s, 16);
 852         __ sub(d, d, 8);
 853       }
 854 
 855       // Fill 8 registers
 856       //
 857       // for forwards copy s was offset by -16 from the original input
 858       // value of s so the register contents are at these offsets
 859       // relative to the 64 bit block addressed by that original input
 860       // and so on for each successive 64 byte block when s is updated
 861       //
 862       // t0 at offset 0,  t1 at offset 8
 863       // t2 at offset 16, t3 at offset 24
 864       // t4 at offset 32, t5 at offset 40
 865       // t6 at offset 48, t7 at offset 56
 866 
 867       // for backwards copy s was not offset so the register contents
 868       // are at these offsets into the preceding 64 byte block
 869       // relative to that original input and so on for each successive
 870       // preceding 64 byte block when s is updated. this explains the
 871       // slightly counter-intuitive looking pattern of register usage
 872       // in the stp instructions for backwards copy.
 873       //
 874       // t0 at offset -16, t1 at offset -8
 875       // t2 at offset -32, t3 at offset -24
 876       // t4 at offset -48, t5 at offset -40
 877       // t6 at offset -64, t7 at offset -56
 878 
 879       __ ldp(t0, t1, Address(s, 2 * unit));
 880       __ ldp(t2, t3, Address(s, 4 * unit));
 881       __ ldp(t4, t5, Address(s, 6 * unit));
 882       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 883 
 884       __ subs(count, count, 16);
 885       __ br(Assembler::LO, drain);
 886 
 887       int prefetch = PrefetchCopyIntervalInBytes;
 888       bool use_stride = false;
 889       if (direction == copy_backwards) {
 890          use_stride = prefetch > 256;
 891          prefetch = -prefetch;
 892          if (use_stride) __ mov(stride, prefetch);
 893       }
 894 
 895       __ bind(again);
 896 
 897       if (PrefetchCopyIntervalInBytes > 0)
 898         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 899 
 900       if (direction == copy_forwards) {
 901        // allowing for the offset of -8 the store instructions place
 902        // registers into the target 64 bit block at the following
 903        // offsets
 904        //
 905        // t0 at offset 0
 906        // t1 at offset 8,  t2 at offset 16
 907        // t3 at offset 24, t4 at offset 32
 908        // t5 at offset 40, t6 at offset 48
 909        // t7 at offset 56
 910 
 911         __ str(t0, Address(d, 1 * unit));
 912         __ stp(t1, t2, Address(d, 2 * unit));
 913         __ ldp(t0, t1, Address(s, 2 * unit));
 914         __ stp(t3, t4, Address(d, 4 * unit));
 915         __ ldp(t2, t3, Address(s, 4 * unit));
 916         __ stp(t5, t6, Address(d, 6 * unit));
 917         __ ldp(t4, t5, Address(s, 6 * unit));
 918         __ str(t7, Address(__ pre(d, 8 * unit)));
 919         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 920       } else {
 921        // d was not offset when we started so the registers are
 922        // written into the 64 bit block preceding d with the following
 923        // offsets
 924        //
 925        // t1 at offset -8
 926        // t3 at offset -24, t0 at offset -16
 927        // t5 at offset -48, t2 at offset -32
 928        // t7 at offset -56, t4 at offset -48
 929        //                   t6 at offset -64
 930        //
 931        // note that this matches the offsets previously noted for the
 932        // loads
 933 
 934         __ str(t1, Address(d, 1 * unit));
 935         __ stp(t3, t0, Address(d, 3 * unit));
 936         __ ldp(t0, t1, Address(s, 2 * unit));
 937         __ stp(t5, t2, Address(d, 5 * unit));
 938         __ ldp(t2, t3, Address(s, 4 * unit));
 939         __ stp(t7, t4, Address(d, 7 * unit));
 940         __ ldp(t4, t5, Address(s, 6 * unit));
 941         __ str(t6, Address(__ pre(d, 8 * unit)));
 942         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 943       }
 944 
 945       __ subs(count, count, 8);
 946       __ br(Assembler::HS, again);
 947 
 948       // Drain
 949       //
 950       // this uses the same pattern of offsets and register arguments
 951       // as above
 952       __ bind(drain);
 953       if (direction == copy_forwards) {
 954         __ str(t0, Address(d, 1 * unit));
 955         __ stp(t1, t2, Address(d, 2 * unit));
 956         __ stp(t3, t4, Address(d, 4 * unit));
 957         __ stp(t5, t6, Address(d, 6 * unit));
 958         __ str(t7, Address(__ pre(d, 8 * unit)));
 959       } else {
 960         __ str(t1, Address(d, 1 * unit));
 961         __ stp(t3, t0, Address(d, 3 * unit));
 962         __ stp(t5, t2, Address(d, 5 * unit));
 963         __ stp(t7, t4, Address(d, 7 * unit));
 964         __ str(t6, Address(__ pre(d, 8 * unit)));
 965       }
 966       // now we need to copy any remaining part block which may
 967       // include a 4 word block subblock and/or a 2 word subblock.
 968       // bits 2 and 1 in the count are the tell-tale for whetehr we
 969       // have each such subblock
 970       {
 971         Label L1, L2;
 972         __ tbz(count, exact_log2(4), L1);
 973        // this is the same as above but copying only 4 longs hence
 974        // with ony one intervening stp between the str instructions
 975        // but note that the offsets and registers still follow the
 976        // same pattern
 977         __ ldp(t0, t1, Address(s, 2 * unit));
 978         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 979         if (direction == copy_forwards) {
 980           __ str(t0, Address(d, 1 * unit));
 981           __ stp(t1, t2, Address(d, 2 * unit));
 982           __ str(t3, Address(__ pre(d, 4 * unit)));
 983         } else {
 984           __ str(t1, Address(d, 1 * unit));
 985           __ stp(t3, t0, Address(d, 3 * unit));
 986           __ str(t2, Address(__ pre(d, 4 * unit)));
 987         }
 988         __ bind(L1);
 989 
 990         __ tbz(count, 1, L2);
 991        // this is the same as above but copying only 2 longs hence
 992        // there is no intervening stp between the str instructions
 993        // but note that the offset and register patterns are still
 994        // the same
 995         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
 996         if (direction == copy_forwards) {
 997           __ str(t0, Address(d, 1 * unit));
 998           __ str(t1, Address(__ pre(d, 2 * unit)));
 999         } else {
1000           __ str(t1, Address(d, 1 * unit));
1001           __ str(t0, Address(__ pre(d, 2 * unit)));
1002         }
1003         __ bind(L2);
1004 
1005        // for forwards copy we need to re-adjust the offsets we
1006        // applied so that s and d are follow the last words written
1007 
1008        if (direction == copy_forwards) {
1009          __ add(s, s, 16);
1010          __ add(d, d, 8);
1011        }
1012 
1013       }
1014 
1015       __ ret(lr);
1016       }
1017   }
1018 
1019   // Small copy: less than 16 bytes.
1020   //
1021   // NB: Ignores all of the bits of count which represent more than 15
1022   // bytes, so a caller doesn't have to mask them.
1023 
1024   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1025     bool is_backwards = step < 0;
1026     size_t granularity = uabs(step);
1027     int direction = is_backwards ? -1 : 1;
1028     int unit = wordSize * direction;
1029 
1030     Label Lword, Lint, Lshort, Lbyte;
1031 
1032     assert(granularity
1033            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1034 
1035     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1036 
1037     // ??? I don't know if this bit-test-and-branch is the right thing
1038     // to do.  It does a lot of jumping, resulting in several
1039     // mispredicted branches.  It might make more sense to do this
1040     // with something like Duff's device with a single computed branch.
1041 
1042     __ tbz(count, 3 - exact_log2(granularity), Lword);
1043     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1044     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1045     __ bind(Lword);
1046 
1047     if (granularity <= sizeof (jint)) {
1048       __ tbz(count, 2 - exact_log2(granularity), Lint);
1049       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1050       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1051       __ bind(Lint);
1052     }
1053 
1054     if (granularity <= sizeof (jshort)) {
1055       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1056       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1057       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1058       __ bind(Lshort);
1059     }
1060 
1061     if (granularity <= sizeof (jbyte)) {
1062       __ tbz(count, 0, Lbyte);
1063       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1064       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1065       __ bind(Lbyte);
1066     }
1067   }
1068 
1069   Label copy_f, copy_b;
1070 
1071   // All-singing all-dancing memory copy.
1072   //
1073   // Copy count units of memory from s to d.  The size of a unit is
1074   // step, which can be positive or negative depending on the direction
1075   // of copy.  If is_aligned is false, we align the source address.
1076   //
1077 
1078   void copy_memory(bool is_aligned, Register s, Register d,
1079                    Register count, Register tmp, int step) {
1080     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1081     bool is_backwards = step < 0;
1082     int granularity = uabs(step);
1083     const Register t0 = r3, t1 = r4;
1084 
1085     // <= 96 bytes do inline. Direction doesn't matter because we always
1086     // load all the data before writing anything
1087     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1088     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1089     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1090     const Register send = r17, dend = r18;
1091 
1092     if (PrefetchCopyIntervalInBytes > 0)
1093       __ prfm(Address(s, 0), PLDL1KEEP);
1094     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1095     __ br(Assembler::HI, copy_big);
1096 
1097     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1098     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1099 
1100     __ cmp(count, u1(16/granularity));
1101     __ br(Assembler::LS, copy16);
1102 
1103     __ cmp(count, u1(64/granularity));
1104     __ br(Assembler::HI, copy80);
1105 
1106     __ cmp(count, u1(32/granularity));
1107     __ br(Assembler::LS, copy32);
1108 
1109     // 33..64 bytes
1110     if (UseSIMDForMemoryOps) {
1111       __ ldpq(v0, v1, Address(s, 0));
1112       __ ldpq(v2, v3, Address(send, -32));
1113       __ stpq(v0, v1, Address(d, 0));
1114       __ stpq(v2, v3, Address(dend, -32));
1115     } else {
1116       __ ldp(t0, t1, Address(s, 0));
1117       __ ldp(t2, t3, Address(s, 16));
1118       __ ldp(t4, t5, Address(send, -32));
1119       __ ldp(t6, t7, Address(send, -16));
1120 
1121       __ stp(t0, t1, Address(d, 0));
1122       __ stp(t2, t3, Address(d, 16));
1123       __ stp(t4, t5, Address(dend, -32));
1124       __ stp(t6, t7, Address(dend, -16));
1125     }
1126     __ b(finish);
1127 
1128     // 17..32 bytes
1129     __ bind(copy32);
1130     __ ldp(t0, t1, Address(s, 0));
1131     __ ldp(t2, t3, Address(send, -16));
1132     __ stp(t0, t1, Address(d, 0));
1133     __ stp(t2, t3, Address(dend, -16));
1134     __ b(finish);
1135 
1136     // 65..80/96 bytes
1137     // (96 bytes if SIMD because we do 32 byes per instruction)
1138     __ bind(copy80);
1139     if (UseSIMDForMemoryOps) {
1140       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1141       __ ldpq(v4, v5, Address(send, -32));
1142       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1143       __ stpq(v4, v5, Address(dend, -32));
1144     } else {
1145       __ ldp(t0, t1, Address(s, 0));
1146       __ ldp(t2, t3, Address(s, 16));
1147       __ ldp(t4, t5, Address(s, 32));
1148       __ ldp(t6, t7, Address(s, 48));
1149       __ ldp(t8, t9, Address(send, -16));
1150 
1151       __ stp(t0, t1, Address(d, 0));
1152       __ stp(t2, t3, Address(d, 16));
1153       __ stp(t4, t5, Address(d, 32));
1154       __ stp(t6, t7, Address(d, 48));
1155       __ stp(t8, t9, Address(dend, -16));
1156     }
1157     __ b(finish);
1158 
1159     // 0..16 bytes
1160     __ bind(copy16);
1161     __ cmp(count, u1(8/granularity));
1162     __ br(Assembler::LO, copy8);
1163 
1164     // 8..16 bytes
1165     __ ldr(t0, Address(s, 0));
1166     __ ldr(t1, Address(send, -8));
1167     __ str(t0, Address(d, 0));
1168     __ str(t1, Address(dend, -8));
1169     __ b(finish);
1170 
1171     if (granularity < 8) {
1172       // 4..7 bytes
1173       __ bind(copy8);
1174       __ tbz(count, 2 - exact_log2(granularity), copy4);
1175       __ ldrw(t0, Address(s, 0));
1176       __ ldrw(t1, Address(send, -4));
1177       __ strw(t0, Address(d, 0));
1178       __ strw(t1, Address(dend, -4));
1179       __ b(finish);
1180       if (granularity < 4) {
1181         // 0..3 bytes
1182         __ bind(copy4);
1183         __ cbz(count, finish); // get rid of 0 case
1184         if (granularity == 2) {
1185           __ ldrh(t0, Address(s, 0));
1186           __ strh(t0, Address(d, 0));
1187         } else { // granularity == 1
1188           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1189           // the first and last byte.
1190           // Handle the 3 byte case by loading and storing base + count/2
1191           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1192           // This does means in the 1 byte case we load/store the same
1193           // byte 3 times.
1194           __ lsr(count, count, 1);
1195           __ ldrb(t0, Address(s, 0));
1196           __ ldrb(t1, Address(send, -1));
1197           __ ldrb(t2, Address(s, count));
1198           __ strb(t0, Address(d, 0));
1199           __ strb(t1, Address(dend, -1));
1200           __ strb(t2, Address(d, count));
1201         }
1202         __ b(finish);
1203       }
1204     }
1205 
1206     __ bind(copy_big);
1207     if (is_backwards) {
1208       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1209       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1210     }
1211 
1212     // Now we've got the small case out of the way we can align the
1213     // source address on a 2-word boundary.
1214 
1215     Label aligned;
1216 
1217     if (is_aligned) {
1218       // We may have to adjust by 1 word to get s 2-word-aligned.
1219       __ tbz(s, exact_log2(wordSize), aligned);
1220       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1221       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1222       __ sub(count, count, wordSize/granularity);
1223     } else {
1224       if (is_backwards) {
1225         __ andr(rscratch2, s, 2 * wordSize - 1);
1226       } else {
1227         __ neg(rscratch2, s);
1228         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1229       }
1230       // rscratch2 is the byte adjustment needed to align s.
1231       __ cbz(rscratch2, aligned);
1232       int shift = exact_log2(granularity);
1233       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1234       __ sub(count, count, rscratch2);
1235 
1236 #if 0
1237       // ?? This code is only correct for a disjoint copy.  It may or
1238       // may not make sense to use it in that case.
1239 
1240       // Copy the first pair; s and d may not be aligned.
1241       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1242       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1243 
1244       // Align s and d, adjust count
1245       if (is_backwards) {
1246         __ sub(s, s, rscratch2);
1247         __ sub(d, d, rscratch2);
1248       } else {
1249         __ add(s, s, rscratch2);
1250         __ add(d, d, rscratch2);
1251       }
1252 #else
1253       copy_memory_small(s, d, rscratch2, rscratch1, step);
1254 #endif
1255     }
1256 
1257     __ bind(aligned);
1258 
1259     // s is now 2-word-aligned.
1260 
1261     // We have a count of units and some trailing bytes.  Adjust the
1262     // count and do a bulk copy of words.
1263     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1264     if (direction == copy_forwards)
1265       __ bl(copy_f);
1266     else
1267       __ bl(copy_b);
1268 
1269     // And the tail.
1270     copy_memory_small(s, d, count, tmp, step);
1271 
1272     if (granularity >= 8) __ bind(copy8);
1273     if (granularity >= 4) __ bind(copy4);
1274     __ bind(finish);
1275   }
1276 
1277 
1278   void clobber_registers() {
1279 #ifdef ASSERT
1280     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1281     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1282     for (Register r = r3; r <= r18; r++)
1283       if (r != rscratch1) __ mov(r, rscratch1);
1284 #endif
1285   }
1286 
1287   // Scan over array at a for count oops, verifying each one.
1288   // Preserves a and count, clobbers rscratch1 and rscratch2.
1289   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1290     Label loop, end;
1291     __ mov(rscratch1, a);
1292     __ mov(rscratch2, zr);
1293     __ bind(loop);
1294     __ cmp(rscratch2, count);
1295     __ br(Assembler::HS, end);
1296     if (size == (size_t)wordSize) {
1297       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1298       __ verify_oop(temp);
1299     } else {
1300       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1301       __ decode_heap_oop(temp); // calls verify_oop
1302     }
1303     __ add(rscratch2, rscratch2, size);
1304     __ b(loop);
1305     __ bind(end);
1306   }
1307 
1308   // Arguments:
1309   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1310   //             ignored
1311   //   is_oop  - true => oop array, so generate store check code
1312   //   name    - stub name string
1313   //
1314   // Inputs:
1315   //   c_rarg0   - source array address
1316   //   c_rarg1   - destination array address
1317   //   c_rarg2   - element count, treated as ssize_t, can be zero
1318   //
1319   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1320   // the hardware handle it.  The two dwords within qwords that span
1321   // cache line boundaries will still be loaded and stored atomicly.
1322   //
1323   // Side Effects:
1324   //   disjoint_int_copy_entry is set to the no-overlap entry point
1325   //   used by generate_conjoint_int_oop_copy().
1326   //
1327   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1328                                   const char *name, bool dest_uninitialized = false) {
1329     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1330     RegSet saved_reg = RegSet::of(s, d, count);
1331     __ align(CodeEntryAlignment);
1332     StubCodeMark mark(this, "StubRoutines", name);
1333     address start = __ pc();
1334     __ enter();
1335 
1336     if (entry != NULL) {
1337       *entry = __ pc();
1338       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1339       BLOCK_COMMENT("Entry:");
1340     }
1341 
1342     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1343     if (dest_uninitialized) {
1344       decorators |= IS_DEST_UNINITIALIZED;
1345     }
1346     if (aligned) {
1347       decorators |= ARRAYCOPY_ALIGNED;
1348     }
1349 
1350     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1351     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1352 
1353     if (is_oop) {
1354       // save regs before copy_memory
1355       __ push(RegSet::of(d, count), sp);
1356     }
1357     {
1358       // UnsafeCopyMemory page error: continue after ucm
1359       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1360       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1361       copy_memory(aligned, s, d, count, rscratch1, size);
1362     }
1363 
1364     if (is_oop) {
1365       __ pop(RegSet::of(d, count), sp);
1366       if (VerifyOops)
1367         verify_oop_array(size, d, count, r16);
1368     }
1369 
1370     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1371 
1372     __ leave();
1373     __ mov(r0, zr); // return 0
1374     __ ret(lr);
1375     return start;
1376   }
1377 
1378   // Arguments:
1379   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1380   //             ignored
1381   //   is_oop  - true => oop array, so generate store check code
1382   //   name    - stub name string
1383   //
1384   // Inputs:
1385   //   c_rarg0   - source array address
1386   //   c_rarg1   - destination array address
1387   //   c_rarg2   - element count, treated as ssize_t, can be zero
1388   //
1389   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1390   // the hardware handle it.  The two dwords within qwords that span
1391   // cache line boundaries will still be loaded and stored atomicly.
1392   //
1393   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1394                                  address *entry, const char *name,
1395                                  bool dest_uninitialized = false) {
1396     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1397     RegSet saved_regs = RegSet::of(s, d, count);
1398     StubCodeMark mark(this, "StubRoutines", name);
1399     address start = __ pc();
1400     __ enter();
1401 
1402     if (entry != NULL) {
1403       *entry = __ pc();
1404       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1405       BLOCK_COMMENT("Entry:");
1406     }
1407 
1408     // use fwd copy when (d-s) above_equal (count*size)
1409     __ sub(rscratch1, d, s);
1410     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1411     __ br(Assembler::HS, nooverlap_target);
1412 
1413     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1414     if (dest_uninitialized) {
1415       decorators |= IS_DEST_UNINITIALIZED;
1416     }
1417     if (aligned) {
1418       decorators |= ARRAYCOPY_ALIGNED;
1419     }
1420 
1421     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1422     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1423 
1424     if (is_oop) {
1425       // save regs before copy_memory
1426       __ push(RegSet::of(d, count), sp);
1427     }
1428     {
1429       // UnsafeCopyMemory page error: continue after ucm
1430       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1431       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1432       copy_memory(aligned, s, d, count, rscratch1, -size);
1433     }
1434     if (is_oop) {
1435       __ pop(RegSet::of(d, count), sp);
1436       if (VerifyOops)
1437         verify_oop_array(size, d, count, r16);
1438     }
1439     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1440     __ leave();
1441     __ mov(r0, zr); // return 0
1442     __ ret(lr);
1443     return start;
1444 }
1445 
1446   // Arguments:
1447   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1448   //             ignored
1449   //   name    - stub name string
1450   //
1451   // Inputs:
1452   //   c_rarg0   - source array address
1453   //   c_rarg1   - destination array address
1454   //   c_rarg2   - element count, treated as ssize_t, can be zero
1455   //
1456   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1457   // we let the hardware handle it.  The one to eight bytes within words,
1458   // dwords or qwords that span cache line boundaries will still be loaded
1459   // and stored atomically.
1460   //
1461   // Side Effects:
1462   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1463   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1464   // we let the hardware handle it.  The one to eight bytes within words,
1465   // dwords or qwords that span cache line boundaries will still be loaded
1466   // and stored atomically.
1467   //
1468   // Side Effects:
1469   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1470   //   used by generate_conjoint_byte_copy().
1471   //
1472   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1473     const bool not_oop = false;
1474     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1475   }
1476 
1477   // Arguments:
1478   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1479   //             ignored
1480   //   name    - stub name string
1481   //
1482   // Inputs:
1483   //   c_rarg0   - source array address
1484   //   c_rarg1   - destination array address
1485   //   c_rarg2   - element count, treated as ssize_t, can be zero
1486   //
1487   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1488   // we let the hardware handle it.  The one to eight bytes within words,
1489   // dwords or qwords that span cache line boundaries will still be loaded
1490   // and stored atomically.
1491   //
1492   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1493                                       address* entry, const char *name) {
1494     const bool not_oop = false;
1495     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1496   }
1497 
1498   // Arguments:
1499   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1500   //             ignored
1501   //   name    - stub name string
1502   //
1503   // Inputs:
1504   //   c_rarg0   - source array address
1505   //   c_rarg1   - destination array address
1506   //   c_rarg2   - element count, treated as ssize_t, can be zero
1507   //
1508   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1509   // let the hardware handle it.  The two or four words within dwords
1510   // or qwords that span cache line boundaries will still be loaded
1511   // and stored atomically.
1512   //
1513   // Side Effects:
1514   //   disjoint_short_copy_entry is set to the no-overlap entry point
1515   //   used by generate_conjoint_short_copy().
1516   //
1517   address generate_disjoint_short_copy(bool aligned,
1518                                        address* entry, const char *name) {
1519     const bool not_oop = false;
1520     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1521   }
1522 
1523   // Arguments:
1524   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1525   //             ignored
1526   //   name    - stub name string
1527   //
1528   // Inputs:
1529   //   c_rarg0   - source array address
1530   //   c_rarg1   - destination array address
1531   //   c_rarg2   - element count, treated as ssize_t, can be zero
1532   //
1533   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1534   // let the hardware handle it.  The two or four words within dwords
1535   // or qwords that span cache line boundaries will still be loaded
1536   // and stored atomically.
1537   //
1538   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1539                                        address *entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1542 
1543   }
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1555   // the hardware handle it.  The two dwords within qwords that span
1556   // cache line boundaries will still be loaded and stored atomicly.
1557   //
1558   // Side Effects:
1559   //   disjoint_int_copy_entry is set to the no-overlap entry point
1560   //   used by generate_conjoint_int_oop_copy().
1561   //
1562   address generate_disjoint_int_copy(bool aligned, address *entry,
1563                                          const char *name, bool dest_uninitialized = false) {
1564     const bool not_oop = false;
1565     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1566   }
1567 
1568   // Arguments:
1569   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1570   //             ignored
1571   //   name    - stub name string
1572   //
1573   // Inputs:
1574   //   c_rarg0   - source array address
1575   //   c_rarg1   - destination array address
1576   //   c_rarg2   - element count, treated as ssize_t, can be zero
1577   //
1578   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1579   // the hardware handle it.  The two dwords within qwords that span
1580   // cache line boundaries will still be loaded and stored atomicly.
1581   //
1582   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1583                                      address *entry, const char *name,
1584                                      bool dest_uninitialized = false) {
1585     const bool not_oop = false;
1586     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1587   }
1588 
1589 
1590   // Arguments:
1591   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1592   //             ignored
1593   //   name    - stub name string
1594   //
1595   // Inputs:
1596   //   c_rarg0   - source array address
1597   //   c_rarg1   - destination array address
1598   //   c_rarg2   - element count, treated as size_t, can be zero
1599   //
1600   // Side Effects:
1601   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1602   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1603   //
1604   address generate_disjoint_long_copy(bool aligned, address *entry,
1605                                           const char *name, bool dest_uninitialized = false) {
1606     const bool not_oop = false;
1607     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1608   }
1609 
1610   // Arguments:
1611   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1612   //             ignored
1613   //   name    - stub name string
1614   //
1615   // Inputs:
1616   //   c_rarg0   - source array address
1617   //   c_rarg1   - destination array address
1618   //   c_rarg2   - element count, treated as size_t, can be zero
1619   //
1620   address generate_conjoint_long_copy(bool aligned,
1621                                       address nooverlap_target, address *entry,
1622                                       const char *name, bool dest_uninitialized = false) {
1623     const bool not_oop = false;
1624     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1625   }
1626 
1627   // Arguments:
1628   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1629   //             ignored
1630   //   name    - stub name string
1631   //
1632   // Inputs:
1633   //   c_rarg0   - source array address
1634   //   c_rarg1   - destination array address
1635   //   c_rarg2   - element count, treated as size_t, can be zero
1636   //
1637   // Side Effects:
1638   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1639   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1640   //
1641   address generate_disjoint_oop_copy(bool aligned, address *entry,
1642                                      const char *name, bool dest_uninitialized) {
1643     const bool is_oop = true;
1644     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1645     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1646   }
1647 
1648   // Arguments:
1649   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1650   //             ignored
1651   //   name    - stub name string
1652   //
1653   // Inputs:
1654   //   c_rarg0   - source array address
1655   //   c_rarg1   - destination array address
1656   //   c_rarg2   - element count, treated as size_t, can be zero
1657   //
1658   address generate_conjoint_oop_copy(bool aligned,
1659                                      address nooverlap_target, address *entry,
1660                                      const char *name, bool dest_uninitialized) {
1661     const bool is_oop = true;
1662     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1663     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1664                                   name, dest_uninitialized);
1665   }
1666 
1667 
1668   // Helper for generating a dynamic type check.
1669   // Smashes rscratch1, rscratch2.
1670   void generate_type_check(Register sub_klass,
1671                            Register super_check_offset,
1672                            Register super_klass,
1673                            Label& L_success) {
1674     assert_different_registers(sub_klass, super_check_offset, super_klass);
1675 
1676     BLOCK_COMMENT("type_check:");
1677 
1678     Label L_miss;
1679 
1680     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1681                                      super_check_offset);
1682     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1683 
1684     // Fall through on failure!
1685     __ BIND(L_miss);
1686   }
1687 
1688   //
1689   //  Generate checkcasting array copy stub
1690   //
1691   //  Input:
1692   //    c_rarg0   - source array address
1693   //    c_rarg1   - destination array address
1694   //    c_rarg2   - element count, treated as ssize_t, can be zero
1695   //    c_rarg3   - size_t ckoff (super_check_offset)
1696   //    c_rarg4   - oop ckval (super_klass)
1697   //
1698   //  Output:
1699   //    r0 ==  0  -  success
1700   //    r0 == -1^K - failure, where K is partial transfer count
1701   //
1702   address generate_checkcast_copy(const char *name, address *entry,
1703                                   bool dest_uninitialized = false) {
1704 
1705     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1706 
1707     // Input registers (after setup_arg_regs)
1708     const Register from        = c_rarg0;   // source array address
1709     const Register to          = c_rarg1;   // destination array address
1710     const Register count       = c_rarg2;   // elementscount
1711     const Register ckoff       = c_rarg3;   // super_check_offset
1712     const Register ckval       = c_rarg4;   // super_klass
1713 
1714     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1715     RegSet wb_post_saved_regs = RegSet::of(count);
1716 
1717     // Registers used as temps (r18, r19, r20 are save-on-entry)
1718     const Register count_save  = r21;       // orig elementscount
1719     const Register start_to    = r20;       // destination array start address
1720     const Register copied_oop  = r18;       // actual oop copied
1721     const Register r19_klass   = r19;       // oop._klass
1722 
1723     //---------------------------------------------------------------
1724     // Assembler stub will be used for this call to arraycopy
1725     // if the two arrays are subtypes of Object[] but the
1726     // destination array type is not equal to or a supertype
1727     // of the source type.  Each element must be separately
1728     // checked.
1729 
1730     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1731                                copied_oop, r19_klass, count_save);
1732 
1733     __ align(CodeEntryAlignment);
1734     StubCodeMark mark(this, "StubRoutines", name);
1735     address start = __ pc();
1736 
1737     __ enter(); // required for proper stackwalking of RuntimeStub frame
1738 
1739 #ifdef ASSERT
1740     // caller guarantees that the arrays really are different
1741     // otherwise, we would have to make conjoint checks
1742     { Label L;
1743       array_overlap_test(L, TIMES_OOP);
1744       __ stop("checkcast_copy within a single array");
1745       __ bind(L);
1746     }
1747 #endif //ASSERT
1748 
1749     // Caller of this entry point must set up the argument registers.
1750     if (entry != NULL) {
1751       *entry = __ pc();
1752       BLOCK_COMMENT("Entry:");
1753     }
1754 
1755      // Empty array:  Nothing to do.
1756     __ cbz(count, L_done);
1757 
1758     __ push(RegSet::of(r18, r19, r20, r21), sp);
1759 
1760 #ifdef ASSERT
1761     BLOCK_COMMENT("assert consistent ckoff/ckval");
1762     // The ckoff and ckval must be mutually consistent,
1763     // even though caller generates both.
1764     { Label L;
1765       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1766       __ ldrw(start_to, Address(ckval, sco_offset));
1767       __ cmpw(ckoff, start_to);
1768       __ br(Assembler::EQ, L);
1769       __ stop("super_check_offset inconsistent");
1770       __ bind(L);
1771     }
1772 #endif //ASSERT
1773 
1774     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1775     bool is_oop = true;
1776     if (dest_uninitialized) {
1777       decorators |= IS_DEST_UNINITIALIZED;
1778     }
1779 
1780     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1781     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1782 
1783     // save the original count
1784     __ mov(count_save, count);
1785 
1786     // Copy from low to high addresses
1787     __ mov(start_to, to);              // Save destination array start address
1788     __ b(L_load_element);
1789 
1790     // ======== begin loop ========
1791     // (Loop is rotated; its entry is L_load_element.)
1792     // Loop control:
1793     //   for (; count != 0; count--) {
1794     //     copied_oop = load_heap_oop(from++);
1795     //     ... generate_type_check ...;
1796     //     store_heap_oop(to++, copied_oop);
1797     //   }
1798     __ align(OptoLoopAlignment);
1799 
1800     __ BIND(L_store_element);
1801     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1802     __ sub(count, count, 1);
1803     __ cbz(count, L_do_card_marks);
1804 
1805     // ======== loop entry is here ========
1806     __ BIND(L_load_element);
1807     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1808     __ cbz(copied_oop, L_store_element);
1809 
1810     __ load_klass(r19_klass, copied_oop);// query the object klass
1811     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1812     // ======== end loop ========
1813 
1814     // It was a real error; we must depend on the caller to finish the job.
1815     // Register count = remaining oops, count_orig = total oops.
1816     // Emit GC store barriers for the oops we have copied and report
1817     // their number to the caller.
1818 
1819     __ subs(count, count_save, count);     // K = partially copied oop count
1820     __ eon(count, count, zr);                   // report (-1^K) to caller
1821     __ br(Assembler::EQ, L_done_pop);
1822 
1823     __ BIND(L_do_card_marks);
1824     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1825 
1826     __ bind(L_done_pop);
1827     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1828     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1829 
1830     __ bind(L_done);
1831     __ mov(r0, count);
1832     __ leave();
1833     __ ret(lr);
1834 
1835     return start;
1836   }
1837 
1838   // Perform range checks on the proposed arraycopy.
1839   // Kills temp, but nothing else.
1840   // Also, clean the sign bits of src_pos and dst_pos.
1841   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1842                               Register src_pos, // source position (c_rarg1)
1843                               Register dst,     // destination array oo (c_rarg2)
1844                               Register dst_pos, // destination position (c_rarg3)
1845                               Register length,
1846                               Register temp,
1847                               Label& L_failed) {
1848     BLOCK_COMMENT("arraycopy_range_checks:");
1849 
1850     assert_different_registers(rscratch1, temp);
1851 
1852     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1853     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1854     __ addw(temp, length, src_pos);
1855     __ cmpw(temp, rscratch1);
1856     __ br(Assembler::HI, L_failed);
1857 
1858     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1859     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1860     __ addw(temp, length, dst_pos);
1861     __ cmpw(temp, rscratch1);
1862     __ br(Assembler::HI, L_failed);
1863 
1864     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1865     __ movw(src_pos, src_pos);
1866     __ movw(dst_pos, dst_pos);
1867 
1868     BLOCK_COMMENT("arraycopy_range_checks done");
1869   }
1870 
1871   // These stubs get called from some dumb test routine.
1872   // I'll write them properly when they're called from
1873   // something that's actually doing something.
1874   static void fake_arraycopy_stub(address src, address dst, int count) {
1875     assert(count == 0, "huh?");
1876   }
1877 
1878 
1879   //
1880   //  Generate 'unsafe' array copy stub
1881   //  Though just as safe as the other stubs, it takes an unscaled
1882   //  size_t argument instead of an element count.
1883   //
1884   //  Input:
1885   //    c_rarg0   - source array address
1886   //    c_rarg1   - destination array address
1887   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1888   //
1889   // Examines the alignment of the operands and dispatches
1890   // to a long, int, short, or byte copy loop.
1891   //
1892   address generate_unsafe_copy(const char *name,
1893                                address byte_copy_entry,
1894                                address short_copy_entry,
1895                                address int_copy_entry,
1896                                address long_copy_entry) {
1897     Label L_long_aligned, L_int_aligned, L_short_aligned;
1898     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1899 
1900     __ align(CodeEntryAlignment);
1901     StubCodeMark mark(this, "StubRoutines", name);
1902     address start = __ pc();
1903     __ enter(); // required for proper stackwalking of RuntimeStub frame
1904 
1905     // bump this on entry, not on exit:
1906     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1907 
1908     __ orr(rscratch1, s, d);
1909     __ orr(rscratch1, rscratch1, count);
1910 
1911     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1912     __ cbz(rscratch1, L_long_aligned);
1913     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1914     __ cbz(rscratch1, L_int_aligned);
1915     __ tbz(rscratch1, 0, L_short_aligned);
1916     __ b(RuntimeAddress(byte_copy_entry));
1917 
1918     __ BIND(L_short_aligned);
1919     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1920     __ b(RuntimeAddress(short_copy_entry));
1921     __ BIND(L_int_aligned);
1922     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1923     __ b(RuntimeAddress(int_copy_entry));
1924     __ BIND(L_long_aligned);
1925     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1926     __ b(RuntimeAddress(long_copy_entry));
1927 
1928     return start;
1929   }
1930 
1931   //
1932   //  Generate generic array copy stubs
1933   //
1934   //  Input:
1935   //    c_rarg0    -  src oop
1936   //    c_rarg1    -  src_pos (32-bits)
1937   //    c_rarg2    -  dst oop
1938   //    c_rarg3    -  dst_pos (32-bits)
1939   //    c_rarg4    -  element count (32-bits)
1940   //
1941   //  Output:
1942   //    r0 ==  0  -  success
1943   //    r0 == -1^K - failure, where K is partial transfer count
1944   //
1945   address generate_generic_copy(const char *name,
1946                                 address byte_copy_entry, address short_copy_entry,
1947                                 address int_copy_entry, address oop_copy_entry,
1948                                 address long_copy_entry, address checkcast_copy_entry) {
1949 
1950     Label L_failed, L_objArray;
1951     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1952 
1953     // Input registers
1954     const Register src        = c_rarg0;  // source array oop
1955     const Register src_pos    = c_rarg1;  // source position
1956     const Register dst        = c_rarg2;  // destination array oop
1957     const Register dst_pos    = c_rarg3;  // destination position
1958     const Register length     = c_rarg4;
1959 
1960 
1961     // Registers used as temps
1962     const Register dst_klass  = c_rarg5;
1963 
1964     __ align(CodeEntryAlignment);
1965 
1966     StubCodeMark mark(this, "StubRoutines", name);
1967 
1968     address start = __ pc();
1969 
1970     __ enter(); // required for proper stackwalking of RuntimeStub frame
1971 
1972     // bump this on entry, not on exit:
1973     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1974 
1975     //-----------------------------------------------------------------------
1976     // Assembler stub will be used for this call to arraycopy
1977     // if the following conditions are met:
1978     //
1979     // (1) src and dst must not be null.
1980     // (2) src_pos must not be negative.
1981     // (3) dst_pos must not be negative.
1982     // (4) length  must not be negative.
1983     // (5) src klass and dst klass should be the same and not NULL.
1984     // (6) src and dst should be arrays.
1985     // (7) src_pos + length must not exceed length of src.
1986     // (8) dst_pos + length must not exceed length of dst.
1987     //
1988 
1989     //  if (src == NULL) return -1;
1990     __ cbz(src, L_failed);
1991 
1992     //  if (src_pos < 0) return -1;
1993     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1994 
1995     //  if (dst == NULL) return -1;
1996     __ cbz(dst, L_failed);
1997 
1998     //  if (dst_pos < 0) return -1;
1999     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2000 
2001     // registers used as temp
2002     const Register scratch_length    = r16; // elements count to copy
2003     const Register scratch_src_klass = r17; // array klass
2004     const Register lh                = r18; // layout helper
2005 
2006     //  if (length < 0) return -1;
2007     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2008     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2009 
2010     __ load_klass(scratch_src_klass, src);
2011 #ifdef ASSERT
2012     //  assert(src->klass() != NULL);
2013     {
2014       BLOCK_COMMENT("assert klasses not null {");
2015       Label L1, L2;
2016       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2017       __ bind(L1);
2018       __ stop("broken null klass");
2019       __ bind(L2);
2020       __ load_klass(rscratch1, dst);
2021       __ cbz(rscratch1, L1);     // this would be broken also
2022       BLOCK_COMMENT("} assert klasses not null done");
2023     }
2024 #endif
2025 
2026     // Load layout helper (32-bits)
2027     //
2028     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2029     // 32        30    24            16              8     2                 0
2030     //
2031     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2032     //
2033 
2034     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2035 
2036     // Handle objArrays completely differently...
2037     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2038     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2039     __ movw(rscratch1, objArray_lh);
2040     __ eorw(rscratch2, lh, rscratch1);
2041     __ cbzw(rscratch2, L_objArray);
2042 
2043     //  if (src->klass() != dst->klass()) return -1;
2044     __ load_klass(rscratch2, dst);
2045     __ eor(rscratch2, rscratch2, scratch_src_klass);
2046     __ cbnz(rscratch2, L_failed);
2047 
2048     //  if (!src->is_Array()) return -1;
2049     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2050 
2051     // At this point, it is known to be a typeArray (array_tag 0x3).
2052 #ifdef ASSERT
2053     {
2054       BLOCK_COMMENT("assert primitive array {");
2055       Label L;
2056       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2057       __ cmpw(lh, rscratch2);
2058       __ br(Assembler::GE, L);
2059       __ stop("must be a primitive array");
2060       __ bind(L);
2061       BLOCK_COMMENT("} assert primitive array done");
2062     }
2063 #endif
2064 
2065     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2066                            rscratch2, L_failed);
2067 
2068     // TypeArrayKlass
2069     //
2070     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2071     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2072     //
2073 
2074     const Register rscratch1_offset = rscratch1;    // array offset
2075     const Register r18_elsize = lh; // element size
2076 
2077     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2078            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2079     __ add(src, src, rscratch1_offset);           // src array offset
2080     __ add(dst, dst, rscratch1_offset);           // dst array offset
2081     BLOCK_COMMENT("choose copy loop based on element size");
2082 
2083     // next registers should be set before the jump to corresponding stub
2084     const Register from     = c_rarg0;  // source array address
2085     const Register to       = c_rarg1;  // destination array address
2086     const Register count    = c_rarg2;  // elements count
2087 
2088     // 'from', 'to', 'count' registers should be set in such order
2089     // since they are the same as 'src', 'src_pos', 'dst'.
2090 
2091     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2092 
2093     // The possible values of elsize are 0-3, i.e. exact_log2(element
2094     // size in bytes).  We do a simple bitwise binary search.
2095   __ BIND(L_copy_bytes);
2096     __ tbnz(r18_elsize, 1, L_copy_ints);
2097     __ tbnz(r18_elsize, 0, L_copy_shorts);
2098     __ lea(from, Address(src, src_pos));// src_addr
2099     __ lea(to,   Address(dst, dst_pos));// dst_addr
2100     __ movw(count, scratch_length); // length
2101     __ b(RuntimeAddress(byte_copy_entry));
2102 
2103   __ BIND(L_copy_shorts);
2104     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2105     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2106     __ movw(count, scratch_length); // length
2107     __ b(RuntimeAddress(short_copy_entry));
2108 
2109   __ BIND(L_copy_ints);
2110     __ tbnz(r18_elsize, 0, L_copy_longs);
2111     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2112     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2113     __ movw(count, scratch_length); // length
2114     __ b(RuntimeAddress(int_copy_entry));
2115 
2116   __ BIND(L_copy_longs);
2117 #ifdef ASSERT
2118     {
2119       BLOCK_COMMENT("assert long copy {");
2120       Label L;
2121       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2122       __ cmpw(r18_elsize, LogBytesPerLong);
2123       __ br(Assembler::EQ, L);
2124       __ stop("must be long copy, but elsize is wrong");
2125       __ bind(L);
2126       BLOCK_COMMENT("} assert long copy done");
2127     }
2128 #endif
2129     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2130     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2131     __ movw(count, scratch_length); // length
2132     __ b(RuntimeAddress(long_copy_entry));
2133 
2134     // ObjArrayKlass
2135   __ BIND(L_objArray);
2136     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2137 
2138     Label L_plain_copy, L_checkcast_copy;
2139     //  test array classes for subtyping
2140     __ load_klass(r18, dst);
2141     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2142     __ br(Assembler::NE, L_checkcast_copy);
2143 
2144     // Identically typed arrays can be copied without element-wise checks.
2145     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2146                            rscratch2, L_failed);
2147 
2148     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2149     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2150     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2151     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2152     __ movw(count, scratch_length); // length
2153   __ BIND(L_plain_copy);
2154     __ b(RuntimeAddress(oop_copy_entry));
2155 
2156   __ BIND(L_checkcast_copy);
2157     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2158     {
2159       // Before looking at dst.length, make sure dst is also an objArray.
2160       __ ldrw(rscratch1, Address(r18, lh_offset));
2161       __ movw(rscratch2, objArray_lh);
2162       __ eorw(rscratch1, rscratch1, rscratch2);
2163       __ cbnzw(rscratch1, L_failed);
2164 
2165       // It is safe to examine both src.length and dst.length.
2166       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2167                              r18, L_failed);
2168 
2169       __ load_klass(dst_klass, dst); // reload
2170 
2171       // Marshal the base address arguments now, freeing registers.
2172       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2173       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2174       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2175       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2176       __ movw(count, length);           // length (reloaded)
2177       Register sco_temp = c_rarg3;      // this register is free now
2178       assert_different_registers(from, to, count, sco_temp,
2179                                  dst_klass, scratch_src_klass);
2180       // assert_clean_int(count, sco_temp);
2181 
2182       // Generate the type check.
2183       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2184       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2185 
2186       // Smashes rscratch1, rscratch2
2187       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2188 
2189       // Fetch destination element klass from the ObjArrayKlass header.
2190       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2191       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2192       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2193 
2194       // the checkcast_copy loop needs two extra arguments:
2195       assert(c_rarg3 == sco_temp, "#3 already in place");
2196       // Set up arguments for checkcast_copy_entry.
2197       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2198       __ b(RuntimeAddress(checkcast_copy_entry));
2199     }
2200 
2201   __ BIND(L_failed);
2202     __ mov(r0, -1);
2203     __ leave();   // required for proper stackwalking of RuntimeStub frame
2204     __ ret(lr);
2205 
2206     return start;
2207   }
2208 
2209   //
2210   // Generate stub for array fill. If "aligned" is true, the
2211   // "to" address is assumed to be heapword aligned.
2212   //
2213   // Arguments for generated stub:
2214   //   to:    c_rarg0
2215   //   value: c_rarg1
2216   //   count: c_rarg2 treated as signed
2217   //
2218   address generate_fill(BasicType t, bool aligned, const char *name) {
2219     __ align(CodeEntryAlignment);
2220     StubCodeMark mark(this, "StubRoutines", name);
2221     address start = __ pc();
2222 
2223     BLOCK_COMMENT("Entry:");
2224 
2225     const Register to        = c_rarg0;  // source array address
2226     const Register value     = c_rarg1;  // value
2227     const Register count     = c_rarg2;  // elements count
2228 
2229     const Register bz_base = r10;        // base for block_zero routine
2230     const Register cnt_words = r11;      // temp register
2231 
2232     __ enter();
2233 
2234     Label L_fill_elements, L_exit1;
2235 
2236     int shift = -1;
2237     switch (t) {
2238       case T_BYTE:
2239         shift = 0;
2240         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2241         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2242         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2243         __ br(Assembler::LO, L_fill_elements);
2244         break;
2245       case T_SHORT:
2246         shift = 1;
2247         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2248         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2249         __ br(Assembler::LO, L_fill_elements);
2250         break;
2251       case T_INT:
2252         shift = 2;
2253         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2254         __ br(Assembler::LO, L_fill_elements);
2255         break;
2256       default: ShouldNotReachHere();
2257     }
2258 
2259     // Align source address at 8 bytes address boundary.
2260     Label L_skip_align1, L_skip_align2, L_skip_align4;
2261     if (!aligned) {
2262       switch (t) {
2263         case T_BYTE:
2264           // One byte misalignment happens only for byte arrays.
2265           __ tbz(to, 0, L_skip_align1);
2266           __ strb(value, Address(__ post(to, 1)));
2267           __ subw(count, count, 1);
2268           __ bind(L_skip_align1);
2269           // Fallthrough
2270         case T_SHORT:
2271           // Two bytes misalignment happens only for byte and short (char) arrays.
2272           __ tbz(to, 1, L_skip_align2);
2273           __ strh(value, Address(__ post(to, 2)));
2274           __ subw(count, count, 2 >> shift);
2275           __ bind(L_skip_align2);
2276           // Fallthrough
2277         case T_INT:
2278           // Align to 8 bytes, we know we are 4 byte aligned to start.
2279           __ tbz(to, 2, L_skip_align4);
2280           __ strw(value, Address(__ post(to, 4)));
2281           __ subw(count, count, 4 >> shift);
2282           __ bind(L_skip_align4);
2283           break;
2284         default: ShouldNotReachHere();
2285       }
2286     }
2287 
2288     //
2289     //  Fill large chunks
2290     //
2291     __ lsrw(cnt_words, count, 3 - shift); // number of words
2292     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2293     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2294     if (UseBlockZeroing) {
2295       Label non_block_zeroing, rest;
2296       // If the fill value is zero we can use the fast zero_words().
2297       __ cbnz(value, non_block_zeroing);
2298       __ mov(bz_base, to);
2299       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2300       __ zero_words(bz_base, cnt_words);
2301       __ b(rest);
2302       __ bind(non_block_zeroing);
2303       __ fill_words(to, cnt_words, value);
2304       __ bind(rest);
2305     } else {
2306       __ fill_words(to, cnt_words, value);
2307     }
2308 
2309     // Remaining count is less than 8 bytes. Fill it by a single store.
2310     // Note that the total length is no less than 8 bytes.
2311     if (t == T_BYTE || t == T_SHORT) {
2312       Label L_exit1;
2313       __ cbzw(count, L_exit1);
2314       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2315       __ str(value, Address(to, -8));    // overwrite some elements
2316       __ bind(L_exit1);
2317       __ leave();
2318       __ ret(lr);
2319     }
2320 
2321     // Handle copies less than 8 bytes.
2322     Label L_fill_2, L_fill_4, L_exit2;
2323     __ bind(L_fill_elements);
2324     switch (t) {
2325       case T_BYTE:
2326         __ tbz(count, 0, L_fill_2);
2327         __ strb(value, Address(__ post(to, 1)));
2328         __ bind(L_fill_2);
2329         __ tbz(count, 1, L_fill_4);
2330         __ strh(value, Address(__ post(to, 2)));
2331         __ bind(L_fill_4);
2332         __ tbz(count, 2, L_exit2);
2333         __ strw(value, Address(to));
2334         break;
2335       case T_SHORT:
2336         __ tbz(count, 0, L_fill_4);
2337         __ strh(value, Address(__ post(to, 2)));
2338         __ bind(L_fill_4);
2339         __ tbz(count, 1, L_exit2);
2340         __ strw(value, Address(to));
2341         break;
2342       case T_INT:
2343         __ cbzw(count, L_exit2);
2344         __ strw(value, Address(to));
2345         break;
2346       default: ShouldNotReachHere();
2347     }
2348     __ bind(L_exit2);
2349     __ leave();
2350     __ ret(lr);
2351     return start;
2352   }
2353 
2354   address generate_data_cache_writeback() {
2355     const Register line        = c_rarg0;  // address of line to write back
2356 
2357     __ align(CodeEntryAlignment);
2358 
2359     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2360 
2361     address start = __ pc();
2362     __ enter();
2363     __ cache_wb(Address(line, 0));
2364     __ leave();
2365     __ ret(lr);
2366 
2367     return start;
2368   }
2369 
2370   address generate_data_cache_writeback_sync() {
2371     const Register is_pre     = c_rarg0;  // pre or post sync
2372 
2373     __ align(CodeEntryAlignment);
2374 
2375     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2376 
2377     // pre wbsync is a no-op
2378     // post wbsync translates to an sfence
2379 
2380     Label skip;
2381     address start = __ pc();
2382     __ enter();
2383     __ cbnz(is_pre, skip);
2384     __ cache_wbsync(false);
2385     __ bind(skip);
2386     __ leave();
2387     __ ret(lr);
2388 
2389     return start;
2390   }
2391 
2392   void generate_arraycopy_stubs() {
2393     address entry;
2394     address entry_jbyte_arraycopy;
2395     address entry_jshort_arraycopy;
2396     address entry_jint_arraycopy;
2397     address entry_oop_arraycopy;
2398     address entry_jlong_arraycopy;
2399     address entry_checkcast_arraycopy;
2400 
2401     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2402     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2403 
2404     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2405 
2406     //*** jbyte
2407     // Always need aligned and unaligned versions
2408     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2409                                                                                   "jbyte_disjoint_arraycopy");
2410     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2411                                                                                   &entry_jbyte_arraycopy,
2412                                                                                   "jbyte_arraycopy");
2413     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2414                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2415     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2416                                                                                   "arrayof_jbyte_arraycopy");
2417 
2418     //*** jshort
2419     // Always need aligned and unaligned versions
2420     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2421                                                                                     "jshort_disjoint_arraycopy");
2422     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2423                                                                                     &entry_jshort_arraycopy,
2424                                                                                     "jshort_arraycopy");
2425     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2426                                                                                     "arrayof_jshort_disjoint_arraycopy");
2427     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2428                                                                                     "arrayof_jshort_arraycopy");
2429 
2430     //*** jint
2431     // Aligned versions
2432     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2433                                                                                 "arrayof_jint_disjoint_arraycopy");
2434     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2435                                                                                 "arrayof_jint_arraycopy");
2436     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2437     // entry_jint_arraycopy always points to the unaligned version
2438     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2439                                                                                 "jint_disjoint_arraycopy");
2440     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2441                                                                                 &entry_jint_arraycopy,
2442                                                                                 "jint_arraycopy");
2443 
2444     //*** jlong
2445     // It is always aligned
2446     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2447                                                                                   "arrayof_jlong_disjoint_arraycopy");
2448     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2449                                                                                   "arrayof_jlong_arraycopy");
2450     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2451     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2452 
2453     //*** oops
2454     {
2455       // With compressed oops we need unaligned versions; notice that
2456       // we overwrite entry_oop_arraycopy.
2457       bool aligned = !UseCompressedOops;
2458 
2459       StubRoutines::_arrayof_oop_disjoint_arraycopy
2460         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2461                                      /*dest_uninitialized*/false);
2462       StubRoutines::_arrayof_oop_arraycopy
2463         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2464                                      /*dest_uninitialized*/false);
2465       // Aligned versions without pre-barriers
2466       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2467         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2468                                      /*dest_uninitialized*/true);
2469       StubRoutines::_arrayof_oop_arraycopy_uninit
2470         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2471                                      /*dest_uninitialized*/true);
2472     }
2473 
2474     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2475     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2476     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2477     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2478 
2479     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2480     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2481                                                                         /*dest_uninitialized*/true);
2482 
2483     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2484                                                               entry_jbyte_arraycopy,
2485                                                               entry_jshort_arraycopy,
2486                                                               entry_jint_arraycopy,
2487                                                               entry_jlong_arraycopy);
2488 
2489     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2490                                                                entry_jbyte_arraycopy,
2491                                                                entry_jshort_arraycopy,
2492                                                                entry_jint_arraycopy,
2493                                                                entry_oop_arraycopy,
2494                                                                entry_jlong_arraycopy,
2495                                                                entry_checkcast_arraycopy);
2496 
2497     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2498     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2499     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2500     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2501     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2502     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2503   }
2504 
2505   void generate_math_stubs() { Unimplemented(); }
2506 
2507   // Arguments:
2508   //
2509   // Inputs:
2510   //   c_rarg0   - source byte array address
2511   //   c_rarg1   - destination byte array address
2512   //   c_rarg2   - K (key) in little endian int array
2513   //
2514   address generate_aescrypt_encryptBlock() {
2515     __ align(CodeEntryAlignment);
2516     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2517 
2518     Label L_doLast;
2519 
2520     const Register from        = c_rarg0;  // source array address
2521     const Register to          = c_rarg1;  // destination array address
2522     const Register key         = c_rarg2;  // key array address
2523     const Register keylen      = rscratch1;
2524 
2525     address start = __ pc();
2526     __ enter();
2527 
2528     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2529 
2530     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2531 
2532     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2533     __ rev32(v1, __ T16B, v1);
2534     __ rev32(v2, __ T16B, v2);
2535     __ rev32(v3, __ T16B, v3);
2536     __ rev32(v4, __ T16B, v4);
2537     __ aese(v0, v1);
2538     __ aesmc(v0, v0);
2539     __ aese(v0, v2);
2540     __ aesmc(v0, v0);
2541     __ aese(v0, v3);
2542     __ aesmc(v0, v0);
2543     __ aese(v0, v4);
2544     __ aesmc(v0, v0);
2545 
2546     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2547     __ rev32(v1, __ T16B, v1);
2548     __ rev32(v2, __ T16B, v2);
2549     __ rev32(v3, __ T16B, v3);
2550     __ rev32(v4, __ T16B, v4);
2551     __ aese(v0, v1);
2552     __ aesmc(v0, v0);
2553     __ aese(v0, v2);
2554     __ aesmc(v0, v0);
2555     __ aese(v0, v3);
2556     __ aesmc(v0, v0);
2557     __ aese(v0, v4);
2558     __ aesmc(v0, v0);
2559 
2560     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2561     __ rev32(v1, __ T16B, v1);
2562     __ rev32(v2, __ T16B, v2);
2563 
2564     __ cmpw(keylen, 44);
2565     __ br(Assembler::EQ, L_doLast);
2566 
2567     __ aese(v0, v1);
2568     __ aesmc(v0, v0);
2569     __ aese(v0, v2);
2570     __ aesmc(v0, v0);
2571 
2572     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2573     __ rev32(v1, __ T16B, v1);
2574     __ rev32(v2, __ T16B, v2);
2575 
2576     __ cmpw(keylen, 52);
2577     __ br(Assembler::EQ, L_doLast);
2578 
2579     __ aese(v0, v1);
2580     __ aesmc(v0, v0);
2581     __ aese(v0, v2);
2582     __ aesmc(v0, v0);
2583 
2584     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2585     __ rev32(v1, __ T16B, v1);
2586     __ rev32(v2, __ T16B, v2);
2587 
2588     __ BIND(L_doLast);
2589 
2590     __ aese(v0, v1);
2591     __ aesmc(v0, v0);
2592     __ aese(v0, v2);
2593 
2594     __ ld1(v1, __ T16B, key);
2595     __ rev32(v1, __ T16B, v1);
2596     __ eor(v0, __ T16B, v0, v1);
2597 
2598     __ st1(v0, __ T16B, to);
2599 
2600     __ mov(r0, 0);
2601 
2602     __ leave();
2603     __ ret(lr);
2604 
2605     return start;
2606   }
2607 
2608   // Arguments:
2609   //
2610   // Inputs:
2611   //   c_rarg0   - source byte array address
2612   //   c_rarg1   - destination byte array address
2613   //   c_rarg2   - K (key) in little endian int array
2614   //
2615   address generate_aescrypt_decryptBlock() {
2616     assert(UseAES, "need AES instructions and misaligned SSE support");
2617     __ align(CodeEntryAlignment);
2618     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2619     Label L_doLast;
2620 
2621     const Register from        = c_rarg0;  // source array address
2622     const Register to          = c_rarg1;  // destination array address
2623     const Register key         = c_rarg2;  // key array address
2624     const Register keylen      = rscratch1;
2625 
2626     address start = __ pc();
2627     __ enter(); // required for proper stackwalking of RuntimeStub frame
2628 
2629     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2630 
2631     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2632 
2633     __ ld1(v5, __ T16B, __ post(key, 16));
2634     __ rev32(v5, __ T16B, v5);
2635 
2636     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2637     __ rev32(v1, __ T16B, v1);
2638     __ rev32(v2, __ T16B, v2);
2639     __ rev32(v3, __ T16B, v3);
2640     __ rev32(v4, __ T16B, v4);
2641     __ aesd(v0, v1);
2642     __ aesimc(v0, v0);
2643     __ aesd(v0, v2);
2644     __ aesimc(v0, v0);
2645     __ aesd(v0, v3);
2646     __ aesimc(v0, v0);
2647     __ aesd(v0, v4);
2648     __ aesimc(v0, v0);
2649 
2650     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2651     __ rev32(v1, __ T16B, v1);
2652     __ rev32(v2, __ T16B, v2);
2653     __ rev32(v3, __ T16B, v3);
2654     __ rev32(v4, __ T16B, v4);
2655     __ aesd(v0, v1);
2656     __ aesimc(v0, v0);
2657     __ aesd(v0, v2);
2658     __ aesimc(v0, v0);
2659     __ aesd(v0, v3);
2660     __ aesimc(v0, v0);
2661     __ aesd(v0, v4);
2662     __ aesimc(v0, v0);
2663 
2664     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2665     __ rev32(v1, __ T16B, v1);
2666     __ rev32(v2, __ T16B, v2);
2667 
2668     __ cmpw(keylen, 44);
2669     __ br(Assembler::EQ, L_doLast);
2670 
2671     __ aesd(v0, v1);
2672     __ aesimc(v0, v0);
2673     __ aesd(v0, v2);
2674     __ aesimc(v0, v0);
2675 
2676     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2677     __ rev32(v1, __ T16B, v1);
2678     __ rev32(v2, __ T16B, v2);
2679 
2680     __ cmpw(keylen, 52);
2681     __ br(Assembler::EQ, L_doLast);
2682 
2683     __ aesd(v0, v1);
2684     __ aesimc(v0, v0);
2685     __ aesd(v0, v2);
2686     __ aesimc(v0, v0);
2687 
2688     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2689     __ rev32(v1, __ T16B, v1);
2690     __ rev32(v2, __ T16B, v2);
2691 
2692     __ BIND(L_doLast);
2693 
2694     __ aesd(v0, v1);
2695     __ aesimc(v0, v0);
2696     __ aesd(v0, v2);
2697 
2698     __ eor(v0, __ T16B, v0, v5);
2699 
2700     __ st1(v0, __ T16B, to);
2701 
2702     __ mov(r0, 0);
2703 
2704     __ leave();
2705     __ ret(lr);
2706 
2707     return start;
2708   }
2709 
2710   // Arguments:
2711   //
2712   // Inputs:
2713   //   c_rarg0   - source byte array address
2714   //   c_rarg1   - destination byte array address
2715   //   c_rarg2   - K (key) in little endian int array
2716   //   c_rarg3   - r vector byte array address
2717   //   c_rarg4   - input length
2718   //
2719   // Output:
2720   //   x0        - input length
2721   //
2722   address generate_cipherBlockChaining_encryptAESCrypt() {
2723     assert(UseAES, "need AES instructions and misaligned SSE support");
2724     __ align(CodeEntryAlignment);
2725     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2726 
2727     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2728 
2729     const Register from        = c_rarg0;  // source array address
2730     const Register to          = c_rarg1;  // destination array address
2731     const Register key         = c_rarg2;  // key array address
2732     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2733                                            // and left with the results of the last encryption block
2734     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2735     const Register keylen      = rscratch1;
2736 
2737     address start = __ pc();
2738 
2739       __ enter();
2740 
2741       __ movw(rscratch2, len_reg);
2742 
2743       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2744 
2745       __ ld1(v0, __ T16B, rvec);
2746 
2747       __ cmpw(keylen, 52);
2748       __ br(Assembler::CC, L_loadkeys_44);
2749       __ br(Assembler::EQ, L_loadkeys_52);
2750 
2751       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2752       __ rev32(v17, __ T16B, v17);
2753       __ rev32(v18, __ T16B, v18);
2754     __ BIND(L_loadkeys_52);
2755       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2756       __ rev32(v19, __ T16B, v19);
2757       __ rev32(v20, __ T16B, v20);
2758     __ BIND(L_loadkeys_44);
2759       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2760       __ rev32(v21, __ T16B, v21);
2761       __ rev32(v22, __ T16B, v22);
2762       __ rev32(v23, __ T16B, v23);
2763       __ rev32(v24, __ T16B, v24);
2764       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2765       __ rev32(v25, __ T16B, v25);
2766       __ rev32(v26, __ T16B, v26);
2767       __ rev32(v27, __ T16B, v27);
2768       __ rev32(v28, __ T16B, v28);
2769       __ ld1(v29, v30, v31, __ T16B, key);
2770       __ rev32(v29, __ T16B, v29);
2771       __ rev32(v30, __ T16B, v30);
2772       __ rev32(v31, __ T16B, v31);
2773 
2774     __ BIND(L_aes_loop);
2775       __ ld1(v1, __ T16B, __ post(from, 16));
2776       __ eor(v0, __ T16B, v0, v1);
2777 
2778       __ br(Assembler::CC, L_rounds_44);
2779       __ br(Assembler::EQ, L_rounds_52);
2780 
2781       __ aese(v0, v17); __ aesmc(v0, v0);
2782       __ aese(v0, v18); __ aesmc(v0, v0);
2783     __ BIND(L_rounds_52);
2784       __ aese(v0, v19); __ aesmc(v0, v0);
2785       __ aese(v0, v20); __ aesmc(v0, v0);
2786     __ BIND(L_rounds_44);
2787       __ aese(v0, v21); __ aesmc(v0, v0);
2788       __ aese(v0, v22); __ aesmc(v0, v0);
2789       __ aese(v0, v23); __ aesmc(v0, v0);
2790       __ aese(v0, v24); __ aesmc(v0, v0);
2791       __ aese(v0, v25); __ aesmc(v0, v0);
2792       __ aese(v0, v26); __ aesmc(v0, v0);
2793       __ aese(v0, v27); __ aesmc(v0, v0);
2794       __ aese(v0, v28); __ aesmc(v0, v0);
2795       __ aese(v0, v29); __ aesmc(v0, v0);
2796       __ aese(v0, v30);
2797       __ eor(v0, __ T16B, v0, v31);
2798 
2799       __ st1(v0, __ T16B, __ post(to, 16));
2800 
2801       __ subw(len_reg, len_reg, 16);
2802       __ cbnzw(len_reg, L_aes_loop);
2803 
2804       __ st1(v0, __ T16B, rvec);
2805 
2806       __ mov(r0, rscratch2);
2807 
2808       __ leave();
2809       __ ret(lr);
2810 
2811       return start;
2812   }
2813 
2814   // Arguments:
2815   //
2816   // Inputs:
2817   //   c_rarg0   - source byte array address
2818   //   c_rarg1   - destination byte array address
2819   //   c_rarg2   - K (key) in little endian int array
2820   //   c_rarg3   - r vector byte array address
2821   //   c_rarg4   - input length
2822   //
2823   // Output:
2824   //   r0        - input length
2825   //
2826   address generate_cipherBlockChaining_decryptAESCrypt() {
2827     assert(UseAES, "need AES instructions and misaligned SSE support");
2828     __ align(CodeEntryAlignment);
2829     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2830 
2831     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2832 
2833     const Register from        = c_rarg0;  // source array address
2834     const Register to          = c_rarg1;  // destination array address
2835     const Register key         = c_rarg2;  // key array address
2836     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2837                                            // and left with the results of the last encryption block
2838     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2839     const Register keylen      = rscratch1;
2840 
2841     address start = __ pc();
2842 
2843       __ enter();
2844 
2845       __ movw(rscratch2, len_reg);
2846 
2847       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2848 
2849       __ ld1(v2, __ T16B, rvec);
2850 
2851       __ ld1(v31, __ T16B, __ post(key, 16));
2852       __ rev32(v31, __ T16B, v31);
2853 
2854       __ cmpw(keylen, 52);
2855       __ br(Assembler::CC, L_loadkeys_44);
2856       __ br(Assembler::EQ, L_loadkeys_52);
2857 
2858       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2859       __ rev32(v17, __ T16B, v17);
2860       __ rev32(v18, __ T16B, v18);
2861     __ BIND(L_loadkeys_52);
2862       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2863       __ rev32(v19, __ T16B, v19);
2864       __ rev32(v20, __ T16B, v20);
2865     __ BIND(L_loadkeys_44);
2866       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2867       __ rev32(v21, __ T16B, v21);
2868       __ rev32(v22, __ T16B, v22);
2869       __ rev32(v23, __ T16B, v23);
2870       __ rev32(v24, __ T16B, v24);
2871       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2872       __ rev32(v25, __ T16B, v25);
2873       __ rev32(v26, __ T16B, v26);
2874       __ rev32(v27, __ T16B, v27);
2875       __ rev32(v28, __ T16B, v28);
2876       __ ld1(v29, v30, __ T16B, key);
2877       __ rev32(v29, __ T16B, v29);
2878       __ rev32(v30, __ T16B, v30);
2879 
2880     __ BIND(L_aes_loop);
2881       __ ld1(v0, __ T16B, __ post(from, 16));
2882       __ orr(v1, __ T16B, v0, v0);
2883 
2884       __ br(Assembler::CC, L_rounds_44);
2885       __ br(Assembler::EQ, L_rounds_52);
2886 
2887       __ aesd(v0, v17); __ aesimc(v0, v0);
2888       __ aesd(v0, v18); __ aesimc(v0, v0);
2889     __ BIND(L_rounds_52);
2890       __ aesd(v0, v19); __ aesimc(v0, v0);
2891       __ aesd(v0, v20); __ aesimc(v0, v0);
2892     __ BIND(L_rounds_44);
2893       __ aesd(v0, v21); __ aesimc(v0, v0);
2894       __ aesd(v0, v22); __ aesimc(v0, v0);
2895       __ aesd(v0, v23); __ aesimc(v0, v0);
2896       __ aesd(v0, v24); __ aesimc(v0, v0);
2897       __ aesd(v0, v25); __ aesimc(v0, v0);
2898       __ aesd(v0, v26); __ aesimc(v0, v0);
2899       __ aesd(v0, v27); __ aesimc(v0, v0);
2900       __ aesd(v0, v28); __ aesimc(v0, v0);
2901       __ aesd(v0, v29); __ aesimc(v0, v0);
2902       __ aesd(v0, v30);
2903       __ eor(v0, __ T16B, v0, v31);
2904       __ eor(v0, __ T16B, v0, v2);
2905 
2906       __ st1(v0, __ T16B, __ post(to, 16));
2907       __ orr(v2, __ T16B, v1, v1);
2908 
2909       __ subw(len_reg, len_reg, 16);
2910       __ cbnzw(len_reg, L_aes_loop);
2911 
2912       __ st1(v2, __ T16B, rvec);
2913 
2914       __ mov(r0, rscratch2);
2915 
2916       __ leave();
2917       __ ret(lr);
2918 
2919     return start;
2920   }
2921 
2922   // Arguments:
2923   //
2924   // Inputs:
2925   //   c_rarg0   - byte[]  source+offset
2926   //   c_rarg1   - int[]   SHA.state
2927   //   c_rarg2   - int     offset
2928   //   c_rarg3   - int     limit
2929   //
2930   address generate_sha1_implCompress(bool multi_block, const char *name) {
2931     __ align(CodeEntryAlignment);
2932     StubCodeMark mark(this, "StubRoutines", name);
2933     address start = __ pc();
2934 
2935     Register buf   = c_rarg0;
2936     Register state = c_rarg1;
2937     Register ofs   = c_rarg2;
2938     Register limit = c_rarg3;
2939 
2940     Label keys;
2941     Label sha1_loop;
2942 
2943     // load the keys into v0..v3
2944     __ adr(rscratch1, keys);
2945     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2946     // load 5 words state into v6, v7
2947     __ ldrq(v6, Address(state, 0));
2948     __ ldrs(v7, Address(state, 16));
2949 
2950 
2951     __ BIND(sha1_loop);
2952     // load 64 bytes of data into v16..v19
2953     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2954     __ rev32(v16, __ T16B, v16);
2955     __ rev32(v17, __ T16B, v17);
2956     __ rev32(v18, __ T16B, v18);
2957     __ rev32(v19, __ T16B, v19);
2958 
2959     // do the sha1
2960     __ addv(v4, __ T4S, v16, v0);
2961     __ orr(v20, __ T16B, v6, v6);
2962 
2963     FloatRegister d0 = v16;
2964     FloatRegister d1 = v17;
2965     FloatRegister d2 = v18;
2966     FloatRegister d3 = v19;
2967 
2968     for (int round = 0; round < 20; round++) {
2969       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2970       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2971       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2972       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2973       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2974 
2975       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2976       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2977       __ sha1h(tmp2, __ T4S, v20);
2978       if (round < 5)
2979         __ sha1c(v20, __ T4S, tmp3, tmp4);
2980       else if (round < 10 || round >= 15)
2981         __ sha1p(v20, __ T4S, tmp3, tmp4);
2982       else
2983         __ sha1m(v20, __ T4S, tmp3, tmp4);
2984       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2985 
2986       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2987     }
2988 
2989     __ addv(v7, __ T2S, v7, v21);
2990     __ addv(v6, __ T4S, v6, v20);
2991 
2992     if (multi_block) {
2993       __ add(ofs, ofs, 64);
2994       __ cmp(ofs, limit);
2995       __ br(Assembler::LE, sha1_loop);
2996       __ mov(c_rarg0, ofs); // return ofs
2997     }
2998 
2999     __ strq(v6, Address(state, 0));
3000     __ strs(v7, Address(state, 16));
3001 
3002     __ ret(lr);
3003 
3004     __ bind(keys);
3005     __ emit_int32(0x5a827999);
3006     __ emit_int32(0x6ed9eba1);
3007     __ emit_int32(0x8f1bbcdc);
3008     __ emit_int32(0xca62c1d6);
3009 
3010     return start;
3011   }
3012 
3013 
3014   // Arguments:
3015   //
3016   // Inputs:
3017   //   c_rarg0   - byte[]  source+offset
3018   //   c_rarg1   - int[]   SHA.state
3019   //   c_rarg2   - int     offset
3020   //   c_rarg3   - int     limit
3021   //
3022   address generate_sha256_implCompress(bool multi_block, const char *name) {
3023     static const uint32_t round_consts[64] = {
3024       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3025       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3026       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3027       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3028       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3029       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3030       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3031       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3032       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3033       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3034       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3035       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3036       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3037       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3038       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3039       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3040     };
3041     __ align(CodeEntryAlignment);
3042     StubCodeMark mark(this, "StubRoutines", name);
3043     address start = __ pc();
3044 
3045     Register buf   = c_rarg0;
3046     Register state = c_rarg1;
3047     Register ofs   = c_rarg2;
3048     Register limit = c_rarg3;
3049 
3050     Label sha1_loop;
3051 
3052     __ stpd(v8, v9, __ pre(sp, -32));
3053     __ stpd(v10, v11, Address(sp, 16));
3054 
3055 // dga == v0
3056 // dgb == v1
3057 // dg0 == v2
3058 // dg1 == v3
3059 // dg2 == v4
3060 // t0 == v6
3061 // t1 == v7
3062 
3063     // load 16 keys to v16..v31
3064     __ lea(rscratch1, ExternalAddress((address)round_consts));
3065     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3066     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3067     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3068     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3069 
3070     // load 8 words (256 bits) state
3071     __ ldpq(v0, v1, state);
3072 
3073     __ BIND(sha1_loop);
3074     // load 64 bytes of data into v8..v11
3075     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3076     __ rev32(v8, __ T16B, v8);
3077     __ rev32(v9, __ T16B, v9);
3078     __ rev32(v10, __ T16B, v10);
3079     __ rev32(v11, __ T16B, v11);
3080 
3081     __ addv(v6, __ T4S, v8, v16);
3082     __ orr(v2, __ T16B, v0, v0);
3083     __ orr(v3, __ T16B, v1, v1);
3084 
3085     FloatRegister d0 = v8;
3086     FloatRegister d1 = v9;
3087     FloatRegister d2 = v10;
3088     FloatRegister d3 = v11;
3089 
3090 
3091     for (int round = 0; round < 16; round++) {
3092       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3093       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3094       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3095       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3096 
3097       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3098        __ orr(v4, __ T16B, v2, v2);
3099       if (round < 15)
3100         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3101       __ sha256h(v2, __ T4S, v3, tmp2);
3102       __ sha256h2(v3, __ T4S, v4, tmp2);
3103       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3104 
3105       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3106     }
3107 
3108     __ addv(v0, __ T4S, v0, v2);
3109     __ addv(v1, __ T4S, v1, v3);
3110 
3111     if (multi_block) {
3112       __ add(ofs, ofs, 64);
3113       __ cmp(ofs, limit);
3114       __ br(Assembler::LE, sha1_loop);
3115       __ mov(c_rarg0, ofs); // return ofs
3116     }
3117 
3118     __ ldpd(v10, v11, Address(sp, 16));
3119     __ ldpd(v8, v9, __ post(sp, 32));
3120 
3121     __ stpq(v0, v1, state);
3122 
3123     __ ret(lr);
3124 
3125     return start;
3126   }
3127 
3128   // Arguments:
3129   //
3130   // Inputs:
3131   //   c_rarg0   - byte[]  source+offset
3132   //   c_rarg1   - int[]   SHA.state
3133   //   c_rarg2   - int     offset
3134   //   c_rarg3   - int     limit
3135   //
3136   address generate_sha512_implCompress(bool multi_block, const char *name) {
3137     static const uint64_t round_consts[80] = {
3138       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3139       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3140       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3141       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3142       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3143       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3144       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3145       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3146       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3147       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3148       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3149       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3150       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3151       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3152       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3153       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3154       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3155       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3156       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3157       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3158       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3159       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3160       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3161       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3162       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3163       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3164       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3165     };
3166 
3167     // Double rounds for sha512.
3168     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3169       if (dr < 36)                                                                   \
3170         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3171       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3172       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3173       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3174       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3175       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3176       if (dr < 32) {                                                                 \
3177         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3178         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3179       }                                                                              \
3180       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3181       if (dr < 32)                                                                   \
3182         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3183       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3184       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3185 
3186     __ align(CodeEntryAlignment);
3187     StubCodeMark mark(this, "StubRoutines", name);
3188     address start = __ pc();
3189 
3190     Register buf   = c_rarg0;
3191     Register state = c_rarg1;
3192     Register ofs   = c_rarg2;
3193     Register limit = c_rarg3;
3194 
3195     __ stpd(v8, v9, __ pre(sp, -64));
3196     __ stpd(v10, v11, Address(sp, 16));
3197     __ stpd(v12, v13, Address(sp, 32));
3198     __ stpd(v14, v15, Address(sp, 48));
3199 
3200     Label sha512_loop;
3201 
3202     // load state
3203     __ ld1(v8, v9, v10, v11, __ T2D, state);
3204 
3205     // load first 4 round constants
3206     __ lea(rscratch1, ExternalAddress((address)round_consts));
3207     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3208 
3209     __ BIND(sha512_loop);
3210     // load 128B of data into v12..v19
3211     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3212     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3213     __ rev64(v12, __ T16B, v12);
3214     __ rev64(v13, __ T16B, v13);
3215     __ rev64(v14, __ T16B, v14);
3216     __ rev64(v15, __ T16B, v15);
3217     __ rev64(v16, __ T16B, v16);
3218     __ rev64(v17, __ T16B, v17);
3219     __ rev64(v18, __ T16B, v18);
3220     __ rev64(v19, __ T16B, v19);
3221 
3222     __ mov(rscratch2, rscratch1);
3223 
3224     __ mov(v0, __ T16B, v8);
3225     __ mov(v1, __ T16B, v9);
3226     __ mov(v2, __ T16B, v10);
3227     __ mov(v3, __ T16B, v11);
3228 
3229     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3230     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3231     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3232     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3233     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3234     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3235     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3236     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3237     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3238     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3239     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3240     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3241     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3242     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3243     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3244     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3245     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3246     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3247     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3248     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3249     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3250     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3251     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3252     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3253     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3254     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3255     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3256     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3257     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3258     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3259     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3260     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3261     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3262     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3263     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3264     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3265     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3266     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3267     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3268     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3269 
3270     __ addv(v8, __ T2D, v8, v0);
3271     __ addv(v9, __ T2D, v9, v1);
3272     __ addv(v10, __ T2D, v10, v2);
3273     __ addv(v11, __ T2D, v11, v3);
3274 
3275     if (multi_block) {
3276       __ add(ofs, ofs, 128);
3277       __ cmp(ofs, limit);
3278       __ br(Assembler::LE, sha512_loop);
3279       __ mov(c_rarg0, ofs); // return ofs
3280     }
3281 
3282     __ st1(v8, v9, v10, v11, __ T2D, state);
3283 
3284     __ ldpd(v14, v15, Address(sp, 48));
3285     __ ldpd(v12, v13, Address(sp, 32));
3286     __ ldpd(v10, v11, Address(sp, 16));
3287     __ ldpd(v8, v9, __ post(sp, 64));
3288 
3289     __ ret(lr);
3290 
3291     return start;
3292   }
3293 
3294   // Safefetch stubs.
3295   void generate_safefetch(const char* name, int size, address* entry,
3296                           address* fault_pc, address* continuation_pc) {
3297     // safefetch signatures:
3298     //   int      SafeFetch32(int*      adr, int      errValue);
3299     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3300     //
3301     // arguments:
3302     //   c_rarg0 = adr
3303     //   c_rarg1 = errValue
3304     //
3305     // result:
3306     //   PPC_RET  = *adr or errValue
3307 
3308     StubCodeMark mark(this, "StubRoutines", name);
3309 
3310     // Entry point, pc or function descriptor.
3311     *entry = __ pc();
3312 
3313     // Load *adr into c_rarg1, may fault.
3314     *fault_pc = __ pc();
3315     switch (size) {
3316       case 4:
3317         // int32_t
3318         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3319         break;
3320       case 8:
3321         // int64_t
3322         __ ldr(c_rarg1, Address(c_rarg0, 0));
3323         break;
3324       default:
3325         ShouldNotReachHere();
3326     }
3327 
3328     // return errValue or *adr
3329     *continuation_pc = __ pc();
3330     __ mov(r0, c_rarg1);
3331     __ ret(lr);
3332   }
3333 
3334   /**
3335    *  Arguments:
3336    *
3337    * Inputs:
3338    *   c_rarg0   - int crc
3339    *   c_rarg1   - byte* buf
3340    *   c_rarg2   - int length
3341    *
3342    * Ouput:
3343    *       rax   - int crc result
3344    */
3345   address generate_updateBytesCRC32() {
3346     assert(UseCRC32Intrinsics, "what are we doing here?");
3347 
3348     __ align(CodeEntryAlignment);
3349     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3350 
3351     address start = __ pc();
3352 
3353     const Register crc   = c_rarg0;  // crc
3354     const Register buf   = c_rarg1;  // source java byte array address
3355     const Register len   = c_rarg2;  // length
3356     const Register table0 = c_rarg3; // crc_table address
3357     const Register table1 = c_rarg4;
3358     const Register table2 = c_rarg5;
3359     const Register table3 = c_rarg6;
3360     const Register tmp3 = c_rarg7;
3361 
3362     BLOCK_COMMENT("Entry:");
3363     __ enter(); // required for proper stackwalking of RuntimeStub frame
3364 
3365     __ kernel_crc32(crc, buf, len,
3366               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3367 
3368     __ leave(); // required for proper stackwalking of RuntimeStub frame
3369     __ ret(lr);
3370 
3371     return start;
3372   }
3373 
3374   /**
3375    *  Arguments:
3376    *
3377    * Inputs:
3378    *   c_rarg0   - int crc
3379    *   c_rarg1   - byte* buf
3380    *   c_rarg2   - int length
3381    *   c_rarg3   - int* table
3382    *
3383    * Ouput:
3384    *       r0   - int crc result
3385    */
3386   address generate_updateBytesCRC32C() {
3387     assert(UseCRC32CIntrinsics, "what are we doing here?");
3388 
3389     __ align(CodeEntryAlignment);
3390     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3391 
3392     address start = __ pc();
3393 
3394     const Register crc   = c_rarg0;  // crc
3395     const Register buf   = c_rarg1;  // source java byte array address
3396     const Register len   = c_rarg2;  // length
3397     const Register table0 = c_rarg3; // crc_table address
3398     const Register table1 = c_rarg4;
3399     const Register table2 = c_rarg5;
3400     const Register table3 = c_rarg6;
3401     const Register tmp3 = c_rarg7;
3402 
3403     BLOCK_COMMENT("Entry:");
3404     __ enter(); // required for proper stackwalking of RuntimeStub frame
3405 
3406     __ kernel_crc32c(crc, buf, len,
3407               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3408 
3409     __ leave(); // required for proper stackwalking of RuntimeStub frame
3410     __ ret(lr);
3411 
3412     return start;
3413   }
3414 
3415   /***
3416    *  Arguments:
3417    *
3418    *  Inputs:
3419    *   c_rarg0   - int   adler
3420    *   c_rarg1   - byte* buff
3421    *   c_rarg2   - int   len
3422    *
3423    * Output:
3424    *   c_rarg0   - int adler result
3425    */
3426   address generate_updateBytesAdler32() {
3427     __ align(CodeEntryAlignment);
3428     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3429     address start = __ pc();
3430 
3431     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3432 
3433     // Aliases
3434     Register adler  = c_rarg0;
3435     Register s1     = c_rarg0;
3436     Register s2     = c_rarg3;
3437     Register buff   = c_rarg1;
3438     Register len    = c_rarg2;
3439     Register nmax  = r4;
3440     Register base  = r5;
3441     Register count = r6;
3442     Register temp0 = rscratch1;
3443     Register temp1 = rscratch2;
3444     FloatRegister vbytes = v0;
3445     FloatRegister vs1acc = v1;
3446     FloatRegister vs2acc = v2;
3447     FloatRegister vtable = v3;
3448 
3449     // Max number of bytes we can process before having to take the mod
3450     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3451     uint64_t BASE = 0xfff1;
3452     uint64_t NMAX = 0x15B0;
3453 
3454     __ mov(base, BASE);
3455     __ mov(nmax, NMAX);
3456 
3457     // Load accumulation coefficients for the upper 16 bits
3458     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3459     __ ld1(vtable, __ T16B, Address(temp0));
3460 
3461     // s1 is initialized to the lower 16 bits of adler
3462     // s2 is initialized to the upper 16 bits of adler
3463     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3464     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3465 
3466     // The pipelined loop needs at least 16 elements for 1 iteration
3467     // It does check this, but it is more effective to skip to the cleanup loop
3468     __ cmp(len, (u1)16);
3469     __ br(Assembler::HS, L_nmax);
3470     __ cbz(len, L_combine);
3471 
3472     __ bind(L_simple_by1_loop);
3473     __ ldrb(temp0, Address(__ post(buff, 1)));
3474     __ add(s1, s1, temp0);
3475     __ add(s2, s2, s1);
3476     __ subs(len, len, 1);
3477     __ br(Assembler::HI, L_simple_by1_loop);
3478 
3479     // s1 = s1 % BASE
3480     __ subs(temp0, s1, base);
3481     __ csel(s1, temp0, s1, Assembler::HS);
3482 
3483     // s2 = s2 % BASE
3484     __ lsr(temp0, s2, 16);
3485     __ lsl(temp1, temp0, 4);
3486     __ sub(temp1, temp1, temp0);
3487     __ add(s2, temp1, s2, ext::uxth);
3488 
3489     __ subs(temp0, s2, base);
3490     __ csel(s2, temp0, s2, Assembler::HS);
3491 
3492     __ b(L_combine);
3493 
3494     __ bind(L_nmax);
3495     __ subs(len, len, nmax);
3496     __ sub(count, nmax, 16);
3497     __ br(Assembler::LO, L_by16);
3498 
3499     __ bind(L_nmax_loop);
3500 
3501     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3502                                       vbytes, vs1acc, vs2acc, vtable);
3503 
3504     __ subs(count, count, 16);
3505     __ br(Assembler::HS, L_nmax_loop);
3506 
3507     // s1 = s1 % BASE
3508     __ lsr(temp0, s1, 16);
3509     __ lsl(temp1, temp0, 4);
3510     __ sub(temp1, temp1, temp0);
3511     __ add(temp1, temp1, s1, ext::uxth);
3512 
3513     __ lsr(temp0, temp1, 16);
3514     __ lsl(s1, temp0, 4);
3515     __ sub(s1, s1, temp0);
3516     __ add(s1, s1, temp1, ext:: uxth);
3517 
3518     __ subs(temp0, s1, base);
3519     __ csel(s1, temp0, s1, Assembler::HS);
3520 
3521     // s2 = s2 % BASE
3522     __ lsr(temp0, s2, 16);
3523     __ lsl(temp1, temp0, 4);
3524     __ sub(temp1, temp1, temp0);
3525     __ add(temp1, temp1, s2, ext::uxth);
3526 
3527     __ lsr(temp0, temp1, 16);
3528     __ lsl(s2, temp0, 4);
3529     __ sub(s2, s2, temp0);
3530     __ add(s2, s2, temp1, ext:: uxth);
3531 
3532     __ subs(temp0, s2, base);
3533     __ csel(s2, temp0, s2, Assembler::HS);
3534 
3535     __ subs(len, len, nmax);
3536     __ sub(count, nmax, 16);
3537     __ br(Assembler::HS, L_nmax_loop);
3538 
3539     __ bind(L_by16);
3540     __ adds(len, len, count);
3541     __ br(Assembler::LO, L_by1);
3542 
3543     __ bind(L_by16_loop);
3544 
3545     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3546                                       vbytes, vs1acc, vs2acc, vtable);
3547 
3548     __ subs(len, len, 16);
3549     __ br(Assembler::HS, L_by16_loop);
3550 
3551     __ bind(L_by1);
3552     __ adds(len, len, 15);
3553     __ br(Assembler::LO, L_do_mod);
3554 
3555     __ bind(L_by1_loop);
3556     __ ldrb(temp0, Address(__ post(buff, 1)));
3557     __ add(s1, temp0, s1);
3558     __ add(s2, s2, s1);
3559     __ subs(len, len, 1);
3560     __ br(Assembler::HS, L_by1_loop);
3561 
3562     __ bind(L_do_mod);
3563     // s1 = s1 % BASE
3564     __ lsr(temp0, s1, 16);
3565     __ lsl(temp1, temp0, 4);
3566     __ sub(temp1, temp1, temp0);
3567     __ add(temp1, temp1, s1, ext::uxth);
3568 
3569     __ lsr(temp0, temp1, 16);
3570     __ lsl(s1, temp0, 4);
3571     __ sub(s1, s1, temp0);
3572     __ add(s1, s1, temp1, ext:: uxth);
3573 
3574     __ subs(temp0, s1, base);
3575     __ csel(s1, temp0, s1, Assembler::HS);
3576 
3577     // s2 = s2 % BASE
3578     __ lsr(temp0, s2, 16);
3579     __ lsl(temp1, temp0, 4);
3580     __ sub(temp1, temp1, temp0);
3581     __ add(temp1, temp1, s2, ext::uxth);
3582 
3583     __ lsr(temp0, temp1, 16);
3584     __ lsl(s2, temp0, 4);
3585     __ sub(s2, s2, temp0);
3586     __ add(s2, s2, temp1, ext:: uxth);
3587 
3588     __ subs(temp0, s2, base);
3589     __ csel(s2, temp0, s2, Assembler::HS);
3590 
3591     // Combine lower bits and higher bits
3592     __ bind(L_combine);
3593     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3594 
3595     __ ret(lr);
3596 
3597     return start;
3598   }
3599 
3600   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3601           Register temp0, Register temp1, FloatRegister vbytes,
3602           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3603     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3604     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3605     // In non-vectorized code, we update s1 and s2 as:
3606     //   s1 <- s1 + b1
3607     //   s2 <- s2 + s1
3608     //   s1 <- s1 + b2
3609     //   s2 <- s2 + b1
3610     //   ...
3611     //   s1 <- s1 + b16
3612     //   s2 <- s2 + s1
3613     // Putting above assignments together, we have:
3614     //   s1_new = s1 + b1 + b2 + ... + b16
3615     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3616     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3617     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3618     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3619 
3620     // s2 = s2 + s1 * 16
3621     __ add(s2, s2, s1, Assembler::LSL, 4);
3622 
3623     // vs1acc = b1 + b2 + b3 + ... + b16
3624     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3625     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3626     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3627     __ uaddlv(vs1acc, __ T16B, vbytes);
3628     __ uaddlv(vs2acc, __ T8H, vs2acc);
3629 
3630     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3631     __ fmovd(temp0, vs1acc);
3632     __ fmovd(temp1, vs2acc);
3633     __ add(s1, s1, temp0);
3634     __ add(s2, s2, temp1);
3635   }
3636 
3637   /**
3638    *  Arguments:
3639    *
3640    *  Input:
3641    *    c_rarg0   - x address
3642    *    c_rarg1   - x length
3643    *    c_rarg2   - y address
3644    *    c_rarg3   - y lenth
3645    *    c_rarg4   - z address
3646    *    c_rarg5   - z length
3647    */
3648   address generate_multiplyToLen() {
3649     __ align(CodeEntryAlignment);
3650     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3651 
3652     address start = __ pc();
3653     const Register x     = r0;
3654     const Register xlen  = r1;
3655     const Register y     = r2;
3656     const Register ylen  = r3;
3657     const Register z     = r4;
3658     const Register zlen  = r5;
3659 
3660     const Register tmp1  = r10;
3661     const Register tmp2  = r11;
3662     const Register tmp3  = r12;
3663     const Register tmp4  = r13;
3664     const Register tmp5  = r14;
3665     const Register tmp6  = r15;
3666     const Register tmp7  = r16;
3667 
3668     BLOCK_COMMENT("Entry:");
3669     __ enter(); // required for proper stackwalking of RuntimeStub frame
3670     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3671     __ leave(); // required for proper stackwalking of RuntimeStub frame
3672     __ ret(lr);
3673 
3674     return start;
3675   }
3676 
3677   address generate_squareToLen() {
3678     // squareToLen algorithm for sizes 1..127 described in java code works
3679     // faster than multiply_to_len on some CPUs and slower on others, but
3680     // multiply_to_len shows a bit better overall results
3681     __ align(CodeEntryAlignment);
3682     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3683     address start = __ pc();
3684 
3685     const Register x     = r0;
3686     const Register xlen  = r1;
3687     const Register z     = r2;
3688     const Register zlen  = r3;
3689     const Register y     = r4; // == x
3690     const Register ylen  = r5; // == xlen
3691 
3692     const Register tmp1  = r10;
3693     const Register tmp2  = r11;
3694     const Register tmp3  = r12;
3695     const Register tmp4  = r13;
3696     const Register tmp5  = r14;
3697     const Register tmp6  = r15;
3698     const Register tmp7  = r16;
3699 
3700     RegSet spilled_regs = RegSet::of(y, ylen);
3701     BLOCK_COMMENT("Entry:");
3702     __ enter();
3703     __ push(spilled_regs, sp);
3704     __ mov(y, x);
3705     __ mov(ylen, xlen);
3706     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3707     __ pop(spilled_regs, sp);
3708     __ leave();
3709     __ ret(lr);
3710     return start;
3711   }
3712 
3713   address generate_mulAdd() {
3714     __ align(CodeEntryAlignment);
3715     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3716 
3717     address start = __ pc();
3718 
3719     const Register out     = r0;
3720     const Register in      = r1;
3721     const Register offset  = r2;
3722     const Register len     = r3;
3723     const Register k       = r4;
3724 
3725     BLOCK_COMMENT("Entry:");
3726     __ enter();
3727     __ mul_add(out, in, offset, len, k);
3728     __ leave();
3729     __ ret(lr);
3730 
3731     return start;
3732   }
3733 
3734   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3735                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3736                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3737     // Karatsuba multiplication performs a 128*128 -> 256-bit
3738     // multiplication in three 128-bit multiplications and a few
3739     // additions.
3740     //
3741     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3742     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3743     //
3744     // Inputs:
3745     //
3746     // A0 in a.d[0]     (subkey)
3747     // A1 in a.d[1]
3748     // (A1+A0) in a1_xor_a0.d[0]
3749     //
3750     // B0 in b.d[0]     (state)
3751     // B1 in b.d[1]
3752 
3753     __ ext(tmp1, __ T16B, b, b, 0x08);
3754     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3755     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3756     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3757     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3758 
3759     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3760     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3761     __ eor(tmp2, __ T16B, tmp2, tmp4);
3762     __ eor(tmp2, __ T16B, tmp2, tmp3);
3763 
3764     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3765     __ ins(result_hi, __ D, tmp2, 0, 1);
3766     __ ins(result_lo, __ D, tmp2, 1, 0);
3767   }
3768 
3769   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3770                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3771     const FloatRegister t0 = result;
3772 
3773     // The GCM field polynomial f is z^128 + p(z), where p =
3774     // z^7+z^2+z+1.
3775     //
3776     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3777     //
3778     // so, given that the product we're reducing is
3779     //    a == lo + hi * z^128
3780     // substituting,
3781     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3782     //
3783     // we reduce by multiplying hi by p(z) and subtracting the result
3784     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3785     // bits we can do this with two 64-bit multiplications, lo*p and
3786     // hi*p.
3787 
3788     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3789     __ ext(t1, __ T16B, t0, z, 8);
3790     __ eor(hi, __ T16B, hi, t1);
3791     __ ext(t1, __ T16B, z, t0, 8);
3792     __ eor(lo, __ T16B, lo, t1);
3793     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3794     __ eor(result, __ T16B, lo, t0);
3795   }
3796 
3797   address generate_has_negatives(address &has_negatives_long) {
3798     const u1 large_loop_size = 64;
3799     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3800     int dcache_line = VM_Version::dcache_line_size();
3801 
3802     Register ary1 = r1, len = r2, result = r0;
3803 
3804     __ align(CodeEntryAlignment);
3805 
3806     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3807 
3808     address entry = __ pc();
3809 
3810     __ enter();
3811 
3812   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3813         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3814 
3815   __ cmp(len, (u1)15);
3816   __ br(Assembler::GT, LEN_OVER_15);
3817   // The only case when execution falls into this code is when pointer is near
3818   // the end of memory page and we have to avoid reading next page
3819   __ add(ary1, ary1, len);
3820   __ subs(len, len, 8);
3821   __ br(Assembler::GT, LEN_OVER_8);
3822   __ ldr(rscratch2, Address(ary1, -8));
3823   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3824   __ lsrv(rscratch2, rscratch2, rscratch1);
3825   __ tst(rscratch2, UPPER_BIT_MASK);
3826   __ cset(result, Assembler::NE);
3827   __ leave();
3828   __ ret(lr);
3829   __ bind(LEN_OVER_8);
3830   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3831   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3832   __ tst(rscratch2, UPPER_BIT_MASK);
3833   __ br(Assembler::NE, RET_TRUE_NO_POP);
3834   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3835   __ lsrv(rscratch1, rscratch1, rscratch2);
3836   __ tst(rscratch1, UPPER_BIT_MASK);
3837   __ cset(result, Assembler::NE);
3838   __ leave();
3839   __ ret(lr);
3840 
3841   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3842   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3843 
3844   has_negatives_long = __ pc(); // 2nd entry point
3845 
3846   __ enter();
3847 
3848   __ bind(LEN_OVER_15);
3849     __ push(spilled_regs, sp);
3850     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3851     __ cbz(rscratch2, ALIGNED);
3852     __ ldp(tmp6, tmp1, Address(ary1));
3853     __ mov(tmp5, 16);
3854     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3855     __ add(ary1, ary1, rscratch1);
3856     __ sub(len, len, rscratch1);
3857     __ orr(tmp6, tmp6, tmp1);
3858     __ tst(tmp6, UPPER_BIT_MASK);
3859     __ br(Assembler::NE, RET_TRUE);
3860 
3861   __ bind(ALIGNED);
3862     __ cmp(len, large_loop_size);
3863     __ br(Assembler::LT, CHECK_16);
3864     // Perform 16-byte load as early return in pre-loop to handle situation
3865     // when initially aligned large array has negative values at starting bytes,
3866     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3867     // slower. Cases with negative bytes further ahead won't be affected that
3868     // much. In fact, it'll be faster due to early loads, less instructions and
3869     // less branches in LARGE_LOOP.
3870     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3871     __ sub(len, len, 16);
3872     __ orr(tmp6, tmp6, tmp1);
3873     __ tst(tmp6, UPPER_BIT_MASK);
3874     __ br(Assembler::NE, RET_TRUE);
3875     __ cmp(len, large_loop_size);
3876     __ br(Assembler::LT, CHECK_16);
3877 
3878     if (SoftwarePrefetchHintDistance >= 0
3879         && SoftwarePrefetchHintDistance >= dcache_line) {
3880       // initial prefetch
3881       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3882     }
3883   __ bind(LARGE_LOOP);
3884     if (SoftwarePrefetchHintDistance >= 0) {
3885       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3886     }
3887     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3888     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3889     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3890     // instructions per cycle and have less branches, but this approach disables
3891     // early return, thus, all 64 bytes are loaded and checked every time.
3892     __ ldp(tmp2, tmp3, Address(ary1));
3893     __ ldp(tmp4, tmp5, Address(ary1, 16));
3894     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3895     __ ldp(tmp6, tmp1, Address(ary1, 48));
3896     __ add(ary1, ary1, large_loop_size);
3897     __ sub(len, len, large_loop_size);
3898     __ orr(tmp2, tmp2, tmp3);
3899     __ orr(tmp4, tmp4, tmp5);
3900     __ orr(rscratch1, rscratch1, rscratch2);
3901     __ orr(tmp6, tmp6, tmp1);
3902     __ orr(tmp2, tmp2, tmp4);
3903     __ orr(rscratch1, rscratch1, tmp6);
3904     __ orr(tmp2, tmp2, rscratch1);
3905     __ tst(tmp2, UPPER_BIT_MASK);
3906     __ br(Assembler::NE, RET_TRUE);
3907     __ cmp(len, large_loop_size);
3908     __ br(Assembler::GE, LARGE_LOOP);
3909 
3910   __ bind(CHECK_16); // small 16-byte load pre-loop
3911     __ cmp(len, (u1)16);
3912     __ br(Assembler::LT, POST_LOOP16);
3913 
3914   __ bind(LOOP16); // small 16-byte load loop
3915     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3916     __ sub(len, len, 16);
3917     __ orr(tmp2, tmp2, tmp3);
3918     __ tst(tmp2, UPPER_BIT_MASK);
3919     __ br(Assembler::NE, RET_TRUE);
3920     __ cmp(len, (u1)16);
3921     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3922 
3923   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3924     __ cmp(len, (u1)8);
3925     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3926     __ ldr(tmp3, Address(__ post(ary1, 8)));
3927     __ sub(len, len, 8);
3928     __ tst(tmp3, UPPER_BIT_MASK);
3929     __ br(Assembler::NE, RET_TRUE);
3930 
3931   __ bind(POST_LOOP16_LOAD_TAIL);
3932     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3933     __ ldr(tmp1, Address(ary1));
3934     __ mov(tmp2, 64);
3935     __ sub(tmp4, tmp2, len, __ LSL, 3);
3936     __ lslv(tmp1, tmp1, tmp4);
3937     __ tst(tmp1, UPPER_BIT_MASK);
3938     __ br(Assembler::NE, RET_TRUE);
3939     // Fallthrough
3940 
3941   __ bind(RET_FALSE);
3942     __ pop(spilled_regs, sp);
3943     __ leave();
3944     __ mov(result, zr);
3945     __ ret(lr);
3946 
3947   __ bind(RET_TRUE);
3948     __ pop(spilled_regs, sp);
3949   __ bind(RET_TRUE_NO_POP);
3950     __ leave();
3951     __ mov(result, 1);
3952     __ ret(lr);
3953 
3954   __ bind(DONE);
3955     __ pop(spilled_regs, sp);
3956     __ leave();
3957     __ ret(lr);
3958     return entry;
3959   }
3960 
3961   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3962         bool usePrefetch, Label &NOT_EQUAL) {
3963     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3964         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3965         tmp7 = r12, tmp8 = r13;
3966     Label LOOP;
3967 
3968     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3969     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3970     __ bind(LOOP);
3971     if (usePrefetch) {
3972       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3973       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3974     }
3975     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3976     __ eor(tmp1, tmp1, tmp2);
3977     __ eor(tmp3, tmp3, tmp4);
3978     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3979     __ orr(tmp1, tmp1, tmp3);
3980     __ cbnz(tmp1, NOT_EQUAL);
3981     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3982     __ eor(tmp5, tmp5, tmp6);
3983     __ eor(tmp7, tmp7, tmp8);
3984     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3985     __ orr(tmp5, tmp5, tmp7);
3986     __ cbnz(tmp5, NOT_EQUAL);
3987     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3988     __ eor(tmp1, tmp1, tmp2);
3989     __ eor(tmp3, tmp3, tmp4);
3990     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3991     __ orr(tmp1, tmp1, tmp3);
3992     __ cbnz(tmp1, NOT_EQUAL);
3993     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3994     __ eor(tmp5, tmp5, tmp6);
3995     __ sub(cnt1, cnt1, 8 * wordSize);
3996     __ eor(tmp7, tmp7, tmp8);
3997     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3998     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3999     // cmp) because subs allows an unlimited range of immediate operand.
4000     __ subs(tmp6, cnt1, loopThreshold);
4001     __ orr(tmp5, tmp5, tmp7);
4002     __ cbnz(tmp5, NOT_EQUAL);
4003     __ br(__ GE, LOOP);
4004     // post-loop
4005     __ eor(tmp1, tmp1, tmp2);
4006     __ eor(tmp3, tmp3, tmp4);
4007     __ orr(tmp1, tmp1, tmp3);
4008     __ sub(cnt1, cnt1, 2 * wordSize);
4009     __ cbnz(tmp1, NOT_EQUAL);
4010   }
4011 
4012   void generate_large_array_equals_loop_simd(int loopThreshold,
4013         bool usePrefetch, Label &NOT_EQUAL) {
4014     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4015         tmp2 = rscratch2;
4016     Label LOOP;
4017 
4018     __ bind(LOOP);
4019     if (usePrefetch) {
4020       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4021       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4022     }
4023     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4024     __ sub(cnt1, cnt1, 8 * wordSize);
4025     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4026     __ subs(tmp1, cnt1, loopThreshold);
4027     __ eor(v0, __ T16B, v0, v4);
4028     __ eor(v1, __ T16B, v1, v5);
4029     __ eor(v2, __ T16B, v2, v6);
4030     __ eor(v3, __ T16B, v3, v7);
4031     __ orr(v0, __ T16B, v0, v1);
4032     __ orr(v1, __ T16B, v2, v3);
4033     __ orr(v0, __ T16B, v0, v1);
4034     __ umov(tmp1, v0, __ D, 0);
4035     __ umov(tmp2, v0, __ D, 1);
4036     __ orr(tmp1, tmp1, tmp2);
4037     __ cbnz(tmp1, NOT_EQUAL);
4038     __ br(__ GE, LOOP);
4039   }
4040 
4041   // a1 = r1 - array1 address
4042   // a2 = r2 - array2 address
4043   // result = r0 - return value. Already contains "false"
4044   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4045   // r3-r5 are reserved temporary registers
4046   address generate_large_array_equals() {
4047     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4048         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4049         tmp7 = r12, tmp8 = r13;
4050     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4051         SMALL_LOOP, POST_LOOP;
4052     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4053     // calculate if at least 32 prefetched bytes are used
4054     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4055     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4056     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4057     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4058         tmp5, tmp6, tmp7, tmp8);
4059 
4060     __ align(CodeEntryAlignment);
4061 
4062     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4063 
4064     address entry = __ pc();
4065     __ enter();
4066     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4067     // also advance pointers to use post-increment instead of pre-increment
4068     __ add(a1, a1, wordSize);
4069     __ add(a2, a2, wordSize);
4070     if (AvoidUnalignedAccesses) {
4071       // both implementations (SIMD/nonSIMD) are using relatively large load
4072       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4073       // on some CPUs in case of address is not at least 16-byte aligned.
4074       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4075       // load if needed at least for 1st address and make if 16-byte aligned.
4076       Label ALIGNED16;
4077       __ tbz(a1, 3, ALIGNED16);
4078       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4079       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4080       __ sub(cnt1, cnt1, wordSize);
4081       __ eor(tmp1, tmp1, tmp2);
4082       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4083       __ bind(ALIGNED16);
4084     }
4085     if (UseSIMDForArrayEquals) {
4086       if (SoftwarePrefetchHintDistance >= 0) {
4087         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4088         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4089         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4090             /* prfm = */ true, NOT_EQUAL);
4091         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4092         __ br(__ LT, TAIL);
4093       }
4094       __ bind(NO_PREFETCH_LARGE_LOOP);
4095       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4096           /* prfm = */ false, NOT_EQUAL);
4097     } else {
4098       __ push(spilled_regs, sp);
4099       if (SoftwarePrefetchHintDistance >= 0) {
4100         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4101         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4102         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4103             /* prfm = */ true, NOT_EQUAL);
4104         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4105         __ br(__ LT, TAIL);
4106       }
4107       __ bind(NO_PREFETCH_LARGE_LOOP);
4108       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4109           /* prfm = */ false, NOT_EQUAL);
4110     }
4111     __ bind(TAIL);
4112       __ cbz(cnt1, EQUAL);
4113       __ subs(cnt1, cnt1, wordSize);
4114       __ br(__ LE, POST_LOOP);
4115     __ bind(SMALL_LOOP);
4116       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4117       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4118       __ subs(cnt1, cnt1, wordSize);
4119       __ eor(tmp1, tmp1, tmp2);
4120       __ cbnz(tmp1, NOT_EQUAL);
4121       __ br(__ GT, SMALL_LOOP);
4122     __ bind(POST_LOOP);
4123       __ ldr(tmp1, Address(a1, cnt1));
4124       __ ldr(tmp2, Address(a2, cnt1));
4125       __ eor(tmp1, tmp1, tmp2);
4126       __ cbnz(tmp1, NOT_EQUAL);
4127     __ bind(EQUAL);
4128       __ mov(result, true);
4129     __ bind(NOT_EQUAL);
4130       if (!UseSIMDForArrayEquals) {
4131         __ pop(spilled_regs, sp);
4132       }
4133     __ bind(NOT_EQUAL_NO_POP);
4134     __ leave();
4135     __ ret(lr);
4136     return entry;
4137   }
4138 
4139   address generate_dsin_dcos(bool isCos) {
4140     __ align(CodeEntryAlignment);
4141     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4142     address start = __ pc();
4143     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4144         (address)StubRoutines::aarch64::_two_over_pi,
4145         (address)StubRoutines::aarch64::_pio2,
4146         (address)StubRoutines::aarch64::_dsin_coef,
4147         (address)StubRoutines::aarch64::_dcos_coef);
4148     return start;
4149   }
4150 
4151   address generate_dlog() {
4152     __ align(CodeEntryAlignment);
4153     StubCodeMark mark(this, "StubRoutines", "dlog");
4154     address entry = __ pc();
4155     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4156         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4157     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4158     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4159         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4160     return entry;
4161   }
4162 
4163   // code for comparing 16 bytes of strings with same encoding
4164   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4165     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4166     __ ldr(rscratch1, Address(__ post(str1, 8)));
4167     __ eor(rscratch2, tmp1, tmp2);
4168     __ ldr(cnt1, Address(__ post(str2, 8)));
4169     __ cbnz(rscratch2, DIFF1);
4170     __ ldr(tmp1, Address(__ post(str1, 8)));
4171     __ eor(rscratch2, rscratch1, cnt1);
4172     __ ldr(tmp2, Address(__ post(str2, 8)));
4173     __ cbnz(rscratch2, DIFF2);
4174   }
4175 
4176   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4177   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4178       Label &DIFF2) {
4179     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4180     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4181 
4182     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4183     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4184     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4185     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4186 
4187     __ fmovd(tmpL, vtmp3);
4188     __ eor(rscratch2, tmp3, tmpL);
4189     __ cbnz(rscratch2, DIFF2);
4190 
4191     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4192     __ umov(tmpL, vtmp3, __ D, 1);
4193     __ eor(rscratch2, tmpU, tmpL);
4194     __ cbnz(rscratch2, DIFF1);
4195 
4196     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4197     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4198     __ fmovd(tmpL, vtmp);
4199     __ eor(rscratch2, tmp3, tmpL);
4200     __ cbnz(rscratch2, DIFF2);
4201 
4202     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4203     __ umov(tmpL, vtmp, __ D, 1);
4204     __ eor(rscratch2, tmpU, tmpL);
4205     __ cbnz(rscratch2, DIFF1);
4206   }
4207 
4208   // r0  = result
4209   // r1  = str1
4210   // r2  = cnt1
4211   // r3  = str2
4212   // r4  = cnt2
4213   // r10 = tmp1
4214   // r11 = tmp2
4215   address generate_compare_long_string_different_encoding(bool isLU) {
4216     __ align(CodeEntryAlignment);
4217     StubCodeMark mark(this, "StubRoutines", isLU
4218         ? "compare_long_string_different_encoding LU"
4219         : "compare_long_string_different_encoding UL");
4220     address entry = __ pc();
4221     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4222         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4223         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4224     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4225         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4226     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4227     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4228 
4229     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4230 
4231     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4232     // cnt2 == amount of characters left to compare
4233     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4234     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4235     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4236     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4237     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4238     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4239     __ eor(rscratch2, tmp1, tmp2);
4240     __ mov(rscratch1, tmp2);
4241     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4242     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4243              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4244     __ push(spilled_regs, sp);
4245     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4246     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4247 
4248     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4249 
4250     if (SoftwarePrefetchHintDistance >= 0) {
4251       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4252       __ br(__ LT, NO_PREFETCH);
4253       __ bind(LARGE_LOOP_PREFETCH);
4254         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4255         __ mov(tmp4, 2);
4256         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4257         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4258           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4259           __ subs(tmp4, tmp4, 1);
4260           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4261           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4262           __ mov(tmp4, 2);
4263         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4264           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4265           __ subs(tmp4, tmp4, 1);
4266           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4267           __ sub(cnt2, cnt2, 64);
4268           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4269           __ br(__ GE, LARGE_LOOP_PREFETCH);
4270     }
4271     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4272     __ bind(NO_PREFETCH);
4273     __ subs(cnt2, cnt2, 16);
4274     __ br(__ LT, TAIL);
4275     __ align(OptoLoopAlignment);
4276     __ bind(SMALL_LOOP); // smaller loop
4277       __ subs(cnt2, cnt2, 16);
4278       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4279       __ br(__ GE, SMALL_LOOP);
4280       __ cmn(cnt2, (u1)16);
4281       __ br(__ EQ, LOAD_LAST);
4282     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4283       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4284       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4285       __ ldr(tmp3, Address(cnt1, -8));
4286       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4287       __ b(LOAD_LAST);
4288     __ bind(DIFF2);
4289       __ mov(tmpU, tmp3);
4290     __ bind(DIFF1);
4291       __ pop(spilled_regs, sp);
4292       __ b(CALCULATE_DIFFERENCE);
4293     __ bind(LOAD_LAST);
4294       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4295       // No need to load it again
4296       __ mov(tmpU, tmp3);
4297       __ pop(spilled_regs, sp);
4298 
4299       // tmp2 points to the address of the last 4 Latin1 characters right now
4300       __ ldrs(vtmp, Address(tmp2));
4301       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4302       __ fmovd(tmpL, vtmp);
4303 
4304       __ eor(rscratch2, tmpU, tmpL);
4305       __ cbz(rscratch2, DONE);
4306 
4307     // Find the first different characters in the longwords and
4308     // compute their difference.
4309     __ bind(CALCULATE_DIFFERENCE);
4310       __ rev(rscratch2, rscratch2);
4311       __ clz(rscratch2, rscratch2);
4312       __ andr(rscratch2, rscratch2, -16);
4313       __ lsrv(tmp1, tmp1, rscratch2);
4314       __ uxthw(tmp1, tmp1);
4315       __ lsrv(rscratch1, rscratch1, rscratch2);
4316       __ uxthw(rscratch1, rscratch1);
4317       __ subw(result, tmp1, rscratch1);
4318     __ bind(DONE);
4319       __ ret(lr);
4320     return entry;
4321   }
4322 
4323     address generate_method_entry_barrier() {
4324     __ align(CodeEntryAlignment);
4325     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
4326 
4327     Label deoptimize_label;
4328 
4329     address start = __ pc();
4330 
4331     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
4332 
4333     __ enter();
4334     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
4335 
4336     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
4337 
4338     __ push_call_clobbered_registers();
4339 
4340     __ mov(c_rarg0, rscratch2);
4341     __ call_VM_leaf
4342          (CAST_FROM_FN_PTR
4343           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
4344 
4345     __ reset_last_Java_frame(true);
4346 
4347     __ mov(rscratch1, r0);
4348 
4349     __ pop_call_clobbered_registers();
4350 
4351     __ cbnz(rscratch1, deoptimize_label);
4352 
4353     __ leave();
4354     __ ret(lr);
4355 
4356     __ BIND(deoptimize_label);
4357 
4358     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
4359     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
4360 
4361     __ mov(sp, rscratch1);
4362     __ br(rscratch2);
4363 
4364     return start;
4365   }
4366 
4367   // r0  = result
4368   // r1  = str1
4369   // r2  = cnt1
4370   // r3  = str2
4371   // r4  = cnt2
4372   // r10 = tmp1
4373   // r11 = tmp2
4374   address generate_compare_long_string_same_encoding(bool isLL) {
4375     __ align(CodeEntryAlignment);
4376     StubCodeMark mark(this, "StubRoutines", isLL
4377         ? "compare_long_string_same_encoding LL"
4378         : "compare_long_string_same_encoding UU");
4379     address entry = __ pc();
4380     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4381         tmp1 = r10, tmp2 = r11;
4382     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4383         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4384         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4385     // exit from large loop when less than 64 bytes left to read or we're about
4386     // to prefetch memory behind array border
4387     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4388     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4389     // update cnt2 counter with already loaded 8 bytes
4390     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4391     // update pointers, because of previous read
4392     __ add(str1, str1, wordSize);
4393     __ add(str2, str2, wordSize);
4394     if (SoftwarePrefetchHintDistance >= 0) {
4395       __ bind(LARGE_LOOP_PREFETCH);
4396         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4397         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4398         compare_string_16_bytes_same(DIFF, DIFF2);
4399         compare_string_16_bytes_same(DIFF, DIFF2);
4400         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4401         compare_string_16_bytes_same(DIFF, DIFF2);
4402         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4403         compare_string_16_bytes_same(DIFF, DIFF2);
4404         __ br(__ GT, LARGE_LOOP_PREFETCH);
4405         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4406     }
4407     // less than 16 bytes left?
4408     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4409     __ br(__ LT, TAIL);
4410     __ align(OptoLoopAlignment);
4411     __ bind(SMALL_LOOP);
4412       compare_string_16_bytes_same(DIFF, DIFF2);
4413       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4414       __ br(__ GE, SMALL_LOOP);
4415     __ bind(TAIL);
4416       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4417       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4418       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4419       __ br(__ LE, CHECK_LAST);
4420       __ eor(rscratch2, tmp1, tmp2);
4421       __ cbnz(rscratch2, DIFF);
4422       __ ldr(tmp1, Address(__ post(str1, 8)));
4423       __ ldr(tmp2, Address(__ post(str2, 8)));
4424       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4425     __ bind(CHECK_LAST);
4426       if (!isLL) {
4427         __ add(cnt2, cnt2, cnt2); // now in bytes
4428       }
4429       __ eor(rscratch2, tmp1, tmp2);
4430       __ cbnz(rscratch2, DIFF);
4431       __ ldr(rscratch1, Address(str1, cnt2));
4432       __ ldr(cnt1, Address(str2, cnt2));
4433       __ eor(rscratch2, rscratch1, cnt1);
4434       __ cbz(rscratch2, LENGTH_DIFF);
4435       // Find the first different characters in the longwords and
4436       // compute their difference.
4437     __ bind(DIFF2);
4438       __ rev(rscratch2, rscratch2);
4439       __ clz(rscratch2, rscratch2);
4440       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4441       __ lsrv(rscratch1, rscratch1, rscratch2);
4442       if (isLL) {
4443         __ lsrv(cnt1, cnt1, rscratch2);
4444         __ uxtbw(rscratch1, rscratch1);
4445         __ uxtbw(cnt1, cnt1);
4446       } else {
4447         __ lsrv(cnt1, cnt1, rscratch2);
4448         __ uxthw(rscratch1, rscratch1);
4449         __ uxthw(cnt1, cnt1);
4450       }
4451       __ subw(result, rscratch1, cnt1);
4452       __ b(LENGTH_DIFF);
4453     __ bind(DIFF);
4454       __ rev(rscratch2, rscratch2);
4455       __ clz(rscratch2, rscratch2);
4456       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4457       __ lsrv(tmp1, tmp1, rscratch2);
4458       if (isLL) {
4459         __ lsrv(tmp2, tmp2, rscratch2);
4460         __ uxtbw(tmp1, tmp1);
4461         __ uxtbw(tmp2, tmp2);
4462       } else {
4463         __ lsrv(tmp2, tmp2, rscratch2);
4464         __ uxthw(tmp1, tmp1);
4465         __ uxthw(tmp2, tmp2);
4466       }
4467       __ subw(result, tmp1, tmp2);
4468       __ b(LENGTH_DIFF);
4469     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4470       __ eor(rscratch2, tmp1, tmp2);
4471       __ cbnz(rscratch2, DIFF);
4472     __ bind(LENGTH_DIFF);
4473       __ ret(lr);
4474     return entry;
4475   }
4476 
4477   void generate_compare_long_strings() {
4478       StubRoutines::aarch64::_compare_long_string_LL
4479           = generate_compare_long_string_same_encoding(true);
4480       StubRoutines::aarch64::_compare_long_string_UU
4481           = generate_compare_long_string_same_encoding(false);
4482       StubRoutines::aarch64::_compare_long_string_LU
4483           = generate_compare_long_string_different_encoding(true);
4484       StubRoutines::aarch64::_compare_long_string_UL
4485           = generate_compare_long_string_different_encoding(false);
4486   }
4487 
4488   // R0 = result
4489   // R1 = str2
4490   // R2 = cnt1
4491   // R3 = str1
4492   // R4 = cnt2
4493   // This generic linear code use few additional ideas, which makes it faster:
4494   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4495   // in order to skip initial loading(help in systems with 1 ld pipeline)
4496   // 2) we can use "fast" algorithm of finding single character to search for
4497   // first symbol with less branches(1 branch per each loaded register instead
4498   // of branch for each symbol), so, this is where constants like
4499   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4500   // 3) after loading and analyzing 1st register of source string, it can be
4501   // used to search for every 1st character entry, saving few loads in
4502   // comparison with "simplier-but-slower" implementation
4503   // 4) in order to avoid lots of push/pop operations, code below is heavily
4504   // re-using/re-initializing/compressing register values, which makes code
4505   // larger and a bit less readable, however, most of extra operations are
4506   // issued during loads or branches, so, penalty is minimal
4507   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4508     const char* stubName = str1_isL
4509         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4510         : "indexof_linear_uu";
4511     __ align(CodeEntryAlignment);
4512     StubCodeMark mark(this, "StubRoutines", stubName);
4513     address entry = __ pc();
4514 
4515     int str1_chr_size = str1_isL ? 1 : 2;
4516     int str2_chr_size = str2_isL ? 1 : 2;
4517     int str1_chr_shift = str1_isL ? 0 : 1;
4518     int str2_chr_shift = str2_isL ? 0 : 1;
4519     bool isL = str1_isL && str2_isL;
4520    // parameters
4521     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4522     // temporary registers
4523     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4524     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4525     // redefinitions
4526     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4527 
4528     __ push(spilled_regs, sp);
4529     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4530         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4531         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4532         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4533         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4534         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4535     // Read whole register from str1. It is safe, because length >=8 here
4536     __ ldr(ch1, Address(str1));
4537     // Read whole register from str2. It is safe, because length >=8 here
4538     __ ldr(ch2, Address(str2));
4539     __ sub(cnt2, cnt2, cnt1);
4540     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4541     if (str1_isL != str2_isL) {
4542       __ eor(v0, __ T16B, v0, v0);
4543     }
4544     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4545     __ mul(first, first, tmp1);
4546     // check if we have less than 1 register to check
4547     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4548     if (str1_isL != str2_isL) {
4549       __ fmovd(v1, ch1);
4550     }
4551     __ br(__ LE, L_SMALL);
4552     __ eor(ch2, first, ch2);
4553     if (str1_isL != str2_isL) {
4554       __ zip1(v1, __ T16B, v1, v0);
4555     }
4556     __ sub(tmp2, ch2, tmp1);
4557     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4558     __ bics(tmp2, tmp2, ch2);
4559     if (str1_isL != str2_isL) {
4560       __ fmovd(ch1, v1);
4561     }
4562     __ br(__ NE, L_HAS_ZERO);
4563     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4564     __ add(result, result, wordSize/str2_chr_size);
4565     __ add(str2, str2, wordSize);
4566     __ br(__ LT, L_POST_LOOP);
4567     __ BIND(L_LOOP);
4568       __ ldr(ch2, Address(str2));
4569       __ eor(ch2, first, ch2);
4570       __ sub(tmp2, ch2, tmp1);
4571       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4572       __ bics(tmp2, tmp2, ch2);
4573       __ br(__ NE, L_HAS_ZERO);
4574     __ BIND(L_LOOP_PROCEED);
4575       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4576       __ add(str2, str2, wordSize);
4577       __ add(result, result, wordSize/str2_chr_size);
4578       __ br(__ GE, L_LOOP);
4579     __ BIND(L_POST_LOOP);
4580       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4581       __ br(__ LE, NOMATCH);
4582       __ ldr(ch2, Address(str2));
4583       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4584       __ eor(ch2, first, ch2);
4585       __ sub(tmp2, ch2, tmp1);
4586       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4587       __ mov(tmp4, -1); // all bits set
4588       __ b(L_SMALL_PROCEED);
4589     __ align(OptoLoopAlignment);
4590     __ BIND(L_SMALL);
4591       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4592       __ eor(ch2, first, ch2);
4593       if (str1_isL != str2_isL) {
4594         __ zip1(v1, __ T16B, v1, v0);
4595       }
4596       __ sub(tmp2, ch2, tmp1);
4597       __ mov(tmp4, -1); // all bits set
4598       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4599       if (str1_isL != str2_isL) {
4600         __ fmovd(ch1, v1); // move converted 4 symbols
4601       }
4602     __ BIND(L_SMALL_PROCEED);
4603       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4604       __ bic(tmp2, tmp2, ch2);
4605       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4606       __ rbit(tmp2, tmp2);
4607       __ br(__ EQ, NOMATCH);
4608     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4609       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4610       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4611       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4612       if (str2_isL) { // LL
4613         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4614         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4615         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4616         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4617         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4618       } else {
4619         __ mov(ch2, 0xE); // all bits in byte set except last one
4620         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4621         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4622         __ lslv(tmp2, tmp2, tmp4);
4623         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4624         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4625         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4626         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4627       }
4628       __ cmp(ch1, ch2);
4629       __ mov(tmp4, wordSize/str2_chr_size);
4630       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4631     __ BIND(L_SMALL_CMP_LOOP);
4632       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4633                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4634       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4635                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4636       __ add(tmp4, tmp4, 1);
4637       __ cmp(tmp4, cnt1);
4638       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4639       __ cmp(first, ch2);
4640       __ br(__ EQ, L_SMALL_CMP_LOOP);
4641     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4642       __ cbz(tmp2, NOMATCH); // no more matches. exit
4643       __ clz(tmp4, tmp2);
4644       __ add(result, result, 1); // advance index
4645       __ add(str2, str2, str2_chr_size); // advance pointer
4646       __ b(L_SMALL_HAS_ZERO_LOOP);
4647     __ align(OptoLoopAlignment);
4648     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4649       __ cmp(first, ch2);
4650       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4651       __ b(DONE);
4652     __ align(OptoLoopAlignment);
4653     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4654       if (str2_isL) { // LL
4655         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4656         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4657         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4658         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4659         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4660       } else {
4661         __ mov(ch2, 0xE); // all bits in byte set except last one
4662         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4663         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4664         __ lslv(tmp2, tmp2, tmp4);
4665         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4666         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4667         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4668         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4669       }
4670       __ cmp(ch1, ch2);
4671       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4672       __ b(DONE);
4673     __ align(OptoLoopAlignment);
4674     __ BIND(L_HAS_ZERO);
4675       __ rbit(tmp2, tmp2);
4676       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4677       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4678       // It's fine because both counters are 32bit and are not changed in this
4679       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4680       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4681       __ sub(result, result, 1);
4682     __ BIND(L_HAS_ZERO_LOOP);
4683       __ mov(cnt1, wordSize/str2_chr_size);
4684       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4685       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4686       if (str2_isL) {
4687         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4688         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4689         __ lslv(tmp2, tmp2, tmp4);
4690         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4691         __ add(tmp4, tmp4, 1);
4692         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4693         __ lsl(tmp2, tmp2, 1);
4694         __ mov(tmp4, wordSize/str2_chr_size);
4695       } else {
4696         __ mov(ch2, 0xE);
4697         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4698         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4699         __ lslv(tmp2, tmp2, tmp4);
4700         __ add(tmp4, tmp4, 1);
4701         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4702         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4703         __ lsl(tmp2, tmp2, 1);
4704         __ mov(tmp4, wordSize/str2_chr_size);
4705         __ sub(str2, str2, str2_chr_size);
4706       }
4707       __ cmp(ch1, ch2);
4708       __ mov(tmp4, wordSize/str2_chr_size);
4709       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4710     __ BIND(L_CMP_LOOP);
4711       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4712                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4713       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4714                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4715       __ add(tmp4, tmp4, 1);
4716       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4717       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4718       __ cmp(cnt1, ch2);
4719       __ br(__ EQ, L_CMP_LOOP);
4720     __ BIND(L_CMP_LOOP_NOMATCH);
4721       // here we're not matched
4722       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4723       __ clz(tmp4, tmp2);
4724       __ add(str2, str2, str2_chr_size); // advance pointer
4725       __ b(L_HAS_ZERO_LOOP);
4726     __ align(OptoLoopAlignment);
4727     __ BIND(L_CMP_LOOP_LAST_CMP);
4728       __ cmp(cnt1, ch2);
4729       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4730       __ b(DONE);
4731     __ align(OptoLoopAlignment);
4732     __ BIND(L_CMP_LOOP_LAST_CMP2);
4733       if (str2_isL) {
4734         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4735         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4736         __ lslv(tmp2, tmp2, tmp4);
4737         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4738         __ add(tmp4, tmp4, 1);
4739         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4740         __ lsl(tmp2, tmp2, 1);
4741       } else {
4742         __ mov(ch2, 0xE);
4743         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4744         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4745         __ lslv(tmp2, tmp2, tmp4);
4746         __ add(tmp4, tmp4, 1);
4747         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4748         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4749         __ lsl(tmp2, tmp2, 1);
4750         __ sub(str2, str2, str2_chr_size);
4751       }
4752       __ cmp(ch1, ch2);
4753       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4754       __ b(DONE);
4755     __ align(OptoLoopAlignment);
4756     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4757       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4758       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4759       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4760       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4761       // result by analyzed characters value, so, we can just reset lower bits
4762       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4763       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4764       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4765       // index of last analyzed substring inside current octet. So, str2 in at
4766       // respective start address. We need to advance it to next octet
4767       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4768       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4769       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4770       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4771       __ movw(cnt2, cnt2);
4772       __ b(L_LOOP_PROCEED);
4773     __ align(OptoLoopAlignment);
4774     __ BIND(NOMATCH);
4775       __ mov(result, -1);
4776     __ BIND(DONE);
4777       __ pop(spilled_regs, sp);
4778       __ ret(lr);
4779     return entry;
4780   }
4781 
4782   void generate_string_indexof_stubs() {
4783     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4784     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4785     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4786   }
4787 
4788   void inflate_and_store_2_fp_registers(bool generatePrfm,
4789       FloatRegister src1, FloatRegister src2) {
4790     Register dst = r1;
4791     __ zip1(v1, __ T16B, src1, v0);
4792     __ zip2(v2, __ T16B, src1, v0);
4793     if (generatePrfm) {
4794       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4795     }
4796     __ zip1(v3, __ T16B, src2, v0);
4797     __ zip2(v4, __ T16B, src2, v0);
4798     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4799   }
4800 
4801   // R0 = src
4802   // R1 = dst
4803   // R2 = len
4804   // R3 = len >> 3
4805   // V0 = 0
4806   // v1 = loaded 8 bytes
4807   address generate_large_byte_array_inflate() {
4808     __ align(CodeEntryAlignment);
4809     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4810     address entry = __ pc();
4811     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4812     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4813     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
4814 
4815     // do one more 8-byte read to have address 16-byte aligned in most cases
4816     // also use single store instruction
4817     __ ldrd(v2, __ post(src, 8));
4818     __ sub(octetCounter, octetCounter, 2);
4819     __ zip1(v1, __ T16B, v1, v0);
4820     __ zip1(v2, __ T16B, v2, v0);
4821     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4822     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4823     __ subs(rscratch1, octetCounter, large_loop_threshold);
4824     __ br(__ LE, LOOP_START);
4825     __ b(LOOP_PRFM_START);
4826     __ bind(LOOP_PRFM);
4827       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4828     __ bind(LOOP_PRFM_START);
4829       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4830       __ sub(octetCounter, octetCounter, 8);
4831       __ subs(rscratch1, octetCounter, large_loop_threshold);
4832       inflate_and_store_2_fp_registers(true, v3, v4);
4833       inflate_and_store_2_fp_registers(true, v5, v6);
4834       __ br(__ GT, LOOP_PRFM);
4835       __ cmp(octetCounter, (u1)8);
4836       __ br(__ LT, DONE);
4837     __ bind(LOOP);
4838       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4839       __ bind(LOOP_START);
4840       __ sub(octetCounter, octetCounter, 8);
4841       __ cmp(octetCounter, (u1)8);
4842       inflate_and_store_2_fp_registers(false, v3, v4);
4843       inflate_and_store_2_fp_registers(false, v5, v6);
4844       __ br(__ GE, LOOP);
4845     __ bind(DONE);
4846       __ ret(lr);
4847     return entry;
4848   }
4849 
4850   /**
4851    *  Arguments:
4852    *
4853    *  Input:
4854    *  c_rarg0   - current state address
4855    *  c_rarg1   - H key address
4856    *  c_rarg2   - data address
4857    *  c_rarg3   - number of blocks
4858    *
4859    *  Output:
4860    *  Updated state at c_rarg0
4861    */
4862   address generate_ghash_processBlocks() {
4863     // Bafflingly, GCM uses little-endian for the byte order, but
4864     // big-endian for the bit order.  For example, the polynomial 1 is
4865     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4866     //
4867     // So, we must either reverse the bytes in each word and do
4868     // everything big-endian or reverse the bits in each byte and do
4869     // it little-endian.  On AArch64 it's more idiomatic to reverse
4870     // the bits in each byte (we have an instruction, RBIT, to do
4871     // that) and keep the data in little-endian bit order throught the
4872     // calculation, bit-reversing the inputs and outputs.
4873 
4874     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4875     __ align(wordSize * 2);
4876     address p = __ pc();
4877     __ emit_int64(0x87);  // The low-order bits of the field
4878                           // polynomial (i.e. p = z^7+z^2+z+1)
4879                           // repeated in the low and high parts of a
4880                           // 128-bit vector
4881     __ emit_int64(0x87);
4882 
4883     __ align(CodeEntryAlignment);
4884     address start = __ pc();
4885 
4886     Register state   = c_rarg0;
4887     Register subkeyH = c_rarg1;
4888     Register data    = c_rarg2;
4889     Register blocks  = c_rarg3;
4890 
4891     FloatRegister vzr = v30;
4892     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4893 
4894     __ ldrq(v0, Address(state));
4895     __ ldrq(v1, Address(subkeyH));
4896 
4897     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4898     __ rbit(v0, __ T16B, v0);
4899     __ rev64(v1, __ T16B, v1);
4900     __ rbit(v1, __ T16B, v1);
4901 
4902     __ ldrq(v26, p);
4903 
4904     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4905     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4906 
4907     {
4908       Label L_ghash_loop;
4909       __ bind(L_ghash_loop);
4910 
4911       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4912                                                  // reversing each byte
4913       __ rbit(v2, __ T16B, v2);
4914       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4915 
4916       // Multiply state in v2 by subkey in v1
4917       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4918                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4919                      /*temps*/v6, v20, v18, v21);
4920       // Reduce v7:v5 by the field polynomial
4921       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4922 
4923       __ sub(blocks, blocks, 1);
4924       __ cbnz(blocks, L_ghash_loop);
4925     }
4926 
4927     // The bit-reversed result is at this point in v0
4928     __ rev64(v1, __ T16B, v0);
4929     __ rbit(v1, __ T16B, v1);
4930 
4931     __ st1(v1, __ T16B, state);
4932     __ ret(lr);
4933 
4934     return start;
4935   }
4936 
4937   // Continuation point for throwing of implicit exceptions that are
4938   // not handled in the current activation. Fabricates an exception
4939   // oop and initiates normal exception dispatching in this
4940   // frame. Since we need to preserve callee-saved values (currently
4941   // only for C2, but done for C1 as well) we need a callee-saved oop
4942   // map and therefore have to make these stubs into RuntimeStubs
4943   // rather than BufferBlobs.  If the compiler needs all registers to
4944   // be preserved between the fault point and the exception handler
4945   // then it must assume responsibility for that in
4946   // AbstractCompiler::continuation_for_implicit_null_exception or
4947   // continuation_for_implicit_division_by_zero_exception. All other
4948   // implicit exceptions (e.g., NullPointerException or
4949   // AbstractMethodError on entry) are either at call sites or
4950   // otherwise assume that stack unwinding will be initiated, so
4951   // caller saved registers were assumed volatile in the compiler.
4952 
4953 #undef __
4954 #define __ masm->
4955 
4956   address generate_throw_exception(const char* name,
4957                                    address runtime_entry,
4958                                    Register arg1 = noreg,
4959                                    Register arg2 = noreg) {
4960     // Information about frame layout at time of blocking runtime call.
4961     // Note that we only have to preserve callee-saved registers since
4962     // the compilers are responsible for supplying a continuation point
4963     // if they expect all registers to be preserved.
4964     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4965     enum layout {
4966       rfp_off = 0,
4967       rfp_off2,
4968       return_off,
4969       return_off2,
4970       framesize // inclusive of return address
4971     };
4972 
4973     int insts_size = 512;
4974     int locs_size  = 64;
4975 
4976     CodeBuffer code(name, insts_size, locs_size);
4977     OopMapSet* oop_maps  = new OopMapSet();
4978     MacroAssembler* masm = new MacroAssembler(&code);
4979 
4980     address start = __ pc();
4981 
4982     // This is an inlined and slightly modified version of call_VM
4983     // which has the ability to fetch the return PC out of
4984     // thread-local storage and also sets up last_Java_sp slightly
4985     // differently than the real call_VM
4986 
4987     __ enter(); // Save FP and LR before call
4988 
4989     assert(is_even(framesize/2), "sp not 16-byte aligned");
4990 
4991     // lr and fp are already in place
4992     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4993 
4994     int frame_complete = __ pc() - start;
4995 
4996     // Set up last_Java_sp and last_Java_fp
4997     address the_pc = __ pc();
4998     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4999 
5000     // Call runtime
5001     if (arg1 != noreg) {
5002       assert(arg2 != c_rarg1, "clobbered");
5003       __ mov(c_rarg1, arg1);
5004     }
5005     if (arg2 != noreg) {
5006       __ mov(c_rarg2, arg2);
5007     }
5008     __ mov(c_rarg0, rthread);
5009     BLOCK_COMMENT("call runtime_entry");
5010     __ mov(rscratch1, runtime_entry);
5011     __ blr(rscratch1);
5012 
5013     // Generate oop map
5014     OopMap* map = new OopMap(framesize, 0);
5015 
5016     oop_maps->add_gc_map(the_pc - start, map);
5017 
5018     __ reset_last_Java_frame(true);
5019     __ maybe_isb();
5020 
5021     __ leave();
5022 
5023     // check for pending exceptions
5024 #ifdef ASSERT
5025     Label L;
5026     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
5027     __ cbnz(rscratch1, L);
5028     __ should_not_reach_here();
5029     __ bind(L);
5030 #endif // ASSERT
5031     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5032 
5033 
5034     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5035     RuntimeStub* stub =
5036       RuntimeStub::new_runtime_stub(name,
5037                                     &code,
5038                                     frame_complete,
5039                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5040                                     oop_maps, false);
5041     return stub->entry_point();
5042   }
5043 
5044   class MontgomeryMultiplyGenerator : public MacroAssembler {
5045 
5046     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
5047       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
5048 
5049     RegSet _toSave;
5050     bool _squaring;
5051 
5052   public:
5053     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
5054       : MacroAssembler(as->code()), _squaring(squaring) {
5055 
5056       // Register allocation
5057 
5058       Register reg = c_rarg0;
5059       Pa_base = reg;       // Argument registers
5060       if (squaring)
5061         Pb_base = Pa_base;
5062       else
5063         Pb_base = ++reg;
5064       Pn_base = ++reg;
5065       Rlen= ++reg;
5066       inv = ++reg;
5067       Pm_base = ++reg;
5068 
5069                           // Working registers:
5070       Ra =  ++reg;        // The current digit of a, b, n, and m.
5071       Rb =  ++reg;
5072       Rm =  ++reg;
5073       Rn =  ++reg;
5074 
5075       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
5076       Pb =  ++reg;
5077       Pm =  ++reg;
5078       Pn =  ++reg;
5079 
5080       t0 =  ++reg;        // Three registers which form a
5081       t1 =  ++reg;        // triple-precision accumuator.
5082       t2 =  ++reg;
5083 
5084       Ri =  ++reg;        // Inner and outer loop indexes.
5085       Rj =  ++reg;
5086 
5087       Rhi_ab = ++reg;     // Product registers: low and high parts
5088       Rlo_ab = ++reg;     // of a*b and m*n.
5089       Rhi_mn = ++reg;
5090       Rlo_mn = ++reg;
5091 
5092       // r19 and up are callee-saved.
5093       _toSave = RegSet::range(r19, reg) + Pm_base;
5094     }
5095 
5096   private:
5097     void save_regs() {
5098       push(_toSave, sp);
5099     }
5100 
5101     void restore_regs() {
5102       pop(_toSave, sp);
5103     }
5104 
5105     template <typename T>
5106     void unroll_2(Register count, T block) {
5107       Label loop, end, odd;
5108       tbnz(count, 0, odd);
5109       cbz(count, end);
5110       align(16);
5111       bind(loop);
5112       (this->*block)();
5113       bind(odd);
5114       (this->*block)();
5115       subs(count, count, 2);
5116       br(Assembler::GT, loop);
5117       bind(end);
5118     }
5119 
5120     template <typename T>
5121     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
5122       Label loop, end, odd;
5123       tbnz(count, 0, odd);
5124       cbz(count, end);
5125       align(16);
5126       bind(loop);
5127       (this->*block)(d, s, tmp);
5128       bind(odd);
5129       (this->*block)(d, s, tmp);
5130       subs(count, count, 2);
5131       br(Assembler::GT, loop);
5132       bind(end);
5133     }
5134 
5135     void pre1(RegisterOrConstant i) {
5136       block_comment("pre1");
5137       // Pa = Pa_base;
5138       // Pb = Pb_base + i;
5139       // Pm = Pm_base;
5140       // Pn = Pn_base + i;
5141       // Ra = *Pa;
5142       // Rb = *Pb;
5143       // Rm = *Pm;
5144       // Rn = *Pn;
5145       ldr(Ra, Address(Pa_base));
5146       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5147       ldr(Rm, Address(Pm_base));
5148       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5149       lea(Pa, Address(Pa_base));
5150       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5151       lea(Pm, Address(Pm_base));
5152       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5153 
5154       // Zero the m*n result.
5155       mov(Rhi_mn, zr);
5156       mov(Rlo_mn, zr);
5157     }
5158 
5159     // The core multiply-accumulate step of a Montgomery
5160     // multiplication.  The idea is to schedule operations as a
5161     // pipeline so that instructions with long latencies (loads and
5162     // multiplies) have time to complete before their results are
5163     // used.  This most benefits in-order implementations of the
5164     // architecture but out-of-order ones also benefit.
5165     void step() {
5166       block_comment("step");
5167       // MACC(Ra, Rb, t0, t1, t2);
5168       // Ra = *++Pa;
5169       // Rb = *--Pb;
5170       umulh(Rhi_ab, Ra, Rb);
5171       mul(Rlo_ab, Ra, Rb);
5172       ldr(Ra, pre(Pa, wordSize));
5173       ldr(Rb, pre(Pb, -wordSize));
5174       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5175                                        // previous iteration.
5176       // MACC(Rm, Rn, t0, t1, t2);
5177       // Rm = *++Pm;
5178       // Rn = *--Pn;
5179       umulh(Rhi_mn, Rm, Rn);
5180       mul(Rlo_mn, Rm, Rn);
5181       ldr(Rm, pre(Pm, wordSize));
5182       ldr(Rn, pre(Pn, -wordSize));
5183       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5184     }
5185 
5186     void post1() {
5187       block_comment("post1");
5188 
5189       // MACC(Ra, Rb, t0, t1, t2);
5190       // Ra = *++Pa;
5191       // Rb = *--Pb;
5192       umulh(Rhi_ab, Ra, Rb);
5193       mul(Rlo_ab, Ra, Rb);
5194       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5195       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5196 
5197       // *Pm = Rm = t0 * inv;
5198       mul(Rm, t0, inv);
5199       str(Rm, Address(Pm));
5200 
5201       // MACC(Rm, Rn, t0, t1, t2);
5202       // t0 = t1; t1 = t2; t2 = 0;
5203       umulh(Rhi_mn, Rm, Rn);
5204 
5205 #ifndef PRODUCT
5206       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5207       {
5208         mul(Rlo_mn, Rm, Rn);
5209         add(Rlo_mn, t0, Rlo_mn);
5210         Label ok;
5211         cbz(Rlo_mn, ok); {
5212           stop("broken Montgomery multiply");
5213         } bind(ok);
5214       }
5215 #endif
5216       // We have very carefully set things up so that
5217       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5218       // the lower half of Rm * Rn because we know the result already:
5219       // it must be -t0.  t0 + (-t0) must generate a carry iff
5220       // t0 != 0.  So, rather than do a mul and an adds we just set
5221       // the carry flag iff t0 is nonzero.
5222       //
5223       // mul(Rlo_mn, Rm, Rn);
5224       // adds(zr, t0, Rlo_mn);
5225       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5226       adcs(t0, t1, Rhi_mn);
5227       adc(t1, t2, zr);
5228       mov(t2, zr);
5229     }
5230 
5231     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5232       block_comment("pre2");
5233       // Pa = Pa_base + i-len;
5234       // Pb = Pb_base + len;
5235       // Pm = Pm_base + i-len;
5236       // Pn = Pn_base + len;
5237 
5238       if (i.is_register()) {
5239         sub(Rj, i.as_register(), len);
5240       } else {
5241         mov(Rj, i.as_constant());
5242         sub(Rj, Rj, len);
5243       }
5244       // Rj == i-len
5245 
5246       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5247       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5248       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5249       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5250 
5251       // Ra = *++Pa;
5252       // Rb = *--Pb;
5253       // Rm = *++Pm;
5254       // Rn = *--Pn;
5255       ldr(Ra, pre(Pa, wordSize));
5256       ldr(Rb, pre(Pb, -wordSize));
5257       ldr(Rm, pre(Pm, wordSize));
5258       ldr(Rn, pre(Pn, -wordSize));
5259 
5260       mov(Rhi_mn, zr);
5261       mov(Rlo_mn, zr);
5262     }
5263 
5264     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5265       block_comment("post2");
5266       if (i.is_constant()) {
5267         mov(Rj, i.as_constant()-len.as_constant());
5268       } else {
5269         sub(Rj, i.as_register(), len);
5270       }
5271 
5272       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5273 
5274       // As soon as we know the least significant digit of our result,
5275       // store it.
5276       // Pm_base[i-len] = t0;
5277       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5278 
5279       // t0 = t1; t1 = t2; t2 = 0;
5280       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5281       adc(t1, t2, zr);
5282       mov(t2, zr);
5283     }
5284 
5285     // A carry in t0 after Montgomery multiplication means that we
5286     // should subtract multiples of n from our result in m.  We'll
5287     // keep doing that until there is no carry.
5288     void normalize(RegisterOrConstant len) {
5289       block_comment("normalize");
5290       // while (t0)
5291       //   t0 = sub(Pm_base, Pn_base, t0, len);
5292       Label loop, post, again;
5293       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5294       cbz(t0, post); {
5295         bind(again); {
5296           mov(i, zr);
5297           mov(cnt, len);
5298           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5299           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5300           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5301           align(16);
5302           bind(loop); {
5303             sbcs(Rm, Rm, Rn);
5304             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5305             add(i, i, 1);
5306             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5307             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5308             sub(cnt, cnt, 1);
5309           } cbnz(cnt, loop);
5310           sbc(t0, t0, zr);
5311         } cbnz(t0, again);
5312       } bind(post);
5313     }
5314 
5315     // Move memory at s to d, reversing words.
5316     //    Increments d to end of copied memory
5317     //    Destroys tmp1, tmp2
5318     //    Preserves len
5319     //    Leaves s pointing to the address which was in d at start
5320     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5321       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5322 
5323       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5324       mov(tmp1, len);
5325       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5326       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5327     }
5328     // where
5329     void reverse1(Register d, Register s, Register tmp) {
5330       ldr(tmp, pre(s, -wordSize));
5331       ror(tmp, tmp, 32);
5332       str(tmp, post(d, wordSize));
5333     }
5334 
5335     void step_squaring() {
5336       // An extra ACC
5337       step();
5338       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5339     }
5340 
5341     void last_squaring(RegisterOrConstant i) {
5342       Label dont;
5343       // if ((i & 1) == 0) {
5344       tbnz(i.as_register(), 0, dont); {
5345         // MACC(Ra, Rb, t0, t1, t2);
5346         // Ra = *++Pa;
5347         // Rb = *--Pb;
5348         umulh(Rhi_ab, Ra, Rb);
5349         mul(Rlo_ab, Ra, Rb);
5350         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5351       } bind(dont);
5352     }
5353 
5354     void extra_step_squaring() {
5355       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5356 
5357       // MACC(Rm, Rn, t0, t1, t2);
5358       // Rm = *++Pm;
5359       // Rn = *--Pn;
5360       umulh(Rhi_mn, Rm, Rn);
5361       mul(Rlo_mn, Rm, Rn);
5362       ldr(Rm, pre(Pm, wordSize));
5363       ldr(Rn, pre(Pn, -wordSize));
5364     }
5365 
5366     void post1_squaring() {
5367       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5368 
5369       // *Pm = Rm = t0 * inv;
5370       mul(Rm, t0, inv);
5371       str(Rm, Address(Pm));
5372 
5373       // MACC(Rm, Rn, t0, t1, t2);
5374       // t0 = t1; t1 = t2; t2 = 0;
5375       umulh(Rhi_mn, Rm, Rn);
5376 
5377 #ifndef PRODUCT
5378       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5379       {
5380         mul(Rlo_mn, Rm, Rn);
5381         add(Rlo_mn, t0, Rlo_mn);
5382         Label ok;
5383         cbz(Rlo_mn, ok); {
5384           stop("broken Montgomery multiply");
5385         } bind(ok);
5386       }
5387 #endif
5388       // We have very carefully set things up so that
5389       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5390       // the lower half of Rm * Rn because we know the result already:
5391       // it must be -t0.  t0 + (-t0) must generate a carry iff
5392       // t0 != 0.  So, rather than do a mul and an adds we just set
5393       // the carry flag iff t0 is nonzero.
5394       //
5395       // mul(Rlo_mn, Rm, Rn);
5396       // adds(zr, t0, Rlo_mn);
5397       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5398       adcs(t0, t1, Rhi_mn);
5399       adc(t1, t2, zr);
5400       mov(t2, zr);
5401     }
5402 
5403     void acc(Register Rhi, Register Rlo,
5404              Register t0, Register t1, Register t2) {
5405       adds(t0, t0, Rlo);
5406       adcs(t1, t1, Rhi);
5407       adc(t2, t2, zr);
5408     }
5409 
5410   public:
5411     /**
5412      * Fast Montgomery multiplication.  The derivation of the
5413      * algorithm is in A Cryptographic Library for the Motorola
5414      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5415      *
5416      * Arguments:
5417      *
5418      * Inputs for multiplication:
5419      *   c_rarg0   - int array elements a
5420      *   c_rarg1   - int array elements b
5421      *   c_rarg2   - int array elements n (the modulus)
5422      *   c_rarg3   - int length
5423      *   c_rarg4   - int inv
5424      *   c_rarg5   - int array elements m (the result)
5425      *
5426      * Inputs for squaring:
5427      *   c_rarg0   - int array elements a
5428      *   c_rarg1   - int array elements n (the modulus)
5429      *   c_rarg2   - int length
5430      *   c_rarg3   - int inv
5431      *   c_rarg4   - int array elements m (the result)
5432      *
5433      */
5434     address generate_multiply() {
5435       Label argh, nothing;
5436       bind(argh);
5437       stop("MontgomeryMultiply total_allocation must be <= 8192");
5438 
5439       align(CodeEntryAlignment);
5440       address entry = pc();
5441 
5442       cbzw(Rlen, nothing);
5443 
5444       enter();
5445 
5446       // Make room.
5447       cmpw(Rlen, 512);
5448       br(Assembler::HI, argh);
5449       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5450       andr(sp, Ra, -2 * wordSize);
5451 
5452       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5453 
5454       {
5455         // Copy input args, reversing as we go.  We use Ra as a
5456         // temporary variable.
5457         reverse(Ra, Pa_base, Rlen, t0, t1);
5458         if (!_squaring)
5459           reverse(Ra, Pb_base, Rlen, t0, t1);
5460         reverse(Ra, Pn_base, Rlen, t0, t1);
5461       }
5462 
5463       // Push all call-saved registers and also Pm_base which we'll need
5464       // at the end.
5465       save_regs();
5466 
5467 #ifndef PRODUCT
5468       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5469       {
5470         ldr(Rn, Address(Pn_base, 0));
5471         mul(Rlo_mn, Rn, inv);
5472         subs(zr, Rlo_mn, -1);
5473         Label ok;
5474         br(EQ, ok); {
5475           stop("broken inverse in Montgomery multiply");
5476         } bind(ok);
5477       }
5478 #endif
5479 
5480       mov(Pm_base, Ra);
5481 
5482       mov(t0, zr);
5483       mov(t1, zr);
5484       mov(t2, zr);
5485 
5486       block_comment("for (int i = 0; i < len; i++) {");
5487       mov(Ri, zr); {
5488         Label loop, end;
5489         cmpw(Ri, Rlen);
5490         br(Assembler::GE, end);
5491 
5492         bind(loop);
5493         pre1(Ri);
5494 
5495         block_comment("  for (j = i; j; j--) {"); {
5496           movw(Rj, Ri);
5497           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5498         } block_comment("  } // j");
5499 
5500         post1();
5501         addw(Ri, Ri, 1);
5502         cmpw(Ri, Rlen);
5503         br(Assembler::LT, loop);
5504         bind(end);
5505         block_comment("} // i");
5506       }
5507 
5508       block_comment("for (int i = len; i < 2*len; i++) {");
5509       mov(Ri, Rlen); {
5510         Label loop, end;
5511         cmpw(Ri, Rlen, Assembler::LSL, 1);
5512         br(Assembler::GE, end);
5513 
5514         bind(loop);
5515         pre2(Ri, Rlen);
5516 
5517         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5518           lslw(Rj, Rlen, 1);
5519           subw(Rj, Rj, Ri);
5520           subw(Rj, Rj, 1);
5521           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5522         } block_comment("  } // j");
5523 
5524         post2(Ri, Rlen);
5525         addw(Ri, Ri, 1);
5526         cmpw(Ri, Rlen, Assembler::LSL, 1);
5527         br(Assembler::LT, loop);
5528         bind(end);
5529       }
5530       block_comment("} // i");
5531 
5532       normalize(Rlen);
5533 
5534       mov(Ra, Pm_base);  // Save Pm_base in Ra
5535       restore_regs();  // Restore caller's Pm_base
5536 
5537       // Copy our result into caller's Pm_base
5538       reverse(Pm_base, Ra, Rlen, t0, t1);
5539 
5540       leave();
5541       bind(nothing);
5542       ret(lr);
5543 
5544       return entry;
5545     }
5546     // In C, approximately:
5547 
5548     // void
5549     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
5550     //                     julong Pn_base[], julong Pm_base[],
5551     //                     julong inv, int len) {
5552     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5553     //   julong *Pa, *Pb, *Pn, *Pm;
5554     //   julong Ra, Rb, Rn, Rm;
5555 
5556     //   int i;
5557 
5558     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5559 
5560     //   for (i = 0; i < len; i++) {
5561     //     int j;
5562 
5563     //     Pa = Pa_base;
5564     //     Pb = Pb_base + i;
5565     //     Pm = Pm_base;
5566     //     Pn = Pn_base + i;
5567 
5568     //     Ra = *Pa;
5569     //     Rb = *Pb;
5570     //     Rm = *Pm;
5571     //     Rn = *Pn;
5572 
5573     //     int iters = i;
5574     //     for (j = 0; iters--; j++) {
5575     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5576     //       MACC(Ra, Rb, t0, t1, t2);
5577     //       Ra = *++Pa;
5578     //       Rb = *--Pb;
5579     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5580     //       MACC(Rm, Rn, t0, t1, t2);
5581     //       Rm = *++Pm;
5582     //       Rn = *--Pn;
5583     //     }
5584 
5585     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5586     //     MACC(Ra, Rb, t0, t1, t2);
5587     //     *Pm = Rm = t0 * inv;
5588     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5589     //     MACC(Rm, Rn, t0, t1, t2);
5590 
5591     //     assert(t0 == 0, "broken Montgomery multiply");
5592 
5593     //     t0 = t1; t1 = t2; t2 = 0;
5594     //   }
5595 
5596     //   for (i = len; i < 2*len; i++) {
5597     //     int j;
5598 
5599     //     Pa = Pa_base + i-len;
5600     //     Pb = Pb_base + len;
5601     //     Pm = Pm_base + i-len;
5602     //     Pn = Pn_base + len;
5603 
5604     //     Ra = *++Pa;
5605     //     Rb = *--Pb;
5606     //     Rm = *++Pm;
5607     //     Rn = *--Pn;
5608 
5609     //     int iters = len*2-i-1;
5610     //     for (j = i-len+1; iters--; j++) {
5611     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5612     //       MACC(Ra, Rb, t0, t1, t2);
5613     //       Ra = *++Pa;
5614     //       Rb = *--Pb;
5615     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5616     //       MACC(Rm, Rn, t0, t1, t2);
5617     //       Rm = *++Pm;
5618     //       Rn = *--Pn;
5619     //     }
5620 
5621     //     Pm_base[i-len] = t0;
5622     //     t0 = t1; t1 = t2; t2 = 0;
5623     //   }
5624 
5625     //   while (t0)
5626     //     t0 = sub(Pm_base, Pn_base, t0, len);
5627     // }
5628 
5629     /**
5630      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5631      * multiplies than Montgomery multiplication so it should be up to
5632      * 25% faster.  However, its loop control is more complex and it
5633      * may actually run slower on some machines.
5634      *
5635      * Arguments:
5636      *
5637      * Inputs:
5638      *   c_rarg0   - int array elements a
5639      *   c_rarg1   - int array elements n (the modulus)
5640      *   c_rarg2   - int length
5641      *   c_rarg3   - int inv
5642      *   c_rarg4   - int array elements m (the result)
5643      *
5644      */
5645     address generate_square() {
5646       Label argh;
5647       bind(argh);
5648       stop("MontgomeryMultiply total_allocation must be <= 8192");
5649 
5650       align(CodeEntryAlignment);
5651       address entry = pc();
5652 
5653       enter();
5654 
5655       // Make room.
5656       cmpw(Rlen, 512);
5657       br(Assembler::HI, argh);
5658       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5659       andr(sp, Ra, -2 * wordSize);
5660 
5661       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5662 
5663       {
5664         // Copy input args, reversing as we go.  We use Ra as a
5665         // temporary variable.
5666         reverse(Ra, Pa_base, Rlen, t0, t1);
5667         reverse(Ra, Pn_base, Rlen, t0, t1);
5668       }
5669 
5670       // Push all call-saved registers and also Pm_base which we'll need
5671       // at the end.
5672       save_regs();
5673 
5674       mov(Pm_base, Ra);
5675 
5676       mov(t0, zr);
5677       mov(t1, zr);
5678       mov(t2, zr);
5679 
5680       block_comment("for (int i = 0; i < len; i++) {");
5681       mov(Ri, zr); {
5682         Label loop, end;
5683         bind(loop);
5684         cmp(Ri, Rlen);
5685         br(Assembler::GE, end);
5686 
5687         pre1(Ri);
5688 
5689         block_comment("for (j = (i+1)/2; j; j--) {"); {
5690           add(Rj, Ri, 1);
5691           lsr(Rj, Rj, 1);
5692           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5693         } block_comment("  } // j");
5694 
5695         last_squaring(Ri);
5696 
5697         block_comment("  for (j = i/2; j; j--) {"); {
5698           lsr(Rj, Ri, 1);
5699           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5700         } block_comment("  } // j");
5701 
5702         post1_squaring();
5703         add(Ri, Ri, 1);
5704         cmp(Ri, Rlen);
5705         br(Assembler::LT, loop);
5706 
5707         bind(end);
5708         block_comment("} // i");
5709       }
5710 
5711       block_comment("for (int i = len; i < 2*len; i++) {");
5712       mov(Ri, Rlen); {
5713         Label loop, end;
5714         bind(loop);
5715         cmp(Ri, Rlen, Assembler::LSL, 1);
5716         br(Assembler::GE, end);
5717 
5718         pre2(Ri, Rlen);
5719 
5720         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5721           lsl(Rj, Rlen, 1);
5722           sub(Rj, Rj, Ri);
5723           sub(Rj, Rj, 1);
5724           lsr(Rj, Rj, 1);
5725           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5726         } block_comment("  } // j");
5727 
5728         last_squaring(Ri);
5729 
5730         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5731           lsl(Rj, Rlen, 1);
5732           sub(Rj, Rj, Ri);
5733           lsr(Rj, Rj, 1);
5734           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5735         } block_comment("  } // j");
5736 
5737         post2(Ri, Rlen);
5738         add(Ri, Ri, 1);
5739         cmp(Ri, Rlen, Assembler::LSL, 1);
5740 
5741         br(Assembler::LT, loop);
5742         bind(end);
5743         block_comment("} // i");
5744       }
5745 
5746       normalize(Rlen);
5747 
5748       mov(Ra, Pm_base);  // Save Pm_base in Ra
5749       restore_regs();  // Restore caller's Pm_base
5750 
5751       // Copy our result into caller's Pm_base
5752       reverse(Pm_base, Ra, Rlen, t0, t1);
5753 
5754       leave();
5755       ret(lr);
5756 
5757       return entry;
5758     }
5759     // In C, approximately:
5760 
5761     // void
5762     // montgomery_square(julong Pa_base[], julong Pn_base[],
5763     //                   julong Pm_base[], julong inv, int len) {
5764     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5765     //   julong *Pa, *Pb, *Pn, *Pm;
5766     //   julong Ra, Rb, Rn, Rm;
5767 
5768     //   int i;
5769 
5770     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5771 
5772     //   for (i = 0; i < len; i++) {
5773     //     int j;
5774 
5775     //     Pa = Pa_base;
5776     //     Pb = Pa_base + i;
5777     //     Pm = Pm_base;
5778     //     Pn = Pn_base + i;
5779 
5780     //     Ra = *Pa;
5781     //     Rb = *Pb;
5782     //     Rm = *Pm;
5783     //     Rn = *Pn;
5784 
5785     //     int iters = (i+1)/2;
5786     //     for (j = 0; iters--; j++) {
5787     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5788     //       MACC2(Ra, Rb, t0, t1, t2);
5789     //       Ra = *++Pa;
5790     //       Rb = *--Pb;
5791     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5792     //       MACC(Rm, Rn, t0, t1, t2);
5793     //       Rm = *++Pm;
5794     //       Rn = *--Pn;
5795     //     }
5796     //     if ((i & 1) == 0) {
5797     //       assert(Ra == Pa_base[j], "must be");
5798     //       MACC(Ra, Ra, t0, t1, t2);
5799     //     }
5800     //     iters = i/2;
5801     //     assert(iters == i-j, "must be");
5802     //     for (; iters--; j++) {
5803     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5804     //       MACC(Rm, Rn, t0, t1, t2);
5805     //       Rm = *++Pm;
5806     //       Rn = *--Pn;
5807     //     }
5808 
5809     //     *Pm = Rm = t0 * inv;
5810     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5811     //     MACC(Rm, Rn, t0, t1, t2);
5812 
5813     //     assert(t0 == 0, "broken Montgomery multiply");
5814 
5815     //     t0 = t1; t1 = t2; t2 = 0;
5816     //   }
5817 
5818     //   for (i = len; i < 2*len; i++) {
5819     //     int start = i-len+1;
5820     //     int end = start + (len - start)/2;
5821     //     int j;
5822 
5823     //     Pa = Pa_base + i-len;
5824     //     Pb = Pa_base + len;
5825     //     Pm = Pm_base + i-len;
5826     //     Pn = Pn_base + len;
5827 
5828     //     Ra = *++Pa;
5829     //     Rb = *--Pb;
5830     //     Rm = *++Pm;
5831     //     Rn = *--Pn;
5832 
5833     //     int iters = (2*len-i-1)/2;
5834     //     assert(iters == end-start, "must be");
5835     //     for (j = start; iters--; j++) {
5836     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5837     //       MACC2(Ra, Rb, t0, t1, t2);
5838     //       Ra = *++Pa;
5839     //       Rb = *--Pb;
5840     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5841     //       MACC(Rm, Rn, t0, t1, t2);
5842     //       Rm = *++Pm;
5843     //       Rn = *--Pn;
5844     //     }
5845     //     if ((i & 1) == 0) {
5846     //       assert(Ra == Pa_base[j], "must be");
5847     //       MACC(Ra, Ra, t0, t1, t2);
5848     //     }
5849     //     iters =  (2*len-i)/2;
5850     //     assert(iters == len-j, "must be");
5851     //     for (; iters--; j++) {
5852     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5853     //       MACC(Rm, Rn, t0, t1, t2);
5854     //       Rm = *++Pm;
5855     //       Rn = *--Pn;
5856     //     }
5857     //     Pm_base[i-len] = t0;
5858     //     t0 = t1; t1 = t2; t2 = 0;
5859     //   }
5860 
5861     //   while (t0)
5862     //     t0 = sub(Pm_base, Pn_base, t0, len);
5863     // }
5864   };
5865 
5866 
5867   // Initialization
5868   void generate_initial() {
5869     // Generate initial stubs and initializes the entry points
5870 
5871     // entry points that exist in all platforms Note: This is code
5872     // that could be shared among different platforms - however the
5873     // benefit seems to be smaller than the disadvantage of having a
5874     // much more complicated generator structure. See also comment in
5875     // stubRoutines.hpp.
5876 
5877     StubRoutines::_forward_exception_entry = generate_forward_exception();
5878 
5879     StubRoutines::_call_stub_entry =
5880       generate_call_stub(StubRoutines::_call_stub_return_address);
5881 
5882     // is referenced by megamorphic call
5883     StubRoutines::_catch_exception_entry = generate_catch_exception();
5884 
5885     // Build this early so it's available for the interpreter.
5886     StubRoutines::_throw_StackOverflowError_entry =
5887       generate_throw_exception("StackOverflowError throw_exception",
5888                                CAST_FROM_FN_PTR(address,
5889                                                 SharedRuntime::throw_StackOverflowError));
5890     StubRoutines::_throw_delayed_StackOverflowError_entry =
5891       generate_throw_exception("delayed StackOverflowError throw_exception",
5892                                CAST_FROM_FN_PTR(address,
5893                                                 SharedRuntime::throw_delayed_StackOverflowError));
5894     if (UseCRC32Intrinsics) {
5895       // set table address before stub generation which use it
5896       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5897       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5898     }
5899 
5900     if (UseCRC32CIntrinsics) {
5901       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5902     }
5903 
5904     // Disabled until JDK-8210858 is fixed
5905     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5906     //   StubRoutines::_dlog = generate_dlog();
5907     // }
5908 
5909     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5910       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5911     }
5912 
5913     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5914       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5915     }
5916 
5917     // Safefetch stubs.
5918     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5919                                                        &StubRoutines::_safefetch32_fault_pc,
5920                                                        &StubRoutines::_safefetch32_continuation_pc);
5921     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5922                                                        &StubRoutines::_safefetchN_fault_pc,
5923                                                        &StubRoutines::_safefetchN_continuation_pc);
5924   }
5925 
5926   void generate_all() {
5927     // support for verify_oop (must happen after universe_init)
5928     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5929     StubRoutines::_throw_AbstractMethodError_entry =
5930       generate_throw_exception("AbstractMethodError throw_exception",
5931                                CAST_FROM_FN_PTR(address,
5932                                                 SharedRuntime::
5933                                                 throw_AbstractMethodError));
5934 
5935     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5936       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5937                                CAST_FROM_FN_PTR(address,
5938                                                 SharedRuntime::
5939                                                 throw_IncompatibleClassChangeError));
5940 
5941     StubRoutines::_throw_NullPointerException_at_call_entry =
5942       generate_throw_exception("NullPointerException at call throw_exception",
5943                                CAST_FROM_FN_PTR(address,
5944                                                 SharedRuntime::
5945                                                 throw_NullPointerException_at_call));
5946 
5947     // arraycopy stubs used by compilers
5948     generate_arraycopy_stubs();
5949 
5950     // has negatives stub for large arrays.
5951     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5952 
5953     // array equals stub for large arrays.
5954     if (!UseSimpleArrayEquals) {
5955       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5956     }
5957 
5958     generate_compare_long_strings();
5959 
5960     generate_string_indexof_stubs();
5961 
5962     // byte_array_inflate stub for large arrays.
5963     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5964 
5965     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5966     if (bs_nm != NULL) {
5967       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
5968     }
5969 #ifdef COMPILER2
5970     if (UseMultiplyToLenIntrinsic) {
5971       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5972     }
5973 
5974     if (UseSquareToLenIntrinsic) {
5975       StubRoutines::_squareToLen = generate_squareToLen();
5976     }
5977 
5978     if (UseMulAddIntrinsic) {
5979       StubRoutines::_mulAdd = generate_mulAdd();
5980     }
5981 
5982     if (UseMontgomeryMultiplyIntrinsic) {
5983       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5984       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5985       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5986     }
5987 
5988     if (UseMontgomerySquareIntrinsic) {
5989       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5990       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5991       // We use generate_multiply() rather than generate_square()
5992       // because it's faster for the sizes of modulus we care about.
5993       StubRoutines::_montgomerySquare = g.generate_multiply();
5994     }
5995 #endif // COMPILER2
5996 
5997     // generate GHASH intrinsics code
5998     if (UseGHASHIntrinsics) {
5999       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6000     }
6001 
6002     // data cache line writeback
6003     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
6004     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
6005 
6006     if (UseAESIntrinsics) {
6007       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6008       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6009       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6010       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
6011     }
6012 
6013     if (UseSHA1Intrinsics) {
6014       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
6015       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
6016     }
6017     if (UseSHA256Intrinsics) {
6018       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
6019       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
6020     }
6021     if (UseSHA512Intrinsics) {
6022       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
6023       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
6024     }
6025 
6026     // generate Adler32 intrinsics code
6027     if (UseAdler32Intrinsics) {
6028       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6029     }
6030 
6031     StubRoutines::aarch64::set_completed();
6032   }
6033 
6034  public:
6035   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6036     if (all) {
6037       generate_all();
6038     } else {
6039       generate_initial();
6040     }
6041   }
6042 }; // end class declaration
6043 
6044 #define UCM_TABLE_MAX_ENTRIES 8
6045 void StubGenerator_generate(CodeBuffer* code, bool all) {
6046   if (UnsafeCopyMemory::_table == NULL) {
6047     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
6048   }
6049   StubGenerator g(code, all);
6050 }