New src/hotspot/cpu/aarch64/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #include "utilities/powerOfTwo.hpp"
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zThreadLocalData.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     address aarch64_entry = __ pc();
 222 
 223     // set up frame and move sp to end of save area
 224     __ enter();
 225     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 226 
 227     // save register parameters and Java scratch/global registers
 228     // n.b. we save thread even though it gets installed in
 229     // rthread because we want to sanity check rthread later
 230     __ str(c_rarg7,  thread);
 231     __ strw(c_rarg6, parameter_size);
 232     __ stp(c_rarg4, c_rarg5,  entry_point);
 233     __ stp(c_rarg2, c_rarg3,  result_type);
 234     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 235 
 236     __ stp(r20, r19,   r20_save);
 237     __ stp(r22, r21,   r22_save);
 238     __ stp(r24, r23,   r24_save);
 239     __ stp(r26, r25,   r26_save);
 240     __ stp(r28, r27,   r28_save);
 241 
 242     __ stpd(v9,  v8,   d9_save);
 243     __ stpd(v11, v10,  d11_save);
 244     __ stpd(v13, v12,  d13_save);
 245     __ stpd(v15, v14,  d15_save);
 246 
 247     // install Java thread in global register now we have saved
 248     // whatever value it held
 249     __ mov(rthread, c_rarg7);
 250     // And method
 251     __ mov(rmethod, c_rarg3);
 252 
 253     // set up the heapbase register
 254     __ reinit_heapbase();
 255 
 256 #ifdef ASSERT
 257     // make sure we have no pending exceptions
 258     {
 259       Label L;
 260       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 261       __ cmp(rscratch1, (u1)NULL_WORD);
 262       __ br(Assembler::EQ, L);
 263       __ stop("StubRoutines::call_stub: entered with pending exception");
 264       __ BIND(L);
 265     }
 266 #endif
 267     // pass parameters if any
 268     __ mov(esp, sp);
 269     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 270     __ andr(sp, rscratch1, -2 * wordSize);
 271 
 272     BLOCK_COMMENT("pass parameters if any");
 273     Label parameters_done;
 274     // parameter count is still in c_rarg6
 275     // and parameter pointer identifying param 1 is in c_rarg5
 276     __ cbzw(c_rarg6, parameters_done);
 277 
 278     address loop = __ pc();
 279     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 280     __ subsw(c_rarg6, c_rarg6, 1);
 281     __ push(rscratch1);
 282     __ br(Assembler::GT, loop);
 283 
 284     __ BIND(parameters_done);
 285 
 286     // call Java entry -- passing methdoOop, and current sp
 287     //      rmethod: Method*
 288     //      r13: sender sp
 289     BLOCK_COMMENT("call Java function");
 290     __ mov(r13, sp);
 291     __ blr(c_rarg4);
 292 
 293     // we do this here because the notify will already have been done
 294     // if we get to the next instruction via an exception
 295     //
 296     // n.b. adding this instruction here affects the calculation of
 297     // whether or not a routine returns to the call stub (used when
 298     // doing stack walks) since the normal test is to check the return
 299     // pc against the address saved below. so we may need to allow for
 300     // this extra instruction in the check.
 301 
 302     // save current address for use by exception handling code
 303 
 304     return_address = __ pc();
 305 
 306     // store result depending on type (everything that is not
 307     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 308     // n.b. this assumes Java returns an integral result in r0
 309     // and a floating result in j_farg0
 310     __ ldr(j_rarg2, result);
 311     Label is_long, is_float, is_double, exit;
 312     __ ldr(j_rarg1, result_type);
 313     __ cmp(j_rarg1, (u1)T_OBJECT);
 314     __ br(Assembler::EQ, is_long);
 315     __ cmp(j_rarg1, (u1)T_LONG);
 316     __ br(Assembler::EQ, is_long);
 317     __ cmp(j_rarg1, (u1)T_FLOAT);
 318     __ br(Assembler::EQ, is_float);
 319     __ cmp(j_rarg1, (u1)T_DOUBLE);
 320     __ br(Assembler::EQ, is_double);
 321 
 322     // handle T_INT case
 323     __ strw(r0, Address(j_rarg2));
 324 
 325     __ BIND(exit);
 326 
 327     // pop parameters
 328     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 329 
 330 #ifdef ASSERT
 331     // verify that threads correspond
 332     {
 333       Label L, S;
 334       __ ldr(rscratch1, thread);
 335       __ cmp(rthread, rscratch1);
 336       __ br(Assembler::NE, S);
 337       __ get_thread(rscratch1);
 338       __ cmp(rthread, rscratch1);
 339       __ br(Assembler::EQ, L);
 340       __ BIND(S);
 341       __ stop("StubRoutines::call_stub: threads must correspond");
 342       __ BIND(L);
 343     }
 344 #endif
 345 
 346     // restore callee-save registers
 347     __ ldpd(v15, v14,  d15_save);
 348     __ ldpd(v13, v12,  d13_save);
 349     __ ldpd(v11, v10,  d11_save);
 350     __ ldpd(v9,  v8,   d9_save);
 351 
 352     __ ldp(r28, r27,   r28_save);
 353     __ ldp(r26, r25,   r26_save);
 354     __ ldp(r24, r23,   r24_save);
 355     __ ldp(r22, r21,   r22_save);
 356     __ ldp(r20, r19,   r20_save);
 357 
 358     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 359     __ ldrw(c_rarg2, result_type);
 360     __ ldr(c_rarg3,  method);
 361     __ ldp(c_rarg4, c_rarg5,  entry_point);
 362     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 363 
 364     // leave frame and return to caller
 365     __ leave();
 366     __ ret(lr);
 367 
 368     // handle return types different from T_INT
 369 
 370     __ BIND(is_long);
 371     __ str(r0, Address(j_rarg2, 0));
 372     __ br(Assembler::AL, exit);
 373 
 374     __ BIND(is_float);
 375     __ strs(j_farg0, Address(j_rarg2, 0));
 376     __ br(Assembler::AL, exit);
 377 
 378     __ BIND(is_double);
 379     __ strd(j_farg0, Address(j_rarg2, 0));
 380     __ br(Assembler::AL, exit);
 381 
 382     return start;
 383   }
 384 
 385   // Return point for a Java call if there's an exception thrown in
 386   // Java code.  The exception is caught and transformed into a
 387   // pending exception stored in JavaThread that can be tested from
 388   // within the VM.
 389   //
 390   // Note: Usually the parameters are removed by the callee. In case
 391   // of an exception crossing an activation frame boundary, that is
 392   // not the case if the callee is compiled code => need to setup the
 393   // rsp.
 394   //
 395   // r0: exception oop
 396 
 397   address generate_catch_exception() {
 398     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 399     address start = __ pc();
 400 
 401     // same as in generate_call_stub():
 402     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 403     const Address thread        (rfp, thread_off         * wordSize);
 404 
 405 #ifdef ASSERT
 406     // verify that threads correspond
 407     {
 408       Label L, S;
 409       __ ldr(rscratch1, thread);
 410       __ cmp(rthread, rscratch1);
 411       __ br(Assembler::NE, S);
 412       __ get_thread(rscratch1);
 413       __ cmp(rthread, rscratch1);
 414       __ br(Assembler::EQ, L);
 415       __ bind(S);
 416       __ stop("StubRoutines::catch_exception: threads must correspond");
 417       __ bind(L);
 418     }
 419 #endif
 420 
 421     // set pending exception
 422     __ verify_oop(r0);
 423 
 424     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 425     __ mov(rscratch1, (address)__FILE__);
 426     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 427     __ movw(rscratch1, (int)__LINE__);
 428     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 429 
 430     // complete return to VM
 431     assert(StubRoutines::_call_stub_return_address != NULL,
 432            "_call_stub_return_address must have been generated before");
 433     __ b(StubRoutines::_call_stub_return_address);
 434 
 435     return start;
 436   }
 437 
 438   // Continuation point for runtime calls returning with a pending
 439   // exception.  The pending exception check happened in the runtime
 440   // or native call stub.  The pending exception in Thread is
 441   // converted into a Java-level exception.
 442   //
 443   // Contract with Java-level exception handlers:
 444   // r0: exception
 445   // r3: throwing pc
 446   //
 447   // NOTE: At entry of this stub, exception-pc must be in LR !!
 448 
 449   // NOTE: this is always used as a jump target within generated code
 450   // so it just needs to be generated code wiht no x86 prolog
 451 
 452   address generate_forward_exception() {
 453     StubCodeMark mark(this, "StubRoutines", "forward exception");
 454     address start = __ pc();
 455 
 456     // Upon entry, LR points to the return address returning into
 457     // Java (interpreted or compiled) code; i.e., the return address
 458     // becomes the throwing pc.
 459     //
 460     // Arguments pushed before the runtime call are still on the stack
 461     // but the exception handler will reset the stack pointer ->
 462     // ignore them.  A potential result in registers can be ignored as
 463     // well.
 464 
 465 #ifdef ASSERT
 466     // make sure this code is only executed if there is a pending exception
 467     {
 468       Label L;
 469       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 470       __ cbnz(rscratch1, L);
 471       __ stop("StubRoutines::forward exception: no pending exception (1)");
 472       __ bind(L);
 473     }
 474 #endif
 475 
 476     // compute exception handler into r19
 477 
 478     // call the VM to find the handler address associated with the
 479     // caller address. pass thread in r0 and caller pc (ret address)
 480     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 481     // the stack.
 482     __ mov(c_rarg1, lr);
 483     // lr will be trashed by the VM call so we move it to R19
 484     // (callee-saved) because we also need to pass it to the handler
 485     // returned by this call.
 486     __ mov(r19, lr);
 487     BLOCK_COMMENT("call exception_handler_for_return_address");
 488     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 489                          SharedRuntime::exception_handler_for_return_address),
 490                     rthread, c_rarg1);
 491     // we should not really care that lr is no longer the callee
 492     // address. we saved the value the handler needs in r19 so we can
 493     // just copy it to r3. however, the C2 handler will push its own
 494     // frame and then calls into the VM and the VM code asserts that
 495     // the PC for the frame above the handler belongs to a compiled
 496     // Java method. So, we restore lr here to satisfy that assert.
 497     __ mov(lr, r19);
 498     // setup r0 & r3 & clear pending exception
 499     __ mov(r3, r19);
 500     __ mov(r19, r0);
 501     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 502     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 503 
 504 #ifdef ASSERT
 505     // make sure exception is set
 506     {
 507       Label L;
 508       __ cbnz(r0, L);
 509       __ stop("StubRoutines::forward exception: no pending exception (2)");
 510       __ bind(L);
 511     }
 512 #endif
 513 
 514     // continue at exception handler
 515     // r0: exception
 516     // r3: throwing pc
 517     // r19: exception handler
 518     __ verify_oop(r0);
 519     __ br(r19);
 520 
 521     return start;
 522   }
 523 
 524   // Non-destructive plausibility checks for oops
 525   //
 526   // Arguments:
 527   //    r0: oop to verify
 528   //    rscratch1: error message
 529   //
 530   // Stack after saving c_rarg3:
 531   //    [tos + 0]: saved c_rarg3
 532   //    [tos + 1]: saved c_rarg2
 533   //    [tos + 2]: saved lr
 534   //    [tos + 3]: saved rscratch2
 535   //    [tos + 4]: saved r0
 536   //    [tos + 5]: saved rscratch1
 537   address generate_verify_oop() {
 538 
 539     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 540     address start = __ pc();
 541 
 542     Label exit, error;
 543 
 544     // save c_rarg2 and c_rarg3
 545     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 546 
 547     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 548     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ ldr(c_rarg3, Address(c_rarg2));
 550     __ add(c_rarg3, c_rarg3, 1);
 551     __ str(c_rarg3, Address(c_rarg2));
 552 
 553     // object is in r0
 554     // make sure object is 'reasonable'
 555     __ cbz(r0, exit); // if obj is NULL it is OK
 556 
 557 #if INCLUDE_ZGC
 558     if (UseZGC) {
 559       // Check if mask is good.
 560       // verifies that ZAddressBadMask & r0 == 0
 561       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 562       __ andr(c_rarg2, r0, c_rarg3);
 563       __ cbnz(c_rarg2, error);
 564     }
 565 #endif
 566 
 567     // Check if the oop is in the right area of memory
 568     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 569     __ andr(c_rarg2, r0, c_rarg3);
 570     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 571 
 572     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 573     // instruction here because the flags register is live.
 574     __ eor(c_rarg2, c_rarg2, c_rarg3);
 575     __ cbnz(c_rarg2, error);
 576 
 577     // make sure klass is 'reasonable', which is not zero.
 578     __ load_klass(r0, r0);  // get klass
 579     __ cbz(r0, error);      // if klass is NULL it is broken
 580 
 581     // return if everything seems ok
 582     __ bind(exit);
 583 
 584     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 585     __ ret(lr);
 586 
 587     // handle errors
 588     __ bind(error);
 589     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 590 
 591     __ push(RegSet::range(r0, r29), sp);
 592     // debug(char* msg, int64_t pc, int64_t regs[])
 593     __ mov(c_rarg0, rscratch1);      // pass address of error message
 594     __ mov(c_rarg1, lr);             // pass return address
 595     __ mov(c_rarg2, sp);             // pass address of regs on stack
 596 #ifndef PRODUCT
 597     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 598 #endif
 599     BLOCK_COMMENT("call MacroAssembler::debug");
 600     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 601     __ blr(rscratch1);
 602     __ hlt(0);
 603 
 604     return start;
 605   }
 606 
 607   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 608 
 609   // The inner part of zero_words().  This is the bulk operation,
 610   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 611   // caller is responsible for zeroing the last few words.
 612   //
 613   // Inputs:
 614   // r10: the HeapWord-aligned base address of an array to zero.
 615   // r11: the count in HeapWords, r11 > 0.
 616   //
 617   // Returns r10 and r11, adjusted for the caller to clear.
 618   // r10: the base address of the tail of words left to clear.
 619   // r11: the number of words in the tail.
 620   //      r11 < MacroAssembler::zero_words_block_size.
 621 
 622   address generate_zero_blocks() {
 623     Label done;
 624     Label base_aligned;
 625 
 626     Register base = r10, cnt = r11;
 627 
 628     __ align(CodeEntryAlignment);
 629     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 630     address start = __ pc();
 631 
 632     if (UseBlockZeroing) {
 633       int zva_length = VM_Version::zva_length();
 634 
 635       // Ensure ZVA length can be divided by 16. This is required by
 636       // the subsequent operations.
 637       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 638 
 639       __ tbz(base, 3, base_aligned);
 640       __ str(zr, Address(__ post(base, 8)));
 641       __ sub(cnt, cnt, 1);
 642       __ bind(base_aligned);
 643 
 644       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 645       // alignment.
 646       Label small;
 647       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 648       __ subs(rscratch1, cnt, low_limit >> 3);
 649       __ br(Assembler::LT, small);
 650       __ zero_dcache_blocks(base, cnt);
 651       __ bind(small);
 652     }
 653 
 654     {
 655       // Number of stp instructions we'll unroll
 656       const int unroll =
 657         MacroAssembler::zero_words_block_size / 2;
 658       // Clear the remaining blocks.
 659       Label loop;
 660       __ subs(cnt, cnt, unroll * 2);
 661       __ br(Assembler::LT, done);
 662       __ bind(loop);
 663       for (int i = 0; i < unroll; i++)
 664         __ stp(zr, zr, __ post(base, 16));
 665       __ subs(cnt, cnt, unroll * 2);
 666       __ br(Assembler::GE, loop);
 667       __ bind(done);
 668       __ add(cnt, cnt, unroll * 2);
 669     }
 670 
 671     __ ret(lr);
 672 
 673     return start;
 674   }
 675 
 676 
 677   typedef enum {
 678     copy_forwards = 1,
 679     copy_backwards = -1
 680   } copy_direction;
 681 
 682   // Bulk copy of blocks of 8 words.
 683   //
 684   // count is a count of words.
 685   //
 686   // Precondition: count >= 8
 687   //
 688   // Postconditions:
 689   //
 690   // The least significant bit of count contains the remaining count
 691   // of words to copy.  The rest of count is trash.
 692   //
 693   // s and d are adjusted to point to the remaining words to copy
 694   //
 695   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 696                            copy_direction direction) {
 697     int unit = wordSize * direction;
 698     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 699 
 700     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 701       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 702     const Register stride = r13;
 703 
 704     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 705     assert_different_registers(s, d, count, rscratch1);
 706 
 707     Label again, drain;
 708     const char *stub_name;
 709     if (direction == copy_forwards)
 710       stub_name = "forward_copy_longs";
 711     else
 712       stub_name = "backward_copy_longs";
 713 
 714     __ align(CodeEntryAlignment);
 715 
 716     StubCodeMark mark(this, "StubRoutines", stub_name);
 717 
 718     __ bind(start);
 719 
 720     Label unaligned_copy_long;
 721     if (AvoidUnalignedAccesses) {
 722       __ tbnz(d, 3, unaligned_copy_long);
 723     }
 724 
 725     if (direction == copy_forwards) {
 726       __ sub(s, s, bias);
 727       __ sub(d, d, bias);
 728     }
 729 
 730 #ifdef ASSERT
 731     // Make sure we are never given < 8 words
 732     {
 733       Label L;
 734       __ cmp(count, (u1)8);
 735       __ br(Assembler::GE, L);
 736       __ stop("genrate_copy_longs called with < 8 words");
 737       __ bind(L);
 738     }
 739 #endif
 740 
 741     // Fill 8 registers
 742     if (UseSIMDForMemoryOps) {
 743       __ ldpq(v0, v1, Address(s, 4 * unit));
 744       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 745     } else {
 746       __ ldp(t0, t1, Address(s, 2 * unit));
 747       __ ldp(t2, t3, Address(s, 4 * unit));
 748       __ ldp(t4, t5, Address(s, 6 * unit));
 749       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 750     }
 751 
 752     __ subs(count, count, 16);
 753     __ br(Assembler::LO, drain);
 754 
 755     int prefetch = PrefetchCopyIntervalInBytes;
 756     bool use_stride = false;
 757     if (direction == copy_backwards) {
 758        use_stride = prefetch > 256;
 759        prefetch = -prefetch;
 760        if (use_stride) __ mov(stride, prefetch);
 761     }
 762 
 763     __ bind(again);
 764 
 765     if (PrefetchCopyIntervalInBytes > 0)
 766       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 767 
 768     if (UseSIMDForMemoryOps) {
 769       __ stpq(v0, v1, Address(d, 4 * unit));
 770       __ ldpq(v0, v1, Address(s, 4 * unit));
 771       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 772       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 773     } else {
 774       __ stp(t0, t1, Address(d, 2 * unit));
 775       __ ldp(t0, t1, Address(s, 2 * unit));
 776       __ stp(t2, t3, Address(d, 4 * unit));
 777       __ ldp(t2, t3, Address(s, 4 * unit));
 778       __ stp(t4, t5, Address(d, 6 * unit));
 779       __ ldp(t4, t5, Address(s, 6 * unit));
 780       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 781       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 782     }
 783 
 784     __ subs(count, count, 8);
 785     __ br(Assembler::HS, again);
 786 
 787     // Drain
 788     __ bind(drain);
 789     if (UseSIMDForMemoryOps) {
 790       __ stpq(v0, v1, Address(d, 4 * unit));
 791       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 792     } else {
 793       __ stp(t0, t1, Address(d, 2 * unit));
 794       __ stp(t2, t3, Address(d, 4 * unit));
 795       __ stp(t4, t5, Address(d, 6 * unit));
 796       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 797     }
 798 
 799     {
 800       Label L1, L2;
 801       __ tbz(count, exact_log2(4), L1);
 802       if (UseSIMDForMemoryOps) {
 803         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 804         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 805       } else {
 806         __ ldp(t0, t1, Address(s, 2 * unit));
 807         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 808         __ stp(t0, t1, Address(d, 2 * unit));
 809         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 810       }
 811       __ bind(L1);
 812 
 813       if (direction == copy_forwards) {
 814         __ add(s, s, bias);
 815         __ add(d, d, bias);
 816       }
 817 
 818       __ tbz(count, 1, L2);
 819       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 820       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 821       __ bind(L2);
 822     }
 823 
 824     __ ret(lr);
 825 
 826     if (AvoidUnalignedAccesses) {
 827       Label drain, again;
 828       // Register order for storing. Order is different for backward copy.
 829 
 830       __ bind(unaligned_copy_long);
 831 
 832       // source address is even aligned, target odd aligned
 833       //
 834       // when forward copying word pairs we read long pairs at offsets
 835       // {0, 2, 4, 6} (in long words). when backwards copying we read
 836       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 837       // address by -2 in the forwards case so we can compute the
 838       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 839       // or -1.
 840       //
 841       // when forward copying we need to store 1 word, 3 pairs and
 842       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 843       // zero offset We adjust the destination by -1 which means we
 844       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 845       //
 846       // When backwards copyng we need to store 1 word, 3 pairs and
 847       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 848       // offsets {1, 3, 5, 7, 8} * unit.
 849 
 850       if (direction == copy_forwards) {
 851         __ sub(s, s, 16);
 852         __ sub(d, d, 8);
 853       }
 854 
 855       // Fill 8 registers
 856       //
 857       // for forwards copy s was offset by -16 from the original input
 858       // value of s so the register contents are at these offsets
 859       // relative to the 64 bit block addressed by that original input
 860       // and so on for each successive 64 byte block when s is updated
 861       //
 862       // t0 at offset 0,  t1 at offset 8
 863       // t2 at offset 16, t3 at offset 24
 864       // t4 at offset 32, t5 at offset 40
 865       // t6 at offset 48, t7 at offset 56
 866 
 867       // for backwards copy s was not offset so the register contents
 868       // are at these offsets into the preceding 64 byte block
 869       // relative to that original input and so on for each successive
 870       // preceding 64 byte block when s is updated. this explains the
 871       // slightly counter-intuitive looking pattern of register usage
 872       // in the stp instructions for backwards copy.
 873       //
 874       // t0 at offset -16, t1 at offset -8
 875       // t2 at offset -32, t3 at offset -24
 876       // t4 at offset -48, t5 at offset -40
 877       // t6 at offset -64, t7 at offset -56
 878 
 879       __ ldp(t0, t1, Address(s, 2 * unit));
 880       __ ldp(t2, t3, Address(s, 4 * unit));
 881       __ ldp(t4, t5, Address(s, 6 * unit));
 882       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 883 
 884       __ subs(count, count, 16);
 885       __ br(Assembler::LO, drain);
 886 
 887       int prefetch = PrefetchCopyIntervalInBytes;
 888       bool use_stride = false;
 889       if (direction == copy_backwards) {
 890          use_stride = prefetch > 256;
 891          prefetch = -prefetch;
 892          if (use_stride) __ mov(stride, prefetch);
 893       }
 894 
 895       __ bind(again);
 896 
 897       if (PrefetchCopyIntervalInBytes > 0)
 898         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 899 
 900       if (direction == copy_forwards) {
 901        // allowing for the offset of -8 the store instructions place
 902        // registers into the target 64 bit block at the following
 903        // offsets
 904        //
 905        // t0 at offset 0
 906        // t1 at offset 8,  t2 at offset 16
 907        // t3 at offset 24, t4 at offset 32
 908        // t5 at offset 40, t6 at offset 48
 909        // t7 at offset 56
 910 
 911         __ str(t0, Address(d, 1 * unit));
 912         __ stp(t1, t2, Address(d, 2 * unit));
 913         __ ldp(t0, t1, Address(s, 2 * unit));
 914         __ stp(t3, t4, Address(d, 4 * unit));
 915         __ ldp(t2, t3, Address(s, 4 * unit));
 916         __ stp(t5, t6, Address(d, 6 * unit));
 917         __ ldp(t4, t5, Address(s, 6 * unit));
 918         __ str(t7, Address(__ pre(d, 8 * unit)));
 919         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 920       } else {
 921        // d was not offset when we started so the registers are
 922        // written into the 64 bit block preceding d with the following
 923        // offsets
 924        //
 925        // t1 at offset -8
 926        // t3 at offset -24, t0 at offset -16
 927        // t5 at offset -48, t2 at offset -32
 928        // t7 at offset -56, t4 at offset -48
 929        //                   t6 at offset -64
 930        //
 931        // note that this matches the offsets previously noted for the
 932        // loads
 933 
 934         __ str(t1, Address(d, 1 * unit));
 935         __ stp(t3, t0, Address(d, 3 * unit));
 936         __ ldp(t0, t1, Address(s, 2 * unit));
 937         __ stp(t5, t2, Address(d, 5 * unit));
 938         __ ldp(t2, t3, Address(s, 4 * unit));
 939         __ stp(t7, t4, Address(d, 7 * unit));
 940         __ ldp(t4, t5, Address(s, 6 * unit));
 941         __ str(t6, Address(__ pre(d, 8 * unit)));
 942         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 943       }
 944 
 945       __ subs(count, count, 8);
 946       __ br(Assembler::HS, again);
 947 
 948       // Drain
 949       //
 950       // this uses the same pattern of offsets and register arguments
 951       // as above
 952       __ bind(drain);
 953       if (direction == copy_forwards) {
 954         __ str(t0, Address(d, 1 * unit));
 955         __ stp(t1, t2, Address(d, 2 * unit));
 956         __ stp(t3, t4, Address(d, 4 * unit));
 957         __ stp(t5, t6, Address(d, 6 * unit));
 958         __ str(t7, Address(__ pre(d, 8 * unit)));
 959       } else {
 960         __ str(t1, Address(d, 1 * unit));
 961         __ stp(t3, t0, Address(d, 3 * unit));
 962         __ stp(t5, t2, Address(d, 5 * unit));
 963         __ stp(t7, t4, Address(d, 7 * unit));
 964         __ str(t6, Address(__ pre(d, 8 * unit)));
 965       }
 966       // now we need to copy any remaining part block which may
 967       // include a 4 word block subblock and/or a 2 word subblock.
 968       // bits 2 and 1 in the count are the tell-tale for whetehr we
 969       // have each such subblock
 970       {
 971         Label L1, L2;
 972         __ tbz(count, exact_log2(4), L1);
 973        // this is the same as above but copying only 4 longs hence
 974        // with ony one intervening stp between the str instructions
 975        // but note that the offsets and registers still follow the
 976        // same pattern
 977         __ ldp(t0, t1, Address(s, 2 * unit));
 978         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 979         if (direction == copy_forwards) {
 980           __ str(t0, Address(d, 1 * unit));
 981           __ stp(t1, t2, Address(d, 2 * unit));
 982           __ str(t3, Address(__ pre(d, 4 * unit)));
 983         } else {
 984           __ str(t1, Address(d, 1 * unit));
 985           __ stp(t3, t0, Address(d, 3 * unit));
 986           __ str(t2, Address(__ pre(d, 4 * unit)));
 987         }
 988         __ bind(L1);
 989 
 990         __ tbz(count, 1, L2);
 991        // this is the same as above but copying only 2 longs hence
 992        // there is no intervening stp between the str instructions
 993        // but note that the offset and register patterns are still
 994        // the same
 995         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
 996         if (direction == copy_forwards) {
 997           __ str(t0, Address(d, 1 * unit));
 998           __ str(t1, Address(__ pre(d, 2 * unit)));
 999         } else {
1000           __ str(t1, Address(d, 1 * unit));
1001           __ str(t0, Address(__ pre(d, 2 * unit)));
1002         }
1003         __ bind(L2);
1004 
1005        // for forwards copy we need to re-adjust the offsets we
1006        // applied so that s and d are follow the last words written
1007 
1008        if (direction == copy_forwards) {
1009          __ add(s, s, 16);
1010          __ add(d, d, 8);
1011        }
1012 
1013       }
1014 
1015       __ ret(lr);
1016       }
1017   }
1018 
1019   // Small copy: less than 16 bytes.
1020   //
1021   // NB: Ignores all of the bits of count which represent more than 15
1022   // bytes, so a caller doesn't have to mask them.
1023 
1024   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1025     bool is_backwards = step < 0;
1026     size_t granularity = uabs(step);
1027     int direction = is_backwards ? -1 : 1;
1028     int unit = wordSize * direction;
1029 
1030     Label Lword, Lint, Lshort, Lbyte;
1031 
1032     assert(granularity
1033            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1034 
1035     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1036 
1037     // ??? I don't know if this bit-test-and-branch is the right thing
1038     // to do.  It does a lot of jumping, resulting in several
1039     // mispredicted branches.  It might make more sense to do this
1040     // with something like Duff's device with a single computed branch.
1041 
1042     __ tbz(count, 3 - exact_log2(granularity), Lword);
1043     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1044     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1045     __ bind(Lword);
1046 
1047     if (granularity <= sizeof (jint)) {
1048       __ tbz(count, 2 - exact_log2(granularity), Lint);
1049       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1050       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1051       __ bind(Lint);
1052     }
1053 
1054     if (granularity <= sizeof (jshort)) {
1055       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1056       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1057       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1058       __ bind(Lshort);
1059     }
1060 
1061     if (granularity <= sizeof (jbyte)) {
1062       __ tbz(count, 0, Lbyte);
1063       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1064       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1065       __ bind(Lbyte);
1066     }
1067   }
1068 
1069   Label copy_f, copy_b;
1070 
1071   // All-singing all-dancing memory copy.
1072   //
1073   // Copy count units of memory from s to d.  The size of a unit is
1074   // step, which can be positive or negative depending on the direction
1075   // of copy.  If is_aligned is false, we align the source address.
1076   //
1077 
1078   void copy_memory(bool is_aligned, Register s, Register d,
1079                    Register count, Register tmp, int step) {
1080     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1081     bool is_backwards = step < 0;
1082     int granularity = uabs(step);
1083     const Register t0 = r3, t1 = r4;
1084 
1085     // <= 96 bytes do inline. Direction doesn't matter because we always
1086     // load all the data before writing anything
1087     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1088     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1089     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1090     const Register send = r17, dend = r16;
1091 
1092     if (PrefetchCopyIntervalInBytes > 0)
1093       __ prfm(Address(s, 0), PLDL1KEEP);
1094     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1095     __ br(Assembler::HI, copy_big);
1096 
1097     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1098     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1099 
1100     __ cmp(count, u1(16/granularity));
1101     __ br(Assembler::LS, copy16);
1102 
1103     __ cmp(count, u1(64/granularity));
1104     __ br(Assembler::HI, copy80);
1105 
1106     __ cmp(count, u1(32/granularity));
1107     __ br(Assembler::LS, copy32);
1108 
1109     // 33..64 bytes
1110     if (UseSIMDForMemoryOps) {
1111       __ ldpq(v0, v1, Address(s, 0));
1112       __ ldpq(v2, v3, Address(send, -32));
1113       __ stpq(v0, v1, Address(d, 0));
1114       __ stpq(v2, v3, Address(dend, -32));
1115     } else {
1116       __ ldp(t0, t1, Address(s, 0));
1117       __ ldp(t2, t3, Address(s, 16));
1118       __ ldp(t4, t5, Address(send, -32));
1119       __ ldp(t6, t7, Address(send, -16));
1120 
1121       __ stp(t0, t1, Address(d, 0));
1122       __ stp(t2, t3, Address(d, 16));
1123       __ stp(t4, t5, Address(dend, -32));
1124       __ stp(t6, t7, Address(dend, -16));
1125     }
1126     __ b(finish);
1127 
1128     // 17..32 bytes
1129     __ bind(copy32);
1130     __ ldp(t0, t1, Address(s, 0));
1131     __ ldp(t2, t3, Address(send, -16));
1132     __ stp(t0, t1, Address(d, 0));
1133     __ stp(t2, t3, Address(dend, -16));
1134     __ b(finish);
1135 
1136     // 65..80/96 bytes
1137     // (96 bytes if SIMD because we do 32 byes per instruction)
1138     __ bind(copy80);
1139     if (UseSIMDForMemoryOps) {
1140       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1141       __ ldpq(v4, v5, Address(send, -32));
1142       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1143       __ stpq(v4, v5, Address(dend, -32));
1144     } else {
1145       __ ldp(t0, t1, Address(s, 0));
1146       __ ldp(t2, t3, Address(s, 16));
1147       __ ldp(t4, t5, Address(s, 32));
1148       __ ldp(t6, t7, Address(s, 48));
1149       __ ldp(t8, t9, Address(send, -16));
1150 
1151       __ stp(t0, t1, Address(d, 0));
1152       __ stp(t2, t3, Address(d, 16));
1153       __ stp(t4, t5, Address(d, 32));
1154       __ stp(t6, t7, Address(d, 48));
1155       __ stp(t8, t9, Address(dend, -16));
1156     }
1157     __ b(finish);
1158 
1159     // 0..16 bytes
1160     __ bind(copy16);
1161     __ cmp(count, u1(8/granularity));
1162     __ br(Assembler::LO, copy8);
1163 
1164     // 8..16 bytes
1165     __ ldr(t0, Address(s, 0));
1166     __ ldr(t1, Address(send, -8));
1167     __ str(t0, Address(d, 0));
1168     __ str(t1, Address(dend, -8));
1169     __ b(finish);
1170 
1171     if (granularity < 8) {
1172       // 4..7 bytes
1173       __ bind(copy8);
1174       __ tbz(count, 2 - exact_log2(granularity), copy4);
1175       __ ldrw(t0, Address(s, 0));
1176       __ ldrw(t1, Address(send, -4));
1177       __ strw(t0, Address(d, 0));
1178       __ strw(t1, Address(dend, -4));
1179       __ b(finish);
1180       if (granularity < 4) {
1181         // 0..3 bytes
1182         __ bind(copy4);
1183         __ cbz(count, finish); // get rid of 0 case
1184         if (granularity == 2) {
1185           __ ldrh(t0, Address(s, 0));
1186           __ strh(t0, Address(d, 0));
1187         } else { // granularity == 1
1188           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1189           // the first and last byte.
1190           // Handle the 3 byte case by loading and storing base + count/2
1191           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1192           // This does means in the 1 byte case we load/store the same
1193           // byte 3 times.
1194           __ lsr(count, count, 1);
1195           __ ldrb(t0, Address(s, 0));
1196           __ ldrb(t1, Address(send, -1));
1197           __ ldrb(t2, Address(s, count));
1198           __ strb(t0, Address(d, 0));
1199           __ strb(t1, Address(dend, -1));
1200           __ strb(t2, Address(d, count));
1201         }
1202         __ b(finish);
1203       }
1204     }
1205 
1206     __ bind(copy_big);
1207     if (is_backwards) {
1208       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1209       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1210     }
1211 
1212     // Now we've got the small case out of the way we can align the
1213     // source address on a 2-word boundary.
1214 
1215     Label aligned;
1216 
1217     if (is_aligned) {
1218       // We may have to adjust by 1 word to get s 2-word-aligned.
1219       __ tbz(s, exact_log2(wordSize), aligned);
1220       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1221       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1222       __ sub(count, count, wordSize/granularity);
1223     } else {
1224       if (is_backwards) {
1225         __ andr(rscratch2, s, 2 * wordSize - 1);
1226       } else {
1227         __ neg(rscratch2, s);
1228         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1229       }
1230       // rscratch2 is the byte adjustment needed to align s.
1231       __ cbz(rscratch2, aligned);
1232       int shift = exact_log2(granularity);
1233       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1234       __ sub(count, count, rscratch2);
1235 
1236 #if 0
1237       // ?? This code is only correct for a disjoint copy.  It may or
1238       // may not make sense to use it in that case.
1239 
1240       // Copy the first pair; s and d may not be aligned.
1241       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1242       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1243 
1244       // Align s and d, adjust count
1245       if (is_backwards) {
1246         __ sub(s, s, rscratch2);
1247         __ sub(d, d, rscratch2);
1248       } else {
1249         __ add(s, s, rscratch2);
1250         __ add(d, d, rscratch2);
1251       }
1252 #else
1253       copy_memory_small(s, d, rscratch2, rscratch1, step);
1254 #endif
1255     }
1256 
1257     __ bind(aligned);
1258 
1259     // s is now 2-word-aligned.
1260 
1261     // We have a count of units and some trailing bytes.  Adjust the
1262     // count and do a bulk copy of words.
1263     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1264     if (direction == copy_forwards)
1265       __ bl(copy_f);
1266     else
1267       __ bl(copy_b);
1268 
1269     // And the tail.
1270     copy_memory_small(s, d, count, tmp, step);
1271 
1272     if (granularity >= 8) __ bind(copy8);
1273     if (granularity >= 4) __ bind(copy4);
1274     __ bind(finish);
1275   }
1276 
1277 
1278   void clobber_registers() {
1279 #ifdef ASSERT
1280     RegSet clobbered
1281       = MacroAssembler::call_clobbered_registers() - rscratch1;
1282     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1283     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1284     for (RegSetIterator it = clobbered.begin(); *it != noreg; ++it) {
1285       __ mov(*it, rscratch1);
1286     }
1287 #endif
1288 
1289   }
1290 
1291   // Scan over array at a for count oops, verifying each one.
1292   // Preserves a and count, clobbers rscratch1 and rscratch2.
1293   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1294     Label loop, end;
1295     __ mov(rscratch1, a);
1296     __ mov(rscratch2, zr);
1297     __ bind(loop);
1298     __ cmp(rscratch2, count);
1299     __ br(Assembler::HS, end);
1300     if (size == (size_t)wordSize) {
1301       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1302       __ verify_oop(temp);
1303     } else {
1304       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1305       __ decode_heap_oop(temp); // calls verify_oop
1306     }
1307     __ add(rscratch2, rscratch2, size);
1308     __ b(loop);
1309     __ bind(end);
1310   }
1311 
1312   // Arguments:
1313   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1314   //             ignored
1315   //   is_oop  - true => oop array, so generate store check code
1316   //   name    - stub name string
1317   //
1318   // Inputs:
1319   //   c_rarg0   - source array address
1320   //   c_rarg1   - destination array address
1321   //   c_rarg2   - element count, treated as ssize_t, can be zero
1322   //
1323   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1324   // the hardware handle it.  The two dwords within qwords that span
1325   // cache line boundaries will still be loaded and stored atomicly.
1326   //
1327   // Side Effects:
1328   //   disjoint_int_copy_entry is set to the no-overlap entry point
1329   //   used by generate_conjoint_int_oop_copy().
1330   //
1331   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1332                                   const char *name, bool dest_uninitialized = false) {
1333     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1334     RegSet saved_reg = RegSet::of(s, d, count);
1335     __ align(CodeEntryAlignment);
1336     StubCodeMark mark(this, "StubRoutines", name);
1337     address start = __ pc();
1338     __ enter();
1339 
1340     if (entry != NULL) {
1341       *entry = __ pc();
1342       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1343       BLOCK_COMMENT("Entry:");
1344     }
1345 
1346     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1347     if (dest_uninitialized) {
1348       decorators |= IS_DEST_UNINITIALIZED;
1349     }
1350     if (aligned) {
1351       decorators |= ARRAYCOPY_ALIGNED;
1352     }
1353 
1354     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1355     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1356 
1357     if (is_oop) {
1358       // save regs before copy_memory
1359       __ push(RegSet::of(d, count), sp);
1360     }
1361     {
1362       // UnsafeCopyMemory page error: continue after ucm
1363       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1364       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1365       copy_memory(aligned, s, d, count, rscratch1, size);
1366     }
1367 
1368     if (is_oop) {
1369       __ pop(RegSet::of(d, count), sp);
1370       if (VerifyOops)
1371         verify_oop_array(size, d, count, r16);
1372     }
1373 
1374     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1375 
1376     __ leave();
1377     __ mov(r0, zr); // return 0
1378     __ ret(lr);
1379     return start;
1380   }
1381 
1382   // Arguments:
1383   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1384   //             ignored
1385   //   is_oop  - true => oop array, so generate store check code
1386   //   name    - stub name string
1387   //
1388   // Inputs:
1389   //   c_rarg0   - source array address
1390   //   c_rarg1   - destination array address
1391   //   c_rarg2   - element count, treated as ssize_t, can be zero
1392   //
1393   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1394   // the hardware handle it.  The two dwords within qwords that span
1395   // cache line boundaries will still be loaded and stored atomicly.
1396   //
1397   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1398                                  address *entry, const char *name,
1399                                  bool dest_uninitialized = false) {
1400     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1401     RegSet saved_regs = RegSet::of(s, d, count);
1402     StubCodeMark mark(this, "StubRoutines", name);
1403     address start = __ pc();
1404     __ enter();
1405 
1406     if (entry != NULL) {
1407       *entry = __ pc();
1408       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1409       BLOCK_COMMENT("Entry:");
1410     }
1411 
1412     // use fwd copy when (d-s) above_equal (count*size)
1413     __ sub(rscratch1, d, s);
1414     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1415     __ br(Assembler::HS, nooverlap_target);
1416 
1417     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1418     if (dest_uninitialized) {
1419       decorators |= IS_DEST_UNINITIALIZED;
1420     }
1421     if (aligned) {
1422       decorators |= ARRAYCOPY_ALIGNED;
1423     }
1424 
1425     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1426     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1427 
1428     if (is_oop) {
1429       // save regs before copy_memory
1430       __ push(RegSet::of(d, count), sp);
1431     }
1432     {
1433       // UnsafeCopyMemory page error: continue after ucm
1434       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1435       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1436       copy_memory(aligned, s, d, count, rscratch1, -size);
1437     }
1438     if (is_oop) {
1439       __ pop(RegSet::of(d, count), sp);
1440       if (VerifyOops)
1441         verify_oop_array(size, d, count, r16);
1442     }
1443     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1444     __ leave();
1445     __ mov(r0, zr); // return 0
1446     __ ret(lr);
1447     return start;
1448 }
1449 
1450   // Arguments:
1451   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1452   //             ignored
1453   //   name    - stub name string
1454   //
1455   // Inputs:
1456   //   c_rarg0   - source array address
1457   //   c_rarg1   - destination array address
1458   //   c_rarg2   - element count, treated as ssize_t, can be zero
1459   //
1460   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1461   // we let the hardware handle it.  The one to eight bytes within words,
1462   // dwords or qwords that span cache line boundaries will still be loaded
1463   // and stored atomically.
1464   //
1465   // Side Effects:
1466   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1467   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1468   // we let the hardware handle it.  The one to eight bytes within words,
1469   // dwords or qwords that span cache line boundaries will still be loaded
1470   // and stored atomically.
1471   //
1472   // Side Effects:
1473   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1474   //   used by generate_conjoint_byte_copy().
1475   //
1476   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1477     const bool not_oop = false;
1478     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1479   }
1480 
1481   // Arguments:
1482   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1483   //             ignored
1484   //   name    - stub name string
1485   //
1486   // Inputs:
1487   //   c_rarg0   - source array address
1488   //   c_rarg1   - destination array address
1489   //   c_rarg2   - element count, treated as ssize_t, can be zero
1490   //
1491   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1492   // we let the hardware handle it.  The one to eight bytes within words,
1493   // dwords or qwords that span cache line boundaries will still be loaded
1494   // and stored atomically.
1495   //
1496   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1497                                       address* entry, const char *name) {
1498     const bool not_oop = false;
1499     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1500   }
1501 
1502   // Arguments:
1503   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1504   //             ignored
1505   //   name    - stub name string
1506   //
1507   // Inputs:
1508   //   c_rarg0   - source array address
1509   //   c_rarg1   - destination array address
1510   //   c_rarg2   - element count, treated as ssize_t, can be zero
1511   //
1512   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1513   // let the hardware handle it.  The two or four words within dwords
1514   // or qwords that span cache line boundaries will still be loaded
1515   // and stored atomically.
1516   //
1517   // Side Effects:
1518   //   disjoint_short_copy_entry is set to the no-overlap entry point
1519   //   used by generate_conjoint_short_copy().
1520   //
1521   address generate_disjoint_short_copy(bool aligned,
1522                                        address* entry, const char *name) {
1523     const bool not_oop = false;
1524     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1525   }
1526 
1527   // Arguments:
1528   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1529   //             ignored
1530   //   name    - stub name string
1531   //
1532   // Inputs:
1533   //   c_rarg0   - source array address
1534   //   c_rarg1   - destination array address
1535   //   c_rarg2   - element count, treated as ssize_t, can be zero
1536   //
1537   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1538   // let the hardware handle it.  The two or four words within dwords
1539   // or qwords that span cache line boundaries will still be loaded
1540   // and stored atomically.
1541   //
1542   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1543                                        address *entry, const char *name) {
1544     const bool not_oop = false;
1545     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1546 
1547   }
1548   // Arguments:
1549   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1550   //             ignored
1551   //   name    - stub name string
1552   //
1553   // Inputs:
1554   //   c_rarg0   - source array address
1555   //   c_rarg1   - destination array address
1556   //   c_rarg2   - element count, treated as ssize_t, can be zero
1557   //
1558   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1559   // the hardware handle it.  The two dwords within qwords that span
1560   // cache line boundaries will still be loaded and stored atomicly.
1561   //
1562   // Side Effects:
1563   //   disjoint_int_copy_entry is set to the no-overlap entry point
1564   //   used by generate_conjoint_int_oop_copy().
1565   //
1566   address generate_disjoint_int_copy(bool aligned, address *entry,
1567                                          const char *name, bool dest_uninitialized = false) {
1568     const bool not_oop = false;
1569     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1570   }
1571 
1572   // Arguments:
1573   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1574   //             ignored
1575   //   name    - stub name string
1576   //
1577   // Inputs:
1578   //   c_rarg0   - source array address
1579   //   c_rarg1   - destination array address
1580   //   c_rarg2   - element count, treated as ssize_t, can be zero
1581   //
1582   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1583   // the hardware handle it.  The two dwords within qwords that span
1584   // cache line boundaries will still be loaded and stored atomicly.
1585   //
1586   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1587                                      address *entry, const char *name,
1588                                      bool dest_uninitialized = false) {
1589     const bool not_oop = false;
1590     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1591   }
1592 
1593 
1594   // Arguments:
1595   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1596   //             ignored
1597   //   name    - stub name string
1598   //
1599   // Inputs:
1600   //   c_rarg0   - source array address
1601   //   c_rarg1   - destination array address
1602   //   c_rarg2   - element count, treated as size_t, can be zero
1603   //
1604   // Side Effects:
1605   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1606   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1607   //
1608   address generate_disjoint_long_copy(bool aligned, address *entry,
1609                                           const char *name, bool dest_uninitialized = false) {
1610     const bool not_oop = false;
1611     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1612   }
1613 
1614   // Arguments:
1615   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1616   //             ignored
1617   //   name    - stub name string
1618   //
1619   // Inputs:
1620   //   c_rarg0   - source array address
1621   //   c_rarg1   - destination array address
1622   //   c_rarg2   - element count, treated as size_t, can be zero
1623   //
1624   address generate_conjoint_long_copy(bool aligned,
1625                                       address nooverlap_target, address *entry,
1626                                       const char *name, bool dest_uninitialized = false) {
1627     const bool not_oop = false;
1628     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1629   }
1630 
1631   // Arguments:
1632   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1633   //             ignored
1634   //   name    - stub name string
1635   //
1636   // Inputs:
1637   //   c_rarg0   - source array address
1638   //   c_rarg1   - destination array address
1639   //   c_rarg2   - element count, treated as size_t, can be zero
1640   //
1641   // Side Effects:
1642   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1643   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1644   //
1645   address generate_disjoint_oop_copy(bool aligned, address *entry,
1646                                      const char *name, bool dest_uninitialized) {
1647     const bool is_oop = true;
1648     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1649     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1650   }
1651 
1652   // Arguments:
1653   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1654   //             ignored
1655   //   name    - stub name string
1656   //
1657   // Inputs:
1658   //   c_rarg0   - source array address
1659   //   c_rarg1   - destination array address
1660   //   c_rarg2   - element count, treated as size_t, can be zero
1661   //
1662   address generate_conjoint_oop_copy(bool aligned,
1663                                      address nooverlap_target, address *entry,
1664                                      const char *name, bool dest_uninitialized) {
1665     const bool is_oop = true;
1666     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1667     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1668                                   name, dest_uninitialized);
1669   }
1670 
1671 
1672   // Helper for generating a dynamic type check.
1673   // Smashes rscratch1, rscratch2.
1674   void generate_type_check(Register sub_klass,
1675                            Register super_check_offset,
1676                            Register super_klass,
1677                            Label& L_success) {
1678     assert_different_registers(sub_klass, super_check_offset, super_klass);
1679 
1680     BLOCK_COMMENT("type_check:");
1681 
1682     Label L_miss;
1683 
1684     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1685                                      super_check_offset);
1686     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1687 
1688     // Fall through on failure!
1689     __ BIND(L_miss);
1690   }
1691 
1692   //
1693   //  Generate checkcasting array copy stub
1694   //
1695   //  Input:
1696   //    c_rarg0   - source array address
1697   //    c_rarg1   - destination array address
1698   //    c_rarg2   - element count, treated as ssize_t, can be zero
1699   //    c_rarg3   - size_t ckoff (super_check_offset)
1700   //    c_rarg4   - oop ckval (super_klass)
1701   //
1702   //  Output:
1703   //    r0 ==  0  -  success
1704   //    r0 == -1^K - failure, where K is partial transfer count
1705   //
1706   address generate_checkcast_copy(const char *name, address *entry,
1707                                   bool dest_uninitialized = false) {
1708 
1709     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1710 
1711     // Input registers (after setup_arg_regs)
1712     const Register from        = c_rarg0;   // source array address
1713     const Register to          = c_rarg1;   // destination array address
1714     const Register count       = c_rarg2;   // elementscount
1715     const Register ckoff       = c_rarg3;   // super_check_offset
1716     const Register ckval       = c_rarg4;   // super_klass
1717 
1718     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1719     RegSet wb_post_saved_regs = RegSet::of(count);
1720 
1721     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1722     const Register copied_oop  = r22;       // actual oop copied
1723     const Register count_save  = r21;       // orig elementscount
1724     const Register start_to    = r20;       // destination array start address
1725     const Register r19_klass   = r19;       // oop._klass
1726 
1727     //---------------------------------------------------------------
1728     // Assembler stub will be used for this call to arraycopy
1729     // if the two arrays are subtypes of Object[] but the
1730     // destination array type is not equal to or a supertype
1731     // of the source type.  Each element must be separately
1732     // checked.
1733 
1734     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1735                                copied_oop, r19_klass, count_save);
1736 
1737     __ align(CodeEntryAlignment);
1738     StubCodeMark mark(this, "StubRoutines", name);
1739     address start = __ pc();
1740 
1741     __ enter(); // required for proper stackwalking of RuntimeStub frame
1742 
1743 #ifdef ASSERT
1744     // caller guarantees that the arrays really are different
1745     // otherwise, we would have to make conjoint checks
1746     { Label L;
1747       array_overlap_test(L, TIMES_OOP);
1748       __ stop("checkcast_copy within a single array");
1749       __ bind(L);
1750     }
1751 #endif //ASSERT
1752 
1753     // Caller of this entry point must set up the argument registers.
1754     if (entry != NULL) {
1755       *entry = __ pc();
1756       BLOCK_COMMENT("Entry:");
1757     }
1758 
1759      // Empty array:  Nothing to do.
1760     __ cbz(count, L_done);
1761     __ push(RegSet::of(r19, r20, r21, r22), sp);
1762 
1763 #ifdef ASSERT
1764     BLOCK_COMMENT("assert consistent ckoff/ckval");
1765     // The ckoff and ckval must be mutually consistent,
1766     // even though caller generates both.
1767     { Label L;
1768       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1769       __ ldrw(start_to, Address(ckval, sco_offset));
1770       __ cmpw(ckoff, start_to);
1771       __ br(Assembler::EQ, L);
1772       __ stop("super_check_offset inconsistent");
1773       __ bind(L);
1774     }
1775 #endif //ASSERT
1776 
1777     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1778     bool is_oop = true;
1779     if (dest_uninitialized) {
1780       decorators |= IS_DEST_UNINITIALIZED;
1781     }
1782 
1783     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1784     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1785 
1786     // save the original count
1787     __ mov(count_save, count);
1788 
1789     // Copy from low to high addresses
1790     __ mov(start_to, to);              // Save destination array start address
1791     __ b(L_load_element);
1792 
1793     // ======== begin loop ========
1794     // (Loop is rotated; its entry is L_load_element.)
1795     // Loop control:
1796     //   for (; count != 0; count--) {
1797     //     copied_oop = load_heap_oop(from++);
1798     //     ... generate_type_check ...;
1799     //     store_heap_oop(to++, copied_oop);
1800     //   }
1801     __ align(OptoLoopAlignment);
1802 
1803     __ BIND(L_store_element);
1804     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1805     __ sub(count, count, 1);
1806     __ cbz(count, L_do_card_marks);
1807 
1808     // ======== loop entry is here ========
1809     __ BIND(L_load_element);
1810     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1811     __ cbz(copied_oop, L_store_element);
1812 
1813     __ load_klass(r19_klass, copied_oop);// query the object klass
1814     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1815     // ======== end loop ========
1816 
1817     // It was a real error; we must depend on the caller to finish the job.
1818     // Register count = remaining oops, count_orig = total oops.
1819     // Emit GC store barriers for the oops we have copied and report
1820     // their number to the caller.
1821 
1822     __ subs(count, count_save, count);     // K = partially copied oop count
1823     __ eon(count, count, zr);                   // report (-1^K) to caller
1824     __ br(Assembler::EQ, L_done_pop);
1825 
1826     __ BIND(L_do_card_marks);
1827     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1828 
1829     __ bind(L_done_pop);
1830     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1831     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1832 
1833     __ bind(L_done);
1834     __ mov(r0, count);
1835     __ leave();
1836     __ ret(lr);
1837 
1838     return start;
1839   }
1840 
1841   // Perform range checks on the proposed arraycopy.
1842   // Kills temp, but nothing else.
1843   // Also, clean the sign bits of src_pos and dst_pos.
1844   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1845                               Register src_pos, // source position (c_rarg1)
1846                               Register dst,     // destination array oo (c_rarg2)
1847                               Register dst_pos, // destination position (c_rarg3)
1848                               Register length,
1849                               Register temp,
1850                               Label& L_failed) {
1851     BLOCK_COMMENT("arraycopy_range_checks:");
1852 
1853     assert_different_registers(rscratch1, temp);
1854 
1855     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1856     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1857     __ addw(temp, length, src_pos);
1858     __ cmpw(temp, rscratch1);
1859     __ br(Assembler::HI, L_failed);
1860 
1861     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1862     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1863     __ addw(temp, length, dst_pos);
1864     __ cmpw(temp, rscratch1);
1865     __ br(Assembler::HI, L_failed);
1866 
1867     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1868     __ movw(src_pos, src_pos);
1869     __ movw(dst_pos, dst_pos);
1870 
1871     BLOCK_COMMENT("arraycopy_range_checks done");
1872   }
1873 
1874   // These stubs get called from some dumb test routine.
1875   // I'll write them properly when they're called from
1876   // something that's actually doing something.
1877   static void fake_arraycopy_stub(address src, address dst, int count) {
1878     assert(count == 0, "huh?");
1879   }
1880 
1881 
1882   //
1883   //  Generate 'unsafe' array copy stub
1884   //  Though just as safe as the other stubs, it takes an unscaled
1885   //  size_t argument instead of an element count.
1886   //
1887   //  Input:
1888   //    c_rarg0   - source array address
1889   //    c_rarg1   - destination array address
1890   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1891   //
1892   // Examines the alignment of the operands and dispatches
1893   // to a long, int, short, or byte copy loop.
1894   //
1895   address generate_unsafe_copy(const char *name,
1896                                address byte_copy_entry,
1897                                address short_copy_entry,
1898                                address int_copy_entry,
1899                                address long_copy_entry) {
1900     Label L_long_aligned, L_int_aligned, L_short_aligned;
1901     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1902 
1903     __ align(CodeEntryAlignment);
1904     StubCodeMark mark(this, "StubRoutines", name);
1905     address start = __ pc();
1906     __ enter(); // required for proper stackwalking of RuntimeStub frame
1907 
1908     // bump this on entry, not on exit:
1909     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1910 
1911     __ orr(rscratch1, s, d);
1912     __ orr(rscratch1, rscratch1, count);
1913 
1914     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1915     __ cbz(rscratch1, L_long_aligned);
1916     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1917     __ cbz(rscratch1, L_int_aligned);
1918     __ tbz(rscratch1, 0, L_short_aligned);
1919     __ b(RuntimeAddress(byte_copy_entry));
1920 
1921     __ BIND(L_short_aligned);
1922     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1923     __ b(RuntimeAddress(short_copy_entry));
1924     __ BIND(L_int_aligned);
1925     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1926     __ b(RuntimeAddress(int_copy_entry));
1927     __ BIND(L_long_aligned);
1928     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1929     __ b(RuntimeAddress(long_copy_entry));
1930 
1931     return start;
1932   }
1933 
1934   //
1935   //  Generate generic array copy stubs
1936   //
1937   //  Input:
1938   //    c_rarg0    -  src oop
1939   //    c_rarg1    -  src_pos (32-bits)
1940   //    c_rarg2    -  dst oop
1941   //    c_rarg3    -  dst_pos (32-bits)
1942   //    c_rarg4    -  element count (32-bits)
1943   //
1944   //  Output:
1945   //    r0 ==  0  -  success
1946   //    r0 == -1^K - failure, where K is partial transfer count
1947   //
1948   address generate_generic_copy(const char *name,
1949                                 address byte_copy_entry, address short_copy_entry,
1950                                 address int_copy_entry, address oop_copy_entry,
1951                                 address long_copy_entry, address checkcast_copy_entry) {
1952 
1953     Label L_failed, L_objArray;
1954     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1955 
1956     // Input registers
1957     const Register src        = c_rarg0;  // source array oop
1958     const Register src_pos    = c_rarg1;  // source position
1959     const Register dst        = c_rarg2;  // destination array oop
1960     const Register dst_pos    = c_rarg3;  // destination position
1961     const Register length     = c_rarg4;
1962 
1963 
1964     // Registers used as temps
1965     const Register dst_klass  = c_rarg5;
1966 
1967     __ align(CodeEntryAlignment);
1968 
1969     StubCodeMark mark(this, "StubRoutines", name);
1970 
1971     address start = __ pc();
1972 
1973     __ enter(); // required for proper stackwalking of RuntimeStub frame
1974 
1975     // bump this on entry, not on exit:
1976     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1977 
1978     //-----------------------------------------------------------------------
1979     // Assembler stub will be used for this call to arraycopy
1980     // if the following conditions are met:
1981     //
1982     // (1) src and dst must not be null.
1983     // (2) src_pos must not be negative.
1984     // (3) dst_pos must not be negative.
1985     // (4) length  must not be negative.
1986     // (5) src klass and dst klass should be the same and not NULL.
1987     // (6) src and dst should be arrays.
1988     // (7) src_pos + length must not exceed length of src.
1989     // (8) dst_pos + length must not exceed length of dst.
1990     //
1991 
1992     //  if (src == NULL) return -1;
1993     __ cbz(src, L_failed);
1994 
1995     //  if (src_pos < 0) return -1;
1996     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1997 
1998     //  if (dst == NULL) return -1;
1999     __ cbz(dst, L_failed);
2000 
2001     //  if (dst_pos < 0) return -1;
2002     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2003 
2004     // registers used as temp
2005     const Register scratch_length    = r16; // elements count to copy
2006     const Register scratch_src_klass = r17; // array klass
2007     const Register lh                = r15; // layout helper
2008 
2009     //  if (length < 0) return -1;
2010     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2011     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2012 
2013     __ load_klass(scratch_src_klass, src);
2014 #ifdef ASSERT
2015     //  assert(src->klass() != NULL);
2016     {
2017       BLOCK_COMMENT("assert klasses not null {");
2018       Label L1, L2;
2019       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2020       __ bind(L1);
2021       __ stop("broken null klass");
2022       __ bind(L2);
2023       __ load_klass(rscratch1, dst);
2024       __ cbz(rscratch1, L1);     // this would be broken also
2025       BLOCK_COMMENT("} assert klasses not null done");
2026     }
2027 #endif
2028 
2029     // Load layout helper (32-bits)
2030     //
2031     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2032     // 32        30    24            16              8     2                 0
2033     //
2034     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2035     //
2036 
2037     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2038 
2039     // Handle objArrays completely differently...
2040     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2041     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2042     __ movw(rscratch1, objArray_lh);
2043     __ eorw(rscratch2, lh, rscratch1);
2044     __ cbzw(rscratch2, L_objArray);
2045 
2046     //  if (src->klass() != dst->klass()) return -1;
2047     __ load_klass(rscratch2, dst);
2048     __ eor(rscratch2, rscratch2, scratch_src_klass);
2049     __ cbnz(rscratch2, L_failed);
2050 
2051     //  if (!src->is_Array()) return -1;
2052     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2053 
2054     // At this point, it is known to be a typeArray (array_tag 0x3).
2055 #ifdef ASSERT
2056     {
2057       BLOCK_COMMENT("assert primitive array {");
2058       Label L;
2059       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2060       __ cmpw(lh, rscratch2);
2061       __ br(Assembler::GE, L);
2062       __ stop("must be a primitive array");
2063       __ bind(L);
2064       BLOCK_COMMENT("} assert primitive array done");
2065     }
2066 #endif
2067 
2068     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2069                            rscratch2, L_failed);
2070 
2071     // TypeArrayKlass
2072     //
2073     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2074     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2075     //
2076 
2077     const Register rscratch1_offset = rscratch1;    // array offset
2078     const Register r15_elsize = lh; // element size
2079 
2080     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2081            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2082     __ add(src, src, rscratch1_offset);           // src array offset
2083     __ add(dst, dst, rscratch1_offset);           // dst array offset
2084     BLOCK_COMMENT("choose copy loop based on element size");
2085 
2086     // next registers should be set before the jump to corresponding stub
2087     const Register from     = c_rarg0;  // source array address
2088     const Register to       = c_rarg1;  // destination array address
2089     const Register count    = c_rarg2;  // elements count
2090 
2091     // 'from', 'to', 'count' registers should be set in such order
2092     // since they are the same as 'src', 'src_pos', 'dst'.
2093 
2094     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2095 
2096     // The possible values of elsize are 0-3, i.e. exact_log2(element
2097     // size in bytes).  We do a simple bitwise binary search.
2098   __ BIND(L_copy_bytes);
2099     __ tbnz(r15_elsize, 1, L_copy_ints);
2100     __ tbnz(r15_elsize, 0, L_copy_shorts);
2101     __ lea(from, Address(src, src_pos));// src_addr
2102     __ lea(to,   Address(dst, dst_pos));// dst_addr
2103     __ movw(count, scratch_length); // length
2104     __ b(RuntimeAddress(byte_copy_entry));
2105 
2106   __ BIND(L_copy_shorts);
2107     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2108     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2109     __ movw(count, scratch_length); // length
2110     __ b(RuntimeAddress(short_copy_entry));
2111 
2112   __ BIND(L_copy_ints);
2113     __ tbnz(r15_elsize, 0, L_copy_longs);
2114     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2115     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2116     __ movw(count, scratch_length); // length
2117     __ b(RuntimeAddress(int_copy_entry));
2118 
2119   __ BIND(L_copy_longs);
2120 #ifdef ASSERT
2121     {
2122       BLOCK_COMMENT("assert long copy {");
2123       Label L;
2124       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2125       __ cmpw(r15_elsize, LogBytesPerLong);
2126       __ br(Assembler::EQ, L);
2127       __ stop("must be long copy, but elsize is wrong");
2128       __ bind(L);
2129       BLOCK_COMMENT("} assert long copy done");
2130     }
2131 #endif
2132     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2133     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2134     __ movw(count, scratch_length); // length
2135     __ b(RuntimeAddress(long_copy_entry));
2136 
2137     // ObjArrayKlass
2138   __ BIND(L_objArray);
2139     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2140 
2141     Label L_plain_copy, L_checkcast_copy;
2142     //  test array classes for subtyping
2143     __ load_klass(r15, dst);
2144     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2145     __ br(Assembler::NE, L_checkcast_copy);
2146 
2147     // Identically typed arrays can be copied without element-wise checks.
2148     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2149                            rscratch2, L_failed);
2150 
2151     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2152     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2153     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2154     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2155     __ movw(count, scratch_length); // length
2156   __ BIND(L_plain_copy);
2157     __ b(RuntimeAddress(oop_copy_entry));
2158 
2159   __ BIND(L_checkcast_copy);
2160     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2161     {
2162       // Before looking at dst.length, make sure dst is also an objArray.
2163       __ ldrw(rscratch1, Address(r15, lh_offset));
2164       __ movw(rscratch2, objArray_lh);
2165       __ eorw(rscratch1, rscratch1, rscratch2);
2166       __ cbnzw(rscratch1, L_failed);
2167 
2168       // It is safe to examine both src.length and dst.length.
2169       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2170                              r15, L_failed);
2171 
2172       __ load_klass(dst_klass, dst); // reload
2173 
2174       // Marshal the base address arguments now, freeing registers.
2175       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2176       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2177       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2178       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2179       __ movw(count, length);           // length (reloaded)
2180       Register sco_temp = c_rarg3;      // this register is free now
2181       assert_different_registers(from, to, count, sco_temp,
2182                                  dst_klass, scratch_src_klass);
2183       // assert_clean_int(count, sco_temp);
2184 
2185       // Generate the type check.
2186       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2187       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2188 
2189       // Smashes rscratch1, rscratch2
2190       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2191 
2192       // Fetch destination element klass from the ObjArrayKlass header.
2193       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2194       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2195       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2196 
2197       // the checkcast_copy loop needs two extra arguments:
2198       assert(c_rarg3 == sco_temp, "#3 already in place");
2199       // Set up arguments for checkcast_copy_entry.
2200       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2201       __ b(RuntimeAddress(checkcast_copy_entry));
2202     }
2203 
2204   __ BIND(L_failed);
2205     __ mov(r0, -1);
2206     __ leave();   // required for proper stackwalking of RuntimeStub frame
2207     __ ret(lr);
2208 
2209     return start;
2210   }
2211 
2212   //
2213   // Generate stub for array fill. If "aligned" is true, the
2214   // "to" address is assumed to be heapword aligned.
2215   //
2216   // Arguments for generated stub:
2217   //   to:    c_rarg0
2218   //   value: c_rarg1
2219   //   count: c_rarg2 treated as signed
2220   //
2221   address generate_fill(BasicType t, bool aligned, const char *name) {
2222     __ align(CodeEntryAlignment);
2223     StubCodeMark mark(this, "StubRoutines", name);
2224     address start = __ pc();
2225 
2226     BLOCK_COMMENT("Entry:");
2227 
2228     const Register to        = c_rarg0;  // source array address
2229     const Register value     = c_rarg1;  // value
2230     const Register count     = c_rarg2;  // elements count
2231 
2232     const Register bz_base = r10;        // base for block_zero routine
2233     const Register cnt_words = r11;      // temp register
2234 
2235     __ enter();
2236 
2237     Label L_fill_elements, L_exit1;
2238 
2239     int shift = -1;
2240     switch (t) {
2241       case T_BYTE:
2242         shift = 0;
2243         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2244         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2245         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2246         __ br(Assembler::LO, L_fill_elements);
2247         break;
2248       case T_SHORT:
2249         shift = 1;
2250         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2251         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2252         __ br(Assembler::LO, L_fill_elements);
2253         break;
2254       case T_INT:
2255         shift = 2;
2256         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2257         __ br(Assembler::LO, L_fill_elements);
2258         break;
2259       default: ShouldNotReachHere();
2260     }
2261 
2262     // Align source address at 8 bytes address boundary.
2263     Label L_skip_align1, L_skip_align2, L_skip_align4;
2264     if (!aligned) {
2265       switch (t) {
2266         case T_BYTE:
2267           // One byte misalignment happens only for byte arrays.
2268           __ tbz(to, 0, L_skip_align1);
2269           __ strb(value, Address(__ post(to, 1)));
2270           __ subw(count, count, 1);
2271           __ bind(L_skip_align1);
2272           // Fallthrough
2273         case T_SHORT:
2274           // Two bytes misalignment happens only for byte and short (char) arrays.
2275           __ tbz(to, 1, L_skip_align2);
2276           __ strh(value, Address(__ post(to, 2)));
2277           __ subw(count, count, 2 >> shift);
2278           __ bind(L_skip_align2);
2279           // Fallthrough
2280         case T_INT:
2281           // Align to 8 bytes, we know we are 4 byte aligned to start.
2282           __ tbz(to, 2, L_skip_align4);
2283           __ strw(value, Address(__ post(to, 4)));
2284           __ subw(count, count, 4 >> shift);
2285           __ bind(L_skip_align4);
2286           break;
2287         default: ShouldNotReachHere();
2288       }
2289     }
2290 
2291     //
2292     //  Fill large chunks
2293     //
2294     __ lsrw(cnt_words, count, 3 - shift); // number of words
2295     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2296     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2297     if (UseBlockZeroing) {
2298       Label non_block_zeroing, rest;
2299       // If the fill value is zero we can use the fast zero_words().
2300       __ cbnz(value, non_block_zeroing);
2301       __ mov(bz_base, to);
2302       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2303       __ zero_words(bz_base, cnt_words);
2304       __ b(rest);
2305       __ bind(non_block_zeroing);
2306       __ fill_words(to, cnt_words, value);
2307       __ bind(rest);
2308     } else {
2309       __ fill_words(to, cnt_words, value);
2310     }
2311 
2312     // Remaining count is less than 8 bytes. Fill it by a single store.
2313     // Note that the total length is no less than 8 bytes.
2314     if (t == T_BYTE || t == T_SHORT) {
2315       Label L_exit1;
2316       __ cbzw(count, L_exit1);
2317       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2318       __ str(value, Address(to, -8));    // overwrite some elements
2319       __ bind(L_exit1);
2320       __ leave();
2321       __ ret(lr);
2322     }
2323 
2324     // Handle copies less than 8 bytes.
2325     Label L_fill_2, L_fill_4, L_exit2;
2326     __ bind(L_fill_elements);
2327     switch (t) {
2328       case T_BYTE:
2329         __ tbz(count, 0, L_fill_2);
2330         __ strb(value, Address(__ post(to, 1)));
2331         __ bind(L_fill_2);
2332         __ tbz(count, 1, L_fill_4);
2333         __ strh(value, Address(__ post(to, 2)));
2334         __ bind(L_fill_4);
2335         __ tbz(count, 2, L_exit2);
2336         __ strw(value, Address(to));
2337         break;
2338       case T_SHORT:
2339         __ tbz(count, 0, L_fill_4);
2340         __ strh(value, Address(__ post(to, 2)));
2341         __ bind(L_fill_4);
2342         __ tbz(count, 1, L_exit2);
2343         __ strw(value, Address(to));
2344         break;
2345       case T_INT:
2346         __ cbzw(count, L_exit2);
2347         __ strw(value, Address(to));
2348         break;
2349       default: ShouldNotReachHere();
2350     }
2351     __ bind(L_exit2);
2352     __ leave();
2353     __ ret(lr);
2354     return start;
2355   }
2356 
2357   address generate_data_cache_writeback() {
2358     const Register line        = c_rarg0;  // address of line to write back
2359 
2360     __ align(CodeEntryAlignment);
2361 
2362     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2363 
2364     address start = __ pc();
2365     __ enter();
2366     __ cache_wb(Address(line, 0));
2367     __ leave();
2368     __ ret(lr);
2369 
2370     return start;
2371   }
2372 
2373   address generate_data_cache_writeback_sync() {
2374     const Register is_pre     = c_rarg0;  // pre or post sync
2375 
2376     __ align(CodeEntryAlignment);
2377 
2378     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2379 
2380     // pre wbsync is a no-op
2381     // post wbsync translates to an sfence
2382 
2383     Label skip;
2384     address start = __ pc();
2385     __ enter();
2386     __ cbnz(is_pre, skip);
2387     __ cache_wbsync(false);
2388     __ bind(skip);
2389     __ leave();
2390     __ ret(lr);
2391 
2392     return start;
2393   }
2394 
2395   void generate_arraycopy_stubs() {
2396     address entry;
2397     address entry_jbyte_arraycopy;
2398     address entry_jshort_arraycopy;
2399     address entry_jint_arraycopy;
2400     address entry_oop_arraycopy;
2401     address entry_jlong_arraycopy;
2402     address entry_checkcast_arraycopy;
2403 
2404     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2405     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2406 
2407     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2408 
2409     //*** jbyte
2410     // Always need aligned and unaligned versions
2411     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2412                                                                                   "jbyte_disjoint_arraycopy");
2413     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2414                                                                                   &entry_jbyte_arraycopy,
2415                                                                                   "jbyte_arraycopy");
2416     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2417                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2418     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2419                                                                                   "arrayof_jbyte_arraycopy");
2420 
2421     //*** jshort
2422     // Always need aligned and unaligned versions
2423     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2424                                                                                     "jshort_disjoint_arraycopy");
2425     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2426                                                                                     &entry_jshort_arraycopy,
2427                                                                                     "jshort_arraycopy");
2428     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2429                                                                                     "arrayof_jshort_disjoint_arraycopy");
2430     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2431                                                                                     "arrayof_jshort_arraycopy");
2432 
2433     //*** jint
2434     // Aligned versions
2435     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2436                                                                                 "arrayof_jint_disjoint_arraycopy");
2437     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2438                                                                                 "arrayof_jint_arraycopy");
2439     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2440     // entry_jint_arraycopy always points to the unaligned version
2441     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2442                                                                                 "jint_disjoint_arraycopy");
2443     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2444                                                                                 &entry_jint_arraycopy,
2445                                                                                 "jint_arraycopy");
2446 
2447     //*** jlong
2448     // It is always aligned
2449     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2450                                                                                   "arrayof_jlong_disjoint_arraycopy");
2451     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2452                                                                                   "arrayof_jlong_arraycopy");
2453     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2454     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2455 
2456     //*** oops
2457     {
2458       // With compressed oops we need unaligned versions; notice that
2459       // we overwrite entry_oop_arraycopy.
2460       bool aligned = !UseCompressedOops;
2461 
2462       StubRoutines::_arrayof_oop_disjoint_arraycopy
2463         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2464                                      /*dest_uninitialized*/false);
2465       StubRoutines::_arrayof_oop_arraycopy
2466         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2467                                      /*dest_uninitialized*/false);
2468       // Aligned versions without pre-barriers
2469       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2470         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2471                                      /*dest_uninitialized*/true);
2472       StubRoutines::_arrayof_oop_arraycopy_uninit
2473         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2474                                      /*dest_uninitialized*/true);
2475     }
2476 
2477     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2478     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2479     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2480     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2481 
2482     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2483     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2484                                                                         /*dest_uninitialized*/true);
2485 
2486     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2487                                                               entry_jbyte_arraycopy,
2488                                                               entry_jshort_arraycopy,
2489                                                               entry_jint_arraycopy,
2490                                                               entry_jlong_arraycopy);
2491 
2492     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2493                                                                entry_jbyte_arraycopy,
2494                                                                entry_jshort_arraycopy,
2495                                                                entry_jint_arraycopy,
2496                                                                entry_oop_arraycopy,
2497                                                                entry_jlong_arraycopy,
2498                                                                entry_checkcast_arraycopy);
2499 
2500     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2501     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2502     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2503     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2504     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2505     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2506   }
2507 
2508   void generate_math_stubs() { Unimplemented(); }
2509 
2510   // Arguments:
2511   //
2512   // Inputs:
2513   //   c_rarg0   - source byte array address
2514   //   c_rarg1   - destination byte array address
2515   //   c_rarg2   - K (key) in little endian int array
2516   //
2517   address generate_aescrypt_encryptBlock() {
2518     __ align(CodeEntryAlignment);
2519     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2520 
2521     Label L_doLast;
2522 
2523     const Register from        = c_rarg0;  // source array address
2524     const Register to          = c_rarg1;  // destination array address
2525     const Register key         = c_rarg2;  // key array address
2526     const Register keylen      = rscratch1;
2527 
2528     address start = __ pc();
2529     __ enter();
2530 
2531     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2532 
2533     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2534 
2535     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2536     __ rev32(v1, __ T16B, v1);
2537     __ rev32(v2, __ T16B, v2);
2538     __ rev32(v3, __ T16B, v3);
2539     __ rev32(v4, __ T16B, v4);
2540     __ aese(v0, v1);
2541     __ aesmc(v0, v0);
2542     __ aese(v0, v2);
2543     __ aesmc(v0, v0);
2544     __ aese(v0, v3);
2545     __ aesmc(v0, v0);
2546     __ aese(v0, v4);
2547     __ aesmc(v0, v0);
2548 
2549     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2550     __ rev32(v1, __ T16B, v1);
2551     __ rev32(v2, __ T16B, v2);
2552     __ rev32(v3, __ T16B, v3);
2553     __ rev32(v4, __ T16B, v4);
2554     __ aese(v0, v1);
2555     __ aesmc(v0, v0);
2556     __ aese(v0, v2);
2557     __ aesmc(v0, v0);
2558     __ aese(v0, v3);
2559     __ aesmc(v0, v0);
2560     __ aese(v0, v4);
2561     __ aesmc(v0, v0);
2562 
2563     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2564     __ rev32(v1, __ T16B, v1);
2565     __ rev32(v2, __ T16B, v2);
2566 
2567     __ cmpw(keylen, 44);
2568     __ br(Assembler::EQ, L_doLast);
2569 
2570     __ aese(v0, v1);
2571     __ aesmc(v0, v0);
2572     __ aese(v0, v2);
2573     __ aesmc(v0, v0);
2574 
2575     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2576     __ rev32(v1, __ T16B, v1);
2577     __ rev32(v2, __ T16B, v2);
2578 
2579     __ cmpw(keylen, 52);
2580     __ br(Assembler::EQ, L_doLast);
2581 
2582     __ aese(v0, v1);
2583     __ aesmc(v0, v0);
2584     __ aese(v0, v2);
2585     __ aesmc(v0, v0);
2586 
2587     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2588     __ rev32(v1, __ T16B, v1);
2589     __ rev32(v2, __ T16B, v2);
2590 
2591     __ BIND(L_doLast);
2592 
2593     __ aese(v0, v1);
2594     __ aesmc(v0, v0);
2595     __ aese(v0, v2);
2596 
2597     __ ld1(v1, __ T16B, key);
2598     __ rev32(v1, __ T16B, v1);
2599     __ eor(v0, __ T16B, v0, v1);
2600 
2601     __ st1(v0, __ T16B, to);
2602 
2603     __ mov(r0, 0);
2604 
2605     __ leave();
2606     __ ret(lr);
2607 
2608     return start;
2609   }
2610 
2611   // Arguments:
2612   //
2613   // Inputs:
2614   //   c_rarg0   - source byte array address
2615   //   c_rarg1   - destination byte array address
2616   //   c_rarg2   - K (key) in little endian int array
2617   //
2618   address generate_aescrypt_decryptBlock() {
2619     assert(UseAES, "need AES instructions and misaligned SSE support");
2620     __ align(CodeEntryAlignment);
2621     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2622     Label L_doLast;
2623 
2624     const Register from        = c_rarg0;  // source array address
2625     const Register to          = c_rarg1;  // destination array address
2626     const Register key         = c_rarg2;  // key array address
2627     const Register keylen      = rscratch1;
2628 
2629     address start = __ pc();
2630     __ enter(); // required for proper stackwalking of RuntimeStub frame
2631 
2632     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2633 
2634     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2635 
2636     __ ld1(v5, __ T16B, __ post(key, 16));
2637     __ rev32(v5, __ T16B, v5);
2638 
2639     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2640     __ rev32(v1, __ T16B, v1);
2641     __ rev32(v2, __ T16B, v2);
2642     __ rev32(v3, __ T16B, v3);
2643     __ rev32(v4, __ T16B, v4);
2644     __ aesd(v0, v1);
2645     __ aesimc(v0, v0);
2646     __ aesd(v0, v2);
2647     __ aesimc(v0, v0);
2648     __ aesd(v0, v3);
2649     __ aesimc(v0, v0);
2650     __ aesd(v0, v4);
2651     __ aesimc(v0, v0);
2652 
2653     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2654     __ rev32(v1, __ T16B, v1);
2655     __ rev32(v2, __ T16B, v2);
2656     __ rev32(v3, __ T16B, v3);
2657     __ rev32(v4, __ T16B, v4);
2658     __ aesd(v0, v1);
2659     __ aesimc(v0, v0);
2660     __ aesd(v0, v2);
2661     __ aesimc(v0, v0);
2662     __ aesd(v0, v3);
2663     __ aesimc(v0, v0);
2664     __ aesd(v0, v4);
2665     __ aesimc(v0, v0);
2666 
2667     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2668     __ rev32(v1, __ T16B, v1);
2669     __ rev32(v2, __ T16B, v2);
2670 
2671     __ cmpw(keylen, 44);
2672     __ br(Assembler::EQ, L_doLast);
2673 
2674     __ aesd(v0, v1);
2675     __ aesimc(v0, v0);
2676     __ aesd(v0, v2);
2677     __ aesimc(v0, v0);
2678 
2679     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2680     __ rev32(v1, __ T16B, v1);
2681     __ rev32(v2, __ T16B, v2);
2682 
2683     __ cmpw(keylen, 52);
2684     __ br(Assembler::EQ, L_doLast);
2685 
2686     __ aesd(v0, v1);
2687     __ aesimc(v0, v0);
2688     __ aesd(v0, v2);
2689     __ aesimc(v0, v0);
2690 
2691     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2692     __ rev32(v1, __ T16B, v1);
2693     __ rev32(v2, __ T16B, v2);
2694 
2695     __ BIND(L_doLast);
2696 
2697     __ aesd(v0, v1);
2698     __ aesimc(v0, v0);
2699     __ aesd(v0, v2);
2700 
2701     __ eor(v0, __ T16B, v0, v5);
2702 
2703     __ st1(v0, __ T16B, to);
2704 
2705     __ mov(r0, 0);
2706 
2707     __ leave();
2708     __ ret(lr);
2709 
2710     return start;
2711   }
2712 
2713   // Arguments:
2714   //
2715   // Inputs:
2716   //   c_rarg0   - source byte array address
2717   //   c_rarg1   - destination byte array address
2718   //   c_rarg2   - K (key) in little endian int array
2719   //   c_rarg3   - r vector byte array address
2720   //   c_rarg4   - input length
2721   //
2722   // Output:
2723   //   x0        - input length
2724   //
2725   address generate_cipherBlockChaining_encryptAESCrypt() {
2726     assert(UseAES, "need AES instructions and misaligned SSE support");
2727     __ align(CodeEntryAlignment);
2728     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2729 
2730     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2731 
2732     const Register from        = c_rarg0;  // source array address
2733     const Register to          = c_rarg1;  // destination array address
2734     const Register key         = c_rarg2;  // key array address
2735     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2736                                            // and left with the results of the last encryption block
2737     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2738     const Register keylen      = rscratch1;
2739 
2740     address start = __ pc();
2741 
2742       __ enter();
2743 
2744       __ movw(rscratch2, len_reg);
2745 
2746       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2747 
2748       __ ld1(v0, __ T16B, rvec);
2749 
2750       __ cmpw(keylen, 52);
2751       __ br(Assembler::CC, L_loadkeys_44);
2752       __ br(Assembler::EQ, L_loadkeys_52);
2753 
2754       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2755       __ rev32(v17, __ T16B, v17);
2756       __ rev32(v18, __ T16B, v18);
2757     __ BIND(L_loadkeys_52);
2758       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2759       __ rev32(v19, __ T16B, v19);
2760       __ rev32(v20, __ T16B, v20);
2761     __ BIND(L_loadkeys_44);
2762       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2763       __ rev32(v21, __ T16B, v21);
2764       __ rev32(v22, __ T16B, v22);
2765       __ rev32(v23, __ T16B, v23);
2766       __ rev32(v24, __ T16B, v24);
2767       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2768       __ rev32(v25, __ T16B, v25);
2769       __ rev32(v26, __ T16B, v26);
2770       __ rev32(v27, __ T16B, v27);
2771       __ rev32(v28, __ T16B, v28);
2772       __ ld1(v29, v30, v31, __ T16B, key);
2773       __ rev32(v29, __ T16B, v29);
2774       __ rev32(v30, __ T16B, v30);
2775       __ rev32(v31, __ T16B, v31);
2776 
2777     __ BIND(L_aes_loop);
2778       __ ld1(v1, __ T16B, __ post(from, 16));
2779       __ eor(v0, __ T16B, v0, v1);
2780 
2781       __ br(Assembler::CC, L_rounds_44);
2782       __ br(Assembler::EQ, L_rounds_52);
2783 
2784       __ aese(v0, v17); __ aesmc(v0, v0);
2785       __ aese(v0, v18); __ aesmc(v0, v0);
2786     __ BIND(L_rounds_52);
2787       __ aese(v0, v19); __ aesmc(v0, v0);
2788       __ aese(v0, v20); __ aesmc(v0, v0);
2789     __ BIND(L_rounds_44);
2790       __ aese(v0, v21); __ aesmc(v0, v0);
2791       __ aese(v0, v22); __ aesmc(v0, v0);
2792       __ aese(v0, v23); __ aesmc(v0, v0);
2793       __ aese(v0, v24); __ aesmc(v0, v0);
2794       __ aese(v0, v25); __ aesmc(v0, v0);
2795       __ aese(v0, v26); __ aesmc(v0, v0);
2796       __ aese(v0, v27); __ aesmc(v0, v0);
2797       __ aese(v0, v28); __ aesmc(v0, v0);
2798       __ aese(v0, v29); __ aesmc(v0, v0);
2799       __ aese(v0, v30);
2800       __ eor(v0, __ T16B, v0, v31);
2801 
2802       __ st1(v0, __ T16B, __ post(to, 16));
2803 
2804       __ subw(len_reg, len_reg, 16);
2805       __ cbnzw(len_reg, L_aes_loop);
2806 
2807       __ st1(v0, __ T16B, rvec);
2808 
2809       __ mov(r0, rscratch2);
2810 
2811       __ leave();
2812       __ ret(lr);
2813 
2814       return start;
2815   }
2816 
2817   // Arguments:
2818   //
2819   // Inputs:
2820   //   c_rarg0   - source byte array address
2821   //   c_rarg1   - destination byte array address
2822   //   c_rarg2   - K (key) in little endian int array
2823   //   c_rarg3   - r vector byte array address
2824   //   c_rarg4   - input length
2825   //
2826   // Output:
2827   //   r0        - input length
2828   //
2829   address generate_cipherBlockChaining_decryptAESCrypt() {
2830     assert(UseAES, "need AES instructions and misaligned SSE support");
2831     __ align(CodeEntryAlignment);
2832     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2833 
2834     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2835 
2836     const Register from        = c_rarg0;  // source array address
2837     const Register to          = c_rarg1;  // destination array address
2838     const Register key         = c_rarg2;  // key array address
2839     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2840                                            // and left with the results of the last encryption block
2841     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2842     const Register keylen      = rscratch1;
2843 
2844     address start = __ pc();
2845 
2846       __ enter();
2847 
2848       __ movw(rscratch2, len_reg);
2849 
2850       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2851 
2852       __ ld1(v2, __ T16B, rvec);
2853 
2854       __ ld1(v31, __ T16B, __ post(key, 16));
2855       __ rev32(v31, __ T16B, v31);
2856 
2857       __ cmpw(keylen, 52);
2858       __ br(Assembler::CC, L_loadkeys_44);
2859       __ br(Assembler::EQ, L_loadkeys_52);
2860 
2861       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2862       __ rev32(v17, __ T16B, v17);
2863       __ rev32(v18, __ T16B, v18);
2864     __ BIND(L_loadkeys_52);
2865       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2866       __ rev32(v19, __ T16B, v19);
2867       __ rev32(v20, __ T16B, v20);
2868     __ BIND(L_loadkeys_44);
2869       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2870       __ rev32(v21, __ T16B, v21);
2871       __ rev32(v22, __ T16B, v22);
2872       __ rev32(v23, __ T16B, v23);
2873       __ rev32(v24, __ T16B, v24);
2874       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2875       __ rev32(v25, __ T16B, v25);
2876       __ rev32(v26, __ T16B, v26);
2877       __ rev32(v27, __ T16B, v27);
2878       __ rev32(v28, __ T16B, v28);
2879       __ ld1(v29, v30, __ T16B, key);
2880       __ rev32(v29, __ T16B, v29);
2881       __ rev32(v30, __ T16B, v30);
2882 
2883     __ BIND(L_aes_loop);
2884       __ ld1(v0, __ T16B, __ post(from, 16));
2885       __ orr(v1, __ T16B, v0, v0);
2886 
2887       __ br(Assembler::CC, L_rounds_44);
2888       __ br(Assembler::EQ, L_rounds_52);
2889 
2890       __ aesd(v0, v17); __ aesimc(v0, v0);
2891       __ aesd(v0, v18); __ aesimc(v0, v0);
2892     __ BIND(L_rounds_52);
2893       __ aesd(v0, v19); __ aesimc(v0, v0);
2894       __ aesd(v0, v20); __ aesimc(v0, v0);
2895     __ BIND(L_rounds_44);
2896       __ aesd(v0, v21); __ aesimc(v0, v0);
2897       __ aesd(v0, v22); __ aesimc(v0, v0);
2898       __ aesd(v0, v23); __ aesimc(v0, v0);
2899       __ aesd(v0, v24); __ aesimc(v0, v0);
2900       __ aesd(v0, v25); __ aesimc(v0, v0);
2901       __ aesd(v0, v26); __ aesimc(v0, v0);
2902       __ aesd(v0, v27); __ aesimc(v0, v0);
2903       __ aesd(v0, v28); __ aesimc(v0, v0);
2904       __ aesd(v0, v29); __ aesimc(v0, v0);
2905       __ aesd(v0, v30);
2906       __ eor(v0, __ T16B, v0, v31);
2907       __ eor(v0, __ T16B, v0, v2);
2908 
2909       __ st1(v0, __ T16B, __ post(to, 16));
2910       __ orr(v2, __ T16B, v1, v1);
2911 
2912       __ subw(len_reg, len_reg, 16);
2913       __ cbnzw(len_reg, L_aes_loop);
2914 
2915       __ st1(v2, __ T16B, rvec);
2916 
2917       __ mov(r0, rscratch2);
2918 
2919       __ leave();
2920       __ ret(lr);
2921 
2922     return start;
2923   }
2924 
2925   // Arguments:
2926   //
2927   // Inputs:
2928   //   c_rarg0   - byte[]  source+offset
2929   //   c_rarg1   - int[]   SHA.state
2930   //   c_rarg2   - int     offset
2931   //   c_rarg3   - int     limit
2932   //
2933   address generate_sha1_implCompress(bool multi_block, const char *name) {
2934     __ align(CodeEntryAlignment);
2935     StubCodeMark mark(this, "StubRoutines", name);
2936     address start = __ pc();
2937 
2938     Register buf   = c_rarg0;
2939     Register state = c_rarg1;
2940     Register ofs   = c_rarg2;
2941     Register limit = c_rarg3;
2942 
2943     Label keys;
2944     Label sha1_loop;
2945 
2946     // load the keys into v0..v3
2947     __ adr(rscratch1, keys);
2948     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2949     // load 5 words state into v6, v7
2950     __ ldrq(v6, Address(state, 0));
2951     __ ldrs(v7, Address(state, 16));
2952 
2953 
2954     __ BIND(sha1_loop);
2955     // load 64 bytes of data into v16..v19
2956     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2957     __ rev32(v16, __ T16B, v16);
2958     __ rev32(v17, __ T16B, v17);
2959     __ rev32(v18, __ T16B, v18);
2960     __ rev32(v19, __ T16B, v19);
2961 
2962     // do the sha1
2963     __ addv(v4, __ T4S, v16, v0);
2964     __ orr(v20, __ T16B, v6, v6);
2965 
2966     FloatRegister d0 = v16;
2967     FloatRegister d1 = v17;
2968     FloatRegister d2 = v18;
2969     FloatRegister d3 = v19;
2970 
2971     for (int round = 0; round < 20; round++) {
2972       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2973       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2974       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2975       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2976       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2977 
2978       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2979       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2980       __ sha1h(tmp2, __ T4S, v20);
2981       if (round < 5)
2982         __ sha1c(v20, __ T4S, tmp3, tmp4);
2983       else if (round < 10 || round >= 15)
2984         __ sha1p(v20, __ T4S, tmp3, tmp4);
2985       else
2986         __ sha1m(v20, __ T4S, tmp3, tmp4);
2987       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2988 
2989       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2990     }
2991 
2992     __ addv(v7, __ T2S, v7, v21);
2993     __ addv(v6, __ T4S, v6, v20);
2994 
2995     if (multi_block) {
2996       __ add(ofs, ofs, 64);
2997       __ cmp(ofs, limit);
2998       __ br(Assembler::LE, sha1_loop);
2999       __ mov(c_rarg0, ofs); // return ofs
3000     }
3001 
3002     __ strq(v6, Address(state, 0));
3003     __ strs(v7, Address(state, 16));
3004 
3005     __ ret(lr);
3006 
3007     __ bind(keys);
3008     __ emit_int32(0x5a827999);
3009     __ emit_int32(0x6ed9eba1);
3010     __ emit_int32(0x8f1bbcdc);
3011     __ emit_int32(0xca62c1d6);
3012 
3013     return start;
3014   }
3015 
3016 
3017   // Arguments:
3018   //
3019   // Inputs:
3020   //   c_rarg0   - byte[]  source+offset
3021   //   c_rarg1   - int[]   SHA.state
3022   //   c_rarg2   - int     offset
3023   //   c_rarg3   - int     limit
3024   //
3025   address generate_sha256_implCompress(bool multi_block, const char *name) {
3026     static const uint32_t round_consts[64] = {
3027       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3028       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3029       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3030       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3031       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3032       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3033       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3034       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3035       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3036       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3037       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3038       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3039       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3040       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3041       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3042       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3043     };
3044     __ align(CodeEntryAlignment);
3045     StubCodeMark mark(this, "StubRoutines", name);
3046     address start = __ pc();
3047 
3048     Register buf   = c_rarg0;
3049     Register state = c_rarg1;
3050     Register ofs   = c_rarg2;
3051     Register limit = c_rarg3;
3052 
3053     Label sha1_loop;
3054 
3055     __ stpd(v8, v9, __ pre(sp, -32));
3056     __ stpd(v10, v11, Address(sp, 16));
3057 
3058 // dga == v0
3059 // dgb == v1
3060 // dg0 == v2
3061 // dg1 == v3
3062 // dg2 == v4
3063 // t0 == v6
3064 // t1 == v7
3065 
3066     // load 16 keys to v16..v31
3067     __ lea(rscratch1, ExternalAddress((address)round_consts));
3068     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3069     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3070     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3071     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3072 
3073     // load 8 words (256 bits) state
3074     __ ldpq(v0, v1, state);
3075 
3076     __ BIND(sha1_loop);
3077     // load 64 bytes of data into v8..v11
3078     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3079     __ rev32(v8, __ T16B, v8);
3080     __ rev32(v9, __ T16B, v9);
3081     __ rev32(v10, __ T16B, v10);
3082     __ rev32(v11, __ T16B, v11);
3083 
3084     __ addv(v6, __ T4S, v8, v16);
3085     __ orr(v2, __ T16B, v0, v0);
3086     __ orr(v3, __ T16B, v1, v1);
3087 
3088     FloatRegister d0 = v8;
3089     FloatRegister d1 = v9;
3090     FloatRegister d2 = v10;
3091     FloatRegister d3 = v11;
3092 
3093 
3094     for (int round = 0; round < 16; round++) {
3095       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3096       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3097       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3098       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3099 
3100       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3101        __ orr(v4, __ T16B, v2, v2);
3102       if (round < 15)
3103         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3104       __ sha256h(v2, __ T4S, v3, tmp2);
3105       __ sha256h2(v3, __ T4S, v4, tmp2);
3106       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3107 
3108       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3109     }
3110 
3111     __ addv(v0, __ T4S, v0, v2);
3112     __ addv(v1, __ T4S, v1, v3);
3113 
3114     if (multi_block) {
3115       __ add(ofs, ofs, 64);
3116       __ cmp(ofs, limit);
3117       __ br(Assembler::LE, sha1_loop);
3118       __ mov(c_rarg0, ofs); // return ofs
3119     }
3120 
3121     __ ldpd(v10, v11, Address(sp, 16));
3122     __ ldpd(v8, v9, __ post(sp, 32));
3123 
3124     __ stpq(v0, v1, state);
3125 
3126     __ ret(lr);
3127 
3128     return start;
3129   }
3130 
3131   // Arguments:
3132   //
3133   // Inputs:
3134   //   c_rarg0   - byte[]  source+offset
3135   //   c_rarg1   - int[]   SHA.state
3136   //   c_rarg2   - int     offset
3137   //   c_rarg3   - int     limit
3138   //
3139   address generate_sha512_implCompress(bool multi_block, const char *name) {
3140     static const uint64_t round_consts[80] = {
3141       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3142       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3143       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3144       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3145       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3146       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3147       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3148       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3149       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3150       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3151       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3152       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3153       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3154       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3155       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3156       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3157       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3158       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3159       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3160       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3161       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3162       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3163       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3164       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3165       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3166       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3167       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3168     };
3169 
3170     // Double rounds for sha512.
3171     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3172       if (dr < 36)                                                                   \
3173         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3174       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3175       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3176       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3177       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3178       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3179       if (dr < 32) {                                                                 \
3180         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3181         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3182       }                                                                              \
3183       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3184       if (dr < 32)                                                                   \
3185         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3186       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3187       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3188 
3189     __ align(CodeEntryAlignment);
3190     StubCodeMark mark(this, "StubRoutines", name);
3191     address start = __ pc();
3192 
3193     Register buf   = c_rarg0;
3194     Register state = c_rarg1;
3195     Register ofs   = c_rarg2;
3196     Register limit = c_rarg3;
3197 
3198     __ stpd(v8, v9, __ pre(sp, -64));
3199     __ stpd(v10, v11, Address(sp, 16));
3200     __ stpd(v12, v13, Address(sp, 32));
3201     __ stpd(v14, v15, Address(sp, 48));
3202 
3203     Label sha512_loop;
3204 
3205     // load state
3206     __ ld1(v8, v9, v10, v11, __ T2D, state);
3207 
3208     // load first 4 round constants
3209     __ lea(rscratch1, ExternalAddress((address)round_consts));
3210     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3211 
3212     __ BIND(sha512_loop);
3213     // load 128B of data into v12..v19
3214     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3215     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3216     __ rev64(v12, __ T16B, v12);
3217     __ rev64(v13, __ T16B, v13);
3218     __ rev64(v14, __ T16B, v14);
3219     __ rev64(v15, __ T16B, v15);
3220     __ rev64(v16, __ T16B, v16);
3221     __ rev64(v17, __ T16B, v17);
3222     __ rev64(v18, __ T16B, v18);
3223     __ rev64(v19, __ T16B, v19);
3224 
3225     __ mov(rscratch2, rscratch1);
3226 
3227     __ mov(v0, __ T16B, v8);
3228     __ mov(v1, __ T16B, v9);
3229     __ mov(v2, __ T16B, v10);
3230     __ mov(v3, __ T16B, v11);
3231 
3232     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3233     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3234     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3235     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3236     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3237     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3238     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3239     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3240     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3241     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3242     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3243     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3244     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3245     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3246     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3247     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3248     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3249     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3250     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3251     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3252     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3253     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3254     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3255     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3256     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3257     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3258     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3259     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3260     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3261     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3262     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3263     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3264     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3265     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3266     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3267     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3268     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3269     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3270     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3271     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3272 
3273     __ addv(v8, __ T2D, v8, v0);
3274     __ addv(v9, __ T2D, v9, v1);
3275     __ addv(v10, __ T2D, v10, v2);
3276     __ addv(v11, __ T2D, v11, v3);
3277 
3278     if (multi_block) {
3279       __ add(ofs, ofs, 128);
3280       __ cmp(ofs, limit);
3281       __ br(Assembler::LE, sha512_loop);
3282       __ mov(c_rarg0, ofs); // return ofs
3283     }
3284 
3285     __ st1(v8, v9, v10, v11, __ T2D, state);
3286 
3287     __ ldpd(v14, v15, Address(sp, 48));
3288     __ ldpd(v12, v13, Address(sp, 32));
3289     __ ldpd(v10, v11, Address(sp, 16));
3290     __ ldpd(v8, v9, __ post(sp, 64));
3291 
3292     __ ret(lr);
3293 
3294     return start;
3295   }
3296 
3297   // Safefetch stubs.
3298   void generate_safefetch(const char* name, int size, address* entry,
3299                           address* fault_pc, address* continuation_pc) {
3300     // safefetch signatures:
3301     //   int      SafeFetch32(int*      adr, int      errValue);
3302     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3303     //
3304     // arguments:
3305     //   c_rarg0 = adr
3306     //   c_rarg1 = errValue
3307     //
3308     // result:
3309     //   PPC_RET  = *adr or errValue
3310 
3311     StubCodeMark mark(this, "StubRoutines", name);
3312 
3313     // Entry point, pc or function descriptor.
3314     *entry = __ pc();
3315 
3316     // Load *adr into c_rarg1, may fault.
3317     *fault_pc = __ pc();
3318     switch (size) {
3319       case 4:
3320         // int32_t
3321         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3322         break;
3323       case 8:
3324         // int64_t
3325         __ ldr(c_rarg1, Address(c_rarg0, 0));
3326         break;
3327       default:
3328         ShouldNotReachHere();
3329     }
3330 
3331     // return errValue or *adr
3332     *continuation_pc = __ pc();
3333     __ mov(r0, c_rarg1);
3334     __ ret(lr);
3335   }
3336 
3337   /**
3338    *  Arguments:
3339    *
3340    * Inputs:
3341    *   c_rarg0   - int crc
3342    *   c_rarg1   - byte* buf
3343    *   c_rarg2   - int length
3344    *
3345    * Ouput:
3346    *       rax   - int crc result
3347    */
3348   address generate_updateBytesCRC32() {
3349     assert(UseCRC32Intrinsics, "what are we doing here?");
3350 
3351     __ align(CodeEntryAlignment);
3352     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3353 
3354     address start = __ pc();
3355 
3356     const Register crc   = c_rarg0;  // crc
3357     const Register buf   = c_rarg1;  // source java byte array address
3358     const Register len   = c_rarg2;  // length
3359     const Register table0 = c_rarg3; // crc_table address
3360     const Register table1 = c_rarg4;
3361     const Register table2 = c_rarg5;
3362     const Register table3 = c_rarg6;
3363     const Register tmp3 = c_rarg7;
3364 
3365     BLOCK_COMMENT("Entry:");
3366     __ enter(); // required for proper stackwalking of RuntimeStub frame
3367 
3368     __ kernel_crc32(crc, buf, len,
3369               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3370 
3371     __ leave(); // required for proper stackwalking of RuntimeStub frame
3372     __ ret(lr);
3373 
3374     return start;
3375   }
3376 
3377   /**
3378    *  Arguments:
3379    *
3380    * Inputs:
3381    *   c_rarg0   - int crc
3382    *   c_rarg1   - byte* buf
3383    *   c_rarg2   - int length
3384    *   c_rarg3   - int* table
3385    *
3386    * Ouput:
3387    *       r0   - int crc result
3388    */
3389   address generate_updateBytesCRC32C() {
3390     assert(UseCRC32CIntrinsics, "what are we doing here?");
3391 
3392     __ align(CodeEntryAlignment);
3393     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3394 
3395     address start = __ pc();
3396 
3397     const Register crc   = c_rarg0;  // crc
3398     const Register buf   = c_rarg1;  // source java byte array address
3399     const Register len   = c_rarg2;  // length
3400     const Register table0 = c_rarg3; // crc_table address
3401     const Register table1 = c_rarg4;
3402     const Register table2 = c_rarg5;
3403     const Register table3 = c_rarg6;
3404     const Register tmp3 = c_rarg7;
3405 
3406     BLOCK_COMMENT("Entry:");
3407     __ enter(); // required for proper stackwalking of RuntimeStub frame
3408 
3409     __ kernel_crc32c(crc, buf, len,
3410               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3411 
3412     __ leave(); // required for proper stackwalking of RuntimeStub frame
3413     __ ret(lr);
3414 
3415     return start;
3416   }
3417 
3418   /***
3419    *  Arguments:
3420    *
3421    *  Inputs:
3422    *   c_rarg0   - int   adler
3423    *   c_rarg1   - byte* buff
3424    *   c_rarg2   - int   len
3425    *
3426    * Output:
3427    *   c_rarg0   - int adler result
3428    */
3429   address generate_updateBytesAdler32() {
3430     __ align(CodeEntryAlignment);
3431     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3432     address start = __ pc();
3433 
3434     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3435 
3436     // Aliases
3437     Register adler  = c_rarg0;
3438     Register s1     = c_rarg0;
3439     Register s2     = c_rarg3;
3440     Register buff   = c_rarg1;
3441     Register len    = c_rarg2;
3442     Register nmax  = r4;
3443     Register base  = r5;
3444     Register count = r6;
3445     Register temp0 = rscratch1;
3446     Register temp1 = rscratch2;
3447     FloatRegister vbytes = v0;
3448     FloatRegister vs1acc = v1;
3449     FloatRegister vs2acc = v2;
3450     FloatRegister vtable = v3;
3451 
3452     // Max number of bytes we can process before having to take the mod
3453     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3454     uint64_t BASE = 0xfff1;
3455     uint64_t NMAX = 0x15B0;
3456 
3457     __ mov(base, BASE);
3458     __ mov(nmax, NMAX);
3459 
3460     // Load accumulation coefficients for the upper 16 bits
3461     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3462     __ ld1(vtable, __ T16B, Address(temp0));
3463 
3464     // s1 is initialized to the lower 16 bits of adler
3465     // s2 is initialized to the upper 16 bits of adler
3466     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3467     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3468 
3469     // The pipelined loop needs at least 16 elements for 1 iteration
3470     // It does check this, but it is more effective to skip to the cleanup loop
3471     __ cmp(len, (u1)16);
3472     __ br(Assembler::HS, L_nmax);
3473     __ cbz(len, L_combine);
3474 
3475     __ bind(L_simple_by1_loop);
3476     __ ldrb(temp0, Address(__ post(buff, 1)));
3477     __ add(s1, s1, temp0);
3478     __ add(s2, s2, s1);
3479     __ subs(len, len, 1);
3480     __ br(Assembler::HI, L_simple_by1_loop);
3481 
3482     // s1 = s1 % BASE
3483     __ subs(temp0, s1, base);
3484     __ csel(s1, temp0, s1, Assembler::HS);
3485 
3486     // s2 = s2 % BASE
3487     __ lsr(temp0, s2, 16);
3488     __ lsl(temp1, temp0, 4);
3489     __ sub(temp1, temp1, temp0);
3490     __ add(s2, temp1, s2, ext::uxth);
3491 
3492     __ subs(temp0, s2, base);
3493     __ csel(s2, temp0, s2, Assembler::HS);
3494 
3495     __ b(L_combine);
3496 
3497     __ bind(L_nmax);
3498     __ subs(len, len, nmax);
3499     __ sub(count, nmax, 16);
3500     __ br(Assembler::LO, L_by16);
3501 
3502     __ bind(L_nmax_loop);
3503 
3504     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3505                                       vbytes, vs1acc, vs2acc, vtable);
3506 
3507     __ subs(count, count, 16);
3508     __ br(Assembler::HS, L_nmax_loop);
3509 
3510     // s1 = s1 % BASE
3511     __ lsr(temp0, s1, 16);
3512     __ lsl(temp1, temp0, 4);
3513     __ sub(temp1, temp1, temp0);
3514     __ add(temp1, temp1, s1, ext::uxth);
3515 
3516     __ lsr(temp0, temp1, 16);
3517     __ lsl(s1, temp0, 4);
3518     __ sub(s1, s1, temp0);
3519     __ add(s1, s1, temp1, ext:: uxth);
3520 
3521     __ subs(temp0, s1, base);
3522     __ csel(s1, temp0, s1, Assembler::HS);
3523 
3524     // s2 = s2 % BASE
3525     __ lsr(temp0, s2, 16);
3526     __ lsl(temp1, temp0, 4);
3527     __ sub(temp1, temp1, temp0);
3528     __ add(temp1, temp1, s2, ext::uxth);
3529 
3530     __ lsr(temp0, temp1, 16);
3531     __ lsl(s2, temp0, 4);
3532     __ sub(s2, s2, temp0);
3533     __ add(s2, s2, temp1, ext:: uxth);
3534 
3535     __ subs(temp0, s2, base);
3536     __ csel(s2, temp0, s2, Assembler::HS);
3537 
3538     __ subs(len, len, nmax);
3539     __ sub(count, nmax, 16);
3540     __ br(Assembler::HS, L_nmax_loop);
3541 
3542     __ bind(L_by16);
3543     __ adds(len, len, count);
3544     __ br(Assembler::LO, L_by1);
3545 
3546     __ bind(L_by16_loop);
3547 
3548     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3549                                       vbytes, vs1acc, vs2acc, vtable);
3550 
3551     __ subs(len, len, 16);
3552     __ br(Assembler::HS, L_by16_loop);
3553 
3554     __ bind(L_by1);
3555     __ adds(len, len, 15);
3556     __ br(Assembler::LO, L_do_mod);
3557 
3558     __ bind(L_by1_loop);
3559     __ ldrb(temp0, Address(__ post(buff, 1)));
3560     __ add(s1, temp0, s1);
3561     __ add(s2, s2, s1);
3562     __ subs(len, len, 1);
3563     __ br(Assembler::HS, L_by1_loop);
3564 
3565     __ bind(L_do_mod);
3566     // s1 = s1 % BASE
3567     __ lsr(temp0, s1, 16);
3568     __ lsl(temp1, temp0, 4);
3569     __ sub(temp1, temp1, temp0);
3570     __ add(temp1, temp1, s1, ext::uxth);
3571 
3572     __ lsr(temp0, temp1, 16);
3573     __ lsl(s1, temp0, 4);
3574     __ sub(s1, s1, temp0);
3575     __ add(s1, s1, temp1, ext:: uxth);
3576 
3577     __ subs(temp0, s1, base);
3578     __ csel(s1, temp0, s1, Assembler::HS);
3579 
3580     // s2 = s2 % BASE
3581     __ lsr(temp0, s2, 16);
3582     __ lsl(temp1, temp0, 4);
3583     __ sub(temp1, temp1, temp0);
3584     __ add(temp1, temp1, s2, ext::uxth);
3585 
3586     __ lsr(temp0, temp1, 16);
3587     __ lsl(s2, temp0, 4);
3588     __ sub(s2, s2, temp0);
3589     __ add(s2, s2, temp1, ext:: uxth);
3590 
3591     __ subs(temp0, s2, base);
3592     __ csel(s2, temp0, s2, Assembler::HS);
3593 
3594     // Combine lower bits and higher bits
3595     __ bind(L_combine);
3596     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3597 
3598     __ ret(lr);
3599 
3600     return start;
3601   }
3602 
3603   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3604           Register temp0, Register temp1, FloatRegister vbytes,
3605           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3606     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3607     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3608     // In non-vectorized code, we update s1 and s2 as:
3609     //   s1 <- s1 + b1
3610     //   s2 <- s2 + s1
3611     //   s1 <- s1 + b2
3612     //   s2 <- s2 + b1
3613     //   ...
3614     //   s1 <- s1 + b16
3615     //   s2 <- s2 + s1
3616     // Putting above assignments together, we have:
3617     //   s1_new = s1 + b1 + b2 + ... + b16
3618     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3619     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3620     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3621     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3622 
3623     // s2 = s2 + s1 * 16
3624     __ add(s2, s2, s1, Assembler::LSL, 4);
3625 
3626     // vs1acc = b1 + b2 + b3 + ... + b16
3627     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3628     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3629     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3630     __ uaddlv(vs1acc, __ T16B, vbytes);
3631     __ uaddlv(vs2acc, __ T8H, vs2acc);
3632 
3633     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3634     __ fmovd(temp0, vs1acc);
3635     __ fmovd(temp1, vs2acc);
3636     __ add(s1, s1, temp0);
3637     __ add(s2, s2, temp1);
3638   }
3639 
3640   /**
3641    *  Arguments:
3642    *
3643    *  Input:
3644    *    c_rarg0   - x address
3645    *    c_rarg1   - x length
3646    *    c_rarg2   - y address
3647    *    c_rarg3   - y lenth
3648    *    c_rarg4   - z address
3649    *    c_rarg5   - z length
3650    */
3651   address generate_multiplyToLen() {
3652     __ align(CodeEntryAlignment);
3653     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3654 
3655     address start = __ pc();
3656     const Register x     = r0;
3657     const Register xlen  = r1;
3658     const Register y     = r2;
3659     const Register ylen  = r3;
3660     const Register z     = r4;
3661     const Register zlen  = r5;
3662 
3663     const Register tmp1  = r10;
3664     const Register tmp2  = r11;
3665     const Register tmp3  = r12;
3666     const Register tmp4  = r13;
3667     const Register tmp5  = r14;
3668     const Register tmp6  = r15;
3669     const Register tmp7  = r16;
3670 
3671     BLOCK_COMMENT("Entry:");
3672     __ enter(); // required for proper stackwalking of RuntimeStub frame
3673     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3674     __ leave(); // required for proper stackwalking of RuntimeStub frame
3675     __ ret(lr);
3676 
3677     return start;
3678   }
3679 
3680   address generate_squareToLen() {
3681     // squareToLen algorithm for sizes 1..127 described in java code works
3682     // faster than multiply_to_len on some CPUs and slower on others, but
3683     // multiply_to_len shows a bit better overall results
3684     __ align(CodeEntryAlignment);
3685     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3686     address start = __ pc();
3687 
3688     const Register x     = r0;
3689     const Register xlen  = r1;
3690     const Register z     = r2;
3691     const Register zlen  = r3;
3692     const Register y     = r4; // == x
3693     const Register ylen  = r5; // == xlen
3694 
3695     const Register tmp1  = r10;
3696     const Register tmp2  = r11;
3697     const Register tmp3  = r12;
3698     const Register tmp4  = r13;
3699     const Register tmp5  = r14;
3700     const Register tmp6  = r15;
3701     const Register tmp7  = r16;
3702 
3703     RegSet spilled_regs = RegSet::of(y, ylen);
3704     BLOCK_COMMENT("Entry:");
3705     __ enter();
3706     __ push(spilled_regs, sp);
3707     __ mov(y, x);
3708     __ mov(ylen, xlen);
3709     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3710     __ pop(spilled_regs, sp);
3711     __ leave();
3712     __ ret(lr);
3713     return start;
3714   }
3715 
3716   address generate_mulAdd() {
3717     __ align(CodeEntryAlignment);
3718     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3719 
3720     address start = __ pc();
3721 
3722     const Register out     = r0;
3723     const Register in      = r1;
3724     const Register offset  = r2;
3725     const Register len     = r3;
3726     const Register k       = r4;
3727 
3728     BLOCK_COMMENT("Entry:");
3729     __ enter();
3730     __ mul_add(out, in, offset, len, k);
3731     __ leave();
3732     __ ret(lr);
3733 
3734     return start;
3735   }
3736 
3737   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3738                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3739                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3740     // Karatsuba multiplication performs a 128*128 -> 256-bit
3741     // multiplication in three 128-bit multiplications and a few
3742     // additions.
3743     //
3744     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3745     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3746     //
3747     // Inputs:
3748     //
3749     // A0 in a.d[0]     (subkey)
3750     // A1 in a.d[1]
3751     // (A1+A0) in a1_xor_a0.d[0]
3752     //
3753     // B0 in b.d[0]     (state)
3754     // B1 in b.d[1]
3755 
3756     __ ext(tmp1, __ T16B, b, b, 0x08);
3757     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3758     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3759     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3760     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3761 
3762     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3763     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3764     __ eor(tmp2, __ T16B, tmp2, tmp4);
3765     __ eor(tmp2, __ T16B, tmp2, tmp3);
3766 
3767     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3768     __ ins(result_hi, __ D, tmp2, 0, 1);
3769     __ ins(result_lo, __ D, tmp2, 1, 0);
3770   }
3771 
3772   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3773                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3774     const FloatRegister t0 = result;
3775 
3776     // The GCM field polynomial f is z^128 + p(z), where p =
3777     // z^7+z^2+z+1.
3778     //
3779     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3780     //
3781     // so, given that the product we're reducing is
3782     //    a == lo + hi * z^128
3783     // substituting,
3784     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3785     //
3786     // we reduce by multiplying hi by p(z) and subtracting the result
3787     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3788     // bits we can do this with two 64-bit multiplications, lo*p and
3789     // hi*p.
3790 
3791     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3792     __ ext(t1, __ T16B, t0, z, 8);
3793     __ eor(hi, __ T16B, hi, t1);
3794     __ ext(t1, __ T16B, z, t0, 8);
3795     __ eor(lo, __ T16B, lo, t1);
3796     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3797     __ eor(result, __ T16B, lo, t0);
3798   }
3799 
3800   address generate_has_negatives(address &has_negatives_long) {
3801     const u1 large_loop_size = 64;
3802     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3803     int dcache_line = VM_Version::dcache_line_size();
3804 
3805     Register ary1 = r1, len = r2, result = r0;
3806 
3807     __ align(CodeEntryAlignment);
3808 
3809     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3810 
3811     address entry = __ pc();
3812 
3813     __ enter();
3814 
3815   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3816         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3817 
3818   __ cmp(len, (u1)15);
3819   __ br(Assembler::GT, LEN_OVER_15);
3820   // The only case when execution falls into this code is when pointer is near
3821   // the end of memory page and we have to avoid reading next page
3822   __ add(ary1, ary1, len);
3823   __ subs(len, len, 8);
3824   __ br(Assembler::GT, LEN_OVER_8);
3825   __ ldr(rscratch2, Address(ary1, -8));
3826   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3827   __ lsrv(rscratch2, rscratch2, rscratch1);
3828   __ tst(rscratch2, UPPER_BIT_MASK);
3829   __ cset(result, Assembler::NE);
3830   __ leave();
3831   __ ret(lr);
3832   __ bind(LEN_OVER_8);
3833   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3834   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3835   __ tst(rscratch2, UPPER_BIT_MASK);
3836   __ br(Assembler::NE, RET_TRUE_NO_POP);
3837   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3838   __ lsrv(rscratch1, rscratch1, rscratch2);
3839   __ tst(rscratch1, UPPER_BIT_MASK);
3840   __ cset(result, Assembler::NE);
3841   __ leave();
3842   __ ret(lr);
3843 
3844   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3845   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3846 
3847   has_negatives_long = __ pc(); // 2nd entry point
3848 
3849   __ enter();
3850 
3851   __ bind(LEN_OVER_15);
3852     __ push(spilled_regs, sp);
3853     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3854     __ cbz(rscratch2, ALIGNED);
3855     __ ldp(tmp6, tmp1, Address(ary1));
3856     __ mov(tmp5, 16);
3857     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3858     __ add(ary1, ary1, rscratch1);
3859     __ sub(len, len, rscratch1);
3860     __ orr(tmp6, tmp6, tmp1);
3861     __ tst(tmp6, UPPER_BIT_MASK);
3862     __ br(Assembler::NE, RET_TRUE);
3863 
3864   __ bind(ALIGNED);
3865     __ cmp(len, large_loop_size);
3866     __ br(Assembler::LT, CHECK_16);
3867     // Perform 16-byte load as early return in pre-loop to handle situation
3868     // when initially aligned large array has negative values at starting bytes,
3869     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3870     // slower. Cases with negative bytes further ahead won't be affected that
3871     // much. In fact, it'll be faster due to early loads, less instructions and
3872     // less branches in LARGE_LOOP.
3873     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3874     __ sub(len, len, 16);
3875     __ orr(tmp6, tmp6, tmp1);
3876     __ tst(tmp6, UPPER_BIT_MASK);
3877     __ br(Assembler::NE, RET_TRUE);
3878     __ cmp(len, large_loop_size);
3879     __ br(Assembler::LT, CHECK_16);
3880 
3881     if (SoftwarePrefetchHintDistance >= 0
3882         && SoftwarePrefetchHintDistance >= dcache_line) {
3883       // initial prefetch
3884       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3885     }
3886   __ bind(LARGE_LOOP);
3887     if (SoftwarePrefetchHintDistance >= 0) {
3888       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3889     }
3890     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3891     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3892     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3893     // instructions per cycle and have less branches, but this approach disables
3894     // early return, thus, all 64 bytes are loaded and checked every time.
3895     __ ldp(tmp2, tmp3, Address(ary1));
3896     __ ldp(tmp4, tmp5, Address(ary1, 16));
3897     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3898     __ ldp(tmp6, tmp1, Address(ary1, 48));
3899     __ add(ary1, ary1, large_loop_size);
3900     __ sub(len, len, large_loop_size);
3901     __ orr(tmp2, tmp2, tmp3);
3902     __ orr(tmp4, tmp4, tmp5);
3903     __ orr(rscratch1, rscratch1, rscratch2);
3904     __ orr(tmp6, tmp6, tmp1);
3905     __ orr(tmp2, tmp2, tmp4);
3906     __ orr(rscratch1, rscratch1, tmp6);
3907     __ orr(tmp2, tmp2, rscratch1);
3908     __ tst(tmp2, UPPER_BIT_MASK);
3909     __ br(Assembler::NE, RET_TRUE);
3910     __ cmp(len, large_loop_size);
3911     __ br(Assembler::GE, LARGE_LOOP);
3912 
3913   __ bind(CHECK_16); // small 16-byte load pre-loop
3914     __ cmp(len, (u1)16);
3915     __ br(Assembler::LT, POST_LOOP16);
3916 
3917   __ bind(LOOP16); // small 16-byte load loop
3918     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3919     __ sub(len, len, 16);
3920     __ orr(tmp2, tmp2, tmp3);
3921     __ tst(tmp2, UPPER_BIT_MASK);
3922     __ br(Assembler::NE, RET_TRUE);
3923     __ cmp(len, (u1)16);
3924     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3925 
3926   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3927     __ cmp(len, (u1)8);
3928     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3929     __ ldr(tmp3, Address(__ post(ary1, 8)));
3930     __ sub(len, len, 8);
3931     __ tst(tmp3, UPPER_BIT_MASK);
3932     __ br(Assembler::NE, RET_TRUE);
3933 
3934   __ bind(POST_LOOP16_LOAD_TAIL);
3935     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3936     __ ldr(tmp1, Address(ary1));
3937     __ mov(tmp2, 64);
3938     __ sub(tmp4, tmp2, len, __ LSL, 3);
3939     __ lslv(tmp1, tmp1, tmp4);
3940     __ tst(tmp1, UPPER_BIT_MASK);
3941     __ br(Assembler::NE, RET_TRUE);
3942     // Fallthrough
3943 
3944   __ bind(RET_FALSE);
3945     __ pop(spilled_regs, sp);
3946     __ leave();
3947     __ mov(result, zr);
3948     __ ret(lr);
3949 
3950   __ bind(RET_TRUE);
3951     __ pop(spilled_regs, sp);
3952   __ bind(RET_TRUE_NO_POP);
3953     __ leave();
3954     __ mov(result, 1);
3955     __ ret(lr);
3956 
3957   __ bind(DONE);
3958     __ pop(spilled_regs, sp);
3959     __ leave();
3960     __ ret(lr);
3961     return entry;
3962   }
3963 
3964   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3965         bool usePrefetch, Label &NOT_EQUAL) {
3966     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3967         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3968         tmp7 = r12, tmp8 = r13;
3969     Label LOOP;
3970 
3971     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3972     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3973     __ bind(LOOP);
3974     if (usePrefetch) {
3975       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3976       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3977     }
3978     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3979     __ eor(tmp1, tmp1, tmp2);
3980     __ eor(tmp3, tmp3, tmp4);
3981     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3982     __ orr(tmp1, tmp1, tmp3);
3983     __ cbnz(tmp1, NOT_EQUAL);
3984     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3985     __ eor(tmp5, tmp5, tmp6);
3986     __ eor(tmp7, tmp7, tmp8);
3987     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3988     __ orr(tmp5, tmp5, tmp7);
3989     __ cbnz(tmp5, NOT_EQUAL);
3990     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3991     __ eor(tmp1, tmp1, tmp2);
3992     __ eor(tmp3, tmp3, tmp4);
3993     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3994     __ orr(tmp1, tmp1, tmp3);
3995     __ cbnz(tmp1, NOT_EQUAL);
3996     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3997     __ eor(tmp5, tmp5, tmp6);
3998     __ sub(cnt1, cnt1, 8 * wordSize);
3999     __ eor(tmp7, tmp7, tmp8);
4000     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4001     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4002     // cmp) because subs allows an unlimited range of immediate operand.
4003     __ subs(tmp6, cnt1, loopThreshold);
4004     __ orr(tmp5, tmp5, tmp7);
4005     __ cbnz(tmp5, NOT_EQUAL);
4006     __ br(__ GE, LOOP);
4007     // post-loop
4008     __ eor(tmp1, tmp1, tmp2);
4009     __ eor(tmp3, tmp3, tmp4);
4010     __ orr(tmp1, tmp1, tmp3);
4011     __ sub(cnt1, cnt1, 2 * wordSize);
4012     __ cbnz(tmp1, NOT_EQUAL);
4013   }
4014 
4015   void generate_large_array_equals_loop_simd(int loopThreshold,
4016         bool usePrefetch, Label &NOT_EQUAL) {
4017     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4018         tmp2 = rscratch2;
4019     Label LOOP;
4020 
4021     __ bind(LOOP);
4022     if (usePrefetch) {
4023       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4024       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4025     }
4026     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4027     __ sub(cnt1, cnt1, 8 * wordSize);
4028     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4029     __ subs(tmp1, cnt1, loopThreshold);
4030     __ eor(v0, __ T16B, v0, v4);
4031     __ eor(v1, __ T16B, v1, v5);
4032     __ eor(v2, __ T16B, v2, v6);
4033     __ eor(v3, __ T16B, v3, v7);
4034     __ orr(v0, __ T16B, v0, v1);
4035     __ orr(v1, __ T16B, v2, v3);
4036     __ orr(v0, __ T16B, v0, v1);
4037     __ umov(tmp1, v0, __ D, 0);
4038     __ umov(tmp2, v0, __ D, 1);
4039     __ orr(tmp1, tmp1, tmp2);
4040     __ cbnz(tmp1, NOT_EQUAL);
4041     __ br(__ GE, LOOP);
4042   }
4043 
4044   // a1 = r1 - array1 address
4045   // a2 = r2 - array2 address
4046   // result = r0 - return value. Already contains "false"
4047   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4048   // r3-r5 are reserved temporary registers
4049   address generate_large_array_equals() {
4050     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4051         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4052         tmp7 = r12, tmp8 = r13;
4053     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4054         SMALL_LOOP, POST_LOOP;
4055     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4056     // calculate if at least 32 prefetched bytes are used
4057     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4058     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4059     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4060     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4061         tmp5, tmp6, tmp7, tmp8);
4062 
4063     __ align(CodeEntryAlignment);
4064 
4065     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4066 
4067     address entry = __ pc();
4068     __ enter();
4069     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4070     // also advance pointers to use post-increment instead of pre-increment
4071     __ add(a1, a1, wordSize);
4072     __ add(a2, a2, wordSize);
4073     if (AvoidUnalignedAccesses) {
4074       // both implementations (SIMD/nonSIMD) are using relatively large load
4075       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4076       // on some CPUs in case of address is not at least 16-byte aligned.
4077       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4078       // load if needed at least for 1st address and make if 16-byte aligned.
4079       Label ALIGNED16;
4080       __ tbz(a1, 3, ALIGNED16);
4081       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4082       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4083       __ sub(cnt1, cnt1, wordSize);
4084       __ eor(tmp1, tmp1, tmp2);
4085       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4086       __ bind(ALIGNED16);
4087     }
4088     if (UseSIMDForArrayEquals) {
4089       if (SoftwarePrefetchHintDistance >= 0) {
4090         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4091         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4092         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4093             /* prfm = */ true, NOT_EQUAL);
4094         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4095         __ br(__ LT, TAIL);
4096       }
4097       __ bind(NO_PREFETCH_LARGE_LOOP);
4098       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4099           /* prfm = */ false, NOT_EQUAL);
4100     } else {
4101       __ push(spilled_regs, sp);
4102       if (SoftwarePrefetchHintDistance >= 0) {
4103         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4104         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4105         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4106             /* prfm = */ true, NOT_EQUAL);
4107         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4108         __ br(__ LT, TAIL);
4109       }
4110       __ bind(NO_PREFETCH_LARGE_LOOP);
4111       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4112           /* prfm = */ false, NOT_EQUAL);
4113     }
4114     __ bind(TAIL);
4115       __ cbz(cnt1, EQUAL);
4116       __ subs(cnt1, cnt1, wordSize);
4117       __ br(__ LE, POST_LOOP);
4118     __ bind(SMALL_LOOP);
4119       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4120       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4121       __ subs(cnt1, cnt1, wordSize);
4122       __ eor(tmp1, tmp1, tmp2);
4123       __ cbnz(tmp1, NOT_EQUAL);
4124       __ br(__ GT, SMALL_LOOP);
4125     __ bind(POST_LOOP);
4126       __ ldr(tmp1, Address(a1, cnt1));
4127       __ ldr(tmp2, Address(a2, cnt1));
4128       __ eor(tmp1, tmp1, tmp2);
4129       __ cbnz(tmp1, NOT_EQUAL);
4130     __ bind(EQUAL);
4131       __ mov(result, true);
4132     __ bind(NOT_EQUAL);
4133       if (!UseSIMDForArrayEquals) {
4134         __ pop(spilled_regs, sp);
4135       }
4136     __ bind(NOT_EQUAL_NO_POP);
4137     __ leave();
4138     __ ret(lr);
4139     return entry;
4140   }
4141 
4142   address generate_dsin_dcos(bool isCos) {
4143     __ align(CodeEntryAlignment);
4144     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4145     address start = __ pc();
4146     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4147         (address)StubRoutines::aarch64::_two_over_pi,
4148         (address)StubRoutines::aarch64::_pio2,
4149         (address)StubRoutines::aarch64::_dsin_coef,
4150         (address)StubRoutines::aarch64::_dcos_coef);
4151     return start;
4152   }
4153 
4154   address generate_dlog() {
4155     __ align(CodeEntryAlignment);
4156     StubCodeMark mark(this, "StubRoutines", "dlog");
4157     address entry = __ pc();
4158     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4159         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4160     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4161     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4162         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4163     return entry;
4164   }
4165 
4166   // code for comparing 16 bytes of strings with same encoding
4167   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4168     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4169     __ ldr(rscratch1, Address(__ post(str1, 8)));
4170     __ eor(rscratch2, tmp1, tmp2);
4171     __ ldr(cnt1, Address(__ post(str2, 8)));
4172     __ cbnz(rscratch2, DIFF1);
4173     __ ldr(tmp1, Address(__ post(str1, 8)));
4174     __ eor(rscratch2, rscratch1, cnt1);
4175     __ ldr(tmp2, Address(__ post(str2, 8)));
4176     __ cbnz(rscratch2, DIFF2);
4177   }
4178 
4179   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4180   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4181       Label &DIFF2) {
4182     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4183     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4184 
4185     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4186     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4187     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4188     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4189 
4190     __ fmovd(tmpL, vtmp3);
4191     __ eor(rscratch2, tmp3, tmpL);
4192     __ cbnz(rscratch2, DIFF2);
4193 
4194     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4195     __ umov(tmpL, vtmp3, __ D, 1);
4196     __ eor(rscratch2, tmpU, tmpL);
4197     __ cbnz(rscratch2, DIFF1);
4198 
4199     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4200     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4201     __ fmovd(tmpL, vtmp);
4202     __ eor(rscratch2, tmp3, tmpL);
4203     __ cbnz(rscratch2, DIFF2);
4204 
4205     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4206     __ umov(tmpL, vtmp, __ D, 1);
4207     __ eor(rscratch2, tmpU, tmpL);
4208     __ cbnz(rscratch2, DIFF1);
4209   }
4210 
4211   // r0  = result
4212   // r1  = str1
4213   // r2  = cnt1
4214   // r3  = str2
4215   // r4  = cnt2
4216   // r10 = tmp1
4217   // r11 = tmp2
4218   address generate_compare_long_string_different_encoding(bool isLU) {
4219     __ align(CodeEntryAlignment);
4220     StubCodeMark mark(this, "StubRoutines", isLU
4221         ? "compare_long_string_different_encoding LU"
4222         : "compare_long_string_different_encoding UL");
4223     address entry = __ pc();
4224     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4225         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4226         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4227     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4228         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4229     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4230     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4231 
4232     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4233 
4234     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4235     // cnt2 == amount of characters left to compare
4236     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4237     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4238     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4239     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4240     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4241     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4242     __ eor(rscratch2, tmp1, tmp2);
4243     __ mov(rscratch1, tmp2);
4244     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4245     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4246              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4247     __ push(spilled_regs, sp);
4248     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4249     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4250 
4251     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4252 
4253     if (SoftwarePrefetchHintDistance >= 0) {
4254       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4255       __ br(__ LT, NO_PREFETCH);
4256       __ bind(LARGE_LOOP_PREFETCH);
4257         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4258         __ mov(tmp4, 2);
4259         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4260         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4261           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4262           __ subs(tmp4, tmp4, 1);
4263           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4264           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4265           __ mov(tmp4, 2);
4266         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4267           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4268           __ subs(tmp4, tmp4, 1);
4269           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4270           __ sub(cnt2, cnt2, 64);
4271           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4272           __ br(__ GE, LARGE_LOOP_PREFETCH);
4273     }
4274     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4275     __ bind(NO_PREFETCH);
4276     __ subs(cnt2, cnt2, 16);
4277     __ br(__ LT, TAIL);
4278     __ align(OptoLoopAlignment);
4279     __ bind(SMALL_LOOP); // smaller loop
4280       __ subs(cnt2, cnt2, 16);
4281       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4282       __ br(__ GE, SMALL_LOOP);
4283       __ cmn(cnt2, (u1)16);
4284       __ br(__ EQ, LOAD_LAST);
4285     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4286       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4287       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4288       __ ldr(tmp3, Address(cnt1, -8));
4289       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4290       __ b(LOAD_LAST);
4291     __ bind(DIFF2);
4292       __ mov(tmpU, tmp3);
4293     __ bind(DIFF1);
4294       __ pop(spilled_regs, sp);
4295       __ b(CALCULATE_DIFFERENCE);
4296     __ bind(LOAD_LAST);
4297       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4298       // No need to load it again
4299       __ mov(tmpU, tmp3);
4300       __ pop(spilled_regs, sp);
4301 
4302       // tmp2 points to the address of the last 4 Latin1 characters right now
4303       __ ldrs(vtmp, Address(tmp2));
4304       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4305       __ fmovd(tmpL, vtmp);
4306 
4307       __ eor(rscratch2, tmpU, tmpL);
4308       __ cbz(rscratch2, DONE);
4309 
4310     // Find the first different characters in the longwords and
4311     // compute their difference.
4312     __ bind(CALCULATE_DIFFERENCE);
4313       __ rev(rscratch2, rscratch2);
4314       __ clz(rscratch2, rscratch2);
4315       __ andr(rscratch2, rscratch2, -16);
4316       __ lsrv(tmp1, tmp1, rscratch2);
4317       __ uxthw(tmp1, tmp1);
4318       __ lsrv(rscratch1, rscratch1, rscratch2);
4319       __ uxthw(rscratch1, rscratch1);
4320       __ subw(result, tmp1, rscratch1);
4321     __ bind(DONE);
4322       __ ret(lr);
4323     return entry;
4324   }
4325 
4326     address generate_method_entry_barrier() {
4327     __ align(CodeEntryAlignment);
4328     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
4329 
4330     Label deoptimize_label;
4331 
4332     address start = __ pc();
4333 
4334     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
4335 
4336     __ enter();
4337     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
4338 
4339     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
4340 
4341     __ push_call_clobbered_registers();
4342 
4343     __ mov(c_rarg0, rscratch2);
4344     __ call_VM_leaf
4345          (CAST_FROM_FN_PTR
4346           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
4347 
4348     __ reset_last_Java_frame(true);
4349 
4350     __ mov(rscratch1, r0);
4351 
4352     __ pop_call_clobbered_registers();
4353 
4354     __ cbnz(rscratch1, deoptimize_label);
4355 
4356     __ leave();
4357     __ ret(lr);
4358 
4359     __ BIND(deoptimize_label);
4360 
4361     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
4362     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
4363 
4364     __ mov(sp, rscratch1);
4365     __ br(rscratch2);
4366 
4367     return start;
4368   }
4369 
4370   // r0  = result
4371   // r1  = str1
4372   // r2  = cnt1
4373   // r3  = str2
4374   // r4  = cnt2
4375   // r10 = tmp1
4376   // r11 = tmp2
4377   address generate_compare_long_string_same_encoding(bool isLL) {
4378     __ align(CodeEntryAlignment);
4379     StubCodeMark mark(this, "StubRoutines", isLL
4380         ? "compare_long_string_same_encoding LL"
4381         : "compare_long_string_same_encoding UU");
4382     address entry = __ pc();
4383     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4384         tmp1 = r10, tmp2 = r11;
4385     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4386         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4387         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4388     // exit from large loop when less than 64 bytes left to read or we're about
4389     // to prefetch memory behind array border
4390     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4391     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4392     // update cnt2 counter with already loaded 8 bytes
4393     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4394     // update pointers, because of previous read
4395     __ add(str1, str1, wordSize);
4396     __ add(str2, str2, wordSize);
4397     if (SoftwarePrefetchHintDistance >= 0) {
4398       __ bind(LARGE_LOOP_PREFETCH);
4399         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4400         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4401         compare_string_16_bytes_same(DIFF, DIFF2);
4402         compare_string_16_bytes_same(DIFF, DIFF2);
4403         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4404         compare_string_16_bytes_same(DIFF, DIFF2);
4405         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4406         compare_string_16_bytes_same(DIFF, DIFF2);
4407         __ br(__ GT, LARGE_LOOP_PREFETCH);
4408         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4409     }
4410     // less than 16 bytes left?
4411     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4412     __ br(__ LT, TAIL);
4413     __ align(OptoLoopAlignment);
4414     __ bind(SMALL_LOOP);
4415       compare_string_16_bytes_same(DIFF, DIFF2);
4416       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4417       __ br(__ GE, SMALL_LOOP);
4418     __ bind(TAIL);
4419       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4420       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4421       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4422       __ br(__ LE, CHECK_LAST);
4423       __ eor(rscratch2, tmp1, tmp2);
4424       __ cbnz(rscratch2, DIFF);
4425       __ ldr(tmp1, Address(__ post(str1, 8)));
4426       __ ldr(tmp2, Address(__ post(str2, 8)));
4427       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4428     __ bind(CHECK_LAST);
4429       if (!isLL) {
4430         __ add(cnt2, cnt2, cnt2); // now in bytes
4431       }
4432       __ eor(rscratch2, tmp1, tmp2);
4433       __ cbnz(rscratch2, DIFF);
4434       __ ldr(rscratch1, Address(str1, cnt2));
4435       __ ldr(cnt1, Address(str2, cnt2));
4436       __ eor(rscratch2, rscratch1, cnt1);
4437       __ cbz(rscratch2, LENGTH_DIFF);
4438       // Find the first different characters in the longwords and
4439       // compute their difference.
4440     __ bind(DIFF2);
4441       __ rev(rscratch2, rscratch2);
4442       __ clz(rscratch2, rscratch2);
4443       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4444       __ lsrv(rscratch1, rscratch1, rscratch2);
4445       if (isLL) {
4446         __ lsrv(cnt1, cnt1, rscratch2);
4447         __ uxtbw(rscratch1, rscratch1);
4448         __ uxtbw(cnt1, cnt1);
4449       } else {
4450         __ lsrv(cnt1, cnt1, rscratch2);
4451         __ uxthw(rscratch1, rscratch1);
4452         __ uxthw(cnt1, cnt1);
4453       }
4454       __ subw(result, rscratch1, cnt1);
4455       __ b(LENGTH_DIFF);
4456     __ bind(DIFF);
4457       __ rev(rscratch2, rscratch2);
4458       __ clz(rscratch2, rscratch2);
4459       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4460       __ lsrv(tmp1, tmp1, rscratch2);
4461       if (isLL) {
4462         __ lsrv(tmp2, tmp2, rscratch2);
4463         __ uxtbw(tmp1, tmp1);
4464         __ uxtbw(tmp2, tmp2);
4465       } else {
4466         __ lsrv(tmp2, tmp2, rscratch2);
4467         __ uxthw(tmp1, tmp1);
4468         __ uxthw(tmp2, tmp2);
4469       }
4470       __ subw(result, tmp1, tmp2);
4471       __ b(LENGTH_DIFF);
4472     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4473       __ eor(rscratch2, tmp1, tmp2);
4474       __ cbnz(rscratch2, DIFF);
4475     __ bind(LENGTH_DIFF);
4476       __ ret(lr);
4477     return entry;
4478   }
4479 
4480   void generate_compare_long_strings() {
4481       StubRoutines::aarch64::_compare_long_string_LL
4482           = generate_compare_long_string_same_encoding(true);
4483       StubRoutines::aarch64::_compare_long_string_UU
4484           = generate_compare_long_string_same_encoding(false);
4485       StubRoutines::aarch64::_compare_long_string_LU
4486           = generate_compare_long_string_different_encoding(true);
4487       StubRoutines::aarch64::_compare_long_string_UL
4488           = generate_compare_long_string_different_encoding(false);
4489   }
4490 
4491   // R0 = result
4492   // R1 = str2
4493   // R2 = cnt1
4494   // R3 = str1
4495   // R4 = cnt2
4496   // This generic linear code use few additional ideas, which makes it faster:
4497   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4498   // in order to skip initial loading(help in systems with 1 ld pipeline)
4499   // 2) we can use "fast" algorithm of finding single character to search for
4500   // first symbol with less branches(1 branch per each loaded register instead
4501   // of branch for each symbol), so, this is where constants like
4502   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4503   // 3) after loading and analyzing 1st register of source string, it can be
4504   // used to search for every 1st character entry, saving few loads in
4505   // comparison with "simplier-but-slower" implementation
4506   // 4) in order to avoid lots of push/pop operations, code below is heavily
4507   // re-using/re-initializing/compressing register values, which makes code
4508   // larger and a bit less readable, however, most of extra operations are
4509   // issued during loads or branches, so, penalty is minimal
4510   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4511     const char* stubName = str1_isL
4512         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4513         : "indexof_linear_uu";
4514     __ align(CodeEntryAlignment);
4515     StubCodeMark mark(this, "StubRoutines", stubName);
4516     address entry = __ pc();
4517 
4518     int str1_chr_size = str1_isL ? 1 : 2;
4519     int str2_chr_size = str2_isL ? 1 : 2;
4520     int str1_chr_shift = str1_isL ? 0 : 1;
4521     int str2_chr_shift = str2_isL ? 0 : 1;
4522     bool isL = str1_isL && str2_isL;
4523    // parameters
4524     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4525     // temporary registers
4526     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4527     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4528     // redefinitions
4529     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4530 
4531     __ push(spilled_regs, sp);
4532     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4533         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4534         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4535         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4536         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4537         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4538     // Read whole register from str1. It is safe, because length >=8 here
4539     __ ldr(ch1, Address(str1));
4540     // Read whole register from str2. It is safe, because length >=8 here
4541     __ ldr(ch2, Address(str2));
4542     __ sub(cnt2, cnt2, cnt1);
4543     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4544     if (str1_isL != str2_isL) {
4545       __ eor(v0, __ T16B, v0, v0);
4546     }
4547     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4548     __ mul(first, first, tmp1);
4549     // check if we have less than 1 register to check
4550     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4551     if (str1_isL != str2_isL) {
4552       __ fmovd(v1, ch1);
4553     }
4554     __ br(__ LE, L_SMALL);
4555     __ eor(ch2, first, ch2);
4556     if (str1_isL != str2_isL) {
4557       __ zip1(v1, __ T16B, v1, v0);
4558     }
4559     __ sub(tmp2, ch2, tmp1);
4560     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4561     __ bics(tmp2, tmp2, ch2);
4562     if (str1_isL != str2_isL) {
4563       __ fmovd(ch1, v1);
4564     }
4565     __ br(__ NE, L_HAS_ZERO);
4566     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4567     __ add(result, result, wordSize/str2_chr_size);
4568     __ add(str2, str2, wordSize);
4569     __ br(__ LT, L_POST_LOOP);
4570     __ BIND(L_LOOP);
4571       __ ldr(ch2, Address(str2));
4572       __ eor(ch2, first, ch2);
4573       __ sub(tmp2, ch2, tmp1);
4574       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4575       __ bics(tmp2, tmp2, ch2);
4576       __ br(__ NE, L_HAS_ZERO);
4577     __ BIND(L_LOOP_PROCEED);
4578       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4579       __ add(str2, str2, wordSize);
4580       __ add(result, result, wordSize/str2_chr_size);
4581       __ br(__ GE, L_LOOP);
4582     __ BIND(L_POST_LOOP);
4583       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4584       __ br(__ LE, NOMATCH);
4585       __ ldr(ch2, Address(str2));
4586       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4587       __ eor(ch2, first, ch2);
4588       __ sub(tmp2, ch2, tmp1);
4589       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4590       __ mov(tmp4, -1); // all bits set
4591       __ b(L_SMALL_PROCEED);
4592     __ align(OptoLoopAlignment);
4593     __ BIND(L_SMALL);
4594       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4595       __ eor(ch2, first, ch2);
4596       if (str1_isL != str2_isL) {
4597         __ zip1(v1, __ T16B, v1, v0);
4598       }
4599       __ sub(tmp2, ch2, tmp1);
4600       __ mov(tmp4, -1); // all bits set
4601       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4602       if (str1_isL != str2_isL) {
4603         __ fmovd(ch1, v1); // move converted 4 symbols
4604       }
4605     __ BIND(L_SMALL_PROCEED);
4606       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4607       __ bic(tmp2, tmp2, ch2);
4608       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4609       __ rbit(tmp2, tmp2);
4610       __ br(__ EQ, NOMATCH);
4611     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4612       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4613       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4614       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4615       if (str2_isL) { // LL
4616         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4617         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4618         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4619         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4620         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4621       } else {
4622         __ mov(ch2, 0xE); // all bits in byte set except last one
4623         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4624         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4625         __ lslv(tmp2, tmp2, tmp4);
4626         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4627         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4628         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4629         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4630       }
4631       __ cmp(ch1, ch2);
4632       __ mov(tmp4, wordSize/str2_chr_size);
4633       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4634     __ BIND(L_SMALL_CMP_LOOP);
4635       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4636                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4637       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4638                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4639       __ add(tmp4, tmp4, 1);
4640       __ cmp(tmp4, cnt1);
4641       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4642       __ cmp(first, ch2);
4643       __ br(__ EQ, L_SMALL_CMP_LOOP);
4644     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4645       __ cbz(tmp2, NOMATCH); // no more matches. exit
4646       __ clz(tmp4, tmp2);
4647       __ add(result, result, 1); // advance index
4648       __ add(str2, str2, str2_chr_size); // advance pointer
4649       __ b(L_SMALL_HAS_ZERO_LOOP);
4650     __ align(OptoLoopAlignment);
4651     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4652       __ cmp(first, ch2);
4653       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4654       __ b(DONE);
4655     __ align(OptoLoopAlignment);
4656     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4657       if (str2_isL) { // LL
4658         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4659         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4660         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4661         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4662         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4663       } else {
4664         __ mov(ch2, 0xE); // all bits in byte set except last one
4665         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4666         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4667         __ lslv(tmp2, tmp2, tmp4);
4668         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4669         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4670         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4671         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4672       }
4673       __ cmp(ch1, ch2);
4674       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4675       __ b(DONE);
4676     __ align(OptoLoopAlignment);
4677     __ BIND(L_HAS_ZERO);
4678       __ rbit(tmp2, tmp2);
4679       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4680       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4681       // It's fine because both counters are 32bit and are not changed in this
4682       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4683       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4684       __ sub(result, result, 1);
4685     __ BIND(L_HAS_ZERO_LOOP);
4686       __ mov(cnt1, wordSize/str2_chr_size);
4687       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4688       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4689       if (str2_isL) {
4690         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4691         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4692         __ lslv(tmp2, tmp2, tmp4);
4693         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4694         __ add(tmp4, tmp4, 1);
4695         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4696         __ lsl(tmp2, tmp2, 1);
4697         __ mov(tmp4, wordSize/str2_chr_size);
4698       } else {
4699         __ mov(ch2, 0xE);
4700         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4701         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4702         __ lslv(tmp2, tmp2, tmp4);
4703         __ add(tmp4, tmp4, 1);
4704         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4705         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4706         __ lsl(tmp2, tmp2, 1);
4707         __ mov(tmp4, wordSize/str2_chr_size);
4708         __ sub(str2, str2, str2_chr_size);
4709       }
4710       __ cmp(ch1, ch2);
4711       __ mov(tmp4, wordSize/str2_chr_size);
4712       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4713     __ BIND(L_CMP_LOOP);
4714       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4715                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4716       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4717                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4718       __ add(tmp4, tmp4, 1);
4719       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4720       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4721       __ cmp(cnt1, ch2);
4722       __ br(__ EQ, L_CMP_LOOP);
4723     __ BIND(L_CMP_LOOP_NOMATCH);
4724       // here we're not matched
4725       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4726       __ clz(tmp4, tmp2);
4727       __ add(str2, str2, str2_chr_size); // advance pointer
4728       __ b(L_HAS_ZERO_LOOP);
4729     __ align(OptoLoopAlignment);
4730     __ BIND(L_CMP_LOOP_LAST_CMP);
4731       __ cmp(cnt1, ch2);
4732       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4733       __ b(DONE);
4734     __ align(OptoLoopAlignment);
4735     __ BIND(L_CMP_LOOP_LAST_CMP2);
4736       if (str2_isL) {
4737         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4738         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4739         __ lslv(tmp2, tmp2, tmp4);
4740         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4741         __ add(tmp4, tmp4, 1);
4742         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4743         __ lsl(tmp2, tmp2, 1);
4744       } else {
4745         __ mov(ch2, 0xE);
4746         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4747         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4748         __ lslv(tmp2, tmp2, tmp4);
4749         __ add(tmp4, tmp4, 1);
4750         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4751         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4752         __ lsl(tmp2, tmp2, 1);
4753         __ sub(str2, str2, str2_chr_size);
4754       }
4755       __ cmp(ch1, ch2);
4756       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4757       __ b(DONE);
4758     __ align(OptoLoopAlignment);
4759     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4760       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4761       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4762       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4763       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4764       // result by analyzed characters value, so, we can just reset lower bits
4765       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4766       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4767       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4768       // index of last analyzed substring inside current octet. So, str2 in at
4769       // respective start address. We need to advance it to next octet
4770       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4771       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4772       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4773       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4774       __ movw(cnt2, cnt2);
4775       __ b(L_LOOP_PROCEED);
4776     __ align(OptoLoopAlignment);
4777     __ BIND(NOMATCH);
4778       __ mov(result, -1);
4779     __ BIND(DONE);
4780       __ pop(spilled_regs, sp);
4781       __ ret(lr);
4782     return entry;
4783   }
4784 
4785   void generate_string_indexof_stubs() {
4786     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4787     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4788     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4789   }
4790 
4791   void inflate_and_store_2_fp_registers(bool generatePrfm,
4792       FloatRegister src1, FloatRegister src2) {
4793     Register dst = r1;
4794     __ zip1(v1, __ T16B, src1, v0);
4795     __ zip2(v2, __ T16B, src1, v0);
4796     if (generatePrfm) {
4797       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4798     }
4799     __ zip1(v3, __ T16B, src2, v0);
4800     __ zip2(v4, __ T16B, src2, v0);
4801     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4802   }
4803 
4804   // R0 = src
4805   // R1 = dst
4806   // R2 = len
4807   // R3 = len >> 3
4808   // V0 = 0
4809   // v1 = loaded 8 bytes
4810   address generate_large_byte_array_inflate() {
4811     __ align(CodeEntryAlignment);
4812     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4813     address entry = __ pc();
4814     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4815     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4816     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
4817 
4818     // do one more 8-byte read to have address 16-byte aligned in most cases
4819     // also use single store instruction
4820     __ ldrd(v2, __ post(src, 8));
4821     __ sub(octetCounter, octetCounter, 2);
4822     __ zip1(v1, __ T16B, v1, v0);
4823     __ zip1(v2, __ T16B, v2, v0);
4824     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4825     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4826     __ subs(rscratch1, octetCounter, large_loop_threshold);
4827     __ br(__ LE, LOOP_START);
4828     __ b(LOOP_PRFM_START);
4829     __ bind(LOOP_PRFM);
4830       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4831     __ bind(LOOP_PRFM_START);
4832       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4833       __ sub(octetCounter, octetCounter, 8);
4834       __ subs(rscratch1, octetCounter, large_loop_threshold);
4835       inflate_and_store_2_fp_registers(true, v3, v4);
4836       inflate_and_store_2_fp_registers(true, v5, v6);
4837       __ br(__ GT, LOOP_PRFM);
4838       __ cmp(octetCounter, (u1)8);
4839       __ br(__ LT, DONE);
4840     __ bind(LOOP);
4841       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4842       __ bind(LOOP_START);
4843       __ sub(octetCounter, octetCounter, 8);
4844       __ cmp(octetCounter, (u1)8);
4845       inflate_and_store_2_fp_registers(false, v3, v4);
4846       inflate_and_store_2_fp_registers(false, v5, v6);
4847       __ br(__ GE, LOOP);
4848     __ bind(DONE);
4849       __ ret(lr);
4850     return entry;
4851   }
4852 
4853   /**
4854    *  Arguments:
4855    *
4856    *  Input:
4857    *  c_rarg0   - current state address
4858    *  c_rarg1   - H key address
4859    *  c_rarg2   - data address
4860    *  c_rarg3   - number of blocks
4861    *
4862    *  Output:
4863    *  Updated state at c_rarg0
4864    */
4865   address generate_ghash_processBlocks() {
4866     // Bafflingly, GCM uses little-endian for the byte order, but
4867     // big-endian for the bit order.  For example, the polynomial 1 is
4868     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4869     //
4870     // So, we must either reverse the bytes in each word and do
4871     // everything big-endian or reverse the bits in each byte and do
4872     // it little-endian.  On AArch64 it's more idiomatic to reverse
4873     // the bits in each byte (we have an instruction, RBIT, to do
4874     // that) and keep the data in little-endian bit order throught the
4875     // calculation, bit-reversing the inputs and outputs.
4876 
4877     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4878     __ align(wordSize * 2);
4879     address p = __ pc();
4880     __ emit_int64(0x87);  // The low-order bits of the field
4881                           // polynomial (i.e. p = z^7+z^2+z+1)
4882                           // repeated in the low and high parts of a
4883                           // 128-bit vector
4884     __ emit_int64(0x87);
4885 
4886     __ align(CodeEntryAlignment);
4887     address start = __ pc();
4888 
4889     Register state   = c_rarg0;
4890     Register subkeyH = c_rarg1;
4891     Register data    = c_rarg2;
4892     Register blocks  = c_rarg3;
4893 
4894     FloatRegister vzr = v30;
4895     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4896 
4897     __ ldrq(v0, Address(state));
4898     __ ldrq(v1, Address(subkeyH));
4899 
4900     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4901     __ rbit(v0, __ T16B, v0);
4902     __ rev64(v1, __ T16B, v1);
4903     __ rbit(v1, __ T16B, v1);
4904 
4905     __ ldrq(v26, p);
4906 
4907     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4908     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4909 
4910     {
4911       Label L_ghash_loop;
4912       __ bind(L_ghash_loop);
4913 
4914       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4915                                                  // reversing each byte
4916       __ rbit(v2, __ T16B, v2);
4917       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4918 
4919       // Multiply state in v2 by subkey in v1
4920       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4921                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4922                      /*temps*/v6, v20, v18, v21);
4923       // Reduce v7:v5 by the field polynomial
4924       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4925 
4926       __ sub(blocks, blocks, 1);
4927       __ cbnz(blocks, L_ghash_loop);
4928     }
4929 
4930     // The bit-reversed result is at this point in v0
4931     __ rev64(v1, __ T16B, v0);
4932     __ rbit(v1, __ T16B, v1);
4933 
4934     __ st1(v1, __ T16B, state);
4935     __ ret(lr);
4936 
4937     return start;
4938   }
4939 
4940   // Continuation point for throwing of implicit exceptions that are
4941   // not handled in the current activation. Fabricates an exception
4942   // oop and initiates normal exception dispatching in this
4943   // frame. Since we need to preserve callee-saved values (currently
4944   // only for C2, but done for C1 as well) we need a callee-saved oop
4945   // map and therefore have to make these stubs into RuntimeStubs
4946   // rather than BufferBlobs.  If the compiler needs all registers to
4947   // be preserved between the fault point and the exception handler
4948   // then it must assume responsibility for that in
4949   // AbstractCompiler::continuation_for_implicit_null_exception or
4950   // continuation_for_implicit_division_by_zero_exception. All other
4951   // implicit exceptions (e.g., NullPointerException or
4952   // AbstractMethodError on entry) are either at call sites or
4953   // otherwise assume that stack unwinding will be initiated, so
4954   // caller saved registers were assumed volatile in the compiler.
4955 
4956 #undef __
4957 #define __ masm->
4958 
4959   address generate_throw_exception(const char* name,
4960                                    address runtime_entry,
4961                                    Register arg1 = noreg,
4962                                    Register arg2 = noreg) {
4963     // Information about frame layout at time of blocking runtime call.
4964     // Note that we only have to preserve callee-saved registers since
4965     // the compilers are responsible for supplying a continuation point
4966     // if they expect all registers to be preserved.
4967     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4968     enum layout {
4969       rfp_off = 0,
4970       rfp_off2,
4971       return_off,
4972       return_off2,
4973       framesize // inclusive of return address
4974     };
4975 
4976     int insts_size = 512;
4977     int locs_size  = 64;
4978 
4979     CodeBuffer code(name, insts_size, locs_size);
4980     OopMapSet* oop_maps  = new OopMapSet();
4981     MacroAssembler* masm = new MacroAssembler(&code);
4982 
4983     address start = __ pc();
4984 
4985     // This is an inlined and slightly modified version of call_VM
4986     // which has the ability to fetch the return PC out of
4987     // thread-local storage and also sets up last_Java_sp slightly
4988     // differently than the real call_VM
4989 
4990     __ enter(); // Save FP and LR before call
4991 
4992     assert(is_even(framesize/2), "sp not 16-byte aligned");
4993 
4994     // lr and fp are already in place
4995     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4996 
4997     int frame_complete = __ pc() - start;
4998 
4999     // Set up last_Java_sp and last_Java_fp
5000     address the_pc = __ pc();
5001     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
5002 
5003     // Call runtime
5004     if (arg1 != noreg) {
5005       assert(arg2 != c_rarg1, "clobbered");
5006       __ mov(c_rarg1, arg1);
5007     }
5008     if (arg2 != noreg) {
5009       __ mov(c_rarg2, arg2);
5010     }
5011     __ mov(c_rarg0, rthread);
5012     BLOCK_COMMENT("call runtime_entry");
5013     __ mov(rscratch1, runtime_entry);
5014     __ blr(rscratch1);
5015 
5016     // Generate oop map
5017     OopMap* map = new OopMap(framesize, 0);
5018 
5019     oop_maps->add_gc_map(the_pc - start, map);
5020 
5021     __ reset_last_Java_frame(true);
5022     __ maybe_isb();
5023 
5024     __ leave();
5025 
5026     // check for pending exceptions
5027 #ifdef ASSERT
5028     Label L;
5029     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
5030     __ cbnz(rscratch1, L);
5031     __ should_not_reach_here();
5032     __ bind(L);
5033 #endif // ASSERT
5034     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5035 
5036 
5037     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5038     RuntimeStub* stub =
5039       RuntimeStub::new_runtime_stub(name,
5040                                     &code,
5041                                     frame_complete,
5042                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5043                                     oop_maps, false);
5044     return stub->entry_point();
5045   }
5046 
5047   class MontgomeryMultiplyGenerator : public MacroAssembler {
5048 
5049     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
5050       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
5051 
5052     RegSet _toSave;
5053     bool _squaring;
5054 
5055   public:
5056     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
5057       : MacroAssembler(as->code()), _squaring(squaring) {
5058 
5059       // Register allocation
5060 
5061       RegSetIterator regs = (RegSet::range(r0, r26) - r18_tls).begin();
5062       Pa_base = *regs;       // Argument registers
5063       if (squaring)
5064         Pb_base = Pa_base;
5065       else
5066         Pb_base = *++regs;
5067       Pn_base = *++regs;
5068       Rlen= *++regs;
5069       inv = *++regs;
5070       Pm_base = *++regs;
5071 
5072                           // Working registers:
5073       Ra =  *++regs;        // The current digit of a, b, n, and m.
5074       Rb =  *++regs;
5075       Rm =  *++regs;
5076       Rn =  *++regs;
5077 
5078       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
5079       Pb =  *++regs;
5080       Pm =  *++regs;
5081       Pn =  *++regs;
5082 
5083       t0 =  *++regs;        // Three registers which form a
5084       t1 =  *++regs;        // triple-precision accumuator.
5085       t2 =  *++regs;
5086 
5087       Ri =  *++regs;        // Inner and outer loop indexes.
5088       Rj =  *++regs;
5089 
5090       Rhi_ab = *++regs;     // Product registers: low and high parts
5091       Rlo_ab = *++regs;     // of a*b and m*n.
5092       Rhi_mn = *++regs;
5093       Rlo_mn = *++regs;
5094 
5095       // r19 and up are callee-saved.
5096       _toSave = RegSet::range(r19, *regs) + Pm_base;
5097     }
5098 
5099   private:
5100     void save_regs() {
5101       push(_toSave, sp);
5102     }
5103 
5104     void restore_regs() {
5105       pop(_toSave, sp);
5106     }
5107 
5108     template <typename T>
5109     void unroll_2(Register count, T block) {
5110       Label loop, end, odd;
5111       tbnz(count, 0, odd);
5112       cbz(count, end);
5113       align(16);
5114       bind(loop);
5115       (this->*block)();
5116       bind(odd);
5117       (this->*block)();
5118       subs(count, count, 2);
5119       br(Assembler::GT, loop);
5120       bind(end);
5121     }
5122 
5123     template <typename T>
5124     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
5125       Label loop, end, odd;
5126       tbnz(count, 0, odd);
5127       cbz(count, end);
5128       align(16);
5129       bind(loop);
5130       (this->*block)(d, s, tmp);
5131       bind(odd);
5132       (this->*block)(d, s, tmp);
5133       subs(count, count, 2);
5134       br(Assembler::GT, loop);
5135       bind(end);
5136     }
5137 
5138     void pre1(RegisterOrConstant i) {
5139       block_comment("pre1");
5140       // Pa = Pa_base;
5141       // Pb = Pb_base + i;
5142       // Pm = Pm_base;
5143       // Pn = Pn_base + i;
5144       // Ra = *Pa;
5145       // Rb = *Pb;
5146       // Rm = *Pm;
5147       // Rn = *Pn;
5148       ldr(Ra, Address(Pa_base));
5149       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5150       ldr(Rm, Address(Pm_base));
5151       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5152       lea(Pa, Address(Pa_base));
5153       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5154       lea(Pm, Address(Pm_base));
5155       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5156 
5157       // Zero the m*n result.
5158       mov(Rhi_mn, zr);
5159       mov(Rlo_mn, zr);
5160     }
5161 
5162     // The core multiply-accumulate step of a Montgomery
5163     // multiplication.  The idea is to schedule operations as a
5164     // pipeline so that instructions with long latencies (loads and
5165     // multiplies) have time to complete before their results are
5166     // used.  This most benefits in-order implementations of the
5167     // architecture but out-of-order ones also benefit.
5168     void step() {
5169       block_comment("step");
5170       // MACC(Ra, Rb, t0, t1, t2);
5171       // Ra = *++Pa;
5172       // Rb = *--Pb;
5173       umulh(Rhi_ab, Ra, Rb);
5174       mul(Rlo_ab, Ra, Rb);
5175       ldr(Ra, pre(Pa, wordSize));
5176       ldr(Rb, pre(Pb, -wordSize));
5177       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5178                                        // previous iteration.
5179       // MACC(Rm, Rn, t0, t1, t2);
5180       // Rm = *++Pm;
5181       // Rn = *--Pn;
5182       umulh(Rhi_mn, Rm, Rn);
5183       mul(Rlo_mn, Rm, Rn);
5184       ldr(Rm, pre(Pm, wordSize));
5185       ldr(Rn, pre(Pn, -wordSize));
5186       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5187     }
5188 
5189     void post1() {
5190       block_comment("post1");
5191 
5192       // MACC(Ra, Rb, t0, t1, t2);
5193       // Ra = *++Pa;
5194       // Rb = *--Pb;
5195       umulh(Rhi_ab, Ra, Rb);
5196       mul(Rlo_ab, Ra, Rb);
5197       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5198       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5199 
5200       // *Pm = Rm = t0 * inv;
5201       mul(Rm, t0, inv);
5202       str(Rm, Address(Pm));
5203 
5204       // MACC(Rm, Rn, t0, t1, t2);
5205       // t0 = t1; t1 = t2; t2 = 0;
5206       umulh(Rhi_mn, Rm, Rn);
5207 
5208 #ifndef PRODUCT
5209       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5210       {
5211         mul(Rlo_mn, Rm, Rn);
5212         add(Rlo_mn, t0, Rlo_mn);
5213         Label ok;
5214         cbz(Rlo_mn, ok); {
5215           stop("broken Montgomery multiply");
5216         } bind(ok);
5217       }
5218 #endif
5219       // We have very carefully set things up so that
5220       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5221       // the lower half of Rm * Rn because we know the result already:
5222       // it must be -t0.  t0 + (-t0) must generate a carry iff
5223       // t0 != 0.  So, rather than do a mul and an adds we just set
5224       // the carry flag iff t0 is nonzero.
5225       //
5226       // mul(Rlo_mn, Rm, Rn);
5227       // adds(zr, t0, Rlo_mn);
5228       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5229       adcs(t0, t1, Rhi_mn);
5230       adc(t1, t2, zr);
5231       mov(t2, zr);
5232     }
5233 
5234     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5235       block_comment("pre2");
5236       // Pa = Pa_base + i-len;
5237       // Pb = Pb_base + len;
5238       // Pm = Pm_base + i-len;
5239       // Pn = Pn_base + len;
5240 
5241       if (i.is_register()) {
5242         sub(Rj, i.as_register(), len);
5243       } else {
5244         mov(Rj, i.as_constant());
5245         sub(Rj, Rj, len);
5246       }
5247       // Rj == i-len
5248 
5249       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5250       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5251       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5252       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5253 
5254       // Ra = *++Pa;
5255       // Rb = *--Pb;
5256       // Rm = *++Pm;
5257       // Rn = *--Pn;
5258       ldr(Ra, pre(Pa, wordSize));
5259       ldr(Rb, pre(Pb, -wordSize));
5260       ldr(Rm, pre(Pm, wordSize));
5261       ldr(Rn, pre(Pn, -wordSize));
5262 
5263       mov(Rhi_mn, zr);
5264       mov(Rlo_mn, zr);
5265     }
5266 
5267     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5268       block_comment("post2");
5269       if (i.is_constant()) {
5270         mov(Rj, i.as_constant()-len.as_constant());
5271       } else {
5272         sub(Rj, i.as_register(), len);
5273       }
5274 
5275       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5276 
5277       // As soon as we know the least significant digit of our result,
5278       // store it.
5279       // Pm_base[i-len] = t0;
5280       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5281 
5282       // t0 = t1; t1 = t2; t2 = 0;
5283       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5284       adc(t1, t2, zr);
5285       mov(t2, zr);
5286     }
5287 
5288     // A carry in t0 after Montgomery multiplication means that we
5289     // should subtract multiples of n from our result in m.  We'll
5290     // keep doing that until there is no carry.
5291     void normalize(RegisterOrConstant len) {
5292       block_comment("normalize");
5293       // while (t0)
5294       //   t0 = sub(Pm_base, Pn_base, t0, len);
5295       Label loop, post, again;
5296       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5297       cbz(t0, post); {
5298         bind(again); {
5299           mov(i, zr);
5300           mov(cnt, len);
5301           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5302           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5303           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5304           align(16);
5305           bind(loop); {
5306             sbcs(Rm, Rm, Rn);
5307             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5308             add(i, i, 1);
5309             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5310             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5311             sub(cnt, cnt, 1);
5312           } cbnz(cnt, loop);
5313           sbc(t0, t0, zr);
5314         } cbnz(t0, again);
5315       } bind(post);
5316     }
5317 
5318     // Move memory at s to d, reversing words.
5319     //    Increments d to end of copied memory
5320     //    Destroys tmp1, tmp2
5321     //    Preserves len
5322     //    Leaves s pointing to the address which was in d at start
5323     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5324       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5325 
5326       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5327       mov(tmp1, len);
5328       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5329       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5330     }
5331     // where
5332     void reverse1(Register d, Register s, Register tmp) {
5333       ldr(tmp, pre(s, -wordSize));
5334       ror(tmp, tmp, 32);
5335       str(tmp, post(d, wordSize));
5336     }
5337 
5338     void step_squaring() {
5339       // An extra ACC
5340       step();
5341       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5342     }
5343 
5344     void last_squaring(RegisterOrConstant i) {
5345       Label dont;
5346       // if ((i & 1) == 0) {
5347       tbnz(i.as_register(), 0, dont); {
5348         // MACC(Ra, Rb, t0, t1, t2);
5349         // Ra = *++Pa;
5350         // Rb = *--Pb;
5351         umulh(Rhi_ab, Ra, Rb);
5352         mul(Rlo_ab, Ra, Rb);
5353         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5354       } bind(dont);
5355     }
5356 
5357     void extra_step_squaring() {
5358       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5359 
5360       // MACC(Rm, Rn, t0, t1, t2);
5361       // Rm = *++Pm;
5362       // Rn = *--Pn;
5363       umulh(Rhi_mn, Rm, Rn);
5364       mul(Rlo_mn, Rm, Rn);
5365       ldr(Rm, pre(Pm, wordSize));
5366       ldr(Rn, pre(Pn, -wordSize));
5367     }
5368 
5369     void post1_squaring() {
5370       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5371 
5372       // *Pm = Rm = t0 * inv;
5373       mul(Rm, t0, inv);
5374       str(Rm, Address(Pm));
5375 
5376       // MACC(Rm, Rn, t0, t1, t2);
5377       // t0 = t1; t1 = t2; t2 = 0;
5378       umulh(Rhi_mn, Rm, Rn);
5379 
5380 #ifndef PRODUCT
5381       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5382       {
5383         mul(Rlo_mn, Rm, Rn);
5384         add(Rlo_mn, t0, Rlo_mn);
5385         Label ok;
5386         cbz(Rlo_mn, ok); {
5387           stop("broken Montgomery multiply");
5388         } bind(ok);
5389       }
5390 #endif
5391       // We have very carefully set things up so that
5392       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5393       // the lower half of Rm * Rn because we know the result already:
5394       // it must be -t0.  t0 + (-t0) must generate a carry iff
5395       // t0 != 0.  So, rather than do a mul and an adds we just set
5396       // the carry flag iff t0 is nonzero.
5397       //
5398       // mul(Rlo_mn, Rm, Rn);
5399       // adds(zr, t0, Rlo_mn);
5400       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5401       adcs(t0, t1, Rhi_mn);
5402       adc(t1, t2, zr);
5403       mov(t2, zr);
5404     }
5405 
5406     void acc(Register Rhi, Register Rlo,
5407              Register t0, Register t1, Register t2) {
5408       adds(t0, t0, Rlo);
5409       adcs(t1, t1, Rhi);
5410       adc(t2, t2, zr);
5411     }
5412 
5413   public:
5414     /**
5415      * Fast Montgomery multiplication.  The derivation of the
5416      * algorithm is in A Cryptographic Library for the Motorola
5417      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5418      *
5419      * Arguments:
5420      *
5421      * Inputs for multiplication:
5422      *   c_rarg0   - int array elements a
5423      *   c_rarg1   - int array elements b
5424      *   c_rarg2   - int array elements n (the modulus)
5425      *   c_rarg3   - int length
5426      *   c_rarg4   - int inv
5427      *   c_rarg5   - int array elements m (the result)
5428      *
5429      * Inputs for squaring:
5430      *   c_rarg0   - int array elements a
5431      *   c_rarg1   - int array elements n (the modulus)
5432      *   c_rarg2   - int length
5433      *   c_rarg3   - int inv
5434      *   c_rarg4   - int array elements m (the result)
5435      *
5436      */
5437     address generate_multiply() {
5438       Label argh, nothing;
5439       bind(argh);
5440       stop("MontgomeryMultiply total_allocation must be <= 8192");
5441 
5442       align(CodeEntryAlignment);
5443       address entry = pc();
5444 
5445       cbzw(Rlen, nothing);
5446 
5447       enter();
5448 
5449       // Make room.
5450       cmpw(Rlen, 512);
5451       br(Assembler::HI, argh);
5452       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5453       andr(sp, Ra, -2 * wordSize);
5454 
5455       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5456 
5457       {
5458         // Copy input args, reversing as we go.  We use Ra as a
5459         // temporary variable.
5460         reverse(Ra, Pa_base, Rlen, t0, t1);
5461         if (!_squaring)
5462           reverse(Ra, Pb_base, Rlen, t0, t1);
5463         reverse(Ra, Pn_base, Rlen, t0, t1);
5464       }
5465 
5466       // Push all call-saved registers and also Pm_base which we'll need
5467       // at the end.
5468       save_regs();
5469 
5470 #ifndef PRODUCT
5471       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5472       {
5473         ldr(Rn, Address(Pn_base, 0));
5474         mul(Rlo_mn, Rn, inv);
5475         subs(zr, Rlo_mn, -1);
5476         Label ok;
5477         br(EQ, ok); {
5478           stop("broken inverse in Montgomery multiply");
5479         } bind(ok);
5480       }
5481 #endif
5482 
5483       mov(Pm_base, Ra);
5484 
5485       mov(t0, zr);
5486       mov(t1, zr);
5487       mov(t2, zr);
5488 
5489       block_comment("for (int i = 0; i < len; i++) {");
5490       mov(Ri, zr); {
5491         Label loop, end;
5492         cmpw(Ri, Rlen);
5493         br(Assembler::GE, end);
5494 
5495         bind(loop);
5496         pre1(Ri);
5497 
5498         block_comment("  for (j = i; j; j--) {"); {
5499           movw(Rj, Ri);
5500           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5501         } block_comment("  } // j");
5502 
5503         post1();
5504         addw(Ri, Ri, 1);
5505         cmpw(Ri, Rlen);
5506         br(Assembler::LT, loop);
5507         bind(end);
5508         block_comment("} // i");
5509       }
5510 
5511       block_comment("for (int i = len; i < 2*len; i++) {");
5512       mov(Ri, Rlen); {
5513         Label loop, end;
5514         cmpw(Ri, Rlen, Assembler::LSL, 1);
5515         br(Assembler::GE, end);
5516 
5517         bind(loop);
5518         pre2(Ri, Rlen);
5519 
5520         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5521           lslw(Rj, Rlen, 1);
5522           subw(Rj, Rj, Ri);
5523           subw(Rj, Rj, 1);
5524           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5525         } block_comment("  } // j");
5526 
5527         post2(Ri, Rlen);
5528         addw(Ri, Ri, 1);
5529         cmpw(Ri, Rlen, Assembler::LSL, 1);
5530         br(Assembler::LT, loop);
5531         bind(end);
5532       }
5533       block_comment("} // i");
5534 
5535       normalize(Rlen);
5536 
5537       mov(Ra, Pm_base);  // Save Pm_base in Ra
5538       restore_regs();  // Restore caller's Pm_base
5539 
5540       // Copy our result into caller's Pm_base
5541       reverse(Pm_base, Ra, Rlen, t0, t1);
5542 
5543       leave();
5544       bind(nothing);
5545       ret(lr);
5546 
5547       return entry;
5548     }
5549     // In C, approximately:
5550 
5551     // void
5552     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
5553     //                     julong Pn_base[], julong Pm_base[],
5554     //                     julong inv, int len) {
5555     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5556     //   julong *Pa, *Pb, *Pn, *Pm;
5557     //   julong Ra, Rb, Rn, Rm;
5558 
5559     //   int i;
5560 
5561     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5562 
5563     //   for (i = 0; i < len; i++) {
5564     //     int j;
5565 
5566     //     Pa = Pa_base;
5567     //     Pb = Pb_base + i;
5568     //     Pm = Pm_base;
5569     //     Pn = Pn_base + i;
5570 
5571     //     Ra = *Pa;
5572     //     Rb = *Pb;
5573     //     Rm = *Pm;
5574     //     Rn = *Pn;
5575 
5576     //     int iters = i;
5577     //     for (j = 0; iters--; j++) {
5578     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5579     //       MACC(Ra, Rb, t0, t1, t2);
5580     //       Ra = *++Pa;
5581     //       Rb = *--Pb;
5582     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5583     //       MACC(Rm, Rn, t0, t1, t2);
5584     //       Rm = *++Pm;
5585     //       Rn = *--Pn;
5586     //     }
5587 
5588     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5589     //     MACC(Ra, Rb, t0, t1, t2);
5590     //     *Pm = Rm = t0 * inv;
5591     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5592     //     MACC(Rm, Rn, t0, t1, t2);
5593 
5594     //     assert(t0 == 0, "broken Montgomery multiply");
5595 
5596     //     t0 = t1; t1 = t2; t2 = 0;
5597     //   }
5598 
5599     //   for (i = len; i < 2*len; i++) {
5600     //     int j;
5601 
5602     //     Pa = Pa_base + i-len;
5603     //     Pb = Pb_base + len;
5604     //     Pm = Pm_base + i-len;
5605     //     Pn = Pn_base + len;
5606 
5607     //     Ra = *++Pa;
5608     //     Rb = *--Pb;
5609     //     Rm = *++Pm;
5610     //     Rn = *--Pn;
5611 
5612     //     int iters = len*2-i-1;
5613     //     for (j = i-len+1; iters--; j++) {
5614     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5615     //       MACC(Ra, Rb, t0, t1, t2);
5616     //       Ra = *++Pa;
5617     //       Rb = *--Pb;
5618     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5619     //       MACC(Rm, Rn, t0, t1, t2);
5620     //       Rm = *++Pm;
5621     //       Rn = *--Pn;
5622     //     }
5623 
5624     //     Pm_base[i-len] = t0;
5625     //     t0 = t1; t1 = t2; t2 = 0;
5626     //   }
5627 
5628     //   while (t0)
5629     //     t0 = sub(Pm_base, Pn_base, t0, len);
5630     // }
5631 
5632     /**
5633      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5634      * multiplies than Montgomery multiplication so it should be up to
5635      * 25% faster.  However, its loop control is more complex and it
5636      * may actually run slower on some machines.
5637      *
5638      * Arguments:
5639      *
5640      * Inputs:
5641      *   c_rarg0   - int array elements a
5642      *   c_rarg1   - int array elements n (the modulus)
5643      *   c_rarg2   - int length
5644      *   c_rarg3   - int inv
5645      *   c_rarg4   - int array elements m (the result)
5646      *
5647      */
5648     address generate_square() {
5649       Label argh;
5650       bind(argh);
5651       stop("MontgomeryMultiply total_allocation must be <= 8192");
5652 
5653       align(CodeEntryAlignment);
5654       address entry = pc();
5655 
5656       enter();
5657 
5658       // Make room.
5659       cmpw(Rlen, 512);
5660       br(Assembler::HI, argh);
5661       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5662       andr(sp, Ra, -2 * wordSize);
5663 
5664       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5665 
5666       {
5667         // Copy input args, reversing as we go.  We use Ra as a
5668         // temporary variable.
5669         reverse(Ra, Pa_base, Rlen, t0, t1);
5670         reverse(Ra, Pn_base, Rlen, t0, t1);
5671       }
5672 
5673       // Push all call-saved registers and also Pm_base which we'll need
5674       // at the end.
5675       save_regs();
5676 
5677       mov(Pm_base, Ra);
5678 
5679       mov(t0, zr);
5680       mov(t1, zr);
5681       mov(t2, zr);
5682 
5683       block_comment("for (int i = 0; i < len; i++) {");
5684       mov(Ri, zr); {
5685         Label loop, end;
5686         bind(loop);
5687         cmp(Ri, Rlen);
5688         br(Assembler::GE, end);
5689 
5690         pre1(Ri);
5691 
5692         block_comment("for (j = (i+1)/2; j; j--) {"); {
5693           add(Rj, Ri, 1);
5694           lsr(Rj, Rj, 1);
5695           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5696         } block_comment("  } // j");
5697 
5698         last_squaring(Ri);
5699 
5700         block_comment("  for (j = i/2; j; j--) {"); {
5701           lsr(Rj, Ri, 1);
5702           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5703         } block_comment("  } // j");
5704 
5705         post1_squaring();
5706         add(Ri, Ri, 1);
5707         cmp(Ri, Rlen);
5708         br(Assembler::LT, loop);
5709 
5710         bind(end);
5711         block_comment("} // i");
5712       }
5713 
5714       block_comment("for (int i = len; i < 2*len; i++) {");
5715       mov(Ri, Rlen); {
5716         Label loop, end;
5717         bind(loop);
5718         cmp(Ri, Rlen, Assembler::LSL, 1);
5719         br(Assembler::GE, end);
5720 
5721         pre2(Ri, Rlen);
5722 
5723         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5724           lsl(Rj, Rlen, 1);
5725           sub(Rj, Rj, Ri);
5726           sub(Rj, Rj, 1);
5727           lsr(Rj, Rj, 1);
5728           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5729         } block_comment("  } // j");
5730 
5731         last_squaring(Ri);
5732 
5733         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5734           lsl(Rj, Rlen, 1);
5735           sub(Rj, Rj, Ri);
5736           lsr(Rj, Rj, 1);
5737           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5738         } block_comment("  } // j");
5739 
5740         post2(Ri, Rlen);
5741         add(Ri, Ri, 1);
5742         cmp(Ri, Rlen, Assembler::LSL, 1);
5743 
5744         br(Assembler::LT, loop);
5745         bind(end);
5746         block_comment("} // i");
5747       }
5748 
5749       normalize(Rlen);
5750 
5751       mov(Ra, Pm_base);  // Save Pm_base in Ra
5752       restore_regs();  // Restore caller's Pm_base
5753 
5754       // Copy our result into caller's Pm_base
5755       reverse(Pm_base, Ra, Rlen, t0, t1);
5756 
5757       leave();
5758       ret(lr);
5759 
5760       return entry;
5761     }
5762     // In C, approximately:
5763 
5764     // void
5765     // montgomery_square(julong Pa_base[], julong Pn_base[],
5766     //                   julong Pm_base[], julong inv, int len) {
5767     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5768     //   julong *Pa, *Pb, *Pn, *Pm;
5769     //   julong Ra, Rb, Rn, Rm;
5770 
5771     //   int i;
5772 
5773     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5774 
5775     //   for (i = 0; i < len; i++) {
5776     //     int j;
5777 
5778     //     Pa = Pa_base;
5779     //     Pb = Pa_base + i;
5780     //     Pm = Pm_base;
5781     //     Pn = Pn_base + i;
5782 
5783     //     Ra = *Pa;
5784     //     Rb = *Pb;
5785     //     Rm = *Pm;
5786     //     Rn = *Pn;
5787 
5788     //     int iters = (i+1)/2;
5789     //     for (j = 0; iters--; j++) {
5790     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5791     //       MACC2(Ra, Rb, t0, t1, t2);
5792     //       Ra = *++Pa;
5793     //       Rb = *--Pb;
5794     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5795     //       MACC(Rm, Rn, t0, t1, t2);
5796     //       Rm = *++Pm;
5797     //       Rn = *--Pn;
5798     //     }
5799     //     if ((i & 1) == 0) {
5800     //       assert(Ra == Pa_base[j], "must be");
5801     //       MACC(Ra, Ra, t0, t1, t2);
5802     //     }
5803     //     iters = i/2;
5804     //     assert(iters == i-j, "must be");
5805     //     for (; iters--; j++) {
5806     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5807     //       MACC(Rm, Rn, t0, t1, t2);
5808     //       Rm = *++Pm;
5809     //       Rn = *--Pn;
5810     //     }
5811 
5812     //     *Pm = Rm = t0 * inv;
5813     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5814     //     MACC(Rm, Rn, t0, t1, t2);
5815 
5816     //     assert(t0 == 0, "broken Montgomery multiply");
5817 
5818     //     t0 = t1; t1 = t2; t2 = 0;
5819     //   }
5820 
5821     //   for (i = len; i < 2*len; i++) {
5822     //     int start = i-len+1;
5823     //     int end = start + (len - start)/2;
5824     //     int j;
5825 
5826     //     Pa = Pa_base + i-len;
5827     //     Pb = Pa_base + len;
5828     //     Pm = Pm_base + i-len;
5829     //     Pn = Pn_base + len;
5830 
5831     //     Ra = *++Pa;
5832     //     Rb = *--Pb;
5833     //     Rm = *++Pm;
5834     //     Rn = *--Pn;
5835 
5836     //     int iters = (2*len-i-1)/2;
5837     //     assert(iters == end-start, "must be");
5838     //     for (j = start; iters--; j++) {
5839     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5840     //       MACC2(Ra, Rb, t0, t1, t2);
5841     //       Ra = *++Pa;
5842     //       Rb = *--Pb;
5843     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5844     //       MACC(Rm, Rn, t0, t1, t2);
5845     //       Rm = *++Pm;
5846     //       Rn = *--Pn;
5847     //     }
5848     //     if ((i & 1) == 0) {
5849     //       assert(Ra == Pa_base[j], "must be");
5850     //       MACC(Ra, Ra, t0, t1, t2);
5851     //     }
5852     //     iters =  (2*len-i)/2;
5853     //     assert(iters == len-j, "must be");
5854     //     for (; iters--; j++) {
5855     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5856     //       MACC(Rm, Rn, t0, t1, t2);
5857     //       Rm = *++Pm;
5858     //       Rn = *--Pn;
5859     //     }
5860     //     Pm_base[i-len] = t0;
5861     //     t0 = t1; t1 = t2; t2 = 0;
5862     //   }
5863 
5864     //   while (t0)
5865     //     t0 = sub(Pm_base, Pn_base, t0, len);
5866     // }
5867   };
5868 
5869 
5870   // Initialization
5871   void generate_initial() {
5872     // Generate initial stubs and initializes the entry points
5873 
5874     // entry points that exist in all platforms Note: This is code
5875     // that could be shared among different platforms - however the
5876     // benefit seems to be smaller than the disadvantage of having a
5877     // much more complicated generator structure. See also comment in
5878     // stubRoutines.hpp.
5879 
5880     StubRoutines::_forward_exception_entry = generate_forward_exception();
5881 
5882     StubRoutines::_call_stub_entry =
5883       generate_call_stub(StubRoutines::_call_stub_return_address);
5884 
5885     // is referenced by megamorphic call
5886     StubRoutines::_catch_exception_entry = generate_catch_exception();
5887 
5888     // Build this early so it's available for the interpreter.
5889     StubRoutines::_throw_StackOverflowError_entry =
5890       generate_throw_exception("StackOverflowError throw_exception",
5891                                CAST_FROM_FN_PTR(address,
5892                                                 SharedRuntime::throw_StackOverflowError));
5893     StubRoutines::_throw_delayed_StackOverflowError_entry =
5894       generate_throw_exception("delayed StackOverflowError throw_exception",
5895                                CAST_FROM_FN_PTR(address,
5896                                                 SharedRuntime::throw_delayed_StackOverflowError));
5897     if (UseCRC32Intrinsics) {
5898       // set table address before stub generation which use it
5899       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5900       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5901     }
5902 
5903     if (UseCRC32CIntrinsics) {
5904       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5905     }
5906 
5907     // Disabled until JDK-8210858 is fixed
5908     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5909     //   StubRoutines::_dlog = generate_dlog();
5910     // }
5911 
5912     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5913       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5914     }
5915 
5916     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5917       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5918     }
5919 
5920     // Safefetch stubs.
5921     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5922                                                        &StubRoutines::_safefetch32_fault_pc,
5923                                                        &StubRoutines::_safefetch32_continuation_pc);
5924     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5925                                                        &StubRoutines::_safefetchN_fault_pc,
5926                                                        &StubRoutines::_safefetchN_continuation_pc);
5927   }
5928 
5929   void generate_all() {
5930     // support for verify_oop (must happen after universe_init)
5931     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5932     StubRoutines::_throw_AbstractMethodError_entry =
5933       generate_throw_exception("AbstractMethodError throw_exception",
5934                                CAST_FROM_FN_PTR(address,
5935                                                 SharedRuntime::
5936                                                 throw_AbstractMethodError));
5937 
5938     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5939       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5940                                CAST_FROM_FN_PTR(address,
5941                                                 SharedRuntime::
5942                                                 throw_IncompatibleClassChangeError));
5943 
5944     StubRoutines::_throw_NullPointerException_at_call_entry =
5945       generate_throw_exception("NullPointerException at call throw_exception",
5946                                CAST_FROM_FN_PTR(address,
5947                                                 SharedRuntime::
5948                                                 throw_NullPointerException_at_call));
5949 
5950     // arraycopy stubs used by compilers
5951     generate_arraycopy_stubs();
5952 
5953     // has negatives stub for large arrays.
5954     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5955 
5956     // array equals stub for large arrays.
5957     if (!UseSimpleArrayEquals) {
5958       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5959     }
5960 
5961     generate_compare_long_strings();
5962 
5963     generate_string_indexof_stubs();
5964 
5965     // byte_array_inflate stub for large arrays.
5966     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5967 
5968     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5969     if (bs_nm != NULL) {
5970       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
5971     }
5972 #ifdef COMPILER2
5973     if (UseMultiplyToLenIntrinsic) {
5974       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5975     }
5976 
5977     if (UseSquareToLenIntrinsic) {
5978       StubRoutines::_squareToLen = generate_squareToLen();
5979     }
5980 
5981     if (UseMulAddIntrinsic) {
5982       StubRoutines::_mulAdd = generate_mulAdd();
5983     }
5984 
5985     if (UseMontgomeryMultiplyIntrinsic) {
5986       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5987       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5988       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5989     }
5990 
5991     if (UseMontgomerySquareIntrinsic) {
5992       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5993       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5994       // We use generate_multiply() rather than generate_square()
5995       // because it's faster for the sizes of modulus we care about.
5996       StubRoutines::_montgomerySquare = g.generate_multiply();
5997     }
5998 #endif // COMPILER2
5999 
6000     // generate GHASH intrinsics code
6001     if (UseGHASHIntrinsics) {
6002       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6003     }
6004 
6005     // data cache line writeback
6006     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
6007     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
6008 
6009     if (UseAESIntrinsics) {
6010       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6011       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6012       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6013       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
6014     }
6015 
6016     if (UseSHA1Intrinsics) {
6017       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
6018       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
6019     }
6020     if (UseSHA256Intrinsics) {
6021       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
6022       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
6023     }
6024     if (UseSHA512Intrinsics) {
6025       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
6026       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
6027     }
6028 
6029     // generate Adler32 intrinsics code
6030     if (UseAdler32Intrinsics) {
6031       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6032     }
6033 
6034     StubRoutines::aarch64::set_completed();
6035   }
6036 
6037  public:
6038   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6039     if (all) {
6040       generate_all();
6041     } else {
6042       generate_initial();
6043     }
6044   }
6045 }; // end class declaration
6046 
6047 #define UCM_TABLE_MAX_ENTRIES 8
6048 void StubGenerator_generate(CodeBuffer* code, bool all) {
6049   if (UnsafeCopyMemory::_table == NULL) {
6050     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
6051   }
6052   StubGenerator g(code, all);
6053 }