1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 #if INCLUDE_ZGC
  50 #include "gc/z/zThreadLocalData.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     address aarch64_entry = __ pc();
 221 
 222     // set up frame and move sp to end of save area
 223     __ enter();
 224     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 225 
 226     // save register parameters and Java scratch/global registers
 227     // n.b. we save thread even though it gets installed in
 228     // rthread because we want to sanity check rthread later
 229     __ str(c_rarg7,  thread);
 230     __ strw(c_rarg6, parameter_size);
 231     __ stp(c_rarg4, c_rarg5,  entry_point);
 232     __ stp(c_rarg2, c_rarg3,  result_type);
 233     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 234 
 235     __ stp(r20, r19,   r20_save);
 236     __ stp(r22, r21,   r22_save);
 237     __ stp(r24, r23,   r24_save);
 238     __ stp(r26, r25,   r26_save);
 239     __ stp(r28, r27,   r28_save);
 240 
 241     __ stpd(v9,  v8,   d9_save);
 242     __ stpd(v11, v10,  d11_save);
 243     __ stpd(v13, v12,  d13_save);
 244     __ stpd(v15, v14,  d15_save);
 245 
 246     // install Java thread in global register now we have saved
 247     // whatever value it held
 248     __ mov(rthread, c_rarg7);
 249     // And method
 250     __ mov(rmethod, c_rarg3);
 251 
 252     // set up the heapbase register
 253     __ reinit_heapbase();
 254 
 255 #ifdef ASSERT
 256     // make sure we have no pending exceptions
 257     {
 258       Label L;
 259       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 260       __ cmp(rscratch1, (u1)NULL_WORD);
 261       __ br(Assembler::EQ, L);
 262       __ stop("StubRoutines::call_stub: entered with pending exception");
 263       __ BIND(L);
 264     }
 265 #endif
 266     // pass parameters if any
 267     __ mov(esp, sp);
 268     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 269     __ andr(sp, rscratch1, -2 * wordSize);
 270 
 271     BLOCK_COMMENT("pass parameters if any");
 272     Label parameters_done;
 273     // parameter count is still in c_rarg6
 274     // and parameter pointer identifying param 1 is in c_rarg5
 275     __ cbzw(c_rarg6, parameters_done);
 276 
 277     address loop = __ pc();
 278     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 279     __ subsw(c_rarg6, c_rarg6, 1);
 280     __ push(rscratch1);
 281     __ br(Assembler::GT, loop);
 282 
 283     __ BIND(parameters_done);
 284 
 285     // call Java entry -- passing methdoOop, and current sp
 286     //      rmethod: Method*
 287     //      r13: sender sp
 288     BLOCK_COMMENT("call Java function");
 289     __ mov(r13, sp);
 290     __ blr(c_rarg4);
 291 
 292     // we do this here because the notify will already have been done
 293     // if we get to the next instruction via an exception
 294     //
 295     // n.b. adding this instruction here affects the calculation of
 296     // whether or not a routine returns to the call stub (used when
 297     // doing stack walks) since the normal test is to check the return
 298     // pc against the address saved below. so we may need to allow for
 299     // this extra instruction in the check.
 300 
 301     // save current address for use by exception handling code
 302 
 303     return_address = __ pc();
 304 
 305     // store result depending on type (everything that is not
 306     // T_OBJECT, T_VALUETYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 307     // n.b. this assumes Java returns an integral result in r0
 308     // and a floating result in j_farg0
 309     __ ldr(j_rarg2, result);
 310     Label is_long, is_float, is_double, is_value, exit;
 311     __ ldr(j_rarg1, result_type);
 312     __ cmp(j_rarg1, (u1)T_OBJECT);
 313     __ br(Assembler::EQ, is_long);
 314     __ cmp(j_rarg1, (u1)T_VALUETYPE);
 315     __ br(Assembler::EQ, is_value);
 316     __ cmp(j_rarg1, (u1)T_LONG);
 317     __ br(Assembler::EQ, is_long);
 318     __ cmp(j_rarg1, (u1)T_FLOAT);
 319     __ br(Assembler::EQ, is_float);
 320     __ cmp(j_rarg1, (u1)T_DOUBLE);
 321     __ br(Assembler::EQ, is_double);
 322 
 323     // handle T_INT case
 324     __ strw(r0, Address(j_rarg2));
 325 
 326     __ BIND(exit);
 327 
 328     // pop parameters
 329     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 330 
 331 #ifdef ASSERT
 332     // verify that threads correspond
 333     {
 334       Label L, S;
 335       __ ldr(rscratch1, thread);
 336       __ cmp(rthread, rscratch1);
 337       __ br(Assembler::NE, S);
 338       __ get_thread(rscratch1);
 339       __ cmp(rthread, rscratch1);
 340       __ br(Assembler::EQ, L);
 341       __ BIND(S);
 342       __ stop("StubRoutines::call_stub: threads must correspond");
 343       __ BIND(L);
 344     }
 345 #endif
 346 
 347     // restore callee-save registers
 348     __ ldpd(v15, v14,  d15_save);
 349     __ ldpd(v13, v12,  d13_save);
 350     __ ldpd(v11, v10,  d11_save);
 351     __ ldpd(v9,  v8,   d9_save);
 352 
 353     __ ldp(r28, r27,   r28_save);
 354     __ ldp(r26, r25,   r26_save);
 355     __ ldp(r24, r23,   r24_save);
 356     __ ldp(r22, r21,   r22_save);
 357     __ ldp(r20, r19,   r20_save);
 358 
 359     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 360     __ ldrw(c_rarg2, result_type);
 361     __ ldr(c_rarg3,  method);
 362     __ ldp(c_rarg4, c_rarg5,  entry_point);
 363     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 364 
 365     // leave frame and return to caller
 366     __ leave();
 367     __ ret(lr);
 368 
 369     // handle return types different from T_INT
 370     __ BIND(is_value);
 371     if (ValueTypeReturnedAsFields) {
 372       // Check for flattened return value
 373       __ cbz(r0, is_long);
 374       // Initialize pre-allocated buffer
 375       __ mov(r1, r0);
 376       __ andr(r1, r1, -2);
 377       __ ldr(r1, Address(r1, InstanceKlass::adr_valueklass_fixed_block_offset()));
 378       __ ldr(r1, Address(r1, ValueKlass::pack_handler_offset()));
 379       __ ldr(r0, Address(j_rarg2, 0));
 380       __ blr(r1);
 381       __ b(exit);
 382     }
 383 
 384     __ BIND(is_long);
 385     __ str(r0, Address(j_rarg2, 0));
 386     __ br(Assembler::AL, exit);
 387 
 388     __ BIND(is_float);
 389     __ strs(j_farg0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_double);
 393     __ strd(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     return start;
 397   }
 398 
 399   // Return point for a Java call if there's an exception thrown in
 400   // Java code.  The exception is caught and transformed into a
 401   // pending exception stored in JavaThread that can be tested from
 402   // within the VM.
 403   //
 404   // Note: Usually the parameters are removed by the callee. In case
 405   // of an exception crossing an activation frame boundary, that is
 406   // not the case if the callee is compiled code => need to setup the
 407   // rsp.
 408   //
 409   // r0: exception oop
 410 
 411   address generate_catch_exception() {
 412     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 413     address start = __ pc();
 414 
 415     // same as in generate_call_stub():
 416     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 417     const Address thread        (rfp, thread_off         * wordSize);
 418 
 419 #ifdef ASSERT
 420     // verify that threads correspond
 421     {
 422       Label L, S;
 423       __ ldr(rscratch1, thread);
 424       __ cmp(rthread, rscratch1);
 425       __ br(Assembler::NE, S);
 426       __ get_thread(rscratch1);
 427       __ cmp(rthread, rscratch1);
 428       __ br(Assembler::EQ, L);
 429       __ bind(S);
 430       __ stop("StubRoutines::catch_exception: threads must correspond");
 431       __ bind(L);
 432     }
 433 #endif
 434 
 435     // set pending exception
 436     __ verify_oop(r0);
 437 
 438     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 439     __ mov(rscratch1, (address)__FILE__);
 440     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 441     __ movw(rscratch1, (int)__LINE__);
 442     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 443 
 444     // complete return to VM
 445     assert(StubRoutines::_call_stub_return_address != NULL,
 446            "_call_stub_return_address must have been generated before");
 447     __ b(StubRoutines::_call_stub_return_address);
 448 
 449     return start;
 450   }
 451 
 452   // Continuation point for runtime calls returning with a pending
 453   // exception.  The pending exception check happened in the runtime
 454   // or native call stub.  The pending exception in Thread is
 455   // converted into a Java-level exception.
 456   //
 457   // Contract with Java-level exception handlers:
 458   // r0: exception
 459   // r3: throwing pc
 460   //
 461   // NOTE: At entry of this stub, exception-pc must be in LR !!
 462 
 463   // NOTE: this is always used as a jump target within generated code
 464   // so it just needs to be generated code wiht no x86 prolog
 465 
 466   address generate_forward_exception() {
 467     StubCodeMark mark(this, "StubRoutines", "forward exception");
 468     address start = __ pc();
 469 
 470     // Upon entry, LR points to the return address returning into
 471     // Java (interpreted or compiled) code; i.e., the return address
 472     // becomes the throwing pc.
 473     //
 474     // Arguments pushed before the runtime call are still on the stack
 475     // but the exception handler will reset the stack pointer ->
 476     // ignore them.  A potential result in registers can be ignored as
 477     // well.
 478 
 479 #ifdef ASSERT
 480     // make sure this code is only executed if there is a pending exception
 481     {
 482       Label L;
 483       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 484       __ cbnz(rscratch1, L);
 485       __ stop("StubRoutines::forward exception: no pending exception (1)");
 486       __ bind(L);
 487     }
 488 #endif
 489 
 490     // compute exception handler into r19
 491 
 492     // call the VM to find the handler address associated with the
 493     // caller address. pass thread in r0 and caller pc (ret address)
 494     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 495     // the stack.
 496     __ mov(c_rarg1, lr);
 497     // lr will be trashed by the VM call so we move it to R19
 498     // (callee-saved) because we also need to pass it to the handler
 499     // returned by this call.
 500     __ mov(r19, lr);
 501     BLOCK_COMMENT("call exception_handler_for_return_address");
 502     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 503                          SharedRuntime::exception_handler_for_return_address),
 504                     rthread, c_rarg1);
 505     // we should not really care that lr is no longer the callee
 506     // address. we saved the value the handler needs in r19 so we can
 507     // just copy it to r3. however, the C2 handler will push its own
 508     // frame and then calls into the VM and the VM code asserts that
 509     // the PC for the frame above the handler belongs to a compiled
 510     // Java method. So, we restore lr here to satisfy that assert.
 511     __ mov(lr, r19);
 512     // setup r0 & r3 & clear pending exception
 513     __ mov(r3, r19);
 514     __ mov(r19, r0);
 515     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 516     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 517 
 518 #ifdef ASSERT
 519     // make sure exception is set
 520     {
 521       Label L;
 522       __ cbnz(r0, L);
 523       __ stop("StubRoutines::forward exception: no pending exception (2)");
 524       __ bind(L);
 525     }
 526 #endif
 527 
 528     // continue at exception handler
 529     // r0: exception
 530     // r3: throwing pc
 531     // r19: exception handler
 532     __ verify_oop(r0);
 533     __ br(r19);
 534 
 535     return start;
 536   }
 537 
 538   // Non-destructive plausibility checks for oops
 539   //
 540   // Arguments:
 541   //    r0: oop to verify
 542   //    rscratch1: error message
 543   //
 544   // Stack after saving c_rarg3:
 545   //    [tos + 0]: saved c_rarg3
 546   //    [tos + 1]: saved c_rarg2
 547   //    [tos + 2]: saved lr
 548   //    [tos + 3]: saved rscratch2
 549   //    [tos + 4]: saved r0
 550   //    [tos + 5]: saved rscratch1
 551   address generate_verify_oop() {
 552 
 553     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 554     address start = __ pc();
 555 
 556     Label exit, error;
 557 
 558     // save c_rarg2 and c_rarg3
 559     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 560 
 561     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 562     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 563     __ ldr(c_rarg3, Address(c_rarg2));
 564     __ add(c_rarg3, c_rarg3, 1);
 565     __ str(c_rarg3, Address(c_rarg2));
 566 
 567     // object is in r0
 568     // make sure object is 'reasonable'
 569     __ cbz(r0, exit); // if obj is NULL it is OK
 570 
 571 #if INCLUDE_ZGC
 572     if (UseZGC) {
 573       // Check if mask is good.
 574       // verifies that ZAddressBadMask & r0 == 0
 575       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 576       __ andr(c_rarg2, r0, c_rarg3);
 577       __ cbnz(c_rarg2, error);
 578     }
 579 #endif
 580 
 581     // Check if the oop is in the right area of memory
 582     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 583     __ andr(c_rarg2, r0, c_rarg3);
 584     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 585 
 586     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 587     // instruction here because the flags register is live.
 588     __ eor(c_rarg2, c_rarg2, c_rarg3);
 589     __ cbnz(c_rarg2, error);
 590 
 591     // make sure klass is 'reasonable', which is not zero.
 592     __ load_klass(r0, r0);  // get klass
 593     __ cbz(r0, error);      // if klass is NULL it is broken
 594 
 595     // return if everything seems ok
 596     __ bind(exit);
 597 
 598     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 599     __ ret(lr);
 600 
 601     // handle errors
 602     __ bind(error);
 603     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 604 
 605     __ push(RegSet::range(r0, r29), sp);
 606     // debug(char* msg, int64_t pc, int64_t regs[])
 607     __ mov(c_rarg0, rscratch1);      // pass address of error message
 608     __ mov(c_rarg1, lr);             // pass return address
 609     __ mov(c_rarg2, sp);             // pass address of regs on stack
 610 #ifndef PRODUCT
 611     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 612 #endif
 613     BLOCK_COMMENT("call MacroAssembler::debug");
 614     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 615     __ blr(rscratch1);
 616     __ hlt(0);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657 
 658       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 659       // alignment.
 660       Label small;
 661       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 662       __ subs(rscratch1, cnt, low_limit >> 3);
 663       __ br(Assembler::LT, small);
 664       __ zero_dcache_blocks(base, cnt);
 665       __ bind(small);
 666     }
 667 
 668     {
 669       // Number of stp instructions we'll unroll
 670       const int unroll =
 671         MacroAssembler::zero_words_block_size / 2;
 672       // Clear the remaining blocks.
 673       Label loop;
 674       __ subs(cnt, cnt, unroll * 2);
 675       __ br(Assembler::LT, done);
 676       __ bind(loop);
 677       for (int i = 0; i < unroll; i++)
 678         __ stp(zr, zr, __ post(base, 16));
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::GE, loop);
 681       __ bind(done);
 682       __ add(cnt, cnt, unroll * 2);
 683     }
 684 
 685     __ ret(lr);
 686 
 687     return start;
 688   }
 689 
 690 
 691   typedef enum {
 692     copy_forwards = 1,
 693     copy_backwards = -1
 694   } copy_direction;
 695 
 696   // Bulk copy of blocks of 8 words.
 697   //
 698   // count is a count of words.
 699   //
 700   // Precondition: count >= 8
 701   //
 702   // Postconditions:
 703   //
 704   // The least significant bit of count contains the remaining count
 705   // of words to copy.  The rest of count is trash.
 706   //
 707   // s and d are adjusted to point to the remaining words to copy
 708   //
 709   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 710                            copy_direction direction) {
 711     int unit = wordSize * direction;
 712     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 713 
 714     int offset;
 715     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 716       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 717     const Register stride = r13;
 718 
 719     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 720     assert_different_registers(s, d, count, rscratch1);
 721 
 722     Label again, drain;
 723     const char *stub_name;
 724     if (direction == copy_forwards)
 725       stub_name = "forward_copy_longs";
 726     else
 727       stub_name = "backward_copy_longs";
 728 
 729     __ align(CodeEntryAlignment);
 730 
 731     StubCodeMark mark(this, "StubRoutines", stub_name);
 732 
 733     __ bind(start);
 734 
 735     Label unaligned_copy_long;
 736     if (AvoidUnalignedAccesses) {
 737       __ tbnz(d, 3, unaligned_copy_long);
 738     }
 739 
 740     if (direction == copy_forwards) {
 741       __ sub(s, s, bias);
 742       __ sub(d, d, bias);
 743     }
 744 
 745 #ifdef ASSERT
 746     // Make sure we are never given < 8 words
 747     {
 748       Label L;
 749       __ cmp(count, (u1)8);
 750       __ br(Assembler::GE, L);
 751       __ stop("genrate_copy_longs called with < 8 words");
 752       __ bind(L);
 753     }
 754 #endif
 755 
 756     // Fill 8 registers
 757     if (UseSIMDForMemoryOps) {
 758       __ ldpq(v0, v1, Address(s, 4 * unit));
 759       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 760     } else {
 761       __ ldp(t0, t1, Address(s, 2 * unit));
 762       __ ldp(t2, t3, Address(s, 4 * unit));
 763       __ ldp(t4, t5, Address(s, 6 * unit));
 764       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 765     }
 766 
 767     __ subs(count, count, 16);
 768     __ br(Assembler::LO, drain);
 769 
 770     int prefetch = PrefetchCopyIntervalInBytes;
 771     bool use_stride = false;
 772     if (direction == copy_backwards) {
 773        use_stride = prefetch > 256;
 774        prefetch = -prefetch;
 775        if (use_stride) __ mov(stride, prefetch);
 776     }
 777 
 778     __ bind(again);
 779 
 780     if (PrefetchCopyIntervalInBytes > 0)
 781       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 782 
 783     if (UseSIMDForMemoryOps) {
 784       __ stpq(v0, v1, Address(d, 4 * unit));
 785       __ ldpq(v0, v1, Address(s, 4 * unit));
 786       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 787       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 788     } else {
 789       __ stp(t0, t1, Address(d, 2 * unit));
 790       __ ldp(t0, t1, Address(s, 2 * unit));
 791       __ stp(t2, t3, Address(d, 4 * unit));
 792       __ ldp(t2, t3, Address(s, 4 * unit));
 793       __ stp(t4, t5, Address(d, 6 * unit));
 794       __ ldp(t4, t5, Address(s, 6 * unit));
 795       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 796       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 797     }
 798 
 799     __ subs(count, count, 8);
 800     __ br(Assembler::HS, again);
 801 
 802     // Drain
 803     __ bind(drain);
 804     if (UseSIMDForMemoryOps) {
 805       __ stpq(v0, v1, Address(d, 4 * unit));
 806       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 807     } else {
 808       __ stp(t0, t1, Address(d, 2 * unit));
 809       __ stp(t2, t3, Address(d, 4 * unit));
 810       __ stp(t4, t5, Address(d, 6 * unit));
 811       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 812     }
 813 
 814     {
 815       Label L1, L2;
 816       __ tbz(count, exact_log2(4), L1);
 817       if (UseSIMDForMemoryOps) {
 818         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 819         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 820       } else {
 821         __ ldp(t0, t1, Address(s, 2 * unit));
 822         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 823         __ stp(t0, t1, Address(d, 2 * unit));
 824         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 825       }
 826       __ bind(L1);
 827 
 828       if (direction == copy_forwards) {
 829         __ add(s, s, bias);
 830         __ add(d, d, bias);
 831       }
 832 
 833       __ tbz(count, 1, L2);
 834       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 835       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 836       __ bind(L2);
 837     }
 838 
 839     __ ret(lr);
 840 
 841     if (AvoidUnalignedAccesses) {
 842       Label drain, again;
 843       // Register order for storing. Order is different for backward copy.
 844 
 845       __ bind(unaligned_copy_long);
 846 
 847       // source address is even aligned, target odd aligned
 848       //
 849       // when forward copying word pairs we read long pairs at offsets
 850       // {0, 2, 4, 6} (in long words). when backwards copying we read
 851       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 852       // address by -2 in the forwards case so we can compute the
 853       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 854       // or -1.
 855       //
 856       // when forward copying we need to store 1 word, 3 pairs and
 857       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 858       // zero offset We adjust the destination by -1 which means we
 859       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 860       //
 861       // When backwards copyng we need to store 1 word, 3 pairs and
 862       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 863       // offsets {1, 3, 5, 7, 8} * unit.
 864 
 865       if (direction == copy_forwards) {
 866         __ sub(s, s, 16);
 867         __ sub(d, d, 8);
 868       }
 869 
 870       // Fill 8 registers
 871       //
 872       // for forwards copy s was offset by -16 from the original input
 873       // value of s so the register contents are at these offsets
 874       // relative to the 64 bit block addressed by that original input
 875       // and so on for each successive 64 byte block when s is updated
 876       //
 877       // t0 at offset 0,  t1 at offset 8
 878       // t2 at offset 16, t3 at offset 24
 879       // t4 at offset 32, t5 at offset 40
 880       // t6 at offset 48, t7 at offset 56
 881 
 882       // for backwards copy s was not offset so the register contents
 883       // are at these offsets into the preceding 64 byte block
 884       // relative to that original input and so on for each successive
 885       // preceding 64 byte block when s is updated. this explains the
 886       // slightly counter-intuitive looking pattern of register usage
 887       // in the stp instructions for backwards copy.
 888       //
 889       // t0 at offset -16, t1 at offset -8
 890       // t2 at offset -32, t3 at offset -24
 891       // t4 at offset -48, t5 at offset -40
 892       // t6 at offset -64, t7 at offset -56
 893 
 894       __ ldp(t0, t1, Address(s, 2 * unit));
 895       __ ldp(t2, t3, Address(s, 4 * unit));
 896       __ ldp(t4, t5, Address(s, 6 * unit));
 897       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 898 
 899       __ subs(count, count, 16);
 900       __ br(Assembler::LO, drain);
 901 
 902       int prefetch = PrefetchCopyIntervalInBytes;
 903       bool use_stride = false;
 904       if (direction == copy_backwards) {
 905          use_stride = prefetch > 256;
 906          prefetch = -prefetch;
 907          if (use_stride) __ mov(stride, prefetch);
 908       }
 909 
 910       __ bind(again);
 911 
 912       if (PrefetchCopyIntervalInBytes > 0)
 913         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 914 
 915       if (direction == copy_forwards) {
 916        // allowing for the offset of -8 the store instructions place
 917        // registers into the target 64 bit block at the following
 918        // offsets
 919        //
 920        // t0 at offset 0
 921        // t1 at offset 8,  t2 at offset 16
 922        // t3 at offset 24, t4 at offset 32
 923        // t5 at offset 40, t6 at offset 48
 924        // t7 at offset 56
 925 
 926         __ str(t0, Address(d, 1 * unit));
 927         __ stp(t1, t2, Address(d, 2 * unit));
 928         __ ldp(t0, t1, Address(s, 2 * unit));
 929         __ stp(t3, t4, Address(d, 4 * unit));
 930         __ ldp(t2, t3, Address(s, 4 * unit));
 931         __ stp(t5, t6, Address(d, 6 * unit));
 932         __ ldp(t4, t5, Address(s, 6 * unit));
 933         __ str(t7, Address(__ pre(d, 8 * unit)));
 934         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 935       } else {
 936        // d was not offset when we started so the registers are
 937        // written into the 64 bit block preceding d with the following
 938        // offsets
 939        //
 940        // t1 at offset -8
 941        // t3 at offset -24, t0 at offset -16
 942        // t5 at offset -48, t2 at offset -32
 943        // t7 at offset -56, t4 at offset -48
 944        //                   t6 at offset -64
 945        //
 946        // note that this matches the offsets previously noted for the
 947        // loads
 948 
 949         __ str(t1, Address(d, 1 * unit));
 950         __ stp(t3, t0, Address(d, 3 * unit));
 951         __ ldp(t0, t1, Address(s, 2 * unit));
 952         __ stp(t5, t2, Address(d, 5 * unit));
 953         __ ldp(t2, t3, Address(s, 4 * unit));
 954         __ stp(t7, t4, Address(d, 7 * unit));
 955         __ ldp(t4, t5, Address(s, 6 * unit));
 956         __ str(t6, Address(__ pre(d, 8 * unit)));
 957         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 958       }
 959 
 960       __ subs(count, count, 8);
 961       __ br(Assembler::HS, again);
 962 
 963       // Drain
 964       //
 965       // this uses the same pattern of offsets and register arguments
 966       // as above
 967       __ bind(drain);
 968       if (direction == copy_forwards) {
 969         __ str(t0, Address(d, 1 * unit));
 970         __ stp(t1, t2, Address(d, 2 * unit));
 971         __ stp(t3, t4, Address(d, 4 * unit));
 972         __ stp(t5, t6, Address(d, 6 * unit));
 973         __ str(t7, Address(__ pre(d, 8 * unit)));
 974       } else {
 975         __ str(t1, Address(d, 1 * unit));
 976         __ stp(t3, t0, Address(d, 3 * unit));
 977         __ stp(t5, t2, Address(d, 5 * unit));
 978         __ stp(t7, t4, Address(d, 7 * unit));
 979         __ str(t6, Address(__ pre(d, 8 * unit)));
 980       }
 981       // now we need to copy any remaining part block which may
 982       // include a 4 word block subblock and/or a 2 word subblock.
 983       // bits 2 and 1 in the count are the tell-tale for whetehr we
 984       // have each such subblock
 985       {
 986         Label L1, L2;
 987         __ tbz(count, exact_log2(4), L1);
 988        // this is the same as above but copying only 4 longs hence
 989        // with ony one intervening stp between the str instructions
 990        // but note that the offsets and registers still follow the
 991        // same pattern
 992         __ ldp(t0, t1, Address(s, 2 * unit));
 993         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 994         if (direction == copy_forwards) {
 995           __ str(t0, Address(d, 1 * unit));
 996           __ stp(t1, t2, Address(d, 2 * unit));
 997           __ str(t3, Address(__ pre(d, 4 * unit)));
 998         } else {
 999           __ str(t1, Address(d, 1 * unit));
1000           __ stp(t3, t0, Address(d, 3 * unit));
1001           __ str(t2, Address(__ pre(d, 4 * unit)));
1002         }
1003         __ bind(L1);
1004 
1005         __ tbz(count, 1, L2);
1006        // this is the same as above but copying only 2 longs hence
1007        // there is no intervening stp between the str instructions
1008        // but note that the offset and register patterns are still
1009        // the same
1010         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1011         if (direction == copy_forwards) {
1012           __ str(t0, Address(d, 1 * unit));
1013           __ str(t1, Address(__ pre(d, 2 * unit)));
1014         } else {
1015           __ str(t1, Address(d, 1 * unit));
1016           __ str(t0, Address(__ pre(d, 2 * unit)));
1017         }
1018         __ bind(L2);
1019 
1020        // for forwards copy we need to re-adjust the offsets we
1021        // applied so that s and d are follow the last words written
1022 
1023        if (direction == copy_forwards) {
1024          __ add(s, s, 16);
1025          __ add(d, d, 8);
1026        }
1027 
1028       }
1029 
1030       __ ret(lr);
1031       }
1032   }
1033 
1034   // Small copy: less than 16 bytes.
1035   //
1036   // NB: Ignores all of the bits of count which represent more than 15
1037   // bytes, so a caller doesn't have to mask them.
1038 
1039   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1040     bool is_backwards = step < 0;
1041     size_t granularity = uabs(step);
1042     int direction = is_backwards ? -1 : 1;
1043     int unit = wordSize * direction;
1044 
1045     Label Lword, Lint, Lshort, Lbyte;
1046 
1047     assert(granularity
1048            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1049 
1050     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1051 
1052     // ??? I don't know if this bit-test-and-branch is the right thing
1053     // to do.  It does a lot of jumping, resulting in several
1054     // mispredicted branches.  It might make more sense to do this
1055     // with something like Duff's device with a single computed branch.
1056 
1057     __ tbz(count, 3 - exact_log2(granularity), Lword);
1058     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1059     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1060     __ bind(Lword);
1061 
1062     if (granularity <= sizeof (jint)) {
1063       __ tbz(count, 2 - exact_log2(granularity), Lint);
1064       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1065       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1066       __ bind(Lint);
1067     }
1068 
1069     if (granularity <= sizeof (jshort)) {
1070       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1071       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1072       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1073       __ bind(Lshort);
1074     }
1075 
1076     if (granularity <= sizeof (jbyte)) {
1077       __ tbz(count, 0, Lbyte);
1078       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1079       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1080       __ bind(Lbyte);
1081     }
1082   }
1083 
1084   Label copy_f, copy_b;
1085 
1086   // All-singing all-dancing memory copy.
1087   //
1088   // Copy count units of memory from s to d.  The size of a unit is
1089   // step, which can be positive or negative depending on the direction
1090   // of copy.  If is_aligned is false, we align the source address.
1091   //
1092 
1093   void copy_memory(bool is_aligned, Register s, Register d,
1094                    Register count, Register tmp, int step) {
1095     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1096     bool is_backwards = step < 0;
1097     int granularity = uabs(step);
1098     const Register t0 = r3, t1 = r4;
1099 
1100     // <= 96 bytes do inline. Direction doesn't matter because we always
1101     // load all the data before writing anything
1102     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1103     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1104     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1105     const Register send = r17, dend = r18;
1106 
1107     if (PrefetchCopyIntervalInBytes > 0)
1108       __ prfm(Address(s, 0), PLDL1KEEP);
1109     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1110     __ br(Assembler::HI, copy_big);
1111 
1112     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1113     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1114 
1115     __ cmp(count, u1(16/granularity));
1116     __ br(Assembler::LS, copy16);
1117 
1118     __ cmp(count, u1(64/granularity));
1119     __ br(Assembler::HI, copy80);
1120 
1121     __ cmp(count, u1(32/granularity));
1122     __ br(Assembler::LS, copy32);
1123 
1124     // 33..64 bytes
1125     if (UseSIMDForMemoryOps) {
1126       __ ldpq(v0, v1, Address(s, 0));
1127       __ ldpq(v2, v3, Address(send, -32));
1128       __ stpq(v0, v1, Address(d, 0));
1129       __ stpq(v2, v3, Address(dend, -32));
1130     } else {
1131       __ ldp(t0, t1, Address(s, 0));
1132       __ ldp(t2, t3, Address(s, 16));
1133       __ ldp(t4, t5, Address(send, -32));
1134       __ ldp(t6, t7, Address(send, -16));
1135 
1136       __ stp(t0, t1, Address(d, 0));
1137       __ stp(t2, t3, Address(d, 16));
1138       __ stp(t4, t5, Address(dend, -32));
1139       __ stp(t6, t7, Address(dend, -16));
1140     }
1141     __ b(finish);
1142 
1143     // 17..32 bytes
1144     __ bind(copy32);
1145     __ ldp(t0, t1, Address(s, 0));
1146     __ ldp(t2, t3, Address(send, -16));
1147     __ stp(t0, t1, Address(d, 0));
1148     __ stp(t2, t3, Address(dend, -16));
1149     __ b(finish);
1150 
1151     // 65..80/96 bytes
1152     // (96 bytes if SIMD because we do 32 byes per instruction)
1153     __ bind(copy80);
1154     if (UseSIMDForMemoryOps) {
1155       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1156       __ ldpq(v4, v5, Address(send, -32));
1157       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1158       __ stpq(v4, v5, Address(dend, -32));
1159     } else {
1160       __ ldp(t0, t1, Address(s, 0));
1161       __ ldp(t2, t3, Address(s, 16));
1162       __ ldp(t4, t5, Address(s, 32));
1163       __ ldp(t6, t7, Address(s, 48));
1164       __ ldp(t8, t9, Address(send, -16));
1165 
1166       __ stp(t0, t1, Address(d, 0));
1167       __ stp(t2, t3, Address(d, 16));
1168       __ stp(t4, t5, Address(d, 32));
1169       __ stp(t6, t7, Address(d, 48));
1170       __ stp(t8, t9, Address(dend, -16));
1171     }
1172     __ b(finish);
1173 
1174     // 0..16 bytes
1175     __ bind(copy16);
1176     __ cmp(count, u1(8/granularity));
1177     __ br(Assembler::LO, copy8);
1178 
1179     // 8..16 bytes
1180     __ ldr(t0, Address(s, 0));
1181     __ ldr(t1, Address(send, -8));
1182     __ str(t0, Address(d, 0));
1183     __ str(t1, Address(dend, -8));
1184     __ b(finish);
1185 
1186     if (granularity < 8) {
1187       // 4..7 bytes
1188       __ bind(copy8);
1189       __ tbz(count, 2 - exact_log2(granularity), copy4);
1190       __ ldrw(t0, Address(s, 0));
1191       __ ldrw(t1, Address(send, -4));
1192       __ strw(t0, Address(d, 0));
1193       __ strw(t1, Address(dend, -4));
1194       __ b(finish);
1195       if (granularity < 4) {
1196         // 0..3 bytes
1197         __ bind(copy4);
1198         __ cbz(count, finish); // get rid of 0 case
1199         if (granularity == 2) {
1200           __ ldrh(t0, Address(s, 0));
1201           __ strh(t0, Address(d, 0));
1202         } else { // granularity == 1
1203           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1204           // the first and last byte.
1205           // Handle the 3 byte case by loading and storing base + count/2
1206           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1207           // This does means in the 1 byte case we load/store the same
1208           // byte 3 times.
1209           __ lsr(count, count, 1);
1210           __ ldrb(t0, Address(s, 0));
1211           __ ldrb(t1, Address(send, -1));
1212           __ ldrb(t2, Address(s, count));
1213           __ strb(t0, Address(d, 0));
1214           __ strb(t1, Address(dend, -1));
1215           __ strb(t2, Address(d, count));
1216         }
1217         __ b(finish);
1218       }
1219     }
1220 
1221     __ bind(copy_big);
1222     if (is_backwards) {
1223       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1224       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1225     }
1226 
1227     // Now we've got the small case out of the way we can align the
1228     // source address on a 2-word boundary.
1229 
1230     Label aligned;
1231 
1232     if (is_aligned) {
1233       // We may have to adjust by 1 word to get s 2-word-aligned.
1234       __ tbz(s, exact_log2(wordSize), aligned);
1235       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1236       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1237       __ sub(count, count, wordSize/granularity);
1238     } else {
1239       if (is_backwards) {
1240         __ andr(rscratch2, s, 2 * wordSize - 1);
1241       } else {
1242         __ neg(rscratch2, s);
1243         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1244       }
1245       // rscratch2 is the byte adjustment needed to align s.
1246       __ cbz(rscratch2, aligned);
1247       int shift = exact_log2(granularity);
1248       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1249       __ sub(count, count, rscratch2);
1250 
1251 #if 0
1252       // ?? This code is only correct for a disjoint copy.  It may or
1253       // may not make sense to use it in that case.
1254 
1255       // Copy the first pair; s and d may not be aligned.
1256       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1257       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1258 
1259       // Align s and d, adjust count
1260       if (is_backwards) {
1261         __ sub(s, s, rscratch2);
1262         __ sub(d, d, rscratch2);
1263       } else {
1264         __ add(s, s, rscratch2);
1265         __ add(d, d, rscratch2);
1266       }
1267 #else
1268       copy_memory_small(s, d, rscratch2, rscratch1, step);
1269 #endif
1270     }
1271 
1272     __ bind(aligned);
1273 
1274     // s is now 2-word-aligned.
1275 
1276     // We have a count of units and some trailing bytes.  Adjust the
1277     // count and do a bulk copy of words.
1278     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1279     if (direction == copy_forwards)
1280       __ bl(copy_f);
1281     else
1282       __ bl(copy_b);
1283 
1284     // And the tail.
1285     copy_memory_small(s, d, count, tmp, step);
1286 
1287     if (granularity >= 8) __ bind(copy8);
1288     if (granularity >= 4) __ bind(copy4);
1289     __ bind(finish);
1290   }
1291 
1292 
1293   void clobber_registers() {
1294 #ifdef ASSERT
1295     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1296     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1297     for (Register r = r3; r <= r18; r++)
1298       if (r != rscratch1) __ mov(r, rscratch1);
1299 #endif
1300   }
1301 
1302   // Scan over array at a for count oops, verifying each one.
1303   // Preserves a and count, clobbers rscratch1 and rscratch2.
1304   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1305     Label loop, end;
1306     __ mov(rscratch1, a);
1307     __ mov(rscratch2, zr);
1308     __ bind(loop);
1309     __ cmp(rscratch2, count);
1310     __ br(Assembler::HS, end);
1311     if (size == (size_t)wordSize) {
1312       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1313       __ verify_oop(temp);
1314     } else {
1315       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1316       __ decode_heap_oop(temp); // calls verify_oop
1317     }
1318     __ add(rscratch2, rscratch2, size);
1319     __ b(loop);
1320     __ bind(end);
1321   }
1322 
1323   // Arguments:
1324   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1325   //             ignored
1326   //   is_oop  - true => oop array, so generate store check code
1327   //   name    - stub name string
1328   //
1329   // Inputs:
1330   //   c_rarg0   - source array address
1331   //   c_rarg1   - destination array address
1332   //   c_rarg2   - element count, treated as ssize_t, can be zero
1333   //
1334   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1335   // the hardware handle it.  The two dwords within qwords that span
1336   // cache line boundaries will still be loaded and stored atomicly.
1337   //
1338   // Side Effects:
1339   //   disjoint_int_copy_entry is set to the no-overlap entry point
1340   //   used by generate_conjoint_int_oop_copy().
1341   //
1342   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1343                                   const char *name, bool dest_uninitialized = false) {
1344     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1345     RegSet saved_reg = RegSet::of(s, d, count);
1346     __ align(CodeEntryAlignment);
1347     StubCodeMark mark(this, "StubRoutines", name);
1348     address start = __ pc();
1349     __ enter();
1350 
1351     if (entry != NULL) {
1352       *entry = __ pc();
1353       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1354       BLOCK_COMMENT("Entry:");
1355     }
1356 
1357     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1358     if (dest_uninitialized) {
1359       decorators |= IS_DEST_UNINITIALIZED;
1360     }
1361     if (aligned) {
1362       decorators |= ARRAYCOPY_ALIGNED;
1363     }
1364 
1365     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1366     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1367 
1368     if (is_oop) {
1369       // save regs before copy_memory
1370       __ push(RegSet::of(d, count), sp);
1371     }
1372     {
1373       // UnsafeCopyMemory page error: continue after ucm
1374       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1375       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1376       copy_memory(aligned, s, d, count, rscratch1, size);
1377     }
1378 
1379     if (is_oop) {
1380       __ pop(RegSet::of(d, count), sp);
1381       if (VerifyOops)
1382         verify_oop_array(size, d, count, r16);
1383     }
1384 
1385     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1386 
1387     __ leave();
1388     __ mov(r0, zr); // return 0
1389     __ ret(lr);
1390     return start;
1391   }
1392 
1393   // Arguments:
1394   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1395   //             ignored
1396   //   is_oop  - true => oop array, so generate store check code
1397   //   name    - stub name string
1398   //
1399   // Inputs:
1400   //   c_rarg0   - source array address
1401   //   c_rarg1   - destination array address
1402   //   c_rarg2   - element count, treated as ssize_t, can be zero
1403   //
1404   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1405   // the hardware handle it.  The two dwords within qwords that span
1406   // cache line boundaries will still be loaded and stored atomicly.
1407   //
1408   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1409                                  address *entry, const char *name,
1410                                  bool dest_uninitialized = false) {
1411     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1412     RegSet saved_regs = RegSet::of(s, d, count);
1413     StubCodeMark mark(this, "StubRoutines", name);
1414     address start = __ pc();
1415     __ enter();
1416 
1417     if (entry != NULL) {
1418       *entry = __ pc();
1419       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1420       BLOCK_COMMENT("Entry:");
1421     }
1422 
1423     // use fwd copy when (d-s) above_equal (count*size)
1424     __ sub(rscratch1, d, s);
1425     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1426     __ br(Assembler::HS, nooverlap_target);
1427 
1428     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1429     if (dest_uninitialized) {
1430       decorators |= IS_DEST_UNINITIALIZED;
1431     }
1432     if (aligned) {
1433       decorators |= ARRAYCOPY_ALIGNED;
1434     }
1435 
1436     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1437     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1438 
1439     if (is_oop) {
1440       // save regs before copy_memory
1441       __ push(RegSet::of(d, count), sp);
1442     }
1443     {
1444       // UnsafeCopyMemory page error: continue after ucm
1445       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1446       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1447       copy_memory(aligned, s, d, count, rscratch1, -size);
1448     }
1449     if (is_oop) {
1450       __ pop(RegSet::of(d, count), sp);
1451       if (VerifyOops)
1452         verify_oop_array(size, d, count, r16);
1453     }
1454     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1455     __ leave();
1456     __ mov(r0, zr); // return 0
1457     __ ret(lr);
1458     return start;
1459 }
1460 
1461   // Arguments:
1462   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1463   //             ignored
1464   //   name    - stub name string
1465   //
1466   // Inputs:
1467   //   c_rarg0   - source array address
1468   //   c_rarg1   - destination array address
1469   //   c_rarg2   - element count, treated as ssize_t, can be zero
1470   //
1471   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1472   // we let the hardware handle it.  The one to eight bytes within words,
1473   // dwords or qwords that span cache line boundaries will still be loaded
1474   // and stored atomically.
1475   //
1476   // Side Effects:
1477   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1478   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1479   // we let the hardware handle it.  The one to eight bytes within words,
1480   // dwords or qwords that span cache line boundaries will still be loaded
1481   // and stored atomically.
1482   //
1483   // Side Effects:
1484   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1485   //   used by generate_conjoint_byte_copy().
1486   //
1487   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1488     const bool not_oop = false;
1489     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1490   }
1491 
1492   // Arguments:
1493   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1494   //             ignored
1495   //   name    - stub name string
1496   //
1497   // Inputs:
1498   //   c_rarg0   - source array address
1499   //   c_rarg1   - destination array address
1500   //   c_rarg2   - element count, treated as ssize_t, can be zero
1501   //
1502   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1503   // we let the hardware handle it.  The one to eight bytes within words,
1504   // dwords or qwords that span cache line boundaries will still be loaded
1505   // and stored atomically.
1506   //
1507   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1508                                       address* entry, const char *name) {
1509     const bool not_oop = false;
1510     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1511   }
1512 
1513   // Arguments:
1514   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1515   //             ignored
1516   //   name    - stub name string
1517   //
1518   // Inputs:
1519   //   c_rarg0   - source array address
1520   //   c_rarg1   - destination array address
1521   //   c_rarg2   - element count, treated as ssize_t, can be zero
1522   //
1523   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1524   // let the hardware handle it.  The two or four words within dwords
1525   // or qwords that span cache line boundaries will still be loaded
1526   // and stored atomically.
1527   //
1528   // Side Effects:
1529   //   disjoint_short_copy_entry is set to the no-overlap entry point
1530   //   used by generate_conjoint_short_copy().
1531   //
1532   address generate_disjoint_short_copy(bool aligned,
1533                                        address* entry, const char *name) {
1534     const bool not_oop = false;
1535     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1536   }
1537 
1538   // Arguments:
1539   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1540   //             ignored
1541   //   name    - stub name string
1542   //
1543   // Inputs:
1544   //   c_rarg0   - source array address
1545   //   c_rarg1   - destination array address
1546   //   c_rarg2   - element count, treated as ssize_t, can be zero
1547   //
1548   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1549   // let the hardware handle it.  The two or four words within dwords
1550   // or qwords that span cache line boundaries will still be loaded
1551   // and stored atomically.
1552   //
1553   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1554                                        address *entry, const char *name) {
1555     const bool not_oop = false;
1556     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1557 
1558   }
1559   // Arguments:
1560   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1561   //             ignored
1562   //   name    - stub name string
1563   //
1564   // Inputs:
1565   //   c_rarg0   - source array address
1566   //   c_rarg1   - destination array address
1567   //   c_rarg2   - element count, treated as ssize_t, can be zero
1568   //
1569   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1570   // the hardware handle it.  The two dwords within qwords that span
1571   // cache line boundaries will still be loaded and stored atomicly.
1572   //
1573   // Side Effects:
1574   //   disjoint_int_copy_entry is set to the no-overlap entry point
1575   //   used by generate_conjoint_int_oop_copy().
1576   //
1577   address generate_disjoint_int_copy(bool aligned, address *entry,
1578                                          const char *name, bool dest_uninitialized = false) {
1579     const bool not_oop = false;
1580     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1581   }
1582 
1583   // Arguments:
1584   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1585   //             ignored
1586   //   name    - stub name string
1587   //
1588   // Inputs:
1589   //   c_rarg0   - source array address
1590   //   c_rarg1   - destination array address
1591   //   c_rarg2   - element count, treated as ssize_t, can be zero
1592   //
1593   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1594   // the hardware handle it.  The two dwords within qwords that span
1595   // cache line boundaries will still be loaded and stored atomicly.
1596   //
1597   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1598                                      address *entry, const char *name,
1599                                      bool dest_uninitialized = false) {
1600     const bool not_oop = false;
1601     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1602   }
1603 
1604 
1605   // Arguments:
1606   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1607   //             ignored
1608   //   name    - stub name string
1609   //
1610   // Inputs:
1611   //   c_rarg0   - source array address
1612   //   c_rarg1   - destination array address
1613   //   c_rarg2   - element count, treated as size_t, can be zero
1614   //
1615   // Side Effects:
1616   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1617   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1618   //
1619   address generate_disjoint_long_copy(bool aligned, address *entry,
1620                                           const char *name, bool dest_uninitialized = false) {
1621     const bool not_oop = false;
1622     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1623   }
1624 
1625   // Arguments:
1626   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1627   //             ignored
1628   //   name    - stub name string
1629   //
1630   // Inputs:
1631   //   c_rarg0   - source array address
1632   //   c_rarg1   - destination array address
1633   //   c_rarg2   - element count, treated as size_t, can be zero
1634   //
1635   address generate_conjoint_long_copy(bool aligned,
1636                                       address nooverlap_target, address *entry,
1637                                       const char *name, bool dest_uninitialized = false) {
1638     const bool not_oop = false;
1639     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1640   }
1641 
1642   // Arguments:
1643   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1644   //             ignored
1645   //   name    - stub name string
1646   //
1647   // Inputs:
1648   //   c_rarg0   - source array address
1649   //   c_rarg1   - destination array address
1650   //   c_rarg2   - element count, treated as size_t, can be zero
1651   //
1652   // Side Effects:
1653   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1654   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1655   //
1656   address generate_disjoint_oop_copy(bool aligned, address *entry,
1657                                      const char *name, bool dest_uninitialized) {
1658     const bool is_oop = true;
1659     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1660     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1661   }
1662 
1663   // Arguments:
1664   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1665   //             ignored
1666   //   name    - stub name string
1667   //
1668   // Inputs:
1669   //   c_rarg0   - source array address
1670   //   c_rarg1   - destination array address
1671   //   c_rarg2   - element count, treated as size_t, can be zero
1672   //
1673   address generate_conjoint_oop_copy(bool aligned,
1674                                      address nooverlap_target, address *entry,
1675                                      const char *name, bool dest_uninitialized) {
1676     const bool is_oop = true;
1677     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1678     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1679                                   name, dest_uninitialized);
1680   }
1681 
1682 
1683   // Helper for generating a dynamic type check.
1684   // Smashes rscratch1, rscratch2.
1685   void generate_type_check(Register sub_klass,
1686                            Register super_check_offset,
1687                            Register super_klass,
1688                            Label& L_success) {
1689     assert_different_registers(sub_klass, super_check_offset, super_klass);
1690 
1691     BLOCK_COMMENT("type_check:");
1692 
1693     Label L_miss;
1694 
1695     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1696                                      super_check_offset);
1697     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1698 
1699     // Fall through on failure!
1700     __ BIND(L_miss);
1701   }
1702 
1703   //
1704   //  Generate checkcasting array copy stub
1705   //
1706   //  Input:
1707   //    c_rarg0   - source array address
1708   //    c_rarg1   - destination array address
1709   //    c_rarg2   - element count, treated as ssize_t, can be zero
1710   //    c_rarg3   - size_t ckoff (super_check_offset)
1711   //    c_rarg4   - oop ckval (super_klass)
1712   //
1713   //  Output:
1714   //    r0 ==  0  -  success
1715   //    r0 == -1^K - failure, where K is partial transfer count
1716   //
1717   address generate_checkcast_copy(const char *name, address *entry,
1718                                   bool dest_uninitialized = false) {
1719 
1720     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1721 
1722     // Input registers (after setup_arg_regs)
1723     const Register from        = c_rarg0;   // source array address
1724     const Register to          = c_rarg1;   // destination array address
1725     const Register count       = c_rarg2;   // elementscount
1726     const Register ckoff       = c_rarg3;   // super_check_offset
1727     const Register ckval       = c_rarg4;   // super_klass
1728 
1729     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1730     RegSet wb_post_saved_regs = RegSet::of(count);
1731 
1732     // Registers used as temps (r18, r19, r20 are save-on-entry)
1733     const Register count_save  = r21;       // orig elementscount
1734     const Register start_to    = r20;       // destination array start address
1735     const Register copied_oop  = r18;       // actual oop copied
1736     const Register r19_klass   = r19;       // oop._klass
1737 
1738     //---------------------------------------------------------------
1739     // Assembler stub will be used for this call to arraycopy
1740     // if the two arrays are subtypes of Object[] but the
1741     // destination array type is not equal to or a supertype
1742     // of the source type.  Each element must be separately
1743     // checked.
1744 
1745     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1746                                copied_oop, r19_klass, count_save);
1747 
1748     __ align(CodeEntryAlignment);
1749     StubCodeMark mark(this, "StubRoutines", name);
1750     address start = __ pc();
1751 
1752     __ enter(); // required for proper stackwalking of RuntimeStub frame
1753 
1754 #ifdef ASSERT
1755     // caller guarantees that the arrays really are different
1756     // otherwise, we would have to make conjoint checks
1757     { Label L;
1758       array_overlap_test(L, TIMES_OOP);
1759       __ stop("checkcast_copy within a single array");
1760       __ bind(L);
1761     }
1762 #endif //ASSERT
1763 
1764     // Caller of this entry point must set up the argument registers.
1765     if (entry != NULL) {
1766       *entry = __ pc();
1767       BLOCK_COMMENT("Entry:");
1768     }
1769 
1770      // Empty array:  Nothing to do.
1771     __ cbz(count, L_done);
1772 
1773     __ push(RegSet::of(r18, r19, r20, r21), sp);
1774 
1775 #ifdef ASSERT
1776     BLOCK_COMMENT("assert consistent ckoff/ckval");
1777     // The ckoff and ckval must be mutually consistent,
1778     // even though caller generates both.
1779     { Label L;
1780       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1781       __ ldrw(start_to, Address(ckval, sco_offset));
1782       __ cmpw(ckoff, start_to);
1783       __ br(Assembler::EQ, L);
1784       __ stop("super_check_offset inconsistent");
1785       __ bind(L);
1786     }
1787 #endif //ASSERT
1788 
1789     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1790     bool is_oop = true;
1791     if (dest_uninitialized) {
1792       decorators |= IS_DEST_UNINITIALIZED;
1793     }
1794 
1795     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1796     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1797 
1798     // save the original count
1799     __ mov(count_save, count);
1800 
1801     // Copy from low to high addresses
1802     __ mov(start_to, to);              // Save destination array start address
1803     __ b(L_load_element);
1804 
1805     // ======== begin loop ========
1806     // (Loop is rotated; its entry is L_load_element.)
1807     // Loop control:
1808     //   for (; count != 0; count--) {
1809     //     copied_oop = load_heap_oop(from++);
1810     //     ... generate_type_check ...;
1811     //     store_heap_oop(to++, copied_oop);
1812     //   }
1813     __ align(OptoLoopAlignment);
1814 
1815     __ BIND(L_store_element);
1816     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop 
1817     __ sub(count, count, 1);
1818     __ cbz(count, L_do_card_marks);
1819 
1820     // ======== loop entry is here ========
1821     __ BIND(L_load_element);
1822     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1823     __ cbz(copied_oop, L_store_element);
1824 
1825     __ load_klass(r19_klass, copied_oop);// query the object klass
1826     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1827     // ======== end loop ========
1828 
1829     // It was a real error; we must depend on the caller to finish the job.
1830     // Register count = remaining oops, count_orig = total oops.
1831     // Emit GC store barriers for the oops we have copied and report
1832     // their number to the caller.
1833 
1834     __ subs(count, count_save, count);     // K = partially copied oop count
1835     __ eon(count, count, zr);                   // report (-1^K) to caller
1836     __ br(Assembler::EQ, L_done_pop);
1837 
1838     __ BIND(L_do_card_marks);
1839     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1840 
1841     __ bind(L_done_pop);
1842     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1843     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1844 
1845     __ bind(L_done);
1846     __ mov(r0, count);
1847     __ leave();
1848     __ ret(lr);
1849 
1850     return start;
1851   }
1852 
1853   // Perform range checks on the proposed arraycopy.
1854   // Kills temp, but nothing else.
1855   // Also, clean the sign bits of src_pos and dst_pos.
1856   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1857                               Register src_pos, // source position (c_rarg1)
1858                               Register dst,     // destination array oo (c_rarg2)
1859                               Register dst_pos, // destination position (c_rarg3)
1860                               Register length,
1861                               Register temp,
1862                               Label& L_failed) {
1863     BLOCK_COMMENT("arraycopy_range_checks:");
1864 
1865     assert_different_registers(rscratch1, temp);
1866 
1867     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1868     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1869     __ addw(temp, length, src_pos);
1870     __ cmpw(temp, rscratch1);
1871     __ br(Assembler::HI, L_failed);
1872 
1873     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1874     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1875     __ addw(temp, length, dst_pos);
1876     __ cmpw(temp, rscratch1);
1877     __ br(Assembler::HI, L_failed);
1878 
1879     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1880     __ movw(src_pos, src_pos);
1881     __ movw(dst_pos, dst_pos);
1882 
1883     BLOCK_COMMENT("arraycopy_range_checks done");
1884   }
1885 
1886   // These stubs get called from some dumb test routine.
1887   // I'll write them properly when they're called from
1888   // something that's actually doing something.
1889   static void fake_arraycopy_stub(address src, address dst, int count) {
1890     assert(count == 0, "huh?");
1891   }
1892 
1893 
1894   //
1895   //  Generate 'unsafe' array copy stub
1896   //  Though just as safe as the other stubs, it takes an unscaled
1897   //  size_t argument instead of an element count.
1898   //
1899   //  Input:
1900   //    c_rarg0   - source array address
1901   //    c_rarg1   - destination array address
1902   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1903   //
1904   // Examines the alignment of the operands and dispatches
1905   // to a long, int, short, or byte copy loop.
1906   //
1907   address generate_unsafe_copy(const char *name,
1908                                address byte_copy_entry,
1909                                address short_copy_entry,
1910                                address int_copy_entry,
1911                                address long_copy_entry) {
1912     Label L_long_aligned, L_int_aligned, L_short_aligned;
1913     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1914 
1915     __ align(CodeEntryAlignment);
1916     StubCodeMark mark(this, "StubRoutines", name);
1917     address start = __ pc();
1918     __ enter(); // required for proper stackwalking of RuntimeStub frame
1919 
1920     // bump this on entry, not on exit:
1921     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1922 
1923     __ orr(rscratch1, s, d);
1924     __ orr(rscratch1, rscratch1, count);
1925 
1926     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1927     __ cbz(rscratch1, L_long_aligned);
1928     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1929     __ cbz(rscratch1, L_int_aligned);
1930     __ tbz(rscratch1, 0, L_short_aligned);
1931     __ b(RuntimeAddress(byte_copy_entry));
1932 
1933     __ BIND(L_short_aligned);
1934     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1935     __ b(RuntimeAddress(short_copy_entry));
1936     __ BIND(L_int_aligned);
1937     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1938     __ b(RuntimeAddress(int_copy_entry));
1939     __ BIND(L_long_aligned);
1940     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1941     __ b(RuntimeAddress(long_copy_entry));
1942 
1943     return start;
1944   }
1945 
1946   //
1947   //  Generate generic array copy stubs
1948   //
1949   //  Input:
1950   //    c_rarg0    -  src oop
1951   //    c_rarg1    -  src_pos (32-bits)
1952   //    c_rarg2    -  dst oop
1953   //    c_rarg3    -  dst_pos (32-bits)
1954   //    c_rarg4    -  element count (32-bits)
1955   //
1956   //  Output:
1957   //    r0 ==  0  -  success
1958   //    r0 == -1^K - failure, where K is partial transfer count
1959   //
1960   address generate_generic_copy(const char *name,
1961                                 address byte_copy_entry, address short_copy_entry,
1962                                 address int_copy_entry, address oop_copy_entry,
1963                                 address long_copy_entry, address checkcast_copy_entry) {
1964 
1965     Label L_failed, L_objArray;
1966     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1967 
1968     // Input registers
1969     const Register src        = c_rarg0;  // source array oop
1970     const Register src_pos    = c_rarg1;  // source position
1971     const Register dst        = c_rarg2;  // destination array oop
1972     const Register dst_pos    = c_rarg3;  // destination position
1973     const Register length     = c_rarg4;
1974 
1975 
1976     // Registers used as temps
1977     const Register dst_klass  = c_rarg5;
1978 
1979     __ align(CodeEntryAlignment);
1980 
1981     StubCodeMark mark(this, "StubRoutines", name);
1982 
1983     address start = __ pc();
1984 
1985     __ enter(); // required for proper stackwalking of RuntimeStub frame
1986 
1987     // bump this on entry, not on exit:
1988     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1989 
1990     //-----------------------------------------------------------------------
1991     // Assembler stub will be used for this call to arraycopy
1992     // if the following conditions are met:
1993     //
1994     // (1) src and dst must not be null.
1995     // (2) src_pos must not be negative.
1996     // (3) dst_pos must not be negative.
1997     // (4) length  must not be negative.
1998     // (5) src klass and dst klass should be the same and not NULL.
1999     // (6) src and dst should be arrays.
2000     // (7) src_pos + length must not exceed length of src.
2001     // (8) dst_pos + length must not exceed length of dst.
2002     //
2003 
2004     //  if (src == NULL) return -1;
2005     __ cbz(src, L_failed);
2006 
2007     //  if (src_pos < 0) return -1;
2008     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2009 
2010     //  if (dst == NULL) return -1;
2011     __ cbz(dst, L_failed);
2012 
2013     //  if (dst_pos < 0) return -1;
2014     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2015 
2016     // registers used as temp
2017     const Register scratch_length    = r16; // elements count to copy
2018     const Register scratch_src_klass = r17; // array klass
2019     const Register lh                = r18; // layout helper
2020 
2021     //  if (length < 0) return -1;
2022     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2023     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2024 
2025     __ load_klass(scratch_src_klass, src);
2026 #ifdef ASSERT
2027     //  assert(src->klass() != NULL);
2028     {
2029       BLOCK_COMMENT("assert klasses not null {");
2030       Label L1, L2;
2031       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2032       __ bind(L1);
2033       __ stop("broken null klass");
2034       __ bind(L2);
2035       __ load_klass(rscratch1, dst);
2036       __ cbz(rscratch1, L1);     // this would be broken also
2037       BLOCK_COMMENT("} assert klasses not null done");
2038     }
2039 #endif
2040 
2041     // Load layout helper (32-bits)
2042     //
2043     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2044     // 32        30    24            16              8     2                 0
2045     //
2046     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2047     //
2048 
2049     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2050 
2051     // Handle objArrays completely differently...
2052     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2053     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2054     __ movw(rscratch1, objArray_lh);
2055     __ eorw(rscratch2, lh, rscratch1);
2056     __ cbzw(rscratch2, L_objArray);
2057 
2058     //  if (src->klass() != dst->klass()) return -1;
2059     __ load_klass(rscratch2, dst);
2060     __ eor(rscratch2, rscratch2, scratch_src_klass);
2061     __ cbnz(rscratch2, L_failed);
2062 
2063     //  if (!src->is_Array()) return -1;
2064     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2065 
2066     // At this point, it is known to be a typeArray (array_tag 0x3).
2067 #ifdef ASSERT
2068     {
2069       BLOCK_COMMENT("assert primitive array {");
2070       Label L;
2071       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2072       __ cmpw(lh, rscratch2);
2073       __ br(Assembler::GE, L);
2074       __ stop("must be a primitive array");
2075       __ bind(L);
2076       BLOCK_COMMENT("} assert primitive array done");
2077     }
2078 #endif
2079 
2080     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2081                            rscratch2, L_failed);
2082 
2083     // TypeArrayKlass
2084     //
2085     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2086     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2087     //
2088 
2089     const Register rscratch1_offset = rscratch1;    // array offset
2090     const Register r18_elsize = lh; // element size
2091 
2092     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2093            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2094     __ add(src, src, rscratch1_offset);           // src array offset
2095     __ add(dst, dst, rscratch1_offset);           // dst array offset
2096     BLOCK_COMMENT("choose copy loop based on element size");
2097 
2098     // next registers should be set before the jump to corresponding stub
2099     const Register from     = c_rarg0;  // source array address
2100     const Register to       = c_rarg1;  // destination array address
2101     const Register count    = c_rarg2;  // elements count
2102 
2103     // 'from', 'to', 'count' registers should be set in such order
2104     // since they are the same as 'src', 'src_pos', 'dst'.
2105 
2106     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2107 
2108     // The possible values of elsize are 0-3, i.e. exact_log2(element
2109     // size in bytes).  We do a simple bitwise binary search.
2110   __ BIND(L_copy_bytes);
2111     __ tbnz(r18_elsize, 1, L_copy_ints);
2112     __ tbnz(r18_elsize, 0, L_copy_shorts);
2113     __ lea(from, Address(src, src_pos));// src_addr
2114     __ lea(to,   Address(dst, dst_pos));// dst_addr
2115     __ movw(count, scratch_length); // length
2116     __ b(RuntimeAddress(byte_copy_entry));
2117 
2118   __ BIND(L_copy_shorts);
2119     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2120     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2121     __ movw(count, scratch_length); // length
2122     __ b(RuntimeAddress(short_copy_entry));
2123 
2124   __ BIND(L_copy_ints);
2125     __ tbnz(r18_elsize, 0, L_copy_longs);
2126     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2127     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2128     __ movw(count, scratch_length); // length
2129     __ b(RuntimeAddress(int_copy_entry));
2130 
2131   __ BIND(L_copy_longs);
2132 #ifdef ASSERT
2133     {
2134       BLOCK_COMMENT("assert long copy {");
2135       Label L;
2136       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2137       __ cmpw(r18_elsize, LogBytesPerLong);
2138       __ br(Assembler::EQ, L);
2139       __ stop("must be long copy, but elsize is wrong");
2140       __ bind(L);
2141       BLOCK_COMMENT("} assert long copy done");
2142     }
2143 #endif
2144     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2145     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2146     __ movw(count, scratch_length); // length
2147     __ b(RuntimeAddress(long_copy_entry));
2148 
2149     // ObjArrayKlass
2150   __ BIND(L_objArray);
2151     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2152 
2153     Label L_plain_copy, L_checkcast_copy;
2154     //  test array classes for subtyping
2155     __ load_klass(r18, dst);
2156     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2157     __ br(Assembler::NE, L_checkcast_copy);
2158 
2159     // Identically typed arrays can be copied without element-wise checks.
2160     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2161                            rscratch2, L_failed);
2162 
2163     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2164     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2165     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2166     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2167     __ movw(count, scratch_length); // length
2168   __ BIND(L_plain_copy);
2169     __ b(RuntimeAddress(oop_copy_entry));
2170 
2171   __ BIND(L_checkcast_copy);
2172     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2173     {
2174       // Before looking at dst.length, make sure dst is also an objArray.
2175       __ ldrw(rscratch1, Address(r18, lh_offset));
2176       __ movw(rscratch2, objArray_lh);
2177       __ eorw(rscratch1, rscratch1, rscratch2);
2178       __ cbnzw(rscratch1, L_failed);
2179 
2180       // It is safe to examine both src.length and dst.length.
2181       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2182                              r18, L_failed);
2183 
2184       __ load_klass(dst_klass, dst); // reload
2185 
2186       // Marshal the base address arguments now, freeing registers.
2187       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2188       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2189       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2190       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2191       __ movw(count, length);           // length (reloaded)
2192       Register sco_temp = c_rarg3;      // this register is free now
2193       assert_different_registers(from, to, count, sco_temp,
2194                                  dst_klass, scratch_src_klass);
2195       // assert_clean_int(count, sco_temp);
2196 
2197       // Generate the type check.
2198       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2199       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2200 
2201       // Smashes rscratch1, rscratch2
2202       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2203 
2204       // Fetch destination element klass from the ObjArrayKlass header.
2205       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2206       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2207       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2208 
2209       // the checkcast_copy loop needs two extra arguments:
2210       assert(c_rarg3 == sco_temp, "#3 already in place");
2211       // Set up arguments for checkcast_copy_entry.
2212       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2213       __ b(RuntimeAddress(checkcast_copy_entry));
2214     }
2215 
2216   __ BIND(L_failed);
2217     __ mov(r0, -1);
2218     __ leave();   // required for proper stackwalking of RuntimeStub frame
2219     __ ret(lr);
2220 
2221     return start;
2222   }
2223 
2224   //
2225   // Generate stub for array fill. If "aligned" is true, the
2226   // "to" address is assumed to be heapword aligned.
2227   //
2228   // Arguments for generated stub:
2229   //   to:    c_rarg0
2230   //   value: c_rarg1
2231   //   count: c_rarg2 treated as signed
2232   //
2233   address generate_fill(BasicType t, bool aligned, const char *name) {
2234     __ align(CodeEntryAlignment);
2235     StubCodeMark mark(this, "StubRoutines", name);
2236     address start = __ pc();
2237 
2238     BLOCK_COMMENT("Entry:");
2239 
2240     const Register to        = c_rarg0;  // source array address
2241     const Register value     = c_rarg1;  // value
2242     const Register count     = c_rarg2;  // elements count
2243 
2244     const Register bz_base = r10;        // base for block_zero routine
2245     const Register cnt_words = r11;      // temp register
2246 
2247     __ enter();
2248 
2249     Label L_fill_elements, L_exit1;
2250 
2251     int shift = -1;
2252     switch (t) {
2253       case T_BYTE:
2254         shift = 0;
2255         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2256         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2257         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2258         __ br(Assembler::LO, L_fill_elements);
2259         break;
2260       case T_SHORT:
2261         shift = 1;
2262         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2263         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2264         __ br(Assembler::LO, L_fill_elements);
2265         break;
2266       case T_INT:
2267         shift = 2;
2268         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2269         __ br(Assembler::LO, L_fill_elements);
2270         break;
2271       default: ShouldNotReachHere();
2272     }
2273 
2274     // Align source address at 8 bytes address boundary.
2275     Label L_skip_align1, L_skip_align2, L_skip_align4;
2276     if (!aligned) {
2277       switch (t) {
2278         case T_BYTE:
2279           // One byte misalignment happens only for byte arrays.
2280           __ tbz(to, 0, L_skip_align1);
2281           __ strb(value, Address(__ post(to, 1)));
2282           __ subw(count, count, 1);
2283           __ bind(L_skip_align1);
2284           // Fallthrough
2285         case T_SHORT:
2286           // Two bytes misalignment happens only for byte and short (char) arrays.
2287           __ tbz(to, 1, L_skip_align2);
2288           __ strh(value, Address(__ post(to, 2)));
2289           __ subw(count, count, 2 >> shift);
2290           __ bind(L_skip_align2);
2291           // Fallthrough
2292         case T_INT:
2293           // Align to 8 bytes, we know we are 4 byte aligned to start.
2294           __ tbz(to, 2, L_skip_align4);
2295           __ strw(value, Address(__ post(to, 4)));
2296           __ subw(count, count, 4 >> shift);
2297           __ bind(L_skip_align4);
2298           break;
2299         default: ShouldNotReachHere();
2300       }
2301     }
2302 
2303     //
2304     //  Fill large chunks
2305     //
2306     __ lsrw(cnt_words, count, 3 - shift); // number of words
2307     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2308     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2309     if (UseBlockZeroing) {
2310       Label non_block_zeroing, rest;
2311       // If the fill value is zero we can use the fast zero_words().
2312       __ cbnz(value, non_block_zeroing);
2313       __ mov(bz_base, to);
2314       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2315       __ zero_words(bz_base, cnt_words);
2316       __ b(rest);
2317       __ bind(non_block_zeroing);
2318       __ fill_words(to, cnt_words, value);
2319       __ bind(rest);
2320     } else {
2321       __ fill_words(to, cnt_words, value);
2322     }
2323 
2324     // Remaining count is less than 8 bytes. Fill it by a single store.
2325     // Note that the total length is no less than 8 bytes.
2326     if (t == T_BYTE || t == T_SHORT) {
2327       Label L_exit1;
2328       __ cbzw(count, L_exit1);
2329       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2330       __ str(value, Address(to, -8));    // overwrite some elements
2331       __ bind(L_exit1);
2332       __ leave();
2333       __ ret(lr);
2334     }
2335 
2336     // Handle copies less than 8 bytes.
2337     Label L_fill_2, L_fill_4, L_exit2;
2338     __ bind(L_fill_elements);
2339     switch (t) {
2340       case T_BYTE:
2341         __ tbz(count, 0, L_fill_2);
2342         __ strb(value, Address(__ post(to, 1)));
2343         __ bind(L_fill_2);
2344         __ tbz(count, 1, L_fill_4);
2345         __ strh(value, Address(__ post(to, 2)));
2346         __ bind(L_fill_4);
2347         __ tbz(count, 2, L_exit2);
2348         __ strw(value, Address(to));
2349         break;
2350       case T_SHORT:
2351         __ tbz(count, 0, L_fill_4);
2352         __ strh(value, Address(__ post(to, 2)));
2353         __ bind(L_fill_4);
2354         __ tbz(count, 1, L_exit2);
2355         __ strw(value, Address(to));
2356         break;
2357       case T_INT:
2358         __ cbzw(count, L_exit2);
2359         __ strw(value, Address(to));
2360         break;
2361       default: ShouldNotReachHere();
2362     }
2363     __ bind(L_exit2);
2364     __ leave();
2365     __ ret(lr);
2366     return start;
2367   }
2368 
2369   address generate_data_cache_writeback() {
2370     const Register line        = c_rarg0;  // address of line to write back
2371 
2372     __ align(CodeEntryAlignment);
2373 
2374     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2375 
2376     address start = __ pc();
2377     __ enter();
2378     __ cache_wb(Address(line, 0));
2379     __ leave();
2380     __ ret(lr);
2381 
2382     return start;
2383   }
2384 
2385   address generate_data_cache_writeback_sync() {
2386     const Register is_pre     = c_rarg0;  // pre or post sync
2387 
2388     __ align(CodeEntryAlignment);
2389 
2390     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2391 
2392     // pre wbsync is a no-op
2393     // post wbsync translates to an sfence
2394 
2395     Label skip;
2396     address start = __ pc();
2397     __ enter();
2398     __ cbnz(is_pre, skip);
2399     __ cache_wbsync(false);
2400     __ bind(skip);
2401     __ leave();
2402     __ ret(lr);
2403 
2404     return start;
2405   }
2406 
2407   void generate_arraycopy_stubs() {
2408     address entry;
2409     address entry_jbyte_arraycopy;
2410     address entry_jshort_arraycopy;
2411     address entry_jint_arraycopy;
2412     address entry_oop_arraycopy;
2413     address entry_jlong_arraycopy;
2414     address entry_checkcast_arraycopy;
2415 
2416     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2417     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2418 
2419     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2420 
2421     //*** jbyte
2422     // Always need aligned and unaligned versions
2423     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2424                                                                                   "jbyte_disjoint_arraycopy");
2425     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2426                                                                                   &entry_jbyte_arraycopy,
2427                                                                                   "jbyte_arraycopy");
2428     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2429                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2430     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2431                                                                                   "arrayof_jbyte_arraycopy");
2432 
2433     //*** jshort
2434     // Always need aligned and unaligned versions
2435     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2436                                                                                     "jshort_disjoint_arraycopy");
2437     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2438                                                                                     &entry_jshort_arraycopy,
2439                                                                                     "jshort_arraycopy");
2440     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2441                                                                                     "arrayof_jshort_disjoint_arraycopy");
2442     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2443                                                                                     "arrayof_jshort_arraycopy");
2444 
2445     //*** jint
2446     // Aligned versions
2447     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2448                                                                                 "arrayof_jint_disjoint_arraycopy");
2449     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2450                                                                                 "arrayof_jint_arraycopy");
2451     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2452     // entry_jint_arraycopy always points to the unaligned version
2453     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2454                                                                                 "jint_disjoint_arraycopy");
2455     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2456                                                                                 &entry_jint_arraycopy,
2457                                                                                 "jint_arraycopy");
2458 
2459     //*** jlong
2460     // It is always aligned
2461     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2462                                                                                   "arrayof_jlong_disjoint_arraycopy");
2463     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2464                                                                                   "arrayof_jlong_arraycopy");
2465     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2466     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2467 
2468     //*** oops
2469     {
2470       // With compressed oops we need unaligned versions; notice that
2471       // we overwrite entry_oop_arraycopy.
2472       bool aligned = !UseCompressedOops;
2473 
2474       StubRoutines::_arrayof_oop_disjoint_arraycopy
2475         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2476                                      /*dest_uninitialized*/false);
2477       StubRoutines::_arrayof_oop_arraycopy
2478         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2479                                      /*dest_uninitialized*/false);
2480       // Aligned versions without pre-barriers
2481       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2482         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2483                                      /*dest_uninitialized*/true);
2484       StubRoutines::_arrayof_oop_arraycopy_uninit
2485         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2486                                      /*dest_uninitialized*/true);
2487     }
2488 
2489     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2490     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2491     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2492     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2493 
2494     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2495     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2496                                                                         /*dest_uninitialized*/true);
2497 
2498     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2499                                                               entry_jbyte_arraycopy,
2500                                                               entry_jshort_arraycopy,
2501                                                               entry_jint_arraycopy,
2502                                                               entry_jlong_arraycopy);
2503 
2504     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2505                                                                entry_jbyte_arraycopy,
2506                                                                entry_jshort_arraycopy,
2507                                                                entry_jint_arraycopy,
2508                                                                entry_oop_arraycopy,
2509                                                                entry_jlong_arraycopy,
2510                                                                entry_checkcast_arraycopy);
2511 
2512     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2513     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2514     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2515     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2516     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2517     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2518   }
2519 
2520   void generate_math_stubs() { Unimplemented(); }
2521 
2522   // Arguments:
2523   //
2524   // Inputs:
2525   //   c_rarg0   - source byte array address
2526   //   c_rarg1   - destination byte array address
2527   //   c_rarg2   - K (key) in little endian int array
2528   //
2529   address generate_aescrypt_encryptBlock() {
2530     __ align(CodeEntryAlignment);
2531     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2532 
2533     Label L_doLast;
2534 
2535     const Register from        = c_rarg0;  // source array address
2536     const Register to          = c_rarg1;  // destination array address
2537     const Register key         = c_rarg2;  // key array address
2538     const Register keylen      = rscratch1;
2539 
2540     address start = __ pc();
2541     __ enter();
2542 
2543     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2544 
2545     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2546 
2547     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2548     __ rev32(v1, __ T16B, v1);
2549     __ rev32(v2, __ T16B, v2);
2550     __ rev32(v3, __ T16B, v3);
2551     __ rev32(v4, __ T16B, v4);
2552     __ aese(v0, v1);
2553     __ aesmc(v0, v0);
2554     __ aese(v0, v2);
2555     __ aesmc(v0, v0);
2556     __ aese(v0, v3);
2557     __ aesmc(v0, v0);
2558     __ aese(v0, v4);
2559     __ aesmc(v0, v0);
2560 
2561     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2562     __ rev32(v1, __ T16B, v1);
2563     __ rev32(v2, __ T16B, v2);
2564     __ rev32(v3, __ T16B, v3);
2565     __ rev32(v4, __ T16B, v4);
2566     __ aese(v0, v1);
2567     __ aesmc(v0, v0);
2568     __ aese(v0, v2);
2569     __ aesmc(v0, v0);
2570     __ aese(v0, v3);
2571     __ aesmc(v0, v0);
2572     __ aese(v0, v4);
2573     __ aesmc(v0, v0);
2574 
2575     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2576     __ rev32(v1, __ T16B, v1);
2577     __ rev32(v2, __ T16B, v2);
2578 
2579     __ cmpw(keylen, 44);
2580     __ br(Assembler::EQ, L_doLast);
2581 
2582     __ aese(v0, v1);
2583     __ aesmc(v0, v0);
2584     __ aese(v0, v2);
2585     __ aesmc(v0, v0);
2586 
2587     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2588     __ rev32(v1, __ T16B, v1);
2589     __ rev32(v2, __ T16B, v2);
2590 
2591     __ cmpw(keylen, 52);
2592     __ br(Assembler::EQ, L_doLast);
2593 
2594     __ aese(v0, v1);
2595     __ aesmc(v0, v0);
2596     __ aese(v0, v2);
2597     __ aesmc(v0, v0);
2598 
2599     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2600     __ rev32(v1, __ T16B, v1);
2601     __ rev32(v2, __ T16B, v2);
2602 
2603     __ BIND(L_doLast);
2604 
2605     __ aese(v0, v1);
2606     __ aesmc(v0, v0);
2607     __ aese(v0, v2);
2608 
2609     __ ld1(v1, __ T16B, key);
2610     __ rev32(v1, __ T16B, v1);
2611     __ eor(v0, __ T16B, v0, v1);
2612 
2613     __ st1(v0, __ T16B, to);
2614 
2615     __ mov(r0, 0);
2616 
2617     __ leave();
2618     __ ret(lr);
2619 
2620     return start;
2621   }
2622 
2623   // Arguments:
2624   //
2625   // Inputs:
2626   //   c_rarg0   - source byte array address
2627   //   c_rarg1   - destination byte array address
2628   //   c_rarg2   - K (key) in little endian int array
2629   //
2630   address generate_aescrypt_decryptBlock() {
2631     assert(UseAES, "need AES instructions and misaligned SSE support");
2632     __ align(CodeEntryAlignment);
2633     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2634     Label L_doLast;
2635 
2636     const Register from        = c_rarg0;  // source array address
2637     const Register to          = c_rarg1;  // destination array address
2638     const Register key         = c_rarg2;  // key array address
2639     const Register keylen      = rscratch1;
2640 
2641     address start = __ pc();
2642     __ enter(); // required for proper stackwalking of RuntimeStub frame
2643 
2644     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2645 
2646     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2647 
2648     __ ld1(v5, __ T16B, __ post(key, 16));
2649     __ rev32(v5, __ T16B, v5);
2650 
2651     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2652     __ rev32(v1, __ T16B, v1);
2653     __ rev32(v2, __ T16B, v2);
2654     __ rev32(v3, __ T16B, v3);
2655     __ rev32(v4, __ T16B, v4);
2656     __ aesd(v0, v1);
2657     __ aesimc(v0, v0);
2658     __ aesd(v0, v2);
2659     __ aesimc(v0, v0);
2660     __ aesd(v0, v3);
2661     __ aesimc(v0, v0);
2662     __ aesd(v0, v4);
2663     __ aesimc(v0, v0);
2664 
2665     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2666     __ rev32(v1, __ T16B, v1);
2667     __ rev32(v2, __ T16B, v2);
2668     __ rev32(v3, __ T16B, v3);
2669     __ rev32(v4, __ T16B, v4);
2670     __ aesd(v0, v1);
2671     __ aesimc(v0, v0);
2672     __ aesd(v0, v2);
2673     __ aesimc(v0, v0);
2674     __ aesd(v0, v3);
2675     __ aesimc(v0, v0);
2676     __ aesd(v0, v4);
2677     __ aesimc(v0, v0);
2678 
2679     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2680     __ rev32(v1, __ T16B, v1);
2681     __ rev32(v2, __ T16B, v2);
2682 
2683     __ cmpw(keylen, 44);
2684     __ br(Assembler::EQ, L_doLast);
2685 
2686     __ aesd(v0, v1);
2687     __ aesimc(v0, v0);
2688     __ aesd(v0, v2);
2689     __ aesimc(v0, v0);
2690 
2691     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2692     __ rev32(v1, __ T16B, v1);
2693     __ rev32(v2, __ T16B, v2);
2694 
2695     __ cmpw(keylen, 52);
2696     __ br(Assembler::EQ, L_doLast);
2697 
2698     __ aesd(v0, v1);
2699     __ aesimc(v0, v0);
2700     __ aesd(v0, v2);
2701     __ aesimc(v0, v0);
2702 
2703     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2704     __ rev32(v1, __ T16B, v1);
2705     __ rev32(v2, __ T16B, v2);
2706 
2707     __ BIND(L_doLast);
2708 
2709     __ aesd(v0, v1);
2710     __ aesimc(v0, v0);
2711     __ aesd(v0, v2);
2712 
2713     __ eor(v0, __ T16B, v0, v5);
2714 
2715     __ st1(v0, __ T16B, to);
2716 
2717     __ mov(r0, 0);
2718 
2719     __ leave();
2720     __ ret(lr);
2721 
2722     return start;
2723   }
2724 
2725   // Arguments:
2726   //
2727   // Inputs:
2728   //   c_rarg0   - source byte array address
2729   //   c_rarg1   - destination byte array address
2730   //   c_rarg2   - K (key) in little endian int array
2731   //   c_rarg3   - r vector byte array address
2732   //   c_rarg4   - input length
2733   //
2734   // Output:
2735   //   x0        - input length
2736   //
2737   address generate_cipherBlockChaining_encryptAESCrypt() {
2738     assert(UseAES, "need AES instructions and misaligned SSE support");
2739     __ align(CodeEntryAlignment);
2740     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2741 
2742     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2743 
2744     const Register from        = c_rarg0;  // source array address
2745     const Register to          = c_rarg1;  // destination array address
2746     const Register key         = c_rarg2;  // key array address
2747     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2748                                            // and left with the results of the last encryption block
2749     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2750     const Register keylen      = rscratch1;
2751 
2752     address start = __ pc();
2753 
2754       __ enter();
2755 
2756       __ movw(rscratch2, len_reg);
2757 
2758       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2759 
2760       __ ld1(v0, __ T16B, rvec);
2761 
2762       __ cmpw(keylen, 52);
2763       __ br(Assembler::CC, L_loadkeys_44);
2764       __ br(Assembler::EQ, L_loadkeys_52);
2765 
2766       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2767       __ rev32(v17, __ T16B, v17);
2768       __ rev32(v18, __ T16B, v18);
2769     __ BIND(L_loadkeys_52);
2770       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2771       __ rev32(v19, __ T16B, v19);
2772       __ rev32(v20, __ T16B, v20);
2773     __ BIND(L_loadkeys_44);
2774       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2775       __ rev32(v21, __ T16B, v21);
2776       __ rev32(v22, __ T16B, v22);
2777       __ rev32(v23, __ T16B, v23);
2778       __ rev32(v24, __ T16B, v24);
2779       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2780       __ rev32(v25, __ T16B, v25);
2781       __ rev32(v26, __ T16B, v26);
2782       __ rev32(v27, __ T16B, v27);
2783       __ rev32(v28, __ T16B, v28);
2784       __ ld1(v29, v30, v31, __ T16B, key);
2785       __ rev32(v29, __ T16B, v29);
2786       __ rev32(v30, __ T16B, v30);
2787       __ rev32(v31, __ T16B, v31);
2788 
2789     __ BIND(L_aes_loop);
2790       __ ld1(v1, __ T16B, __ post(from, 16));
2791       __ eor(v0, __ T16B, v0, v1);
2792 
2793       __ br(Assembler::CC, L_rounds_44);
2794       __ br(Assembler::EQ, L_rounds_52);
2795 
2796       __ aese(v0, v17); __ aesmc(v0, v0);
2797       __ aese(v0, v18); __ aesmc(v0, v0);
2798     __ BIND(L_rounds_52);
2799       __ aese(v0, v19); __ aesmc(v0, v0);
2800       __ aese(v0, v20); __ aesmc(v0, v0);
2801     __ BIND(L_rounds_44);
2802       __ aese(v0, v21); __ aesmc(v0, v0);
2803       __ aese(v0, v22); __ aesmc(v0, v0);
2804       __ aese(v0, v23); __ aesmc(v0, v0);
2805       __ aese(v0, v24); __ aesmc(v0, v0);
2806       __ aese(v0, v25); __ aesmc(v0, v0);
2807       __ aese(v0, v26); __ aesmc(v0, v0);
2808       __ aese(v0, v27); __ aesmc(v0, v0);
2809       __ aese(v0, v28); __ aesmc(v0, v0);
2810       __ aese(v0, v29); __ aesmc(v0, v0);
2811       __ aese(v0, v30);
2812       __ eor(v0, __ T16B, v0, v31);
2813 
2814       __ st1(v0, __ T16B, __ post(to, 16));
2815 
2816       __ subw(len_reg, len_reg, 16);
2817       __ cbnzw(len_reg, L_aes_loop);
2818 
2819       __ st1(v0, __ T16B, rvec);
2820 
2821       __ mov(r0, rscratch2);
2822 
2823       __ leave();
2824       __ ret(lr);
2825 
2826       return start;
2827   }
2828 
2829   // Arguments:
2830   //
2831   // Inputs:
2832   //   c_rarg0   - source byte array address
2833   //   c_rarg1   - destination byte array address
2834   //   c_rarg2   - K (key) in little endian int array
2835   //   c_rarg3   - r vector byte array address
2836   //   c_rarg4   - input length
2837   //
2838   // Output:
2839   //   r0        - input length
2840   //
2841   address generate_cipherBlockChaining_decryptAESCrypt() {
2842     assert(UseAES, "need AES instructions and misaligned SSE support");
2843     __ align(CodeEntryAlignment);
2844     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2845 
2846     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2847 
2848     const Register from        = c_rarg0;  // source array address
2849     const Register to          = c_rarg1;  // destination array address
2850     const Register key         = c_rarg2;  // key array address
2851     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2852                                            // and left with the results of the last encryption block
2853     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2854     const Register keylen      = rscratch1;
2855 
2856     address start = __ pc();
2857 
2858       __ enter();
2859 
2860       __ movw(rscratch2, len_reg);
2861 
2862       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2863 
2864       __ ld1(v2, __ T16B, rvec);
2865 
2866       __ ld1(v31, __ T16B, __ post(key, 16));
2867       __ rev32(v31, __ T16B, v31);
2868 
2869       __ cmpw(keylen, 52);
2870       __ br(Assembler::CC, L_loadkeys_44);
2871       __ br(Assembler::EQ, L_loadkeys_52);
2872 
2873       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2874       __ rev32(v17, __ T16B, v17);
2875       __ rev32(v18, __ T16B, v18);
2876     __ BIND(L_loadkeys_52);
2877       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2878       __ rev32(v19, __ T16B, v19);
2879       __ rev32(v20, __ T16B, v20);
2880     __ BIND(L_loadkeys_44);
2881       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2882       __ rev32(v21, __ T16B, v21);
2883       __ rev32(v22, __ T16B, v22);
2884       __ rev32(v23, __ T16B, v23);
2885       __ rev32(v24, __ T16B, v24);
2886       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2887       __ rev32(v25, __ T16B, v25);
2888       __ rev32(v26, __ T16B, v26);
2889       __ rev32(v27, __ T16B, v27);
2890       __ rev32(v28, __ T16B, v28);
2891       __ ld1(v29, v30, __ T16B, key);
2892       __ rev32(v29, __ T16B, v29);
2893       __ rev32(v30, __ T16B, v30);
2894 
2895     __ BIND(L_aes_loop);
2896       __ ld1(v0, __ T16B, __ post(from, 16));
2897       __ orr(v1, __ T16B, v0, v0);
2898 
2899       __ br(Assembler::CC, L_rounds_44);
2900       __ br(Assembler::EQ, L_rounds_52);
2901 
2902       __ aesd(v0, v17); __ aesimc(v0, v0);
2903       __ aesd(v0, v18); __ aesimc(v0, v0);
2904     __ BIND(L_rounds_52);
2905       __ aesd(v0, v19); __ aesimc(v0, v0);
2906       __ aesd(v0, v20); __ aesimc(v0, v0);
2907     __ BIND(L_rounds_44);
2908       __ aesd(v0, v21); __ aesimc(v0, v0);
2909       __ aesd(v0, v22); __ aesimc(v0, v0);
2910       __ aesd(v0, v23); __ aesimc(v0, v0);
2911       __ aesd(v0, v24); __ aesimc(v0, v0);
2912       __ aesd(v0, v25); __ aesimc(v0, v0);
2913       __ aesd(v0, v26); __ aesimc(v0, v0);
2914       __ aesd(v0, v27); __ aesimc(v0, v0);
2915       __ aesd(v0, v28); __ aesimc(v0, v0);
2916       __ aesd(v0, v29); __ aesimc(v0, v0);
2917       __ aesd(v0, v30);
2918       __ eor(v0, __ T16B, v0, v31);
2919       __ eor(v0, __ T16B, v0, v2);
2920 
2921       __ st1(v0, __ T16B, __ post(to, 16));
2922       __ orr(v2, __ T16B, v1, v1);
2923 
2924       __ subw(len_reg, len_reg, 16);
2925       __ cbnzw(len_reg, L_aes_loop);
2926 
2927       __ st1(v2, __ T16B, rvec);
2928 
2929       __ mov(r0, rscratch2);
2930 
2931       __ leave();
2932       __ ret(lr);
2933 
2934     return start;
2935   }
2936 
2937   // Arguments:
2938   //
2939   // Inputs:
2940   //   c_rarg0   - byte[]  source+offset
2941   //   c_rarg1   - int[]   SHA.state
2942   //   c_rarg2   - int     offset
2943   //   c_rarg3   - int     limit
2944   //
2945   address generate_sha1_implCompress(bool multi_block, const char *name) {
2946     __ align(CodeEntryAlignment);
2947     StubCodeMark mark(this, "StubRoutines", name);
2948     address start = __ pc();
2949 
2950     Register buf   = c_rarg0;
2951     Register state = c_rarg1;
2952     Register ofs   = c_rarg2;
2953     Register limit = c_rarg3;
2954 
2955     Label keys;
2956     Label sha1_loop;
2957 
2958     // load the keys into v0..v3
2959     __ adr(rscratch1, keys);
2960     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2961     // load 5 words state into v6, v7
2962     __ ldrq(v6, Address(state, 0));
2963     __ ldrs(v7, Address(state, 16));
2964 
2965 
2966     __ BIND(sha1_loop);
2967     // load 64 bytes of data into v16..v19
2968     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2969     __ rev32(v16, __ T16B, v16);
2970     __ rev32(v17, __ T16B, v17);
2971     __ rev32(v18, __ T16B, v18);
2972     __ rev32(v19, __ T16B, v19);
2973 
2974     // do the sha1
2975     __ addv(v4, __ T4S, v16, v0);
2976     __ orr(v20, __ T16B, v6, v6);
2977 
2978     FloatRegister d0 = v16;
2979     FloatRegister d1 = v17;
2980     FloatRegister d2 = v18;
2981     FloatRegister d3 = v19;
2982 
2983     for (int round = 0; round < 20; round++) {
2984       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2985       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2986       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2987       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2988       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2989 
2990       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2991       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2992       __ sha1h(tmp2, __ T4S, v20);
2993       if (round < 5)
2994         __ sha1c(v20, __ T4S, tmp3, tmp4);
2995       else if (round < 10 || round >= 15)
2996         __ sha1p(v20, __ T4S, tmp3, tmp4);
2997       else
2998         __ sha1m(v20, __ T4S, tmp3, tmp4);
2999       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3000 
3001       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3002     }
3003 
3004     __ addv(v7, __ T2S, v7, v21);
3005     __ addv(v6, __ T4S, v6, v20);
3006 
3007     if (multi_block) {
3008       __ add(ofs, ofs, 64);
3009       __ cmp(ofs, limit);
3010       __ br(Assembler::LE, sha1_loop);
3011       __ mov(c_rarg0, ofs); // return ofs
3012     }
3013 
3014     __ strq(v6, Address(state, 0));
3015     __ strs(v7, Address(state, 16));
3016 
3017     __ ret(lr);
3018 
3019     __ bind(keys);
3020     __ emit_int32(0x5a827999);
3021     __ emit_int32(0x6ed9eba1);
3022     __ emit_int32(0x8f1bbcdc);
3023     __ emit_int32(0xca62c1d6);
3024 
3025     return start;
3026   }
3027 
3028 
3029   // Arguments:
3030   //
3031   // Inputs:
3032   //   c_rarg0   - byte[]  source+offset
3033   //   c_rarg1   - int[]   SHA.state
3034   //   c_rarg2   - int     offset
3035   //   c_rarg3   - int     limit
3036   //
3037   address generate_sha256_implCompress(bool multi_block, const char *name) {
3038     static const uint32_t round_consts[64] = {
3039       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3040       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3041       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3042       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3043       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3044       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3045       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3046       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3047       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3048       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3049       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3050       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3051       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3052       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3053       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3054       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3055     };
3056     __ align(CodeEntryAlignment);
3057     StubCodeMark mark(this, "StubRoutines", name);
3058     address start = __ pc();
3059 
3060     Register buf   = c_rarg0;
3061     Register state = c_rarg1;
3062     Register ofs   = c_rarg2;
3063     Register limit = c_rarg3;
3064 
3065     Label sha1_loop;
3066 
3067     __ stpd(v8, v9, __ pre(sp, -32));
3068     __ stpd(v10, v11, Address(sp, 16));
3069 
3070 // dga == v0
3071 // dgb == v1
3072 // dg0 == v2
3073 // dg1 == v3
3074 // dg2 == v4
3075 // t0 == v6
3076 // t1 == v7
3077 
3078     // load 16 keys to v16..v31
3079     __ lea(rscratch1, ExternalAddress((address)round_consts));
3080     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3081     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3082     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3083     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3084 
3085     // load 8 words (256 bits) state
3086     __ ldpq(v0, v1, state);
3087 
3088     __ BIND(sha1_loop);
3089     // load 64 bytes of data into v8..v11
3090     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3091     __ rev32(v8, __ T16B, v8);
3092     __ rev32(v9, __ T16B, v9);
3093     __ rev32(v10, __ T16B, v10);
3094     __ rev32(v11, __ T16B, v11);
3095 
3096     __ addv(v6, __ T4S, v8, v16);
3097     __ orr(v2, __ T16B, v0, v0);
3098     __ orr(v3, __ T16B, v1, v1);
3099 
3100     FloatRegister d0 = v8;
3101     FloatRegister d1 = v9;
3102     FloatRegister d2 = v10;
3103     FloatRegister d3 = v11;
3104 
3105 
3106     for (int round = 0; round < 16; round++) {
3107       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3108       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3109       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3110       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3111 
3112       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3113        __ orr(v4, __ T16B, v2, v2);
3114       if (round < 15)
3115         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3116       __ sha256h(v2, __ T4S, v3, tmp2);
3117       __ sha256h2(v3, __ T4S, v4, tmp2);
3118       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3119 
3120       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3121     }
3122 
3123     __ addv(v0, __ T4S, v0, v2);
3124     __ addv(v1, __ T4S, v1, v3);
3125 
3126     if (multi_block) {
3127       __ add(ofs, ofs, 64);
3128       __ cmp(ofs, limit);
3129       __ br(Assembler::LE, sha1_loop);
3130       __ mov(c_rarg0, ofs); // return ofs
3131     }
3132 
3133     __ ldpd(v10, v11, Address(sp, 16));
3134     __ ldpd(v8, v9, __ post(sp, 32));
3135 
3136     __ stpq(v0, v1, state);
3137 
3138     __ ret(lr);
3139 
3140     return start;
3141   }
3142 
3143   // Safefetch stubs.
3144   void generate_safefetch(const char* name, int size, address* entry,
3145                           address* fault_pc, address* continuation_pc) {
3146     // safefetch signatures:
3147     //   int      SafeFetch32(int*      adr, int      errValue);
3148     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3149     //
3150     // arguments:
3151     //   c_rarg0 = adr
3152     //   c_rarg1 = errValue
3153     //
3154     // result:
3155     //   PPC_RET  = *adr or errValue
3156 
3157     StubCodeMark mark(this, "StubRoutines", name);
3158 
3159     // Entry point, pc or function descriptor.
3160     *entry = __ pc();
3161 
3162     // Load *adr into c_rarg1, may fault.
3163     *fault_pc = __ pc();
3164     switch (size) {
3165       case 4:
3166         // int32_t
3167         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3168         break;
3169       case 8:
3170         // int64_t
3171         __ ldr(c_rarg1, Address(c_rarg0, 0));
3172         break;
3173       default:
3174         ShouldNotReachHere();
3175     }
3176 
3177     // return errValue or *adr
3178     *continuation_pc = __ pc();
3179     __ mov(r0, c_rarg1);
3180     __ ret(lr);
3181   }
3182 
3183   /**
3184    *  Arguments:
3185    *
3186    * Inputs:
3187    *   c_rarg0   - int crc
3188    *   c_rarg1   - byte* buf
3189    *   c_rarg2   - int length
3190    *
3191    * Ouput:
3192    *       rax   - int crc result
3193    */
3194   address generate_updateBytesCRC32() {
3195     assert(UseCRC32Intrinsics, "what are we doing here?");
3196 
3197     __ align(CodeEntryAlignment);
3198     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3199 
3200     address start = __ pc();
3201 
3202     const Register crc   = c_rarg0;  // crc
3203     const Register buf   = c_rarg1;  // source java byte array address
3204     const Register len   = c_rarg2;  // length
3205     const Register table0 = c_rarg3; // crc_table address
3206     const Register table1 = c_rarg4;
3207     const Register table2 = c_rarg5;
3208     const Register table3 = c_rarg6;
3209     const Register tmp3 = c_rarg7;
3210 
3211     BLOCK_COMMENT("Entry:");
3212     __ enter(); // required for proper stackwalking of RuntimeStub frame
3213 
3214     __ kernel_crc32(crc, buf, len,
3215               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3216 
3217     __ leave(); // required for proper stackwalking of RuntimeStub frame
3218     __ ret(lr);
3219 
3220     return start;
3221   }
3222 
3223   /**
3224    *  Arguments:
3225    *
3226    * Inputs:
3227    *   c_rarg0   - int crc
3228    *   c_rarg1   - byte* buf
3229    *   c_rarg2   - int length
3230    *   c_rarg3   - int* table
3231    *
3232    * Ouput:
3233    *       r0   - int crc result
3234    */
3235   address generate_updateBytesCRC32C() {
3236     assert(UseCRC32CIntrinsics, "what are we doing here?");
3237 
3238     __ align(CodeEntryAlignment);
3239     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3240 
3241     address start = __ pc();
3242 
3243     const Register crc   = c_rarg0;  // crc
3244     const Register buf   = c_rarg1;  // source java byte array address
3245     const Register len   = c_rarg2;  // length
3246     const Register table0 = c_rarg3; // crc_table address
3247     const Register table1 = c_rarg4;
3248     const Register table2 = c_rarg5;
3249     const Register table3 = c_rarg6;
3250     const Register tmp3 = c_rarg7;
3251 
3252     BLOCK_COMMENT("Entry:");
3253     __ enter(); // required for proper stackwalking of RuntimeStub frame
3254 
3255     __ kernel_crc32c(crc, buf, len,
3256               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3257 
3258     __ leave(); // required for proper stackwalking of RuntimeStub frame
3259     __ ret(lr);
3260 
3261     return start;
3262   }
3263 
3264   /***
3265    *  Arguments:
3266    *
3267    *  Inputs:
3268    *   c_rarg0   - int   adler
3269    *   c_rarg1   - byte* buff
3270    *   c_rarg2   - int   len
3271    *
3272    * Output:
3273    *   c_rarg0   - int adler result
3274    */
3275   address generate_updateBytesAdler32() {
3276     __ align(CodeEntryAlignment);
3277     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3278     address start = __ pc();
3279 
3280     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3281 
3282     // Aliases
3283     Register adler  = c_rarg0;
3284     Register s1     = c_rarg0;
3285     Register s2     = c_rarg3;
3286     Register buff   = c_rarg1;
3287     Register len    = c_rarg2;
3288     Register nmax  = r4;
3289     Register base  = r5;
3290     Register count = r6;
3291     Register temp0 = rscratch1;
3292     Register temp1 = rscratch2;
3293     FloatRegister vbytes = v0;
3294     FloatRegister vs1acc = v1;
3295     FloatRegister vs2acc = v2;
3296     FloatRegister vtable = v3;
3297 
3298     // Max number of bytes we can process before having to take the mod
3299     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3300     unsigned long BASE = 0xfff1;
3301     unsigned long NMAX = 0x15B0;
3302 
3303     __ mov(base, BASE);
3304     __ mov(nmax, NMAX);
3305 
3306     // Load accumulation coefficients for the upper 16 bits
3307     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3308     __ ld1(vtable, __ T16B, Address(temp0));
3309 
3310     // s1 is initialized to the lower 16 bits of adler
3311     // s2 is initialized to the upper 16 bits of adler
3312     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3313     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3314 
3315     // The pipelined loop needs at least 16 elements for 1 iteration
3316     // It does check this, but it is more effective to skip to the cleanup loop
3317     __ cmp(len, (u1)16);
3318     __ br(Assembler::HS, L_nmax);
3319     __ cbz(len, L_combine);
3320 
3321     __ bind(L_simple_by1_loop);
3322     __ ldrb(temp0, Address(__ post(buff, 1)));
3323     __ add(s1, s1, temp0);
3324     __ add(s2, s2, s1);
3325     __ subs(len, len, 1);
3326     __ br(Assembler::HI, L_simple_by1_loop);
3327 
3328     // s1 = s1 % BASE
3329     __ subs(temp0, s1, base);
3330     __ csel(s1, temp0, s1, Assembler::HS);
3331 
3332     // s2 = s2 % BASE
3333     __ lsr(temp0, s2, 16);
3334     __ lsl(temp1, temp0, 4);
3335     __ sub(temp1, temp1, temp0);
3336     __ add(s2, temp1, s2, ext::uxth);
3337 
3338     __ subs(temp0, s2, base);
3339     __ csel(s2, temp0, s2, Assembler::HS);
3340 
3341     __ b(L_combine);
3342 
3343     __ bind(L_nmax);
3344     __ subs(len, len, nmax);
3345     __ sub(count, nmax, 16);
3346     __ br(Assembler::LO, L_by16);
3347 
3348     __ bind(L_nmax_loop);
3349 
3350     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3351                                       vbytes, vs1acc, vs2acc, vtable);
3352 
3353     __ subs(count, count, 16);
3354     __ br(Assembler::HS, L_nmax_loop);
3355 
3356     // s1 = s1 % BASE
3357     __ lsr(temp0, s1, 16);
3358     __ lsl(temp1, temp0, 4);
3359     __ sub(temp1, temp1, temp0);
3360     __ add(temp1, temp1, s1, ext::uxth);
3361 
3362     __ lsr(temp0, temp1, 16);
3363     __ lsl(s1, temp0, 4);
3364     __ sub(s1, s1, temp0);
3365     __ add(s1, s1, temp1, ext:: uxth);
3366 
3367     __ subs(temp0, s1, base);
3368     __ csel(s1, temp0, s1, Assembler::HS);
3369 
3370     // s2 = s2 % BASE
3371     __ lsr(temp0, s2, 16);
3372     __ lsl(temp1, temp0, 4);
3373     __ sub(temp1, temp1, temp0);
3374     __ add(temp1, temp1, s2, ext::uxth);
3375 
3376     __ lsr(temp0, temp1, 16);
3377     __ lsl(s2, temp0, 4);
3378     __ sub(s2, s2, temp0);
3379     __ add(s2, s2, temp1, ext:: uxth);
3380 
3381     __ subs(temp0, s2, base);
3382     __ csel(s2, temp0, s2, Assembler::HS);
3383 
3384     __ subs(len, len, nmax);
3385     __ sub(count, nmax, 16);
3386     __ br(Assembler::HS, L_nmax_loop);
3387 
3388     __ bind(L_by16);
3389     __ adds(len, len, count);
3390     __ br(Assembler::LO, L_by1);
3391 
3392     __ bind(L_by16_loop);
3393 
3394     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3395                                       vbytes, vs1acc, vs2acc, vtable);
3396 
3397     __ subs(len, len, 16);
3398     __ br(Assembler::HS, L_by16_loop);
3399 
3400     __ bind(L_by1);
3401     __ adds(len, len, 15);
3402     __ br(Assembler::LO, L_do_mod);
3403 
3404     __ bind(L_by1_loop);
3405     __ ldrb(temp0, Address(__ post(buff, 1)));
3406     __ add(s1, temp0, s1);
3407     __ add(s2, s2, s1);
3408     __ subs(len, len, 1);
3409     __ br(Assembler::HS, L_by1_loop);
3410 
3411     __ bind(L_do_mod);
3412     // s1 = s1 % BASE
3413     __ lsr(temp0, s1, 16);
3414     __ lsl(temp1, temp0, 4);
3415     __ sub(temp1, temp1, temp0);
3416     __ add(temp1, temp1, s1, ext::uxth);
3417 
3418     __ lsr(temp0, temp1, 16);
3419     __ lsl(s1, temp0, 4);
3420     __ sub(s1, s1, temp0);
3421     __ add(s1, s1, temp1, ext:: uxth);
3422 
3423     __ subs(temp0, s1, base);
3424     __ csel(s1, temp0, s1, Assembler::HS);
3425 
3426     // s2 = s2 % BASE
3427     __ lsr(temp0, s2, 16);
3428     __ lsl(temp1, temp0, 4);
3429     __ sub(temp1, temp1, temp0);
3430     __ add(temp1, temp1, s2, ext::uxth);
3431 
3432     __ lsr(temp0, temp1, 16);
3433     __ lsl(s2, temp0, 4);
3434     __ sub(s2, s2, temp0);
3435     __ add(s2, s2, temp1, ext:: uxth);
3436 
3437     __ subs(temp0, s2, base);
3438     __ csel(s2, temp0, s2, Assembler::HS);
3439 
3440     // Combine lower bits and higher bits
3441     __ bind(L_combine);
3442     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3443 
3444     __ ret(lr);
3445 
3446     return start;
3447   }
3448 
3449   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3450           Register temp0, Register temp1, FloatRegister vbytes,
3451           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3452     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3453     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3454     // In non-vectorized code, we update s1 and s2 as:
3455     //   s1 <- s1 + b1
3456     //   s2 <- s2 + s1
3457     //   s1 <- s1 + b2
3458     //   s2 <- s2 + b1
3459     //   ...
3460     //   s1 <- s1 + b16
3461     //   s2 <- s2 + s1
3462     // Putting above assignments together, we have:
3463     //   s1_new = s1 + b1 + b2 + ... + b16
3464     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3465     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3466     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3467     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3468 
3469     // s2 = s2 + s1 * 16
3470     __ add(s2, s2, s1, Assembler::LSL, 4);
3471 
3472     // vs1acc = b1 + b2 + b3 + ... + b16
3473     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3474     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3475     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3476     __ uaddlv(vs1acc, __ T16B, vbytes);
3477     __ uaddlv(vs2acc, __ T8H, vs2acc);
3478 
3479     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3480     __ fmovd(temp0, vs1acc);
3481     __ fmovd(temp1, vs2acc);
3482     __ add(s1, s1, temp0);
3483     __ add(s2, s2, temp1);
3484   }
3485 
3486   /**
3487    *  Arguments:
3488    *
3489    *  Input:
3490    *    c_rarg0   - x address
3491    *    c_rarg1   - x length
3492    *    c_rarg2   - y address
3493    *    c_rarg3   - y lenth
3494    *    c_rarg4   - z address
3495    *    c_rarg5   - z length
3496    */
3497   address generate_multiplyToLen() {
3498     __ align(CodeEntryAlignment);
3499     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3500 
3501     address start = __ pc();
3502     const Register x     = r0;
3503     const Register xlen  = r1;
3504     const Register y     = r2;
3505     const Register ylen  = r3;
3506     const Register z     = r4;
3507     const Register zlen  = r5;
3508 
3509     const Register tmp1  = r10;
3510     const Register tmp2  = r11;
3511     const Register tmp3  = r12;
3512     const Register tmp4  = r13;
3513     const Register tmp5  = r14;
3514     const Register tmp6  = r15;
3515     const Register tmp7  = r16;
3516 
3517     BLOCK_COMMENT("Entry:");
3518     __ enter(); // required for proper stackwalking of RuntimeStub frame
3519     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3520     __ leave(); // required for proper stackwalking of RuntimeStub frame
3521     __ ret(lr);
3522 
3523     return start;
3524   }
3525 
3526   address generate_squareToLen() {
3527     // squareToLen algorithm for sizes 1..127 described in java code works
3528     // faster than multiply_to_len on some CPUs and slower on others, but
3529     // multiply_to_len shows a bit better overall results
3530     __ align(CodeEntryAlignment);
3531     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3532     address start = __ pc();
3533 
3534     const Register x     = r0;
3535     const Register xlen  = r1;
3536     const Register z     = r2;
3537     const Register zlen  = r3;
3538     const Register y     = r4; // == x
3539     const Register ylen  = r5; // == xlen
3540 
3541     const Register tmp1  = r10;
3542     const Register tmp2  = r11;
3543     const Register tmp3  = r12;
3544     const Register tmp4  = r13;
3545     const Register tmp5  = r14;
3546     const Register tmp6  = r15;
3547     const Register tmp7  = r16;
3548 
3549     RegSet spilled_regs = RegSet::of(y, ylen);
3550     BLOCK_COMMENT("Entry:");
3551     __ enter();
3552     __ push(spilled_regs, sp);
3553     __ mov(y, x);
3554     __ mov(ylen, xlen);
3555     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3556     __ pop(spilled_regs, sp);
3557     __ leave();
3558     __ ret(lr);
3559     return start;
3560   }
3561 
3562   address generate_mulAdd() {
3563     __ align(CodeEntryAlignment);
3564     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3565 
3566     address start = __ pc();
3567 
3568     const Register out     = r0;
3569     const Register in      = r1;
3570     const Register offset  = r2;
3571     const Register len     = r3;
3572     const Register k       = r4;
3573 
3574     BLOCK_COMMENT("Entry:");
3575     __ enter();
3576     __ mul_add(out, in, offset, len, k);
3577     __ leave();
3578     __ ret(lr);
3579 
3580     return start;
3581   }
3582 
3583   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3584                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3585                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3586     // Karatsuba multiplication performs a 128*128 -> 256-bit
3587     // multiplication in three 128-bit multiplications and a few
3588     // additions.
3589     //
3590     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3591     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3592     //
3593     // Inputs:
3594     //
3595     // A0 in a.d[0]     (subkey)
3596     // A1 in a.d[1]
3597     // (A1+A0) in a1_xor_a0.d[0]
3598     //
3599     // B0 in b.d[0]     (state)
3600     // B1 in b.d[1]
3601 
3602     __ ext(tmp1, __ T16B, b, b, 0x08);
3603     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3604     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3605     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3606     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3607 
3608     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3609     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3610     __ eor(tmp2, __ T16B, tmp2, tmp4);
3611     __ eor(tmp2, __ T16B, tmp2, tmp3);
3612 
3613     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3614     __ ins(result_hi, __ D, tmp2, 0, 1);
3615     __ ins(result_lo, __ D, tmp2, 1, 0);
3616   }
3617 
3618   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3619                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3620     const FloatRegister t0 = result;
3621 
3622     // The GCM field polynomial f is z^128 + p(z), where p =
3623     // z^7+z^2+z+1.
3624     //
3625     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3626     //
3627     // so, given that the product we're reducing is
3628     //    a == lo + hi * z^128
3629     // substituting,
3630     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3631     //
3632     // we reduce by multiplying hi by p(z) and subtracting the result
3633     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3634     // bits we can do this with two 64-bit multiplications, lo*p and
3635     // hi*p.
3636 
3637     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3638     __ ext(t1, __ T16B, t0, z, 8);
3639     __ eor(hi, __ T16B, hi, t1);
3640     __ ext(t1, __ T16B, z, t0, 8);
3641     __ eor(lo, __ T16B, lo, t1);
3642     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3643     __ eor(result, __ T16B, lo, t0);
3644   }
3645 
3646   address generate_has_negatives(address &has_negatives_long) {
3647     const u1 large_loop_size = 64;
3648     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3649     int dcache_line = VM_Version::dcache_line_size();
3650 
3651     Register ary1 = r1, len = r2, result = r0;
3652 
3653     __ align(CodeEntryAlignment);
3654 
3655     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3656 
3657     address entry = __ pc();
3658 
3659     __ enter();
3660 
3661   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3662         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3663 
3664   __ cmp(len, (u1)15);
3665   __ br(Assembler::GT, LEN_OVER_15);
3666   // The only case when execution falls into this code is when pointer is near
3667   // the end of memory page and we have to avoid reading next page
3668   __ add(ary1, ary1, len);
3669   __ subs(len, len, 8);
3670   __ br(Assembler::GT, LEN_OVER_8);
3671   __ ldr(rscratch2, Address(ary1, -8));
3672   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3673   __ lsrv(rscratch2, rscratch2, rscratch1);
3674   __ tst(rscratch2, UPPER_BIT_MASK);
3675   __ cset(result, Assembler::NE);
3676   __ leave();
3677   __ ret(lr);
3678   __ bind(LEN_OVER_8);
3679   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3680   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3681   __ tst(rscratch2, UPPER_BIT_MASK);
3682   __ br(Assembler::NE, RET_TRUE_NO_POP);
3683   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3684   __ lsrv(rscratch1, rscratch1, rscratch2);
3685   __ tst(rscratch1, UPPER_BIT_MASK);
3686   __ cset(result, Assembler::NE);
3687   __ leave();
3688   __ ret(lr);
3689 
3690   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3691   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3692 
3693   has_negatives_long = __ pc(); // 2nd entry point
3694 
3695   __ enter();
3696 
3697   __ bind(LEN_OVER_15);
3698     __ push(spilled_regs, sp);
3699     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3700     __ cbz(rscratch2, ALIGNED);
3701     __ ldp(tmp6, tmp1, Address(ary1));
3702     __ mov(tmp5, 16);
3703     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3704     __ add(ary1, ary1, rscratch1);
3705     __ sub(len, len, rscratch1);
3706     __ orr(tmp6, tmp6, tmp1);
3707     __ tst(tmp6, UPPER_BIT_MASK);
3708     __ br(Assembler::NE, RET_TRUE);
3709 
3710   __ bind(ALIGNED);
3711     __ cmp(len, large_loop_size);
3712     __ br(Assembler::LT, CHECK_16);
3713     // Perform 16-byte load as early return in pre-loop to handle situation
3714     // when initially aligned large array has negative values at starting bytes,
3715     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3716     // slower. Cases with negative bytes further ahead won't be affected that
3717     // much. In fact, it'll be faster due to early loads, less instructions and
3718     // less branches in LARGE_LOOP.
3719     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3720     __ sub(len, len, 16);
3721     __ orr(tmp6, tmp6, tmp1);
3722     __ tst(tmp6, UPPER_BIT_MASK);
3723     __ br(Assembler::NE, RET_TRUE);
3724     __ cmp(len, large_loop_size);
3725     __ br(Assembler::LT, CHECK_16);
3726 
3727     if (SoftwarePrefetchHintDistance >= 0
3728         && SoftwarePrefetchHintDistance >= dcache_line) {
3729       // initial prefetch
3730       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3731     }
3732   __ bind(LARGE_LOOP);
3733     if (SoftwarePrefetchHintDistance >= 0) {
3734       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3735     }
3736     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3737     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3738     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3739     // instructions per cycle and have less branches, but this approach disables
3740     // early return, thus, all 64 bytes are loaded and checked every time.
3741     __ ldp(tmp2, tmp3, Address(ary1));
3742     __ ldp(tmp4, tmp5, Address(ary1, 16));
3743     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3744     __ ldp(tmp6, tmp1, Address(ary1, 48));
3745     __ add(ary1, ary1, large_loop_size);
3746     __ sub(len, len, large_loop_size);
3747     __ orr(tmp2, tmp2, tmp3);
3748     __ orr(tmp4, tmp4, tmp5);
3749     __ orr(rscratch1, rscratch1, rscratch2);
3750     __ orr(tmp6, tmp6, tmp1);
3751     __ orr(tmp2, tmp2, tmp4);
3752     __ orr(rscratch1, rscratch1, tmp6);
3753     __ orr(tmp2, tmp2, rscratch1);
3754     __ tst(tmp2, UPPER_BIT_MASK);
3755     __ br(Assembler::NE, RET_TRUE);
3756     __ cmp(len, large_loop_size);
3757     __ br(Assembler::GE, LARGE_LOOP);
3758 
3759   __ bind(CHECK_16); // small 16-byte load pre-loop
3760     __ cmp(len, (u1)16);
3761     __ br(Assembler::LT, POST_LOOP16);
3762 
3763   __ bind(LOOP16); // small 16-byte load loop
3764     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3765     __ sub(len, len, 16);
3766     __ orr(tmp2, tmp2, tmp3);
3767     __ tst(tmp2, UPPER_BIT_MASK);
3768     __ br(Assembler::NE, RET_TRUE);
3769     __ cmp(len, (u1)16);
3770     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3771 
3772   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3773     __ cmp(len, (u1)8);
3774     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3775     __ ldr(tmp3, Address(__ post(ary1, 8)));
3776     __ sub(len, len, 8);
3777     __ tst(tmp3, UPPER_BIT_MASK);
3778     __ br(Assembler::NE, RET_TRUE);
3779 
3780   __ bind(POST_LOOP16_LOAD_TAIL);
3781     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3782     __ ldr(tmp1, Address(ary1));
3783     __ mov(tmp2, 64);
3784     __ sub(tmp4, tmp2, len, __ LSL, 3);
3785     __ lslv(tmp1, tmp1, tmp4);
3786     __ tst(tmp1, UPPER_BIT_MASK);
3787     __ br(Assembler::NE, RET_TRUE);
3788     // Fallthrough
3789 
3790   __ bind(RET_FALSE);
3791     __ pop(spilled_regs, sp);
3792     __ leave();
3793     __ mov(result, zr);
3794     __ ret(lr);
3795 
3796   __ bind(RET_TRUE);
3797     __ pop(spilled_regs, sp);
3798   __ bind(RET_TRUE_NO_POP);
3799     __ leave();
3800     __ mov(result, 1);
3801     __ ret(lr);
3802 
3803   __ bind(DONE);
3804     __ pop(spilled_regs, sp);
3805     __ leave();
3806     __ ret(lr);
3807     return entry;
3808   }
3809 
3810   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3811         bool usePrefetch, Label &NOT_EQUAL) {
3812     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3813         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3814         tmp7 = r12, tmp8 = r13;
3815     Label LOOP;
3816 
3817     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3818     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3819     __ bind(LOOP);
3820     if (usePrefetch) {
3821       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3822       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3823     }
3824     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3825     __ eor(tmp1, tmp1, tmp2);
3826     __ eor(tmp3, tmp3, tmp4);
3827     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3828     __ orr(tmp1, tmp1, tmp3);
3829     __ cbnz(tmp1, NOT_EQUAL);
3830     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3831     __ eor(tmp5, tmp5, tmp6);
3832     __ eor(tmp7, tmp7, tmp8);
3833     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3834     __ orr(tmp5, tmp5, tmp7);
3835     __ cbnz(tmp5, NOT_EQUAL);
3836     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3837     __ eor(tmp1, tmp1, tmp2);
3838     __ eor(tmp3, tmp3, tmp4);
3839     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3840     __ orr(tmp1, tmp1, tmp3);
3841     __ cbnz(tmp1, NOT_EQUAL);
3842     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3843     __ eor(tmp5, tmp5, tmp6);
3844     __ sub(cnt1, cnt1, 8 * wordSize);
3845     __ eor(tmp7, tmp7, tmp8);
3846     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3847     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3848     // cmp) because subs allows an unlimited range of immediate operand.
3849     __ subs(tmp6, cnt1, loopThreshold);
3850     __ orr(tmp5, tmp5, tmp7);
3851     __ cbnz(tmp5, NOT_EQUAL);
3852     __ br(__ GE, LOOP);
3853     // post-loop
3854     __ eor(tmp1, tmp1, tmp2);
3855     __ eor(tmp3, tmp3, tmp4);
3856     __ orr(tmp1, tmp1, tmp3);
3857     __ sub(cnt1, cnt1, 2 * wordSize);
3858     __ cbnz(tmp1, NOT_EQUAL);
3859   }
3860 
3861   void generate_large_array_equals_loop_simd(int loopThreshold,
3862         bool usePrefetch, Label &NOT_EQUAL) {
3863     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3864         tmp2 = rscratch2;
3865     Label LOOP;
3866 
3867     __ bind(LOOP);
3868     if (usePrefetch) {
3869       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3870       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3871     }
3872     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3873     __ sub(cnt1, cnt1, 8 * wordSize);
3874     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3875     __ subs(tmp1, cnt1, loopThreshold);
3876     __ eor(v0, __ T16B, v0, v4);
3877     __ eor(v1, __ T16B, v1, v5);
3878     __ eor(v2, __ T16B, v2, v6);
3879     __ eor(v3, __ T16B, v3, v7);
3880     __ orr(v0, __ T16B, v0, v1);
3881     __ orr(v1, __ T16B, v2, v3);
3882     __ orr(v0, __ T16B, v0, v1);
3883     __ umov(tmp1, v0, __ D, 0);
3884     __ umov(tmp2, v0, __ D, 1);
3885     __ orr(tmp1, tmp1, tmp2);
3886     __ cbnz(tmp1, NOT_EQUAL);
3887     __ br(__ GE, LOOP);
3888   }
3889 
3890   // a1 = r1 - array1 address
3891   // a2 = r2 - array2 address
3892   // result = r0 - return value. Already contains "false"
3893   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3894   // r3-r5 are reserved temporary registers
3895   address generate_large_array_equals() {
3896     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3897         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3898         tmp7 = r12, tmp8 = r13;
3899     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3900         SMALL_LOOP, POST_LOOP;
3901     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3902     // calculate if at least 32 prefetched bytes are used
3903     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3904     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3905     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3906     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3907         tmp5, tmp6, tmp7, tmp8);
3908 
3909     __ align(CodeEntryAlignment);
3910 
3911     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3912 
3913     address entry = __ pc();
3914     __ enter();
3915     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3916     // also advance pointers to use post-increment instead of pre-increment
3917     __ add(a1, a1, wordSize);
3918     __ add(a2, a2, wordSize);
3919     if (AvoidUnalignedAccesses) {
3920       // both implementations (SIMD/nonSIMD) are using relatively large load
3921       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3922       // on some CPUs in case of address is not at least 16-byte aligned.
3923       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3924       // load if needed at least for 1st address and make if 16-byte aligned.
3925       Label ALIGNED16;
3926       __ tbz(a1, 3, ALIGNED16);
3927       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3928       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3929       __ sub(cnt1, cnt1, wordSize);
3930       __ eor(tmp1, tmp1, tmp2);
3931       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3932       __ bind(ALIGNED16);
3933     }
3934     if (UseSIMDForArrayEquals) {
3935       if (SoftwarePrefetchHintDistance >= 0) {
3936         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3937         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3938         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3939             /* prfm = */ true, NOT_EQUAL);
3940         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3941         __ br(__ LT, TAIL);
3942       }
3943       __ bind(NO_PREFETCH_LARGE_LOOP);
3944       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3945           /* prfm = */ false, NOT_EQUAL);
3946     } else {
3947       __ push(spilled_regs, sp);
3948       if (SoftwarePrefetchHintDistance >= 0) {
3949         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3950         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3951         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3952             /* prfm = */ true, NOT_EQUAL);
3953         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3954         __ br(__ LT, TAIL);
3955       }
3956       __ bind(NO_PREFETCH_LARGE_LOOP);
3957       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3958           /* prfm = */ false, NOT_EQUAL);
3959     }
3960     __ bind(TAIL);
3961       __ cbz(cnt1, EQUAL);
3962       __ subs(cnt1, cnt1, wordSize);
3963       __ br(__ LE, POST_LOOP);
3964     __ bind(SMALL_LOOP);
3965       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3966       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3967       __ subs(cnt1, cnt1, wordSize);
3968       __ eor(tmp1, tmp1, tmp2);
3969       __ cbnz(tmp1, NOT_EQUAL);
3970       __ br(__ GT, SMALL_LOOP);
3971     __ bind(POST_LOOP);
3972       __ ldr(tmp1, Address(a1, cnt1));
3973       __ ldr(tmp2, Address(a2, cnt1));
3974       __ eor(tmp1, tmp1, tmp2);
3975       __ cbnz(tmp1, NOT_EQUAL);
3976     __ bind(EQUAL);
3977       __ mov(result, true);
3978     __ bind(NOT_EQUAL);
3979       if (!UseSIMDForArrayEquals) {
3980         __ pop(spilled_regs, sp);
3981       }
3982     __ bind(NOT_EQUAL_NO_POP);
3983     __ leave();
3984     __ ret(lr);
3985     return entry;
3986   }
3987 
3988   address generate_dsin_dcos(bool isCos) {
3989     __ align(CodeEntryAlignment);
3990     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3991     address start = __ pc();
3992     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3993         (address)StubRoutines::aarch64::_two_over_pi,
3994         (address)StubRoutines::aarch64::_pio2,
3995         (address)StubRoutines::aarch64::_dsin_coef,
3996         (address)StubRoutines::aarch64::_dcos_coef);
3997     return start;
3998   }
3999 
4000   address generate_dlog() {
4001     __ align(CodeEntryAlignment);
4002     StubCodeMark mark(this, "StubRoutines", "dlog");
4003     address entry = __ pc();
4004     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4005         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4006     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4007     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4008         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4009     return entry;
4010   }
4011 
4012   // code for comparing 16 bytes of strings with same encoding
4013   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4014     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4015     __ ldr(rscratch1, Address(__ post(str1, 8)));
4016     __ eor(rscratch2, tmp1, tmp2);
4017     __ ldr(cnt1, Address(__ post(str2, 8)));
4018     __ cbnz(rscratch2, DIFF1);
4019     __ ldr(tmp1, Address(__ post(str1, 8)));
4020     __ eor(rscratch2, rscratch1, cnt1);
4021     __ ldr(tmp2, Address(__ post(str2, 8)));
4022     __ cbnz(rscratch2, DIFF2);
4023   }
4024 
4025   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4026   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4027       Label &DIFF2) {
4028     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4029     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4030 
4031     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4032     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4033     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4034     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4035 
4036     __ fmovd(tmpL, vtmp3);
4037     __ eor(rscratch2, tmp3, tmpL);
4038     __ cbnz(rscratch2, DIFF2);
4039 
4040     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4041     __ umov(tmpL, vtmp3, __ D, 1);
4042     __ eor(rscratch2, tmpU, tmpL);
4043     __ cbnz(rscratch2, DIFF1);
4044 
4045     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4046     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4047     __ fmovd(tmpL, vtmp);
4048     __ eor(rscratch2, tmp3, tmpL);
4049     __ cbnz(rscratch2, DIFF2);
4050 
4051     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4052     __ umov(tmpL, vtmp, __ D, 1);
4053     __ eor(rscratch2, tmpU, tmpL);
4054     __ cbnz(rscratch2, DIFF1);
4055   }
4056 
4057   // r0  = result
4058   // r1  = str1
4059   // r2  = cnt1
4060   // r3  = str2
4061   // r4  = cnt2
4062   // r10 = tmp1
4063   // r11 = tmp2
4064   address generate_compare_long_string_different_encoding(bool isLU) {
4065     __ align(CodeEntryAlignment);
4066     StubCodeMark mark(this, "StubRoutines", isLU
4067         ? "compare_long_string_different_encoding LU"
4068         : "compare_long_string_different_encoding UL");
4069     address entry = __ pc();
4070     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4071         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4072         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4073     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4074         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4075     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4076     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4077 
4078     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4079 
4080     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4081     // cnt2 == amount of characters left to compare
4082     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4083     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4084     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4085     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4086     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4087     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4088     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4089     __ eor(rscratch2, tmp1, tmp2);
4090     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4091     __ mov(rscratch1, tmp2);
4092     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4093     Register strU = isLU ? str2 : str1,
4094              strL = isLU ? str1 : str2,
4095              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4096              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4097     __ push(spilled_regs, sp);
4098     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4099     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4100 
4101     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4102 
4103     if (SoftwarePrefetchHintDistance >= 0) {
4104       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4105       __ br(__ LT, NO_PREFETCH);
4106       __ bind(LARGE_LOOP_PREFETCH);
4107         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4108         __ mov(tmp4, 2);
4109         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4110         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4111           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4112           __ subs(tmp4, tmp4, 1);
4113           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4114           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4115           __ mov(tmp4, 2);
4116         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4117           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4118           __ subs(tmp4, tmp4, 1);
4119           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4120           __ sub(cnt2, cnt2, 64);
4121           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4122           __ br(__ GE, LARGE_LOOP_PREFETCH);
4123     }
4124     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4125     __ bind(NO_PREFETCH);
4126     __ subs(cnt2, cnt2, 16);
4127     __ br(__ LT, TAIL);
4128     __ bind(SMALL_LOOP); // smaller loop
4129       __ subs(cnt2, cnt2, 16);
4130       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4131       __ br(__ GE, SMALL_LOOP);
4132       __ cmn(cnt2, (u1)16);
4133       __ br(__ EQ, LOAD_LAST);
4134     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4135       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
4136       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4137       __ ldr(tmp3, Address(cnt1, -8));
4138       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4139       __ b(LOAD_LAST);
4140     __ bind(DIFF2);
4141       __ mov(tmpU, tmp3);
4142     __ bind(DIFF1);
4143       __ pop(spilled_regs, sp);
4144       __ b(CALCULATE_DIFFERENCE);
4145     __ bind(LOAD_LAST);
4146       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4147       // No need to load it again
4148       __ mov(tmpU, tmp3);
4149       __ pop(spilled_regs, sp);
4150 
4151       __ ldrs(vtmp, Address(strL));
4152       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4153       __ fmovd(tmpL, vtmp);
4154 
4155       __ eor(rscratch2, tmpU, tmpL);
4156       __ cbz(rscratch2, DONE);
4157 
4158     // Find the first different characters in the longwords and
4159     // compute their difference.
4160     __ bind(CALCULATE_DIFFERENCE);
4161       __ rev(rscratch2, rscratch2);
4162       __ clz(rscratch2, rscratch2);
4163       __ andr(rscratch2, rscratch2, -16);
4164       __ lsrv(tmp1, tmp1, rscratch2);
4165       __ uxthw(tmp1, tmp1);
4166       __ lsrv(rscratch1, rscratch1, rscratch2);
4167       __ uxthw(rscratch1, rscratch1);
4168       __ subw(result, tmp1, rscratch1);
4169     __ bind(DONE);
4170       __ ret(lr);
4171     return entry;
4172   }
4173 
4174   // r0  = result
4175   // r1  = str1
4176   // r2  = cnt1
4177   // r3  = str2
4178   // r4  = cnt2
4179   // r10 = tmp1
4180   // r11 = tmp2
4181   address generate_compare_long_string_same_encoding(bool isLL) {
4182     __ align(CodeEntryAlignment);
4183     StubCodeMark mark(this, "StubRoutines", isLL
4184         ? "compare_long_string_same_encoding LL"
4185         : "compare_long_string_same_encoding UU");
4186     address entry = __ pc();
4187     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4188         tmp1 = r10, tmp2 = r11;
4189     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4190         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4191         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4192     // exit from large loop when less than 64 bytes left to read or we're about
4193     // to prefetch memory behind array border
4194     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4195     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4196     // update cnt2 counter with already loaded 8 bytes
4197     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4198     // update pointers, because of previous read
4199     __ add(str1, str1, wordSize);
4200     __ add(str2, str2, wordSize);
4201     if (SoftwarePrefetchHintDistance >= 0) {
4202       __ bind(LARGE_LOOP_PREFETCH);
4203         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4204         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4205         compare_string_16_bytes_same(DIFF, DIFF2);
4206         compare_string_16_bytes_same(DIFF, DIFF2);
4207         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4208         compare_string_16_bytes_same(DIFF, DIFF2);
4209         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4210         compare_string_16_bytes_same(DIFF, DIFF2);
4211         __ br(__ GT, LARGE_LOOP_PREFETCH);
4212         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4213     }
4214     // less than 16 bytes left?
4215     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4216     __ br(__ LT, TAIL);
4217     __ bind(SMALL_LOOP);
4218       compare_string_16_bytes_same(DIFF, DIFF2);
4219       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4220       __ br(__ GE, SMALL_LOOP);
4221     __ bind(TAIL);
4222       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4223       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4224       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4225       __ br(__ LE, CHECK_LAST);
4226       __ eor(rscratch2, tmp1, tmp2);
4227       __ cbnz(rscratch2, DIFF);
4228       __ ldr(tmp1, Address(__ post(str1, 8)));
4229       __ ldr(tmp2, Address(__ post(str2, 8)));
4230       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4231     __ bind(CHECK_LAST);
4232       if (!isLL) {
4233         __ add(cnt2, cnt2, cnt2); // now in bytes
4234       }
4235       __ eor(rscratch2, tmp1, tmp2);
4236       __ cbnz(rscratch2, DIFF);
4237       __ ldr(rscratch1, Address(str1, cnt2));
4238       __ ldr(cnt1, Address(str2, cnt2));
4239       __ eor(rscratch2, rscratch1, cnt1);
4240       __ cbz(rscratch2, LENGTH_DIFF);
4241       // Find the first different characters in the longwords and
4242       // compute their difference.
4243     __ bind(DIFF2);
4244       __ rev(rscratch2, rscratch2);
4245       __ clz(rscratch2, rscratch2);
4246       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4247       __ lsrv(rscratch1, rscratch1, rscratch2);
4248       if (isLL) {
4249         __ lsrv(cnt1, cnt1, rscratch2);
4250         __ uxtbw(rscratch1, rscratch1);
4251         __ uxtbw(cnt1, cnt1);
4252       } else {
4253         __ lsrv(cnt1, cnt1, rscratch2);
4254         __ uxthw(rscratch1, rscratch1);
4255         __ uxthw(cnt1, cnt1);
4256       }
4257       __ subw(result, rscratch1, cnt1);
4258       __ b(LENGTH_DIFF);
4259     __ bind(DIFF);
4260       __ rev(rscratch2, rscratch2);
4261       __ clz(rscratch2, rscratch2);
4262       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4263       __ lsrv(tmp1, tmp1, rscratch2);
4264       if (isLL) {
4265         __ lsrv(tmp2, tmp2, rscratch2);
4266         __ uxtbw(tmp1, tmp1);
4267         __ uxtbw(tmp2, tmp2);
4268       } else {
4269         __ lsrv(tmp2, tmp2, rscratch2);
4270         __ uxthw(tmp1, tmp1);
4271         __ uxthw(tmp2, tmp2);
4272       }
4273       __ subw(result, tmp1, tmp2);
4274       __ b(LENGTH_DIFF);
4275     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4276       __ eor(rscratch2, tmp1, tmp2);
4277       __ cbnz(rscratch2, DIFF);
4278     __ bind(LENGTH_DIFF);
4279       __ ret(lr);
4280     return entry;
4281   }
4282 
4283   void generate_compare_long_strings() {
4284       StubRoutines::aarch64::_compare_long_string_LL
4285           = generate_compare_long_string_same_encoding(true);
4286       StubRoutines::aarch64::_compare_long_string_UU
4287           = generate_compare_long_string_same_encoding(false);
4288       StubRoutines::aarch64::_compare_long_string_LU
4289           = generate_compare_long_string_different_encoding(true);
4290       StubRoutines::aarch64::_compare_long_string_UL
4291           = generate_compare_long_string_different_encoding(false);
4292   }
4293 
4294   // R0 = result
4295   // R1 = str2
4296   // R2 = cnt1
4297   // R3 = str1
4298   // R4 = cnt2
4299   // This generic linear code use few additional ideas, which makes it faster:
4300   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4301   // in order to skip initial loading(help in systems with 1 ld pipeline)
4302   // 2) we can use "fast" algorithm of finding single character to search for
4303   // first symbol with less branches(1 branch per each loaded register instead
4304   // of branch for each symbol), so, this is where constants like
4305   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4306   // 3) after loading and analyzing 1st register of source string, it can be
4307   // used to search for every 1st character entry, saving few loads in
4308   // comparison with "simplier-but-slower" implementation
4309   // 4) in order to avoid lots of push/pop operations, code below is heavily
4310   // re-using/re-initializing/compressing register values, which makes code
4311   // larger and a bit less readable, however, most of extra operations are
4312   // issued during loads or branches, so, penalty is minimal
4313   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4314     const char* stubName = str1_isL
4315         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4316         : "indexof_linear_uu";
4317     __ align(CodeEntryAlignment);
4318     StubCodeMark mark(this, "StubRoutines", stubName);
4319     address entry = __ pc();
4320 
4321     int str1_chr_size = str1_isL ? 1 : 2;
4322     int str2_chr_size = str2_isL ? 1 : 2;
4323     int str1_chr_shift = str1_isL ? 0 : 1;
4324     int str2_chr_shift = str2_isL ? 0 : 1;
4325     bool isL = str1_isL && str2_isL;
4326    // parameters
4327     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4328     // temporary registers
4329     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4330     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4331     // redefinitions
4332     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4333 
4334     __ push(spilled_regs, sp);
4335     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4336         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4337         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4338         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4339         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4340         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4341     // Read whole register from str1. It is safe, because length >=8 here
4342     __ ldr(ch1, Address(str1));
4343     // Read whole register from str2. It is safe, because length >=8 here
4344     __ ldr(ch2, Address(str2));
4345     __ sub(cnt2, cnt2, cnt1);
4346     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4347     if (str1_isL != str2_isL) {
4348       __ eor(v0, __ T16B, v0, v0);
4349     }
4350     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4351     __ mul(first, first, tmp1);
4352     // check if we have less than 1 register to check
4353     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4354     if (str1_isL != str2_isL) {
4355       __ fmovd(v1, ch1);
4356     }
4357     __ br(__ LE, L_SMALL);
4358     __ eor(ch2, first, ch2);
4359     if (str1_isL != str2_isL) {
4360       __ zip1(v1, __ T16B, v1, v0);
4361     }
4362     __ sub(tmp2, ch2, tmp1);
4363     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4364     __ bics(tmp2, tmp2, ch2);
4365     if (str1_isL != str2_isL) {
4366       __ fmovd(ch1, v1);
4367     }
4368     __ br(__ NE, L_HAS_ZERO);
4369     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4370     __ add(result, result, wordSize/str2_chr_size);
4371     __ add(str2, str2, wordSize);
4372     __ br(__ LT, L_POST_LOOP);
4373     __ BIND(L_LOOP);
4374       __ ldr(ch2, Address(str2));
4375       __ eor(ch2, first, ch2);
4376       __ sub(tmp2, ch2, tmp1);
4377       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4378       __ bics(tmp2, tmp2, ch2);
4379       __ br(__ NE, L_HAS_ZERO);
4380     __ BIND(L_LOOP_PROCEED);
4381       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4382       __ add(str2, str2, wordSize);
4383       __ add(result, result, wordSize/str2_chr_size);
4384       __ br(__ GE, L_LOOP);
4385     __ BIND(L_POST_LOOP);
4386       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4387       __ br(__ LE, NOMATCH);
4388       __ ldr(ch2, Address(str2));
4389       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4390       __ eor(ch2, first, ch2);
4391       __ sub(tmp2, ch2, tmp1);
4392       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4393       __ mov(tmp4, -1); // all bits set
4394       __ b(L_SMALL_PROCEED);
4395     __ align(OptoLoopAlignment);
4396     __ BIND(L_SMALL);
4397       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4398       __ eor(ch2, first, ch2);
4399       if (str1_isL != str2_isL) {
4400         __ zip1(v1, __ T16B, v1, v0);
4401       }
4402       __ sub(tmp2, ch2, tmp1);
4403       __ mov(tmp4, -1); // all bits set
4404       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4405       if (str1_isL != str2_isL) {
4406         __ fmovd(ch1, v1); // move converted 4 symbols
4407       }
4408     __ BIND(L_SMALL_PROCEED);
4409       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4410       __ bic(tmp2, tmp2, ch2);
4411       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4412       __ rbit(tmp2, tmp2);
4413       __ br(__ EQ, NOMATCH);
4414     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4415       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4416       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4417       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4418       if (str2_isL) { // LL
4419         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4420         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4421         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4422         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4423         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4424       } else {
4425         __ mov(ch2, 0xE); // all bits in byte set except last one
4426         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4427         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4428         __ lslv(tmp2, tmp2, tmp4);
4429         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4430         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4431         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4432         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4433       }
4434       __ cmp(ch1, ch2);
4435       __ mov(tmp4, wordSize/str2_chr_size);
4436       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4437     __ BIND(L_SMALL_CMP_LOOP);
4438       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4439                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4440       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4441                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4442       __ add(tmp4, tmp4, 1);
4443       __ cmp(tmp4, cnt1);
4444       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4445       __ cmp(first, ch2);
4446       __ br(__ EQ, L_SMALL_CMP_LOOP);
4447     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4448       __ cbz(tmp2, NOMATCH); // no more matches. exit
4449       __ clz(tmp4, tmp2);
4450       __ add(result, result, 1); // advance index
4451       __ add(str2, str2, str2_chr_size); // advance pointer
4452       __ b(L_SMALL_HAS_ZERO_LOOP);
4453     __ align(OptoLoopAlignment);
4454     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4455       __ cmp(first, ch2);
4456       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4457       __ b(DONE);
4458     __ align(OptoLoopAlignment);
4459     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4460       if (str2_isL) { // LL
4461         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4462         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4463         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4464         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4465         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4466       } else {
4467         __ mov(ch2, 0xE); // all bits in byte set except last one
4468         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4469         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4470         __ lslv(tmp2, tmp2, tmp4);
4471         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4472         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4473         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4474         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4475       }
4476       __ cmp(ch1, ch2);
4477       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4478       __ b(DONE);
4479     __ align(OptoLoopAlignment);
4480     __ BIND(L_HAS_ZERO);
4481       __ rbit(tmp2, tmp2);
4482       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4483       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4484       // It's fine because both counters are 32bit and are not changed in this
4485       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4486       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4487       __ sub(result, result, 1);
4488     __ BIND(L_HAS_ZERO_LOOP);
4489       __ mov(cnt1, wordSize/str2_chr_size);
4490       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4491       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4492       if (str2_isL) {
4493         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4494         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4495         __ lslv(tmp2, tmp2, tmp4);
4496         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4497         __ add(tmp4, tmp4, 1);
4498         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4499         __ lsl(tmp2, tmp2, 1);
4500         __ mov(tmp4, wordSize/str2_chr_size);
4501       } else {
4502         __ mov(ch2, 0xE);
4503         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4504         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4505         __ lslv(tmp2, tmp2, tmp4);
4506         __ add(tmp4, tmp4, 1);
4507         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4508         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4509         __ lsl(tmp2, tmp2, 1);
4510         __ mov(tmp4, wordSize/str2_chr_size);
4511         __ sub(str2, str2, str2_chr_size);
4512       }
4513       __ cmp(ch1, ch2);
4514       __ mov(tmp4, wordSize/str2_chr_size);
4515       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4516     __ BIND(L_CMP_LOOP);
4517       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4518                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4519       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4520                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4521       __ add(tmp4, tmp4, 1);
4522       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4523       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4524       __ cmp(cnt1, ch2);
4525       __ br(__ EQ, L_CMP_LOOP);
4526     __ BIND(L_CMP_LOOP_NOMATCH);
4527       // here we're not matched
4528       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4529       __ clz(tmp4, tmp2);
4530       __ add(str2, str2, str2_chr_size); // advance pointer
4531       __ b(L_HAS_ZERO_LOOP);
4532     __ align(OptoLoopAlignment);
4533     __ BIND(L_CMP_LOOP_LAST_CMP);
4534       __ cmp(cnt1, ch2);
4535       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4536       __ b(DONE);
4537     __ align(OptoLoopAlignment);
4538     __ BIND(L_CMP_LOOP_LAST_CMP2);
4539       if (str2_isL) {
4540         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4541         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4542         __ lslv(tmp2, tmp2, tmp4);
4543         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4544         __ add(tmp4, tmp4, 1);
4545         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4546         __ lsl(tmp2, tmp2, 1);
4547       } else {
4548         __ mov(ch2, 0xE);
4549         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4550         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4551         __ lslv(tmp2, tmp2, tmp4);
4552         __ add(tmp4, tmp4, 1);
4553         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4554         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4555         __ lsl(tmp2, tmp2, 1);
4556         __ sub(str2, str2, str2_chr_size);
4557       }
4558       __ cmp(ch1, ch2);
4559       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4560       __ b(DONE);
4561     __ align(OptoLoopAlignment);
4562     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4563       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4564       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4565       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4566       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4567       // result by analyzed characters value, so, we can just reset lower bits
4568       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4569       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4570       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4571       // index of last analyzed substring inside current octet. So, str2 in at
4572       // respective start address. We need to advance it to next octet
4573       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4574       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4575       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4576       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4577       __ movw(cnt2, cnt2);
4578       __ b(L_LOOP_PROCEED);
4579     __ align(OptoLoopAlignment);
4580     __ BIND(NOMATCH);
4581       __ mov(result, -1);
4582     __ BIND(DONE);
4583       __ pop(spilled_regs, sp);
4584       __ ret(lr);
4585     return entry;
4586   }
4587 
4588   void generate_string_indexof_stubs() {
4589     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4590     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4591     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4592   }
4593 
4594   void inflate_and_store_2_fp_registers(bool generatePrfm,
4595       FloatRegister src1, FloatRegister src2) {
4596     Register dst = r1;
4597     __ zip1(v1, __ T16B, src1, v0);
4598     __ zip2(v2, __ T16B, src1, v0);
4599     if (generatePrfm) {
4600       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4601     }
4602     __ zip1(v3, __ T16B, src2, v0);
4603     __ zip2(v4, __ T16B, src2, v0);
4604     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4605   }
4606 
4607   // R0 = src
4608   // R1 = dst
4609   // R2 = len
4610   // R3 = len >> 3
4611   // V0 = 0
4612   // v1 = loaded 8 bytes
4613   address generate_large_byte_array_inflate() {
4614     __ align(CodeEntryAlignment);
4615     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4616     address entry = __ pc();
4617     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4618     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4619     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4620 
4621     // do one more 8-byte read to have address 16-byte aligned in most cases
4622     // also use single store instruction
4623     __ ldrd(v2, __ post(src, 8));
4624     __ sub(octetCounter, octetCounter, 2);
4625     __ zip1(v1, __ T16B, v1, v0);
4626     __ zip1(v2, __ T16B, v2, v0);
4627     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4628     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4629     __ subs(rscratch1, octetCounter, large_loop_threshold);
4630     __ br(__ LE, LOOP_START);
4631     __ b(LOOP_PRFM_START);
4632     __ bind(LOOP_PRFM);
4633       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4634     __ bind(LOOP_PRFM_START);
4635       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4636       __ sub(octetCounter, octetCounter, 8);
4637       __ subs(rscratch1, octetCounter, large_loop_threshold);
4638       inflate_and_store_2_fp_registers(true, v3, v4);
4639       inflate_and_store_2_fp_registers(true, v5, v6);
4640       __ br(__ GT, LOOP_PRFM);
4641       __ cmp(octetCounter, (u1)8);
4642       __ br(__ LT, DONE);
4643     __ bind(LOOP);
4644       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4645       __ bind(LOOP_START);
4646       __ sub(octetCounter, octetCounter, 8);
4647       __ cmp(octetCounter, (u1)8);
4648       inflate_and_store_2_fp_registers(false, v3, v4);
4649       inflate_and_store_2_fp_registers(false, v5, v6);
4650       __ br(__ GE, LOOP);
4651     __ bind(DONE);
4652       __ ret(lr);
4653     return entry;
4654   }
4655 
4656   /**
4657    *  Arguments:
4658    *
4659    *  Input:
4660    *  c_rarg0   - current state address
4661    *  c_rarg1   - H key address
4662    *  c_rarg2   - data address
4663    *  c_rarg3   - number of blocks
4664    *
4665    *  Output:
4666    *  Updated state at c_rarg0
4667    */
4668   address generate_ghash_processBlocks() {
4669     // Bafflingly, GCM uses little-endian for the byte order, but
4670     // big-endian for the bit order.  For example, the polynomial 1 is
4671     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4672     //
4673     // So, we must either reverse the bytes in each word and do
4674     // everything big-endian or reverse the bits in each byte and do
4675     // it little-endian.  On AArch64 it's more idiomatic to reverse
4676     // the bits in each byte (we have an instruction, RBIT, to do
4677     // that) and keep the data in little-endian bit order throught the
4678     // calculation, bit-reversing the inputs and outputs.
4679 
4680     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4681     __ align(wordSize * 2);
4682     address p = __ pc();
4683     __ emit_int64(0x87);  // The low-order bits of the field
4684                           // polynomial (i.e. p = z^7+z^2+z+1)
4685                           // repeated in the low and high parts of a
4686                           // 128-bit vector
4687     __ emit_int64(0x87);
4688 
4689     __ align(CodeEntryAlignment);
4690     address start = __ pc();
4691 
4692     Register state   = c_rarg0;
4693     Register subkeyH = c_rarg1;
4694     Register data    = c_rarg2;
4695     Register blocks  = c_rarg3;
4696 
4697     FloatRegister vzr = v30;
4698     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4699 
4700     __ ldrq(v0, Address(state));
4701     __ ldrq(v1, Address(subkeyH));
4702 
4703     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4704     __ rbit(v0, __ T16B, v0);
4705     __ rev64(v1, __ T16B, v1);
4706     __ rbit(v1, __ T16B, v1);
4707 
4708     __ ldrq(v26, p);
4709 
4710     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4711     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4712 
4713     {
4714       Label L_ghash_loop;
4715       __ bind(L_ghash_loop);
4716 
4717       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4718                                                  // reversing each byte
4719       __ rbit(v2, __ T16B, v2);
4720       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4721 
4722       // Multiply state in v2 by subkey in v1
4723       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4724                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4725                      /*temps*/v6, v20, v18, v21);
4726       // Reduce v7:v5 by the field polynomial
4727       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4728 
4729       __ sub(blocks, blocks, 1);
4730       __ cbnz(blocks, L_ghash_loop);
4731     }
4732 
4733     // The bit-reversed result is at this point in v0
4734     __ rev64(v1, __ T16B, v0);
4735     __ rbit(v1, __ T16B, v1);
4736 
4737     __ st1(v1, __ T16B, state);
4738     __ ret(lr);
4739 
4740     return start;
4741   }
4742 
4743   // Continuation point for throwing of implicit exceptions that are
4744   // not handled in the current activation. Fabricates an exception
4745   // oop and initiates normal exception dispatching in this
4746   // frame. Since we need to preserve callee-saved values (currently
4747   // only for C2, but done for C1 as well) we need a callee-saved oop
4748   // map and therefore have to make these stubs into RuntimeStubs
4749   // rather than BufferBlobs.  If the compiler needs all registers to
4750   // be preserved between the fault point and the exception handler
4751   // then it must assume responsibility for that in
4752   // AbstractCompiler::continuation_for_implicit_null_exception or
4753   // continuation_for_implicit_division_by_zero_exception. All other
4754   // implicit exceptions (e.g., NullPointerException or
4755   // AbstractMethodError on entry) are either at call sites or
4756   // otherwise assume that stack unwinding will be initiated, so
4757   // caller saved registers were assumed volatile in the compiler.
4758 
4759 #undef __
4760 #define __ masm->
4761 
4762   address generate_throw_exception(const char* name,
4763                                    address runtime_entry,
4764                                    Register arg1 = noreg,
4765                                    Register arg2 = noreg) {
4766     // Information about frame layout at time of blocking runtime call.
4767     // Note that we only have to preserve callee-saved registers since
4768     // the compilers are responsible for supplying a continuation point
4769     // if they expect all registers to be preserved.
4770     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4771     enum layout {
4772       rfp_off = 0,
4773       rfp_off2,
4774       return_off,
4775       return_off2,
4776       framesize // inclusive of return address
4777     };
4778 
4779     int insts_size = 512;
4780     int locs_size  = 64;
4781 
4782     CodeBuffer code(name, insts_size, locs_size);
4783     OopMapSet* oop_maps  = new OopMapSet();
4784     MacroAssembler* masm = new MacroAssembler(&code);
4785 
4786     address start = __ pc();
4787 
4788     // This is an inlined and slightly modified version of call_VM
4789     // which has the ability to fetch the return PC out of
4790     // thread-local storage and also sets up last_Java_sp slightly
4791     // differently than the real call_VM
4792 
4793     __ enter(); // Save FP and LR before call
4794 
4795     assert(is_even(framesize/2), "sp not 16-byte aligned");
4796 
4797     // lr and fp are already in place
4798     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4799 
4800     int frame_complete = __ pc() - start;
4801 
4802     // Set up last_Java_sp and last_Java_fp
4803     address the_pc = __ pc();
4804     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4805 
4806     // Call runtime
4807     if (arg1 != noreg) {
4808       assert(arg2 != c_rarg1, "clobbered");
4809       __ mov(c_rarg1, arg1);
4810     }
4811     if (arg2 != noreg) {
4812       __ mov(c_rarg2, arg2);
4813     }
4814     __ mov(c_rarg0, rthread);
4815     BLOCK_COMMENT("call runtime_entry");
4816     __ mov(rscratch1, runtime_entry);
4817     __ blr(rscratch1);
4818 
4819     // Generate oop map
4820     OopMap* map = new OopMap(framesize, 0);
4821 
4822     oop_maps->add_gc_map(the_pc - start, map);
4823 
4824     __ reset_last_Java_frame(true);
4825     __ maybe_isb();
4826 
4827     __ leave();
4828 
4829     // check for pending exceptions
4830 #ifdef ASSERT
4831     Label L;
4832     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4833     __ cbnz(rscratch1, L);
4834     __ should_not_reach_here();
4835     __ bind(L);
4836 #endif // ASSERT
4837     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4838 
4839 
4840     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4841     RuntimeStub* stub =
4842       RuntimeStub::new_runtime_stub(name,
4843                                     &code,
4844                                     frame_complete,
4845                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4846                                     oop_maps, false);
4847     return stub->entry_point();
4848   }
4849 
4850   class MontgomeryMultiplyGenerator : public MacroAssembler {
4851 
4852     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4853       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4854 
4855     RegSet _toSave;
4856     bool _squaring;
4857 
4858   public:
4859     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4860       : MacroAssembler(as->code()), _squaring(squaring) {
4861 
4862       // Register allocation
4863 
4864       Register reg = c_rarg0;
4865       Pa_base = reg;       // Argument registers
4866       if (squaring)
4867         Pb_base = Pa_base;
4868       else
4869         Pb_base = ++reg;
4870       Pn_base = ++reg;
4871       Rlen= ++reg;
4872       inv = ++reg;
4873       Pm_base = ++reg;
4874 
4875                           // Working registers:
4876       Ra =  ++reg;        // The current digit of a, b, n, and m.
4877       Rb =  ++reg;
4878       Rm =  ++reg;
4879       Rn =  ++reg;
4880 
4881       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4882       Pb =  ++reg;
4883       Pm =  ++reg;
4884       Pn =  ++reg;
4885 
4886       t0 =  ++reg;        // Three registers which form a
4887       t1 =  ++reg;        // triple-precision accumuator.
4888       t2 =  ++reg;
4889 
4890       Ri =  ++reg;        // Inner and outer loop indexes.
4891       Rj =  ++reg;
4892 
4893       Rhi_ab = ++reg;     // Product registers: low and high parts
4894       Rlo_ab = ++reg;     // of a*b and m*n.
4895       Rhi_mn = ++reg;
4896       Rlo_mn = ++reg;
4897 
4898       // r19 and up are callee-saved.
4899       _toSave = RegSet::range(r19, reg) + Pm_base;
4900     }
4901 
4902   private:
4903     void save_regs() {
4904       push(_toSave, sp);
4905     }
4906 
4907     void restore_regs() {
4908       pop(_toSave, sp);
4909     }
4910 
4911     template <typename T>
4912     void unroll_2(Register count, T block) {
4913       Label loop, end, odd;
4914       tbnz(count, 0, odd);
4915       cbz(count, end);
4916       align(16);
4917       bind(loop);
4918       (this->*block)();
4919       bind(odd);
4920       (this->*block)();
4921       subs(count, count, 2);
4922       br(Assembler::GT, loop);
4923       bind(end);
4924     }
4925 
4926     template <typename T>
4927     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4928       Label loop, end, odd;
4929       tbnz(count, 0, odd);
4930       cbz(count, end);
4931       align(16);
4932       bind(loop);
4933       (this->*block)(d, s, tmp);
4934       bind(odd);
4935       (this->*block)(d, s, tmp);
4936       subs(count, count, 2);
4937       br(Assembler::GT, loop);
4938       bind(end);
4939     }
4940 
4941     void pre1(RegisterOrConstant i) {
4942       block_comment("pre1");
4943       // Pa = Pa_base;
4944       // Pb = Pb_base + i;
4945       // Pm = Pm_base;
4946       // Pn = Pn_base + i;
4947       // Ra = *Pa;
4948       // Rb = *Pb;
4949       // Rm = *Pm;
4950       // Rn = *Pn;
4951       ldr(Ra, Address(Pa_base));
4952       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4953       ldr(Rm, Address(Pm_base));
4954       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4955       lea(Pa, Address(Pa_base));
4956       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4957       lea(Pm, Address(Pm_base));
4958       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4959 
4960       // Zero the m*n result.
4961       mov(Rhi_mn, zr);
4962       mov(Rlo_mn, zr);
4963     }
4964 
4965     // The core multiply-accumulate step of a Montgomery
4966     // multiplication.  The idea is to schedule operations as a
4967     // pipeline so that instructions with long latencies (loads and
4968     // multiplies) have time to complete before their results are
4969     // used.  This most benefits in-order implementations of the
4970     // architecture but out-of-order ones also benefit.
4971     void step() {
4972       block_comment("step");
4973       // MACC(Ra, Rb, t0, t1, t2);
4974       // Ra = *++Pa;
4975       // Rb = *--Pb;
4976       umulh(Rhi_ab, Ra, Rb);
4977       mul(Rlo_ab, Ra, Rb);
4978       ldr(Ra, pre(Pa, wordSize));
4979       ldr(Rb, pre(Pb, -wordSize));
4980       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4981                                        // previous iteration.
4982       // MACC(Rm, Rn, t0, t1, t2);
4983       // Rm = *++Pm;
4984       // Rn = *--Pn;
4985       umulh(Rhi_mn, Rm, Rn);
4986       mul(Rlo_mn, Rm, Rn);
4987       ldr(Rm, pre(Pm, wordSize));
4988       ldr(Rn, pre(Pn, -wordSize));
4989       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4990     }
4991 
4992     void post1() {
4993       block_comment("post1");
4994 
4995       // MACC(Ra, Rb, t0, t1, t2);
4996       // Ra = *++Pa;
4997       // Rb = *--Pb;
4998       umulh(Rhi_ab, Ra, Rb);
4999       mul(Rlo_ab, Ra, Rb);
5000       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5001       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5002 
5003       // *Pm = Rm = t0 * inv;
5004       mul(Rm, t0, inv);
5005       str(Rm, Address(Pm));
5006 
5007       // MACC(Rm, Rn, t0, t1, t2);
5008       // t0 = t1; t1 = t2; t2 = 0;
5009       umulh(Rhi_mn, Rm, Rn);
5010 
5011 #ifndef PRODUCT
5012       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5013       {
5014         mul(Rlo_mn, Rm, Rn);
5015         add(Rlo_mn, t0, Rlo_mn);
5016         Label ok;
5017         cbz(Rlo_mn, ok); {
5018           stop("broken Montgomery multiply");
5019         } bind(ok);
5020       }
5021 #endif
5022       // We have very carefully set things up so that
5023       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5024       // the lower half of Rm * Rn because we know the result already:
5025       // it must be -t0.  t0 + (-t0) must generate a carry iff
5026       // t0 != 0.  So, rather than do a mul and an adds we just set
5027       // the carry flag iff t0 is nonzero.
5028       //
5029       // mul(Rlo_mn, Rm, Rn);
5030       // adds(zr, t0, Rlo_mn);
5031       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5032       adcs(t0, t1, Rhi_mn);
5033       adc(t1, t2, zr);
5034       mov(t2, zr);
5035     }
5036 
5037     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5038       block_comment("pre2");
5039       // Pa = Pa_base + i-len;
5040       // Pb = Pb_base + len;
5041       // Pm = Pm_base + i-len;
5042       // Pn = Pn_base + len;
5043 
5044       if (i.is_register()) {
5045         sub(Rj, i.as_register(), len);
5046       } else {
5047         mov(Rj, i.as_constant());
5048         sub(Rj, Rj, len);
5049       }
5050       // Rj == i-len
5051 
5052       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5053       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5054       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5055       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5056 
5057       // Ra = *++Pa;
5058       // Rb = *--Pb;
5059       // Rm = *++Pm;
5060       // Rn = *--Pn;
5061       ldr(Ra, pre(Pa, wordSize));
5062       ldr(Rb, pre(Pb, -wordSize));
5063       ldr(Rm, pre(Pm, wordSize));
5064       ldr(Rn, pre(Pn, -wordSize));
5065 
5066       mov(Rhi_mn, zr);
5067       mov(Rlo_mn, zr);
5068     }
5069 
5070     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5071       block_comment("post2");
5072       if (i.is_constant()) {
5073         mov(Rj, i.as_constant()-len.as_constant());
5074       } else {
5075         sub(Rj, i.as_register(), len);
5076       }
5077 
5078       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5079 
5080       // As soon as we know the least significant digit of our result,
5081       // store it.
5082       // Pm_base[i-len] = t0;
5083       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5084 
5085       // t0 = t1; t1 = t2; t2 = 0;
5086       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5087       adc(t1, t2, zr);
5088       mov(t2, zr);
5089     }
5090 
5091     // A carry in t0 after Montgomery multiplication means that we
5092     // should subtract multiples of n from our result in m.  We'll
5093     // keep doing that until there is no carry.
5094     void normalize(RegisterOrConstant len) {
5095       block_comment("normalize");
5096       // while (t0)
5097       //   t0 = sub(Pm_base, Pn_base, t0, len);
5098       Label loop, post, again;
5099       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5100       cbz(t0, post); {
5101         bind(again); {
5102           mov(i, zr);
5103           mov(cnt, len);
5104           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5105           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5106           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5107           align(16);
5108           bind(loop); {
5109             sbcs(Rm, Rm, Rn);
5110             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5111             add(i, i, 1);
5112             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5113             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5114             sub(cnt, cnt, 1);
5115           } cbnz(cnt, loop);
5116           sbc(t0, t0, zr);
5117         } cbnz(t0, again);
5118       } bind(post);
5119     }
5120 
5121     // Move memory at s to d, reversing words.
5122     //    Increments d to end of copied memory
5123     //    Destroys tmp1, tmp2
5124     //    Preserves len
5125     //    Leaves s pointing to the address which was in d at start
5126     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5127       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5128 
5129       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5130       mov(tmp1, len);
5131       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5132       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5133     }
5134     // where
5135     void reverse1(Register d, Register s, Register tmp) {
5136       ldr(tmp, pre(s, -wordSize));
5137       ror(tmp, tmp, 32);
5138       str(tmp, post(d, wordSize));
5139     }
5140 
5141     void step_squaring() {
5142       // An extra ACC
5143       step();
5144       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5145     }
5146 
5147     void last_squaring(RegisterOrConstant i) {
5148       Label dont;
5149       // if ((i & 1) == 0) {
5150       tbnz(i.as_register(), 0, dont); {
5151         // MACC(Ra, Rb, t0, t1, t2);
5152         // Ra = *++Pa;
5153         // Rb = *--Pb;
5154         umulh(Rhi_ab, Ra, Rb);
5155         mul(Rlo_ab, Ra, Rb);
5156         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5157       } bind(dont);
5158     }
5159 
5160     void extra_step_squaring() {
5161       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5162 
5163       // MACC(Rm, Rn, t0, t1, t2);
5164       // Rm = *++Pm;
5165       // Rn = *--Pn;
5166       umulh(Rhi_mn, Rm, Rn);
5167       mul(Rlo_mn, Rm, Rn);
5168       ldr(Rm, pre(Pm, wordSize));
5169       ldr(Rn, pre(Pn, -wordSize));
5170     }
5171 
5172     void post1_squaring() {
5173       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5174 
5175       // *Pm = Rm = t0 * inv;
5176       mul(Rm, t0, inv);
5177       str(Rm, Address(Pm));
5178 
5179       // MACC(Rm, Rn, t0, t1, t2);
5180       // t0 = t1; t1 = t2; t2 = 0;
5181       umulh(Rhi_mn, Rm, Rn);
5182 
5183 #ifndef PRODUCT
5184       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5185       {
5186         mul(Rlo_mn, Rm, Rn);
5187         add(Rlo_mn, t0, Rlo_mn);
5188         Label ok;
5189         cbz(Rlo_mn, ok); {
5190           stop("broken Montgomery multiply");
5191         } bind(ok);
5192       }
5193 #endif
5194       // We have very carefully set things up so that
5195       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5196       // the lower half of Rm * Rn because we know the result already:
5197       // it must be -t0.  t0 + (-t0) must generate a carry iff
5198       // t0 != 0.  So, rather than do a mul and an adds we just set
5199       // the carry flag iff t0 is nonzero.
5200       //
5201       // mul(Rlo_mn, Rm, Rn);
5202       // adds(zr, t0, Rlo_mn);
5203       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5204       adcs(t0, t1, Rhi_mn);
5205       adc(t1, t2, zr);
5206       mov(t2, zr);
5207     }
5208 
5209     void acc(Register Rhi, Register Rlo,
5210              Register t0, Register t1, Register t2) {
5211       adds(t0, t0, Rlo);
5212       adcs(t1, t1, Rhi);
5213       adc(t2, t2, zr);
5214     }
5215 
5216   public:
5217     /**
5218      * Fast Montgomery multiplication.  The derivation of the
5219      * algorithm is in A Cryptographic Library for the Motorola
5220      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5221      *
5222      * Arguments:
5223      *
5224      * Inputs for multiplication:
5225      *   c_rarg0   - int array elements a
5226      *   c_rarg1   - int array elements b
5227      *   c_rarg2   - int array elements n (the modulus)
5228      *   c_rarg3   - int length
5229      *   c_rarg4   - int inv
5230      *   c_rarg5   - int array elements m (the result)
5231      *
5232      * Inputs for squaring:
5233      *   c_rarg0   - int array elements a
5234      *   c_rarg1   - int array elements n (the modulus)
5235      *   c_rarg2   - int length
5236      *   c_rarg3   - int inv
5237      *   c_rarg4   - int array elements m (the result)
5238      *
5239      */
5240     address generate_multiply() {
5241       Label argh, nothing;
5242       bind(argh);
5243       stop("MontgomeryMultiply total_allocation must be <= 8192");
5244 
5245       align(CodeEntryAlignment);
5246       address entry = pc();
5247 
5248       cbzw(Rlen, nothing);
5249 
5250       enter();
5251 
5252       // Make room.
5253       cmpw(Rlen, 512);
5254       br(Assembler::HI, argh);
5255       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5256       andr(sp, Ra, -2 * wordSize);
5257 
5258       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5259 
5260       {
5261         // Copy input args, reversing as we go.  We use Ra as a
5262         // temporary variable.
5263         reverse(Ra, Pa_base, Rlen, t0, t1);
5264         if (!_squaring)
5265           reverse(Ra, Pb_base, Rlen, t0, t1);
5266         reverse(Ra, Pn_base, Rlen, t0, t1);
5267       }
5268 
5269       // Push all call-saved registers and also Pm_base which we'll need
5270       // at the end.
5271       save_regs();
5272 
5273 #ifndef PRODUCT
5274       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5275       {
5276         ldr(Rn, Address(Pn_base, 0));
5277         mul(Rlo_mn, Rn, inv);
5278         subs(zr, Rlo_mn, -1);
5279         Label ok;
5280         br(EQ, ok); {
5281           stop("broken inverse in Montgomery multiply");
5282         } bind(ok);
5283       }
5284 #endif
5285 
5286       mov(Pm_base, Ra);
5287 
5288       mov(t0, zr);
5289       mov(t1, zr);
5290       mov(t2, zr);
5291 
5292       block_comment("for (int i = 0; i < len; i++) {");
5293       mov(Ri, zr); {
5294         Label loop, end;
5295         cmpw(Ri, Rlen);
5296         br(Assembler::GE, end);
5297 
5298         bind(loop);
5299         pre1(Ri);
5300 
5301         block_comment("  for (j = i; j; j--) {"); {
5302           movw(Rj, Ri);
5303           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5304         } block_comment("  } // j");
5305 
5306         post1();
5307         addw(Ri, Ri, 1);
5308         cmpw(Ri, Rlen);
5309         br(Assembler::LT, loop);
5310         bind(end);
5311         block_comment("} // i");
5312       }
5313 
5314       block_comment("for (int i = len; i < 2*len; i++) {");
5315       mov(Ri, Rlen); {
5316         Label loop, end;
5317         cmpw(Ri, Rlen, Assembler::LSL, 1);
5318         br(Assembler::GE, end);
5319 
5320         bind(loop);
5321         pre2(Ri, Rlen);
5322 
5323         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5324           lslw(Rj, Rlen, 1);
5325           subw(Rj, Rj, Ri);
5326           subw(Rj, Rj, 1);
5327           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5328         } block_comment("  } // j");
5329 
5330         post2(Ri, Rlen);
5331         addw(Ri, Ri, 1);
5332         cmpw(Ri, Rlen, Assembler::LSL, 1);
5333         br(Assembler::LT, loop);
5334         bind(end);
5335       }
5336       block_comment("} // i");
5337 
5338       normalize(Rlen);
5339 
5340       mov(Ra, Pm_base);  // Save Pm_base in Ra
5341       restore_regs();  // Restore caller's Pm_base
5342 
5343       // Copy our result into caller's Pm_base
5344       reverse(Pm_base, Ra, Rlen, t0, t1);
5345 
5346       leave();
5347       bind(nothing);
5348       ret(lr);
5349 
5350       return entry;
5351     }
5352     // In C, approximately:
5353 
5354     // void
5355     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5356     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5357     //                     unsigned long inv, int len) {
5358     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5359     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5360     //   unsigned long Ra, Rb, Rn, Rm;
5361 
5362     //   int i;
5363 
5364     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5365 
5366     //   for (i = 0; i < len; i++) {
5367     //     int j;
5368 
5369     //     Pa = Pa_base;
5370     //     Pb = Pb_base + i;
5371     //     Pm = Pm_base;
5372     //     Pn = Pn_base + i;
5373 
5374     //     Ra = *Pa;
5375     //     Rb = *Pb;
5376     //     Rm = *Pm;
5377     //     Rn = *Pn;
5378 
5379     //     int iters = i;
5380     //     for (j = 0; iters--; j++) {
5381     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5382     //       MACC(Ra, Rb, t0, t1, t2);
5383     //       Ra = *++Pa;
5384     //       Rb = *--Pb;
5385     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5386     //       MACC(Rm, Rn, t0, t1, t2);
5387     //       Rm = *++Pm;
5388     //       Rn = *--Pn;
5389     //     }
5390 
5391     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5392     //     MACC(Ra, Rb, t0, t1, t2);
5393     //     *Pm = Rm = t0 * inv;
5394     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5395     //     MACC(Rm, Rn, t0, t1, t2);
5396 
5397     //     assert(t0 == 0, "broken Montgomery multiply");
5398 
5399     //     t0 = t1; t1 = t2; t2 = 0;
5400     //   }
5401 
5402     //   for (i = len; i < 2*len; i++) {
5403     //     int j;
5404 
5405     //     Pa = Pa_base + i-len;
5406     //     Pb = Pb_base + len;
5407     //     Pm = Pm_base + i-len;
5408     //     Pn = Pn_base + len;
5409 
5410     //     Ra = *++Pa;
5411     //     Rb = *--Pb;
5412     //     Rm = *++Pm;
5413     //     Rn = *--Pn;
5414 
5415     //     int iters = len*2-i-1;
5416     //     for (j = i-len+1; iters--; j++) {
5417     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5418     //       MACC(Ra, Rb, t0, t1, t2);
5419     //       Ra = *++Pa;
5420     //       Rb = *--Pb;
5421     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5422     //       MACC(Rm, Rn, t0, t1, t2);
5423     //       Rm = *++Pm;
5424     //       Rn = *--Pn;
5425     //     }
5426 
5427     //     Pm_base[i-len] = t0;
5428     //     t0 = t1; t1 = t2; t2 = 0;
5429     //   }
5430 
5431     //   while (t0)
5432     //     t0 = sub(Pm_base, Pn_base, t0, len);
5433     // }
5434 
5435     /**
5436      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5437      * multiplies than Montgomery multiplication so it should be up to
5438      * 25% faster.  However, its loop control is more complex and it
5439      * may actually run slower on some machines.
5440      *
5441      * Arguments:
5442      *
5443      * Inputs:
5444      *   c_rarg0   - int array elements a
5445      *   c_rarg1   - int array elements n (the modulus)
5446      *   c_rarg2   - int length
5447      *   c_rarg3   - int inv
5448      *   c_rarg4   - int array elements m (the result)
5449      *
5450      */
5451     address generate_square() {
5452       Label argh;
5453       bind(argh);
5454       stop("MontgomeryMultiply total_allocation must be <= 8192");
5455 
5456       align(CodeEntryAlignment);
5457       address entry = pc();
5458 
5459       enter();
5460 
5461       // Make room.
5462       cmpw(Rlen, 512);
5463       br(Assembler::HI, argh);
5464       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5465       andr(sp, Ra, -2 * wordSize);
5466 
5467       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5468 
5469       {
5470         // Copy input args, reversing as we go.  We use Ra as a
5471         // temporary variable.
5472         reverse(Ra, Pa_base, Rlen, t0, t1);
5473         reverse(Ra, Pn_base, Rlen, t0, t1);
5474       }
5475 
5476       // Push all call-saved registers and also Pm_base which we'll need
5477       // at the end.
5478       save_regs();
5479 
5480       mov(Pm_base, Ra);
5481 
5482       mov(t0, zr);
5483       mov(t1, zr);
5484       mov(t2, zr);
5485 
5486       block_comment("for (int i = 0; i < len; i++) {");
5487       mov(Ri, zr); {
5488         Label loop, end;
5489         bind(loop);
5490         cmp(Ri, Rlen);
5491         br(Assembler::GE, end);
5492 
5493         pre1(Ri);
5494 
5495         block_comment("for (j = (i+1)/2; j; j--) {"); {
5496           add(Rj, Ri, 1);
5497           lsr(Rj, Rj, 1);
5498           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5499         } block_comment("  } // j");
5500 
5501         last_squaring(Ri);
5502 
5503         block_comment("  for (j = i/2; j; j--) {"); {
5504           lsr(Rj, Ri, 1);
5505           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5506         } block_comment("  } // j");
5507 
5508         post1_squaring();
5509         add(Ri, Ri, 1);
5510         cmp(Ri, Rlen);
5511         br(Assembler::LT, loop);
5512 
5513         bind(end);
5514         block_comment("} // i");
5515       }
5516 
5517       block_comment("for (int i = len; i < 2*len; i++) {");
5518       mov(Ri, Rlen); {
5519         Label loop, end;
5520         bind(loop);
5521         cmp(Ri, Rlen, Assembler::LSL, 1);
5522         br(Assembler::GE, end);
5523 
5524         pre2(Ri, Rlen);
5525 
5526         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5527           lsl(Rj, Rlen, 1);
5528           sub(Rj, Rj, Ri);
5529           sub(Rj, Rj, 1);
5530           lsr(Rj, Rj, 1);
5531           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5532         } block_comment("  } // j");
5533 
5534         last_squaring(Ri);
5535 
5536         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5537           lsl(Rj, Rlen, 1);
5538           sub(Rj, Rj, Ri);
5539           lsr(Rj, Rj, 1);
5540           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5541         } block_comment("  } // j");
5542 
5543         post2(Ri, Rlen);
5544         add(Ri, Ri, 1);
5545         cmp(Ri, Rlen, Assembler::LSL, 1);
5546 
5547         br(Assembler::LT, loop);
5548         bind(end);
5549         block_comment("} // i");
5550       }
5551 
5552       normalize(Rlen);
5553 
5554       mov(Ra, Pm_base);  // Save Pm_base in Ra
5555       restore_regs();  // Restore caller's Pm_base
5556 
5557       // Copy our result into caller's Pm_base
5558       reverse(Pm_base, Ra, Rlen, t0, t1);
5559 
5560       leave();
5561       ret(lr);
5562 
5563       return entry;
5564     }
5565     // In C, approximately:
5566 
5567     // void
5568     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5569     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5570     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5571     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5572     //   unsigned long Ra, Rb, Rn, Rm;
5573 
5574     //   int i;
5575 
5576     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5577 
5578     //   for (i = 0; i < len; i++) {
5579     //     int j;
5580 
5581     //     Pa = Pa_base;
5582     //     Pb = Pa_base + i;
5583     //     Pm = Pm_base;
5584     //     Pn = Pn_base + i;
5585 
5586     //     Ra = *Pa;
5587     //     Rb = *Pb;
5588     //     Rm = *Pm;
5589     //     Rn = *Pn;
5590 
5591     //     int iters = (i+1)/2;
5592     //     for (j = 0; iters--; j++) {
5593     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5594     //       MACC2(Ra, Rb, t0, t1, t2);
5595     //       Ra = *++Pa;
5596     //       Rb = *--Pb;
5597     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5598     //       MACC(Rm, Rn, t0, t1, t2);
5599     //       Rm = *++Pm;
5600     //       Rn = *--Pn;
5601     //     }
5602     //     if ((i & 1) == 0) {
5603     //       assert(Ra == Pa_base[j], "must be");
5604     //       MACC(Ra, Ra, t0, t1, t2);
5605     //     }
5606     //     iters = i/2;
5607     //     assert(iters == i-j, "must be");
5608     //     for (; iters--; j++) {
5609     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5610     //       MACC(Rm, Rn, t0, t1, t2);
5611     //       Rm = *++Pm;
5612     //       Rn = *--Pn;
5613     //     }
5614 
5615     //     *Pm = Rm = t0 * inv;
5616     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5617     //     MACC(Rm, Rn, t0, t1, t2);
5618 
5619     //     assert(t0 == 0, "broken Montgomery multiply");
5620 
5621     //     t0 = t1; t1 = t2; t2 = 0;
5622     //   }
5623 
5624     //   for (i = len; i < 2*len; i++) {
5625     //     int start = i-len+1;
5626     //     int end = start + (len - start)/2;
5627     //     int j;
5628 
5629     //     Pa = Pa_base + i-len;
5630     //     Pb = Pa_base + len;
5631     //     Pm = Pm_base + i-len;
5632     //     Pn = Pn_base + len;
5633 
5634     //     Ra = *++Pa;
5635     //     Rb = *--Pb;
5636     //     Rm = *++Pm;
5637     //     Rn = *--Pn;
5638 
5639     //     int iters = (2*len-i-1)/2;
5640     //     assert(iters == end-start, "must be");
5641     //     for (j = start; iters--; j++) {
5642     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5643     //       MACC2(Ra, Rb, t0, t1, t2);
5644     //       Ra = *++Pa;
5645     //       Rb = *--Pb;
5646     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5647     //       MACC(Rm, Rn, t0, t1, t2);
5648     //       Rm = *++Pm;
5649     //       Rn = *--Pn;
5650     //     }
5651     //     if ((i & 1) == 0) {
5652     //       assert(Ra == Pa_base[j], "must be");
5653     //       MACC(Ra, Ra, t0, t1, t2);
5654     //     }
5655     //     iters =  (2*len-i)/2;
5656     //     assert(iters == len-j, "must be");
5657     //     for (; iters--; j++) {
5658     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5659     //       MACC(Rm, Rn, t0, t1, t2);
5660     //       Rm = *++Pm;
5661     //       Rn = *--Pn;
5662     //     }
5663     //     Pm_base[i-len] = t0;
5664     //     t0 = t1; t1 = t2; t2 = 0;
5665     //   }
5666 
5667     //   while (t0)
5668     //     t0 = sub(Pm_base, Pn_base, t0, len);
5669     // }
5670   };
5671 
5672 
5673   // Call here from the interpreter or compiled code to either load
5674   // multiple returned values from the value type instance being
5675   // returned to registers or to store returned values to a newly
5676   // allocated value type instance.
5677   address generate_return_value_stub(address destination, const char* name, bool has_res) {
5678 
5679     // Information about frame layout at time of blocking runtime call.
5680     // Note that we only have to preserve callee-saved registers since
5681     // the compilers are responsible for supplying a continuation point
5682     // if they expect all registers to be preserved.
5683     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
5684     enum layout {
5685       rfp_off = 0, rfp_off2,
5686 
5687       j_rarg7_off, j_rarg7_2,
5688       j_rarg6_off, j_rarg6_2,
5689       j_rarg5_off, j_rarg5_2,
5690       j_rarg4_off, j_rarg4_2,
5691       j_rarg3_off, j_rarg3_2,
5692       j_rarg2_off, j_rarg2_2,
5693       j_rarg1_off, j_rarg1_2,
5694       j_rarg0_off, j_rarg0_2,
5695 
5696       j_farg0_off, j_farg0_2,
5697       j_farg1_off, j_farg1_2,
5698       j_farg2_off, j_farg2_2,
5699       j_farg3_off, j_farg3_2,
5700       j_farg4_off, j_farg4_2,
5701       j_farg5_off, j_farg5_2,
5702       j_farg6_off, j_farg6_2,
5703       j_farg7_off, j_farg7_2,
5704  
5705       return_off, return_off2,
5706       framesize // inclusive of return address
5707     };
5708 
5709     int insts_size = 512;
5710     int locs_size  = 64;
5711 
5712     CodeBuffer code(name, insts_size, locs_size);
5713     OopMapSet* oop_maps  = new OopMapSet();
5714     MacroAssembler* masm = new MacroAssembler(&code);
5715 
5716     address start = __ pc();
5717 
5718     const Address f7_save       (rfp, j_farg7_off * wordSize);
5719     const Address f6_save       (rfp, j_farg6_off * wordSize);
5720     const Address f5_save       (rfp, j_farg5_off * wordSize);
5721     const Address f4_save       (rfp, j_farg4_off * wordSize);
5722     const Address f3_save       (rfp, j_farg3_off * wordSize);
5723     const Address f2_save       (rfp, j_farg2_off * wordSize);
5724     const Address f1_save       (rfp, j_farg1_off * wordSize);
5725     const Address f0_save       (rfp, j_farg0_off * wordSize);
5726 
5727     const Address r0_save      (rfp, j_rarg0_off * wordSize);
5728     const Address r1_save      (rfp, j_rarg1_off * wordSize);
5729     const Address r2_save      (rfp, j_rarg2_off * wordSize);
5730     const Address r3_save      (rfp, j_rarg3_off * wordSize);
5731     const Address r4_save      (rfp, j_rarg4_off * wordSize);
5732     const Address r5_save      (rfp, j_rarg5_off * wordSize);
5733     const Address r6_save      (rfp, j_rarg6_off * wordSize);
5734     const Address r7_save      (rfp, j_rarg7_off * wordSize);
5735 
5736     // Generate oop map
5737     OopMap* map = new OopMap(framesize, 0);
5738 
5739     map->set_callee_saved(VMRegImpl::stack2reg(rfp_off), rfp->as_VMReg());
5740     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
5741     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
5742     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
5743     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
5744     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
5745     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
5746     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
5747     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
5748 
5749     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
5750     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
5751     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
5752     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
5753     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
5754     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
5755     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
5756     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
5757 
5758     // This is an inlined and slightly modified version of call_VM
5759     // which has the ability to fetch the return PC out of
5760     // thread-local storage and also sets up last_Java_sp slightly
5761     // differently than the real call_VM
5762 
5763     __ enter(); // Save FP and LR before call
5764 
5765     assert(is_even(framesize/2), "sp not 16-byte aligned");
5766 
5767     // lr and fp are already in place
5768     __ sub(sp, rfp, ((unsigned)framesize - 4) << LogBytesPerInt); // prolog
5769 
5770     __ strd(j_farg7, f7_save); 
5771     __ strd(j_farg6, f6_save); 
5772     __ strd(j_farg5, f5_save); 
5773     __ strd(j_farg4, f4_save); 
5774     __ strd(j_farg3, f3_save); 
5775     __ strd(j_farg2, f2_save); 
5776     __ strd(j_farg1, f1_save); 
5777     __ strd(j_farg0, f0_save); 
5778 
5779     __ str(j_rarg0, r0_save); 
5780     __ str(j_rarg1, r1_save); 
5781     __ str(j_rarg2, r2_save); 
5782     __ str(j_rarg3, r3_save); 
5783     __ str(j_rarg4, r4_save); 
5784     __ str(j_rarg5, r5_save); 
5785     __ str(j_rarg6, r6_save); 
5786     __ str(j_rarg7, r7_save); 
5787 
5788     int frame_complete = __ pc() - start;
5789 
5790     // Set up last_Java_sp and last_Java_fp
5791     address the_pc = __ pc();
5792     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
5793 
5794     // Call runtime
5795     __ mov(c_rarg0, rthread);
5796     __ mov(c_rarg1, r0);
5797 
5798     BLOCK_COMMENT("call runtime_entry");
5799     __ mov(rscratch1, destination);
5800     __ blr(rscratch1);
5801 
5802     oop_maps->add_gc_map(the_pc - start, map);
5803 
5804     __ reset_last_Java_frame(false); 
5805     __ maybe_isb(); 
5806 
5807     __ ldrd(j_farg7, f7_save); 
5808     __ ldrd(j_farg6, f6_save); 
5809     __ ldrd(j_farg5, f5_save); 
5810     __ ldrd(j_farg4, f4_save); 
5811     __ ldrd(j_farg3, f3_save); 
5812     __ ldrd(j_farg3, f2_save); 
5813     __ ldrd(j_farg1, f1_save); 
5814     __ ldrd(j_farg0, f0_save); 
5815 
5816     __ ldr(j_rarg0, r0_save); 
5817     __ ldr(j_rarg1, r1_save); 
5818     __ ldr(j_rarg2, r2_save); 
5819     __ ldr(j_rarg3, r3_save); 
5820     __ ldr(j_rarg4, r4_save); 
5821     __ ldr(j_rarg5, r5_save); 
5822     __ ldr(j_rarg6, r6_save); 
5823     __ ldr(j_rarg7, r7_save); 
5824 
5825     __ leave();
5826 
5827     // check for pending exceptions
5828     Label pending;
5829     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
5830     __ cmp(rscratch1, (u1)NULL_WORD);
5831     __ br(Assembler::NE, pending);
5832 
5833     if (has_res) {
5834       __ get_vm_result(r0, rthread);
5835     }
5836     __ ret(lr);
5837 
5838     __ bind(pending);
5839     __ ldr(r0, Address(rthread, in_bytes(Thread::pending_exception_offset())));
5840     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5841 
5842 
5843     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5844     int frame_size_in_words = (framesize >> (LogBytesPerWord - LogBytesPerInt));
5845     RuntimeStub* stub =
5846       RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
5847 
5848     return stub->entry_point();
5849   }
5850 
5851   // Initialization
5852   void generate_initial() {
5853     // Generate initial stubs and initializes the entry points
5854 
5855     // entry points that exist in all platforms Note: This is code
5856     // that could be shared among different platforms - however the
5857     // benefit seems to be smaller than the disadvantage of having a
5858     // much more complicated generator structure. See also comment in
5859     // stubRoutines.hpp.
5860 
5861     StubRoutines::_forward_exception_entry = generate_forward_exception();
5862 
5863     StubRoutines::_call_stub_entry =
5864       generate_call_stub(StubRoutines::_call_stub_return_address);
5865 
5866     // is referenced by megamorphic call
5867     StubRoutines::_catch_exception_entry = generate_catch_exception();
5868 
5869     // Build this early so it's available for the interpreter.
5870     StubRoutines::_throw_StackOverflowError_entry =
5871       generate_throw_exception("StackOverflowError throw_exception",
5872                                CAST_FROM_FN_PTR(address,
5873                                                 SharedRuntime::throw_StackOverflowError));
5874     StubRoutines::_throw_delayed_StackOverflowError_entry =
5875       generate_throw_exception("delayed StackOverflowError throw_exception",
5876                                CAST_FROM_FN_PTR(address,
5877                                                 SharedRuntime::throw_delayed_StackOverflowError));
5878     if (UseCRC32Intrinsics) {
5879       // set table address before stub generation which use it
5880       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5881       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5882     }
5883 
5884     if (UseCRC32CIntrinsics) {
5885       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5886     }
5887 
5888     // Disabled until JDK-8210858 is fixed
5889     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5890     //   StubRoutines::_dlog = generate_dlog();
5891     // }
5892 
5893     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5894       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5895     }
5896 
5897     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5898       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5899     }
5900 
5901 
5902     StubRoutines::_load_value_type_fields_in_regs = 
5903          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_value_type_fields_in_regs), "load_value_type_fields_in_regs", false);
5904     StubRoutines::_store_value_type_fields_to_buf = 
5905          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_value_type_fields_to_buf), "store_value_type_fields_to_buf", true);
5906   }
5907 
5908   void generate_all() {
5909     // support for verify_oop (must happen after universe_init)
5910     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5911     StubRoutines::_throw_AbstractMethodError_entry =
5912       generate_throw_exception("AbstractMethodError throw_exception",
5913                                CAST_FROM_FN_PTR(address,
5914                                                 SharedRuntime::
5915                                                 throw_AbstractMethodError));
5916 
5917     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5918       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5919                                CAST_FROM_FN_PTR(address,
5920                                                 SharedRuntime::
5921                                                 throw_IncompatibleClassChangeError));
5922 
5923     StubRoutines::_throw_NullPointerException_at_call_entry =
5924       generate_throw_exception("NullPointerException at call throw_exception",
5925                                CAST_FROM_FN_PTR(address,
5926                                                 SharedRuntime::
5927                                                 throw_NullPointerException_at_call));
5928 
5929     // arraycopy stubs used by compilers
5930     generate_arraycopy_stubs();
5931 
5932     // has negatives stub for large arrays.
5933     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5934 
5935     // array equals stub for large arrays.
5936     if (!UseSimpleArrayEquals) {
5937       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5938     }
5939 
5940     generate_compare_long_strings();
5941 
5942     generate_string_indexof_stubs();
5943 
5944     // byte_array_inflate stub for large arrays.
5945     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5946 
5947 #ifdef COMPILER2
5948     if (UseMultiplyToLenIntrinsic) {
5949       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5950     }
5951 
5952     if (UseSquareToLenIntrinsic) {
5953       StubRoutines::_squareToLen = generate_squareToLen();
5954     }
5955 
5956     if (UseMulAddIntrinsic) {
5957       StubRoutines::_mulAdd = generate_mulAdd();
5958     }
5959 
5960     if (UseMontgomeryMultiplyIntrinsic) {
5961       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5962       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5963       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5964     }
5965 
5966     if (UseMontgomerySquareIntrinsic) {
5967       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5968       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5969       // We use generate_multiply() rather than generate_square()
5970       // because it's faster for the sizes of modulus we care about.
5971       StubRoutines::_montgomerySquare = g.generate_multiply();
5972     }
5973 #endif // COMPILER2
5974 
5975     // generate GHASH intrinsics code
5976     if (UseGHASHIntrinsics) {
5977       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5978     }
5979 
5980     // data cache line writeback
5981     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5982     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5983 
5984     if (UseAESIntrinsics) {
5985       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5986       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5987       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5988       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5989     }
5990 
5991     if (UseSHA1Intrinsics) {
5992       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5993       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5994     }
5995     if (UseSHA256Intrinsics) {
5996       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5997       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5998     }
5999 
6000     // generate Adler32 intrinsics code
6001     if (UseAdler32Intrinsics) {
6002       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6003     }
6004 
6005     // Safefetch stubs.
6006     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
6007                                                        &StubRoutines::_safefetch32_fault_pc,
6008                                                        &StubRoutines::_safefetch32_continuation_pc);
6009     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6010                                                        &StubRoutines::_safefetchN_fault_pc,
6011                                                        &StubRoutines::_safefetchN_continuation_pc);
6012     StubRoutines::aarch64::set_completed();
6013   }
6014 
6015  public:
6016   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6017     if (all) {
6018       generate_all();
6019     } else {
6020       generate_initial();
6021     }
6022   }
6023 }; // end class declaration
6024 
6025 #define UCM_TABLE_MAX_ENTRIES 8
6026 void StubGenerator_generate(CodeBuffer* code, bool all) {
6027   if (UnsafeCopyMemory::_table == NULL) {
6028     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
6029   }
6030   StubGenerator g(code, all);
6031 }