1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 #if INCLUDE_ZGC
  50 #include "gc/z/zThreadLocalData.hpp"
  51 #endif
  52 
  53 #ifdef BUILTIN_SIM
  54 #include "../../../../../../simulator/simulator.hpp"
  55 #endif
  56 
  57 // Declaration and definition of StubGenerator (no .hpp file).
  58 // For a more detailed description of the stub routine structure
  59 // see the comment in stubRoutines.hpp
  60 
  61 #undef __
  62 #define __ _masm->
  63 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  64 
  65 #ifdef PRODUCT
  66 #define BLOCK_COMMENT(str) /* nothing */
  67 #else
  68 #define BLOCK_COMMENT(str) __ block_comment(str)
  69 #endif
  70 
  71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  72 
  73 // Stub Code definitions
  74 
  75 class StubGenerator: public StubCodeGenerator {
  76  private:
  77 
  78 #ifdef PRODUCT
  79 #define inc_counter_np(counter) ((void)0)
  80 #else
  81   void inc_counter_np_(int& counter) {
  82     __ lea(rscratch2, ExternalAddress((address)&counter));
  83     __ ldrw(rscratch1, Address(rscratch2));
  84     __ addw(rscratch1, rscratch1, 1);
  85     __ strw(rscratch1, Address(rscratch2));
  86   }
  87 #define inc_counter_np(counter) \
  88   BLOCK_COMMENT("inc_counter " #counter); \
  89   inc_counter_np_(counter);
  90 #endif
  91 
  92   // Call stubs are used to call Java from C
  93   //
  94   // Arguments:
  95   //    c_rarg0:   call wrapper address                   address
  96   //    c_rarg1:   result                                 address
  97   //    c_rarg2:   result type                            BasicType
  98   //    c_rarg3:   method                                 Method*
  99   //    c_rarg4:   (interpreter) entry point              address
 100   //    c_rarg5:   parameters                             intptr_t*
 101   //    c_rarg6:   parameter size (in words)              int
 102   //    c_rarg7:   thread                                 Thread*
 103   //
 104   // There is no return from the stub itself as any Java result
 105   // is written to result
 106   //
 107   // we save r30 (lr) as the return PC at the base of the frame and
 108   // link r29 (fp) below it as the frame pointer installing sp (r31)
 109   // into fp.
 110   //
 111   // we save r0-r7, which accounts for all the c arguments.
 112   //
 113   // TODO: strictly do we need to save them all? they are treated as
 114   // volatile by C so could we omit saving the ones we are going to
 115   // place in global registers (thread? method?) or those we only use
 116   // during setup of the Java call?
 117   //
 118   // we don't need to save r8 which C uses as an indirect result location
 119   // return register.
 120   //
 121   // we don't need to save r9-r15 which both C and Java treat as
 122   // volatile
 123   //
 124   // we don't need to save r16-18 because Java does not use them
 125   //
 126   // we save r19-r28 which Java uses as scratch registers and C
 127   // expects to be callee-save
 128   //
 129   // we save the bottom 64 bits of each value stored in v8-v15; it is
 130   // the responsibility of the caller to preserve larger values.
 131   //
 132   // so the stub frame looks like this when we enter Java code
 133   //
 134   //     [ return_from_Java     ] <--- sp
 135   //     [ argument word n      ]
 136   //      ...
 137   // -27 [ argument word 1      ]
 138   // -26 [ saved v15            ] <--- sp_after_call
 139   // -25 [ saved v14            ]
 140   // -24 [ saved v13            ]
 141   // -23 [ saved v12            ]
 142   // -22 [ saved v11            ]
 143   // -21 [ saved v10            ]
 144   // -20 [ saved v9             ]
 145   // -19 [ saved v8             ]
 146   // -18 [ saved r28            ]
 147   // -17 [ saved r27            ]
 148   // -16 [ saved r26            ]
 149   // -15 [ saved r25            ]
 150   // -14 [ saved r24            ]
 151   // -13 [ saved r23            ]
 152   // -12 [ saved r22            ]
 153   // -11 [ saved r21            ]
 154   // -10 [ saved r20            ]
 155   //  -9 [ saved r19            ]
 156   //  -8 [ call wrapper    (r0) ]
 157   //  -7 [ result          (r1) ]
 158   //  -6 [ result type     (r2) ]
 159   //  -5 [ method          (r3) ]
 160   //  -4 [ entry point     (r4) ]
 161   //  -3 [ parameters      (r5) ]
 162   //  -2 [ parameter size  (r6) ]
 163   //  -1 [ thread (r7)          ]
 164   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 165   //   1 [ saved lr       (r30) ]
 166 
 167   // Call stub stack layout word offsets from fp
 168   enum call_stub_layout {
 169     sp_after_call_off = -26,
 170 
 171     d15_off            = -26,
 172     d13_off            = -24,
 173     d11_off            = -22,
 174     d9_off             = -20,
 175 
 176     r28_off            = -18,
 177     r26_off            = -16,
 178     r24_off            = -14,
 179     r22_off            = -12,
 180     r20_off            = -10,
 181     call_wrapper_off   =  -8,
 182     result_off         =  -7,
 183     result_type_off    =  -6,
 184     method_off         =  -5,
 185     entry_point_off    =  -4,
 186     parameter_size_off =  -2,
 187     thread_off         =  -1,
 188     fp_f               =   0,
 189     retaddr_off        =   1,
 190   };
 191 
 192   address generate_call_stub(address& return_address) {
 193     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 194            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 195            "adjust this code");
 196 
 197     StubCodeMark mark(this, "StubRoutines", "call_stub");
 198     address start = __ pc();
 199 
 200     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 201 
 202     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 203     const Address result        (rfp, result_off         * wordSize);
 204     const Address result_type   (rfp, result_type_off    * wordSize);
 205     const Address method        (rfp, method_off         * wordSize);
 206     const Address entry_point   (rfp, entry_point_off    * wordSize);
 207     const Address parameter_size(rfp, parameter_size_off * wordSize);
 208 
 209     const Address thread        (rfp, thread_off         * wordSize);
 210 
 211     const Address d15_save      (rfp, d15_off * wordSize);
 212     const Address d13_save      (rfp, d13_off * wordSize);
 213     const Address d11_save      (rfp, d11_off * wordSize);
 214     const Address d9_save       (rfp, d9_off * wordSize);
 215 
 216     const Address r28_save      (rfp, r28_off * wordSize);
 217     const Address r26_save      (rfp, r26_off * wordSize);
 218     const Address r24_save      (rfp, r24_off * wordSize);
 219     const Address r22_save      (rfp, r22_off * wordSize);
 220     const Address r20_save      (rfp, r20_off * wordSize);
 221 
 222     // stub code
 223 
 224     // we need a C prolog to bootstrap the x86 caller into the sim
 225     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 226 
 227     address aarch64_entry = __ pc();
 228 
 229 #ifdef BUILTIN_SIM
 230     // Save sender's SP for stack traces.
 231     __ mov(rscratch1, sp);
 232     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 233 #endif
 234     // set up frame and move sp to end of save area
 235     __ enter();
 236     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 237 
 238     // save register parameters and Java scratch/global registers
 239     // n.b. we save thread even though it gets installed in
 240     // rthread because we want to sanity check rthread later
 241     __ str(c_rarg7,  thread);
 242     __ strw(c_rarg6, parameter_size);
 243     __ stp(c_rarg4, c_rarg5,  entry_point);
 244     __ stp(c_rarg2, c_rarg3,  result_type);
 245     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 246 
 247     __ stp(r20, r19,   r20_save);
 248     __ stp(r22, r21,   r22_save);
 249     __ stp(r24, r23,   r24_save);
 250     __ stp(r26, r25,   r26_save);
 251     __ stp(r28, r27,   r28_save);
 252 
 253     __ stpd(v9,  v8,   d9_save);
 254     __ stpd(v11, v10,  d11_save);
 255     __ stpd(v13, v12,  d13_save);
 256     __ stpd(v15, v14,  d15_save);
 257 
 258     // install Java thread in global register now we have saved
 259     // whatever value it held
 260     __ mov(rthread, c_rarg7);
 261     // And method
 262     __ mov(rmethod, c_rarg3);
 263 
 264     // set up the heapbase register
 265     __ reinit_heapbase();
 266 
 267 #ifdef ASSERT
 268     // make sure we have no pending exceptions
 269     {
 270       Label L;
 271       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 272       __ cmp(rscratch1, (u1)NULL_WORD);
 273       __ br(Assembler::EQ, L);
 274       __ stop("StubRoutines::call_stub: entered with pending exception");
 275       __ BIND(L);
 276     }
 277 #endif
 278     // pass parameters if any
 279     __ mov(esp, sp);
 280     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 281     __ andr(sp, rscratch1, -2 * wordSize);
 282 
 283     BLOCK_COMMENT("pass parameters if any");
 284     Label parameters_done;
 285     // parameter count is still in c_rarg6
 286     // and parameter pointer identifying param 1 is in c_rarg5
 287     __ cbzw(c_rarg6, parameters_done);
 288 
 289     address loop = __ pc();
 290     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 291     __ subsw(c_rarg6, c_rarg6, 1);
 292     __ push(rscratch1);
 293     __ br(Assembler::GT, loop);
 294 
 295     __ BIND(parameters_done);
 296 
 297     // call Java entry -- passing methdoOop, and current sp
 298     //      rmethod: Method*
 299     //      r13: sender sp
 300     BLOCK_COMMENT("call Java function");
 301     __ mov(r13, sp);
 302     __ blr(c_rarg4);
 303 
 304     // tell the simulator we have returned to the stub
 305 
 306     // we do this here because the notify will already have been done
 307     // if we get to the next instruction via an exception
 308     //
 309     // n.b. adding this instruction here affects the calculation of
 310     // whether or not a routine returns to the call stub (used when
 311     // doing stack walks) since the normal test is to check the return
 312     // pc against the address saved below. so we may need to allow for
 313     // this extra instruction in the check.
 314 
 315     if (NotifySimulator) {
 316       __ notify(Assembler::method_reentry);
 317     }
 318     // save current address for use by exception handling code
 319 
 320     return_address = __ pc();
 321 
 322     // store result depending on type (everything that is not
 323     // T_OBJECT, T_VALUETYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 324     // n.b. this assumes Java returns an integral result in r0
 325     // and a floating result in j_farg0
 326     __ ldr(j_rarg2, result);
 327     Label is_long, is_float, is_double, is_value, exit;
 328     __ ldr(j_rarg1, result_type);
 329     __ cmp(j_rarg1, (u1)T_OBJECT);
 330     __ br(Assembler::EQ, is_long);
 331     __ cmp(j_rarg1, (u1)T_VALUETYPE);
 332     __ br(Assembler::EQ, is_value);
 333     __ cmp(j_rarg1, (u1)T_LONG);
 334     __ br(Assembler::EQ, is_long);
 335     __ cmp(j_rarg1, (u1)T_FLOAT);
 336     __ br(Assembler::EQ, is_float);
 337     __ cmp(j_rarg1, (u1)T_DOUBLE);
 338     __ br(Assembler::EQ, is_double);
 339 
 340     // handle T_INT case
 341     __ strw(r0, Address(j_rarg2));
 342 
 343     __ BIND(exit);
 344 
 345     // pop parameters
 346     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 347 
 348 #ifdef ASSERT
 349     // verify that threads correspond
 350     {
 351       Label L, S;
 352       __ ldr(rscratch1, thread);
 353       __ cmp(rthread, rscratch1);
 354       __ br(Assembler::NE, S);
 355       __ get_thread(rscratch1);
 356       __ cmp(rthread, rscratch1);
 357       __ br(Assembler::EQ, L);
 358       __ BIND(S);
 359       __ stop("StubRoutines::call_stub: threads must correspond");
 360       __ BIND(L);
 361     }
 362 #endif
 363 
 364     // restore callee-save registers
 365     __ ldpd(v15, v14,  d15_save);
 366     __ ldpd(v13, v12,  d13_save);
 367     __ ldpd(v11, v10,  d11_save);
 368     __ ldpd(v9,  v8,   d9_save);
 369 
 370     __ ldp(r28, r27,   r28_save);
 371     __ ldp(r26, r25,   r26_save);
 372     __ ldp(r24, r23,   r24_save);
 373     __ ldp(r22, r21,   r22_save);
 374     __ ldp(r20, r19,   r20_save);
 375 
 376     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 377     __ ldrw(c_rarg2, result_type);
 378     __ ldr(c_rarg3,  method);
 379     __ ldp(c_rarg4, c_rarg5,  entry_point);
 380     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 381 
 382 #ifndef PRODUCT
 383     // tell the simulator we are about to end Java execution
 384     if (NotifySimulator) {
 385       __ notify(Assembler::method_exit);
 386     }
 387 #endif
 388     // leave frame and return to caller
 389     __ leave();
 390     __ ret(lr);
 391 
 392     // handle return types different from T_INT
 393     __ BIND(is_value);
 394     if (ValueTypeReturnedAsFields) {
 395       // Check for flattened return value
 396       __ cbz(r0, is_long);
 397       // Initialize pre-allocated buffer
 398       __ mov(r1, r0);
 399       __ andr(r1, r1, -2);
 400       __ ldr(r1, Address(r1, InstanceKlass::adr_valueklass_fixed_block_offset()));
 401       __ ldr(r1, Address(r1, ValueKlass::pack_handler_offset()));
 402       __ ldr(r0, Address(j_rarg2, 0));
 403       __ blr(r1);
 404       __ b(exit);
 405     }
 406 
 407     __ BIND(is_long);
 408     __ str(r0, Address(j_rarg2, 0));
 409     __ br(Assembler::AL, exit);
 410 
 411     __ BIND(is_float);
 412     __ strs(j_farg0, Address(j_rarg2, 0));
 413     __ br(Assembler::AL, exit);
 414 
 415     __ BIND(is_double);
 416     __ strd(j_farg0, Address(j_rarg2, 0));
 417     __ br(Assembler::AL, exit);
 418 
 419     return start;
 420   }
 421 
 422   // Return point for a Java call if there's an exception thrown in
 423   // Java code.  The exception is caught and transformed into a
 424   // pending exception stored in JavaThread that can be tested from
 425   // within the VM.
 426   //
 427   // Note: Usually the parameters are removed by the callee. In case
 428   // of an exception crossing an activation frame boundary, that is
 429   // not the case if the callee is compiled code => need to setup the
 430   // rsp.
 431   //
 432   // r0: exception oop
 433 
 434   // NOTE: this is used as a target from the signal handler so it
 435   // needs an x86 prolog which returns into the current simulator
 436   // executing the generated catch_exception code. so the prolog
 437   // needs to install rax in a sim register and adjust the sim's
 438   // restart pc to enter the generated code at the start position
 439   // then return from native to simulated execution.
 440 
 441   address generate_catch_exception() {
 442     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 443     address start = __ pc();
 444 
 445     // same as in generate_call_stub():
 446     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 447     const Address thread        (rfp, thread_off         * wordSize);
 448 
 449 #ifdef ASSERT
 450     // verify that threads correspond
 451     {
 452       Label L, S;
 453       __ ldr(rscratch1, thread);
 454       __ cmp(rthread, rscratch1);
 455       __ br(Assembler::NE, S);
 456       __ get_thread(rscratch1);
 457       __ cmp(rthread, rscratch1);
 458       __ br(Assembler::EQ, L);
 459       __ bind(S);
 460       __ stop("StubRoutines::catch_exception: threads must correspond");
 461       __ bind(L);
 462     }
 463 #endif
 464 
 465     // set pending exception
 466     __ verify_oop(r0);
 467 
 468     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 469     __ mov(rscratch1, (address)__FILE__);
 470     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 471     __ movw(rscratch1, (int)__LINE__);
 472     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 473 
 474     // complete return to VM
 475     assert(StubRoutines::_call_stub_return_address != NULL,
 476            "_call_stub_return_address must have been generated before");
 477     __ b(StubRoutines::_call_stub_return_address);
 478 
 479     return start;
 480   }
 481 
 482   // Continuation point for runtime calls returning with a pending
 483   // exception.  The pending exception check happened in the runtime
 484   // or native call stub.  The pending exception in Thread is
 485   // converted into a Java-level exception.
 486   //
 487   // Contract with Java-level exception handlers:
 488   // r0: exception
 489   // r3: throwing pc
 490   //
 491   // NOTE: At entry of this stub, exception-pc must be in LR !!
 492 
 493   // NOTE: this is always used as a jump target within generated code
 494   // so it just needs to be generated code wiht no x86 prolog
 495 
 496   address generate_forward_exception() {
 497     StubCodeMark mark(this, "StubRoutines", "forward exception");
 498     address start = __ pc();
 499 
 500     // Upon entry, LR points to the return address returning into
 501     // Java (interpreted or compiled) code; i.e., the return address
 502     // becomes the throwing pc.
 503     //
 504     // Arguments pushed before the runtime call are still on the stack
 505     // but the exception handler will reset the stack pointer ->
 506     // ignore them.  A potential result in registers can be ignored as
 507     // well.
 508 
 509 #ifdef ASSERT
 510     // make sure this code is only executed if there is a pending exception
 511     {
 512       Label L;
 513       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 514       __ cbnz(rscratch1, L);
 515       __ stop("StubRoutines::forward exception: no pending exception (1)");
 516       __ bind(L);
 517     }
 518 #endif
 519 
 520     // compute exception handler into r19
 521 
 522     // call the VM to find the handler address associated with the
 523     // caller address. pass thread in r0 and caller pc (ret address)
 524     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 525     // the stack.
 526     __ mov(c_rarg1, lr);
 527     // lr will be trashed by the VM call so we move it to R19
 528     // (callee-saved) because we also need to pass it to the handler
 529     // returned by this call.
 530     __ mov(r19, lr);
 531     BLOCK_COMMENT("call exception_handler_for_return_address");
 532     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 533                          SharedRuntime::exception_handler_for_return_address),
 534                     rthread, c_rarg1);
 535     // we should not really care that lr is no longer the callee
 536     // address. we saved the value the handler needs in r19 so we can
 537     // just copy it to r3. however, the C2 handler will push its own
 538     // frame and then calls into the VM and the VM code asserts that
 539     // the PC for the frame above the handler belongs to a compiled
 540     // Java method. So, we restore lr here to satisfy that assert.
 541     __ mov(lr, r19);
 542     // setup r0 & r3 & clear pending exception
 543     __ mov(r3, r19);
 544     __ mov(r19, r0);
 545     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 546     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 547 
 548 #ifdef ASSERT
 549     // make sure exception is set
 550     {
 551       Label L;
 552       __ cbnz(r0, L);
 553       __ stop("StubRoutines::forward exception: no pending exception (2)");
 554       __ bind(L);
 555     }
 556 #endif
 557 
 558     // continue at exception handler
 559     // r0: exception
 560     // r3: throwing pc
 561     // r19: exception handler
 562     __ verify_oop(r0);
 563     __ br(r19);
 564 
 565     return start;
 566   }
 567 
 568   // Non-destructive plausibility checks for oops
 569   //
 570   // Arguments:
 571   //    r0: oop to verify
 572   //    rscratch1: error message
 573   //
 574   // Stack after saving c_rarg3:
 575   //    [tos + 0]: saved c_rarg3
 576   //    [tos + 1]: saved c_rarg2
 577   //    [tos + 2]: saved lr
 578   //    [tos + 3]: saved rscratch2
 579   //    [tos + 4]: saved r0
 580   //    [tos + 5]: saved rscratch1
 581   address generate_verify_oop() {
 582 
 583     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 584     address start = __ pc();
 585 
 586     Label exit, error;
 587 
 588     // save c_rarg2 and c_rarg3
 589     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 590 
 591     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 592     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 593     __ ldr(c_rarg3, Address(c_rarg2));
 594     __ add(c_rarg3, c_rarg3, 1);
 595     __ str(c_rarg3, Address(c_rarg2));
 596 
 597     // object is in r0
 598     // make sure object is 'reasonable'
 599     __ cbz(r0, exit); // if obj is NULL it is OK
 600 
 601 #if INCLUDE_ZGC
 602     if (UseZGC) {
 603       // Check if mask is good.
 604       // verifies that ZAddressBadMask & r0 == 0
 605       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 606       __ andr(c_rarg2, r0, c_rarg3);
 607       __ cbnz(c_rarg2, error);
 608     }
 609 #endif
 610 
 611     // Check if the oop is in the right area of memory
 612     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 613     __ andr(c_rarg2, r0, c_rarg3);
 614     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 615 
 616     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 617     // instruction here because the flags register is live.
 618     __ eor(c_rarg2, c_rarg2, c_rarg3);
 619     __ cbnz(c_rarg2, error);
 620 
 621     // make sure klass is 'reasonable', which is not zero.
 622     __ load_klass(r0, r0);  // get klass
 623     __ cbz(r0, error);      // if klass is NULL it is broken
 624 
 625     // return if everything seems ok
 626     __ bind(exit);
 627 
 628     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 629     __ ret(lr);
 630 
 631     // handle errors
 632     __ bind(error);
 633     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 634 
 635     __ push(RegSet::range(r0, r29), sp);
 636     // debug(char* msg, int64_t pc, int64_t regs[])
 637     __ mov(c_rarg0, rscratch1);      // pass address of error message
 638     __ mov(c_rarg1, lr);             // pass return address
 639     __ mov(c_rarg2, sp);             // pass address of regs on stack
 640 #ifndef PRODUCT
 641     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 642 #endif
 643     BLOCK_COMMENT("call MacroAssembler::debug");
 644     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 645     __ blrt(rscratch1, 3, 0, 1);
 646 
 647     return start;
 648   }
 649 
 650   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 651 
 652   // The inner part of zero_words().  This is the bulk operation,
 653   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 654   // caller is responsible for zeroing the last few words.
 655   //
 656   // Inputs:
 657   // r10: the HeapWord-aligned base address of an array to zero.
 658   // r11: the count in HeapWords, r11 > 0.
 659   //
 660   // Returns r10 and r11, adjusted for the caller to clear.
 661   // r10: the base address of the tail of words left to clear.
 662   // r11: the number of words in the tail.
 663   //      r11 < MacroAssembler::zero_words_block_size.
 664 
 665   address generate_zero_blocks() {
 666     Label done;
 667     Label base_aligned;
 668 
 669     Register base = r10, cnt = r11;
 670 
 671     __ align(CodeEntryAlignment);
 672     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 673     address start = __ pc();
 674 
 675     if (UseBlockZeroing) {
 676       int zva_length = VM_Version::zva_length();
 677 
 678       // Ensure ZVA length can be divided by 16. This is required by
 679       // the subsequent operations.
 680       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 681 
 682       __ tbz(base, 3, base_aligned);
 683       __ str(zr, Address(__ post(base, 8)));
 684       __ sub(cnt, cnt, 1);
 685       __ bind(base_aligned);
 686 
 687       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 688       // alignment.
 689       Label small;
 690       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 691       __ subs(rscratch1, cnt, low_limit >> 3);
 692       __ br(Assembler::LT, small);
 693       __ zero_dcache_blocks(base, cnt);
 694       __ bind(small);
 695     }
 696 
 697     {
 698       // Number of stp instructions we'll unroll
 699       const int unroll =
 700         MacroAssembler::zero_words_block_size / 2;
 701       // Clear the remaining blocks.
 702       Label loop;
 703       __ subs(cnt, cnt, unroll * 2);
 704       __ br(Assembler::LT, done);
 705       __ bind(loop);
 706       for (int i = 0; i < unroll; i++)
 707         __ stp(zr, zr, __ post(base, 16));
 708       __ subs(cnt, cnt, unroll * 2);
 709       __ br(Assembler::GE, loop);
 710       __ bind(done);
 711       __ add(cnt, cnt, unroll * 2);
 712     }
 713 
 714     __ ret(lr);
 715 
 716     return start;
 717   }
 718 
 719 
 720   typedef enum {
 721     copy_forwards = 1,
 722     copy_backwards = -1
 723   } copy_direction;
 724 
 725   // Bulk copy of blocks of 8 words.
 726   //
 727   // count is a count of words.
 728   //
 729   // Precondition: count >= 8
 730   //
 731   // Postconditions:
 732   //
 733   // The least significant bit of count contains the remaining count
 734   // of words to copy.  The rest of count is trash.
 735   //
 736   // s and d are adjusted to point to the remaining words to copy
 737   //
 738   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 739                            copy_direction direction) {
 740     int unit = wordSize * direction;
 741     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 742 
 743     int offset;
 744     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 745       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 746     const Register stride = r13;
 747 
 748     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 749     assert_different_registers(s, d, count, rscratch1);
 750 
 751     Label again, drain;
 752     const char *stub_name;
 753     if (direction == copy_forwards)
 754       stub_name = "forward_copy_longs";
 755     else
 756       stub_name = "backward_copy_longs";
 757 
 758     __ align(CodeEntryAlignment);
 759 
 760     StubCodeMark mark(this, "StubRoutines", stub_name);
 761 
 762     __ bind(start);
 763 
 764     Label unaligned_copy_long;
 765     if (AvoidUnalignedAccesses) {
 766       __ tbnz(d, 3, unaligned_copy_long);
 767     }
 768 
 769     if (direction == copy_forwards) {
 770       __ sub(s, s, bias);
 771       __ sub(d, d, bias);
 772     }
 773 
 774 #ifdef ASSERT
 775     // Make sure we are never given < 8 words
 776     {
 777       Label L;
 778       __ cmp(count, (u1)8);
 779       __ br(Assembler::GE, L);
 780       __ stop("genrate_copy_longs called with < 8 words");
 781       __ bind(L);
 782     }
 783 #endif
 784 
 785     // Fill 8 registers
 786     if (UseSIMDForMemoryOps) {
 787       __ ldpq(v0, v1, Address(s, 4 * unit));
 788       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 789     } else {
 790       __ ldp(t0, t1, Address(s, 2 * unit));
 791       __ ldp(t2, t3, Address(s, 4 * unit));
 792       __ ldp(t4, t5, Address(s, 6 * unit));
 793       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 794     }
 795 
 796     __ subs(count, count, 16);
 797     __ br(Assembler::LO, drain);
 798 
 799     int prefetch = PrefetchCopyIntervalInBytes;
 800     bool use_stride = false;
 801     if (direction == copy_backwards) {
 802        use_stride = prefetch > 256;
 803        prefetch = -prefetch;
 804        if (use_stride) __ mov(stride, prefetch);
 805     }
 806 
 807     __ bind(again);
 808 
 809     if (PrefetchCopyIntervalInBytes > 0)
 810       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 811 
 812     if (UseSIMDForMemoryOps) {
 813       __ stpq(v0, v1, Address(d, 4 * unit));
 814       __ ldpq(v0, v1, Address(s, 4 * unit));
 815       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 816       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 817     } else {
 818       __ stp(t0, t1, Address(d, 2 * unit));
 819       __ ldp(t0, t1, Address(s, 2 * unit));
 820       __ stp(t2, t3, Address(d, 4 * unit));
 821       __ ldp(t2, t3, Address(s, 4 * unit));
 822       __ stp(t4, t5, Address(d, 6 * unit));
 823       __ ldp(t4, t5, Address(s, 6 * unit));
 824       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 825       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 826     }
 827 
 828     __ subs(count, count, 8);
 829     __ br(Assembler::HS, again);
 830 
 831     // Drain
 832     __ bind(drain);
 833     if (UseSIMDForMemoryOps) {
 834       __ stpq(v0, v1, Address(d, 4 * unit));
 835       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 836     } else {
 837       __ stp(t0, t1, Address(d, 2 * unit));
 838       __ stp(t2, t3, Address(d, 4 * unit));
 839       __ stp(t4, t5, Address(d, 6 * unit));
 840       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 841     }
 842 
 843     {
 844       Label L1, L2;
 845       __ tbz(count, exact_log2(4), L1);
 846       if (UseSIMDForMemoryOps) {
 847         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 848         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 849       } else {
 850         __ ldp(t0, t1, Address(s, 2 * unit));
 851         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 852         __ stp(t0, t1, Address(d, 2 * unit));
 853         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 854       }
 855       __ bind(L1);
 856 
 857       if (direction == copy_forwards) {
 858         __ add(s, s, bias);
 859         __ add(d, d, bias);
 860       }
 861 
 862       __ tbz(count, 1, L2);
 863       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 864       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 865       __ bind(L2);
 866     }
 867 
 868     __ ret(lr);
 869 
 870     if (AvoidUnalignedAccesses) {
 871       Label drain, again;
 872       // Register order for storing. Order is different for backward copy.
 873 
 874       __ bind(unaligned_copy_long);
 875 
 876       // source address is even aligned, target odd aligned
 877       //
 878       // when forward copying word pairs we read long pairs at offsets
 879       // {0, 2, 4, 6} (in long words). when backwards copying we read
 880       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 881       // address by -2 in the forwards case so we can compute the
 882       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 883       // or -1.
 884       //
 885       // when forward copying we need to store 1 word, 3 pairs and
 886       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 887       // zero offset We adjust the destination by -1 which means we
 888       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 889       //
 890       // When backwards copyng we need to store 1 word, 3 pairs and
 891       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 892       // offsets {1, 3, 5, 7, 8} * unit.
 893 
 894       if (direction == copy_forwards) {
 895         __ sub(s, s, 16);
 896         __ sub(d, d, 8);
 897       }
 898 
 899       // Fill 8 registers
 900       //
 901       // for forwards copy s was offset by -16 from the original input
 902       // value of s so the register contents are at these offsets
 903       // relative to the 64 bit block addressed by that original input
 904       // and so on for each successive 64 byte block when s is updated
 905       //
 906       // t0 at offset 0,  t1 at offset 8
 907       // t2 at offset 16, t3 at offset 24
 908       // t4 at offset 32, t5 at offset 40
 909       // t6 at offset 48, t7 at offset 56
 910 
 911       // for backwards copy s was not offset so the register contents
 912       // are at these offsets into the preceding 64 byte block
 913       // relative to that original input and so on for each successive
 914       // preceding 64 byte block when s is updated. this explains the
 915       // slightly counter-intuitive looking pattern of register usage
 916       // in the stp instructions for backwards copy.
 917       //
 918       // t0 at offset -16, t1 at offset -8
 919       // t2 at offset -32, t3 at offset -24
 920       // t4 at offset -48, t5 at offset -40
 921       // t6 at offset -64, t7 at offset -56
 922 
 923       __ ldp(t0, t1, Address(s, 2 * unit));
 924       __ ldp(t2, t3, Address(s, 4 * unit));
 925       __ ldp(t4, t5, Address(s, 6 * unit));
 926       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 927 
 928       __ subs(count, count, 16);
 929       __ br(Assembler::LO, drain);
 930 
 931       int prefetch = PrefetchCopyIntervalInBytes;
 932       bool use_stride = false;
 933       if (direction == copy_backwards) {
 934          use_stride = prefetch > 256;
 935          prefetch = -prefetch;
 936          if (use_stride) __ mov(stride, prefetch);
 937       }
 938 
 939       __ bind(again);
 940 
 941       if (PrefetchCopyIntervalInBytes > 0)
 942         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 943 
 944       if (direction == copy_forwards) {
 945        // allowing for the offset of -8 the store instructions place
 946        // registers into the target 64 bit block at the following
 947        // offsets
 948        //
 949        // t0 at offset 0
 950        // t1 at offset 8,  t2 at offset 16
 951        // t3 at offset 24, t4 at offset 32
 952        // t5 at offset 40, t6 at offset 48
 953        // t7 at offset 56
 954 
 955         __ str(t0, Address(d, 1 * unit));
 956         __ stp(t1, t2, Address(d, 2 * unit));
 957         __ ldp(t0, t1, Address(s, 2 * unit));
 958         __ stp(t3, t4, Address(d, 4 * unit));
 959         __ ldp(t2, t3, Address(s, 4 * unit));
 960         __ stp(t5, t6, Address(d, 6 * unit));
 961         __ ldp(t4, t5, Address(s, 6 * unit));
 962         __ str(t7, Address(__ pre(d, 8 * unit)));
 963         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 964       } else {
 965        // d was not offset when we started so the registers are
 966        // written into the 64 bit block preceding d with the following
 967        // offsets
 968        //
 969        // t1 at offset -8
 970        // t3 at offset -24, t0 at offset -16
 971        // t5 at offset -48, t2 at offset -32
 972        // t7 at offset -56, t4 at offset -48
 973        //                   t6 at offset -64
 974        //
 975        // note that this matches the offsets previously noted for the
 976        // loads
 977 
 978         __ str(t1, Address(d, 1 * unit));
 979         __ stp(t3, t0, Address(d, 3 * unit));
 980         __ ldp(t0, t1, Address(s, 2 * unit));
 981         __ stp(t5, t2, Address(d, 5 * unit));
 982         __ ldp(t2, t3, Address(s, 4 * unit));
 983         __ stp(t7, t4, Address(d, 7 * unit));
 984         __ ldp(t4, t5, Address(s, 6 * unit));
 985         __ str(t6, Address(__ pre(d, 8 * unit)));
 986         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 987       }
 988 
 989       __ subs(count, count, 8);
 990       __ br(Assembler::HS, again);
 991 
 992       // Drain
 993       //
 994       // this uses the same pattern of offsets and register arguments
 995       // as above
 996       __ bind(drain);
 997       if (direction == copy_forwards) {
 998         __ str(t0, Address(d, 1 * unit));
 999         __ stp(t1, t2, Address(d, 2 * unit));
1000         __ stp(t3, t4, Address(d, 4 * unit));
1001         __ stp(t5, t6, Address(d, 6 * unit));
1002         __ str(t7, Address(__ pre(d, 8 * unit)));
1003       } else {
1004         __ str(t1, Address(d, 1 * unit));
1005         __ stp(t3, t0, Address(d, 3 * unit));
1006         __ stp(t5, t2, Address(d, 5 * unit));
1007         __ stp(t7, t4, Address(d, 7 * unit));
1008         __ str(t6, Address(__ pre(d, 8 * unit)));
1009       }
1010       // now we need to copy any remaining part block which may
1011       // include a 4 word block subblock and/or a 2 word subblock.
1012       // bits 2 and 1 in the count are the tell-tale for whetehr we
1013       // have each such subblock
1014       {
1015         Label L1, L2;
1016         __ tbz(count, exact_log2(4), L1);
1017        // this is the same as above but copying only 4 longs hence
1018        // with ony one intervening stp between the str instructions
1019        // but note that the offsets and registers still follow the
1020        // same pattern
1021         __ ldp(t0, t1, Address(s, 2 * unit));
1022         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1023         if (direction == copy_forwards) {
1024           __ str(t0, Address(d, 1 * unit));
1025           __ stp(t1, t2, Address(d, 2 * unit));
1026           __ str(t3, Address(__ pre(d, 4 * unit)));
1027         } else {
1028           __ str(t1, Address(d, 1 * unit));
1029           __ stp(t3, t0, Address(d, 3 * unit));
1030           __ str(t2, Address(__ pre(d, 4 * unit)));
1031         }
1032         __ bind(L1);
1033 
1034         __ tbz(count, 1, L2);
1035        // this is the same as above but copying only 2 longs hence
1036        // there is no intervening stp between the str instructions
1037        // but note that the offset and register patterns are still
1038        // the same
1039         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1040         if (direction == copy_forwards) {
1041           __ str(t0, Address(d, 1 * unit));
1042           __ str(t1, Address(__ pre(d, 2 * unit)));
1043         } else {
1044           __ str(t1, Address(d, 1 * unit));
1045           __ str(t0, Address(__ pre(d, 2 * unit)));
1046         }
1047         __ bind(L2);
1048 
1049        // for forwards copy we need to re-adjust the offsets we
1050        // applied so that s and d are follow the last words written
1051 
1052        if (direction == copy_forwards) {
1053          __ add(s, s, 16);
1054          __ add(d, d, 8);
1055        }
1056 
1057       }
1058 
1059       __ ret(lr);
1060       }
1061   }
1062 
1063   // Small copy: less than 16 bytes.
1064   //
1065   // NB: Ignores all of the bits of count which represent more than 15
1066   // bytes, so a caller doesn't have to mask them.
1067 
1068   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1069     bool is_backwards = step < 0;
1070     size_t granularity = uabs(step);
1071     int direction = is_backwards ? -1 : 1;
1072     int unit = wordSize * direction;
1073 
1074     Label Lword, Lint, Lshort, Lbyte;
1075 
1076     assert(granularity
1077            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1078 
1079     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1080 
1081     // ??? I don't know if this bit-test-and-branch is the right thing
1082     // to do.  It does a lot of jumping, resulting in several
1083     // mispredicted branches.  It might make more sense to do this
1084     // with something like Duff's device with a single computed branch.
1085 
1086     __ tbz(count, 3 - exact_log2(granularity), Lword);
1087     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1088     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1089     __ bind(Lword);
1090 
1091     if (granularity <= sizeof (jint)) {
1092       __ tbz(count, 2 - exact_log2(granularity), Lint);
1093       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1094       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1095       __ bind(Lint);
1096     }
1097 
1098     if (granularity <= sizeof (jshort)) {
1099       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1100       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1101       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1102       __ bind(Lshort);
1103     }
1104 
1105     if (granularity <= sizeof (jbyte)) {
1106       __ tbz(count, 0, Lbyte);
1107       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1108       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1109       __ bind(Lbyte);
1110     }
1111   }
1112 
1113   Label copy_f, copy_b;
1114 
1115   // All-singing all-dancing memory copy.
1116   //
1117   // Copy count units of memory from s to d.  The size of a unit is
1118   // step, which can be positive or negative depending on the direction
1119   // of copy.  If is_aligned is false, we align the source address.
1120   //
1121 
1122   void copy_memory(bool is_aligned, Register s, Register d,
1123                    Register count, Register tmp, int step) {
1124     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1125     bool is_backwards = step < 0;
1126     int granularity = uabs(step);
1127     const Register t0 = r3, t1 = r4;
1128 
1129     // <= 96 bytes do inline. Direction doesn't matter because we always
1130     // load all the data before writing anything
1131     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1132     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1133     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1134     const Register send = r17, dend = r18;
1135 
1136     if (PrefetchCopyIntervalInBytes > 0)
1137       __ prfm(Address(s, 0), PLDL1KEEP);
1138     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1139     __ br(Assembler::HI, copy_big);
1140 
1141     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1142     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1143 
1144     __ cmp(count, u1(16/granularity));
1145     __ br(Assembler::LS, copy16);
1146 
1147     __ cmp(count, u1(64/granularity));
1148     __ br(Assembler::HI, copy80);
1149 
1150     __ cmp(count, u1(32/granularity));
1151     __ br(Assembler::LS, copy32);
1152 
1153     // 33..64 bytes
1154     if (UseSIMDForMemoryOps) {
1155       __ ldpq(v0, v1, Address(s, 0));
1156       __ ldpq(v2, v3, Address(send, -32));
1157       __ stpq(v0, v1, Address(d, 0));
1158       __ stpq(v2, v3, Address(dend, -32));
1159     } else {
1160       __ ldp(t0, t1, Address(s, 0));
1161       __ ldp(t2, t3, Address(s, 16));
1162       __ ldp(t4, t5, Address(send, -32));
1163       __ ldp(t6, t7, Address(send, -16));
1164 
1165       __ stp(t0, t1, Address(d, 0));
1166       __ stp(t2, t3, Address(d, 16));
1167       __ stp(t4, t5, Address(dend, -32));
1168       __ stp(t6, t7, Address(dend, -16));
1169     }
1170     __ b(finish);
1171 
1172     // 17..32 bytes
1173     __ bind(copy32);
1174     __ ldp(t0, t1, Address(s, 0));
1175     __ ldp(t2, t3, Address(send, -16));
1176     __ stp(t0, t1, Address(d, 0));
1177     __ stp(t2, t3, Address(dend, -16));
1178     __ b(finish);
1179 
1180     // 65..80/96 bytes
1181     // (96 bytes if SIMD because we do 32 byes per instruction)
1182     __ bind(copy80);
1183     if (UseSIMDForMemoryOps) {
1184       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1185       __ ldpq(v4, v5, Address(send, -32));
1186       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1187       __ stpq(v4, v5, Address(dend, -32));
1188     } else {
1189       __ ldp(t0, t1, Address(s, 0));
1190       __ ldp(t2, t3, Address(s, 16));
1191       __ ldp(t4, t5, Address(s, 32));
1192       __ ldp(t6, t7, Address(s, 48));
1193       __ ldp(t8, t9, Address(send, -16));
1194 
1195       __ stp(t0, t1, Address(d, 0));
1196       __ stp(t2, t3, Address(d, 16));
1197       __ stp(t4, t5, Address(d, 32));
1198       __ stp(t6, t7, Address(d, 48));
1199       __ stp(t8, t9, Address(dend, -16));
1200     }
1201     __ b(finish);
1202 
1203     // 0..16 bytes
1204     __ bind(copy16);
1205     __ cmp(count, u1(8/granularity));
1206     __ br(Assembler::LO, copy8);
1207 
1208     // 8..16 bytes
1209     __ ldr(t0, Address(s, 0));
1210     __ ldr(t1, Address(send, -8));
1211     __ str(t0, Address(d, 0));
1212     __ str(t1, Address(dend, -8));
1213     __ b(finish);
1214 
1215     if (granularity < 8) {
1216       // 4..7 bytes
1217       __ bind(copy8);
1218       __ tbz(count, 2 - exact_log2(granularity), copy4);
1219       __ ldrw(t0, Address(s, 0));
1220       __ ldrw(t1, Address(send, -4));
1221       __ strw(t0, Address(d, 0));
1222       __ strw(t1, Address(dend, -4));
1223       __ b(finish);
1224       if (granularity < 4) {
1225         // 0..3 bytes
1226         __ bind(copy4);
1227         __ cbz(count, finish); // get rid of 0 case
1228         if (granularity == 2) {
1229           __ ldrh(t0, Address(s, 0));
1230           __ strh(t0, Address(d, 0));
1231         } else { // granularity == 1
1232           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1233           // the first and last byte.
1234           // Handle the 3 byte case by loading and storing base + count/2
1235           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1236           // This does means in the 1 byte case we load/store the same
1237           // byte 3 times.
1238           __ lsr(count, count, 1);
1239           __ ldrb(t0, Address(s, 0));
1240           __ ldrb(t1, Address(send, -1));
1241           __ ldrb(t2, Address(s, count));
1242           __ strb(t0, Address(d, 0));
1243           __ strb(t1, Address(dend, -1));
1244           __ strb(t2, Address(d, count));
1245         }
1246         __ b(finish);
1247       }
1248     }
1249 
1250     __ bind(copy_big);
1251     if (is_backwards) {
1252       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1253       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1254     }
1255 
1256     // Now we've got the small case out of the way we can align the
1257     // source address on a 2-word boundary.
1258 
1259     Label aligned;
1260 
1261     if (is_aligned) {
1262       // We may have to adjust by 1 word to get s 2-word-aligned.
1263       __ tbz(s, exact_log2(wordSize), aligned);
1264       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1265       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1266       __ sub(count, count, wordSize/granularity);
1267     } else {
1268       if (is_backwards) {
1269         __ andr(rscratch2, s, 2 * wordSize - 1);
1270       } else {
1271         __ neg(rscratch2, s);
1272         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1273       }
1274       // rscratch2 is the byte adjustment needed to align s.
1275       __ cbz(rscratch2, aligned);
1276       int shift = exact_log2(granularity);
1277       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1278       __ sub(count, count, rscratch2);
1279 
1280 #if 0
1281       // ?? This code is only correct for a disjoint copy.  It may or
1282       // may not make sense to use it in that case.
1283 
1284       // Copy the first pair; s and d may not be aligned.
1285       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1286       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1287 
1288       // Align s and d, adjust count
1289       if (is_backwards) {
1290         __ sub(s, s, rscratch2);
1291         __ sub(d, d, rscratch2);
1292       } else {
1293         __ add(s, s, rscratch2);
1294         __ add(d, d, rscratch2);
1295       }
1296 #else
1297       copy_memory_small(s, d, rscratch2, rscratch1, step);
1298 #endif
1299     }
1300 
1301     __ bind(aligned);
1302 
1303     // s is now 2-word-aligned.
1304 
1305     // We have a count of units and some trailing bytes.  Adjust the
1306     // count and do a bulk copy of words.
1307     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1308     if (direction == copy_forwards)
1309       __ bl(copy_f);
1310     else
1311       __ bl(copy_b);
1312 
1313     // And the tail.
1314     copy_memory_small(s, d, count, tmp, step);
1315 
1316     if (granularity >= 8) __ bind(copy8);
1317     if (granularity >= 4) __ bind(copy4);
1318     __ bind(finish);
1319   }
1320 
1321 
1322   void clobber_registers() {
1323 #ifdef ASSERT
1324     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1325     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1326     for (Register r = r3; r <= r18; r++)
1327       if (r != rscratch1) __ mov(r, rscratch1);
1328 #endif
1329   }
1330 
1331   // Scan over array at a for count oops, verifying each one.
1332   // Preserves a and count, clobbers rscratch1 and rscratch2.
1333   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1334     Label loop, end;
1335     __ mov(rscratch1, a);
1336     __ mov(rscratch2, zr);
1337     __ bind(loop);
1338     __ cmp(rscratch2, count);
1339     __ br(Assembler::HS, end);
1340     if (size == (size_t)wordSize) {
1341       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1342       __ verify_oop(temp);
1343     } else {
1344       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1345       __ decode_heap_oop(temp); // calls verify_oop
1346     }
1347     __ add(rscratch2, rscratch2, size);
1348     __ b(loop);
1349     __ bind(end);
1350   }
1351 
1352   // Arguments:
1353   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1354   //             ignored
1355   //   is_oop  - true => oop array, so generate store check code
1356   //   name    - stub name string
1357   //
1358   // Inputs:
1359   //   c_rarg0   - source array address
1360   //   c_rarg1   - destination array address
1361   //   c_rarg2   - element count, treated as ssize_t, can be zero
1362   //
1363   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1364   // the hardware handle it.  The two dwords within qwords that span
1365   // cache line boundaries will still be loaded and stored atomicly.
1366   //
1367   // Side Effects:
1368   //   disjoint_int_copy_entry is set to the no-overlap entry point
1369   //   used by generate_conjoint_int_oop_copy().
1370   //
1371   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1372                                   const char *name, bool dest_uninitialized = false) {
1373     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1374     RegSet saved_reg = RegSet::of(s, d, count);
1375     __ align(CodeEntryAlignment);
1376     StubCodeMark mark(this, "StubRoutines", name);
1377     address start = __ pc();
1378     __ enter();
1379 
1380     if (entry != NULL) {
1381       *entry = __ pc();
1382       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1383       BLOCK_COMMENT("Entry:");
1384     }
1385 
1386     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1387     if (dest_uninitialized) {
1388       decorators |= IS_DEST_UNINITIALIZED;
1389     }
1390     if (aligned) {
1391       decorators |= ARRAYCOPY_ALIGNED;
1392     }
1393 
1394     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1395     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1396 
1397     if (is_oop) {
1398       // save regs before copy_memory
1399       __ push(RegSet::of(d, count), sp);
1400     }
1401     copy_memory(aligned, s, d, count, rscratch1, size);
1402 
1403     if (is_oop) {
1404       __ pop(RegSet::of(d, count), sp);
1405       if (VerifyOops)
1406         verify_oop_array(size, d, count, r16);
1407     }
1408 
1409     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1410 
1411     __ leave();
1412     __ mov(r0, zr); // return 0
1413     __ ret(lr);
1414 #ifdef BUILTIN_SIM
1415     {
1416       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1417       sim->notifyCompile(const_cast<char*>(name), start);
1418     }
1419 #endif
1420     return start;
1421   }
1422 
1423   // Arguments:
1424   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1425   //             ignored
1426   //   is_oop  - true => oop array, so generate store check code
1427   //   name    - stub name string
1428   //
1429   // Inputs:
1430   //   c_rarg0   - source array address
1431   //   c_rarg1   - destination array address
1432   //   c_rarg2   - element count, treated as ssize_t, can be zero
1433   //
1434   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1435   // the hardware handle it.  The two dwords within qwords that span
1436   // cache line boundaries will still be loaded and stored atomicly.
1437   //
1438   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1439                                  address *entry, const char *name,
1440                                  bool dest_uninitialized = false) {
1441     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1442     RegSet saved_regs = RegSet::of(s, d, count);
1443     StubCodeMark mark(this, "StubRoutines", name);
1444     address start = __ pc();
1445     __ enter();
1446 
1447     if (entry != NULL) {
1448       *entry = __ pc();
1449       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1450       BLOCK_COMMENT("Entry:");
1451     }
1452 
1453     // use fwd copy when (d-s) above_equal (count*size)
1454     __ sub(rscratch1, d, s);
1455     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1456     __ br(Assembler::HS, nooverlap_target);
1457 
1458     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1459     if (dest_uninitialized) {
1460       decorators |= IS_DEST_UNINITIALIZED;
1461     }
1462     if (aligned) {
1463       decorators |= ARRAYCOPY_ALIGNED;
1464     }
1465 
1466     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1467     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1468 
1469     if (is_oop) {
1470       // save regs before copy_memory
1471       __ push(RegSet::of(d, count), sp);
1472     }
1473     copy_memory(aligned, s, d, count, rscratch1, -size);
1474     if (is_oop) {
1475       __ pop(RegSet::of(d, count), sp);
1476       if (VerifyOops)
1477         verify_oop_array(size, d, count, r16);
1478     }
1479     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1480     __ leave();
1481     __ mov(r0, zr); // return 0
1482     __ ret(lr);
1483 #ifdef BUILTIN_SIM
1484     {
1485       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1486       sim->notifyCompile(const_cast<char*>(name), start);
1487     }
1488 #endif
1489     return start;
1490 }
1491 
1492   // Arguments:
1493   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1494   //             ignored
1495   //   name    - stub name string
1496   //
1497   // Inputs:
1498   //   c_rarg0   - source array address
1499   //   c_rarg1   - destination array address
1500   //   c_rarg2   - element count, treated as ssize_t, can be zero
1501   //
1502   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1503   // we let the hardware handle it.  The one to eight bytes within words,
1504   // dwords or qwords that span cache line boundaries will still be loaded
1505   // and stored atomically.
1506   //
1507   // Side Effects:
1508   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1509   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1510   // we let the hardware handle it.  The one to eight bytes within words,
1511   // dwords or qwords that span cache line boundaries will still be loaded
1512   // and stored atomically.
1513   //
1514   // Side Effects:
1515   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1516   //   used by generate_conjoint_byte_copy().
1517   //
1518   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1519     const bool not_oop = false;
1520     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1521   }
1522 
1523   // Arguments:
1524   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1525   //             ignored
1526   //   name    - stub name string
1527   //
1528   // Inputs:
1529   //   c_rarg0   - source array address
1530   //   c_rarg1   - destination array address
1531   //   c_rarg2   - element count, treated as ssize_t, can be zero
1532   //
1533   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1534   // we let the hardware handle it.  The one to eight bytes within words,
1535   // dwords or qwords that span cache line boundaries will still be loaded
1536   // and stored atomically.
1537   //
1538   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1539                                       address* entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1542   }
1543 
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1555   // let the hardware handle it.  The two or four words within dwords
1556   // or qwords that span cache line boundaries will still be loaded
1557   // and stored atomically.
1558   //
1559   // Side Effects:
1560   //   disjoint_short_copy_entry is set to the no-overlap entry point
1561   //   used by generate_conjoint_short_copy().
1562   //
1563   address generate_disjoint_short_copy(bool aligned,
1564                                        address* entry, const char *name) {
1565     const bool not_oop = false;
1566     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1567   }
1568 
1569   // Arguments:
1570   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1571   //             ignored
1572   //   name    - stub name string
1573   //
1574   // Inputs:
1575   //   c_rarg0   - source array address
1576   //   c_rarg1   - destination array address
1577   //   c_rarg2   - element count, treated as ssize_t, can be zero
1578   //
1579   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1580   // let the hardware handle it.  The two or four words within dwords
1581   // or qwords that span cache line boundaries will still be loaded
1582   // and stored atomically.
1583   //
1584   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1585                                        address *entry, const char *name) {
1586     const bool not_oop = false;
1587     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1588 
1589   }
1590   // Arguments:
1591   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1592   //             ignored
1593   //   name    - stub name string
1594   //
1595   // Inputs:
1596   //   c_rarg0   - source array address
1597   //   c_rarg1   - destination array address
1598   //   c_rarg2   - element count, treated as ssize_t, can be zero
1599   //
1600   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1601   // the hardware handle it.  The two dwords within qwords that span
1602   // cache line boundaries will still be loaded and stored atomicly.
1603   //
1604   // Side Effects:
1605   //   disjoint_int_copy_entry is set to the no-overlap entry point
1606   //   used by generate_conjoint_int_oop_copy().
1607   //
1608   address generate_disjoint_int_copy(bool aligned, address *entry,
1609                                          const char *name, bool dest_uninitialized = false) {
1610     const bool not_oop = false;
1611     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1612   }
1613 
1614   // Arguments:
1615   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1616   //             ignored
1617   //   name    - stub name string
1618   //
1619   // Inputs:
1620   //   c_rarg0   - source array address
1621   //   c_rarg1   - destination array address
1622   //   c_rarg2   - element count, treated as ssize_t, can be zero
1623   //
1624   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1625   // the hardware handle it.  The two dwords within qwords that span
1626   // cache line boundaries will still be loaded and stored atomicly.
1627   //
1628   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1629                                      address *entry, const char *name,
1630                                      bool dest_uninitialized = false) {
1631     const bool not_oop = false;
1632     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1633   }
1634 
1635 
1636   // Arguments:
1637   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1638   //             ignored
1639   //   name    - stub name string
1640   //
1641   // Inputs:
1642   //   c_rarg0   - source array address
1643   //   c_rarg1   - destination array address
1644   //   c_rarg2   - element count, treated as size_t, can be zero
1645   //
1646   // Side Effects:
1647   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1648   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1649   //
1650   address generate_disjoint_long_copy(bool aligned, address *entry,
1651                                           const char *name, bool dest_uninitialized = false) {
1652     const bool not_oop = false;
1653     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1654   }
1655 
1656   // Arguments:
1657   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1658   //             ignored
1659   //   name    - stub name string
1660   //
1661   // Inputs:
1662   //   c_rarg0   - source array address
1663   //   c_rarg1   - destination array address
1664   //   c_rarg2   - element count, treated as size_t, can be zero
1665   //
1666   address generate_conjoint_long_copy(bool aligned,
1667                                       address nooverlap_target, address *entry,
1668                                       const char *name, bool dest_uninitialized = false) {
1669     const bool not_oop = false;
1670     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1671   }
1672 
1673   // Arguments:
1674   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1675   //             ignored
1676   //   name    - stub name string
1677   //
1678   // Inputs:
1679   //   c_rarg0   - source array address
1680   //   c_rarg1   - destination array address
1681   //   c_rarg2   - element count, treated as size_t, can be zero
1682   //
1683   // Side Effects:
1684   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1685   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1686   //
1687   address generate_disjoint_oop_copy(bool aligned, address *entry,
1688                                      const char *name, bool dest_uninitialized) {
1689     const bool is_oop = true;
1690     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1691     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1692   }
1693 
1694   // Arguments:
1695   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1696   //             ignored
1697   //   name    - stub name string
1698   //
1699   // Inputs:
1700   //   c_rarg0   - source array address
1701   //   c_rarg1   - destination array address
1702   //   c_rarg2   - element count, treated as size_t, can be zero
1703   //
1704   address generate_conjoint_oop_copy(bool aligned,
1705                                      address nooverlap_target, address *entry,
1706                                      const char *name, bool dest_uninitialized) {
1707     const bool is_oop = true;
1708     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1709     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1710                                   name, dest_uninitialized);
1711   }
1712 
1713 
1714   // Helper for generating a dynamic type check.
1715   // Smashes rscratch1, rscratch2.
1716   void generate_type_check(Register sub_klass,
1717                            Register super_check_offset,
1718                            Register super_klass,
1719                            Label& L_success) {
1720     assert_different_registers(sub_klass, super_check_offset, super_klass);
1721 
1722     BLOCK_COMMENT("type_check:");
1723 
1724     Label L_miss;
1725 
1726     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1727                                      super_check_offset);
1728     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1729 
1730     // Fall through on failure!
1731     __ BIND(L_miss);
1732   }
1733 
1734   //
1735   //  Generate checkcasting array copy stub
1736   //
1737   //  Input:
1738   //    c_rarg0   - source array address
1739   //    c_rarg1   - destination array address
1740   //    c_rarg2   - element count, treated as ssize_t, can be zero
1741   //    c_rarg3   - size_t ckoff (super_check_offset)
1742   //    c_rarg4   - oop ckval (super_klass)
1743   //
1744   //  Output:
1745   //    r0 ==  0  -  success
1746   //    r0 == -1^K - failure, where K is partial transfer count
1747   //
1748   address generate_checkcast_copy(const char *name, address *entry,
1749                                   bool dest_uninitialized = false) {
1750 
1751     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1752 
1753     // Input registers (after setup_arg_regs)
1754     const Register from        = c_rarg0;   // source array address
1755     const Register to          = c_rarg1;   // destination array address
1756     const Register count       = c_rarg2;   // elementscount
1757     const Register ckoff       = c_rarg3;   // super_check_offset
1758     const Register ckval       = c_rarg4;   // super_klass
1759 
1760     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1761     RegSet wb_post_saved_regs = RegSet::of(count);
1762 
1763     // Registers used as temps (r18, r19, r20 are save-on-entry)
1764     const Register count_save  = r21;       // orig elementscount
1765     const Register start_to    = r20;       // destination array start address
1766     const Register copied_oop  = r18;       // actual oop copied
1767     const Register r19_klass   = r19;       // oop._klass
1768 
1769     //---------------------------------------------------------------
1770     // Assembler stub will be used for this call to arraycopy
1771     // if the two arrays are subtypes of Object[] but the
1772     // destination array type is not equal to or a supertype
1773     // of the source type.  Each element must be separately
1774     // checked.
1775 
1776     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1777                                copied_oop, r19_klass, count_save);
1778 
1779     __ align(CodeEntryAlignment);
1780     StubCodeMark mark(this, "StubRoutines", name);
1781     address start = __ pc();
1782 
1783     __ enter(); // required for proper stackwalking of RuntimeStub frame
1784 
1785 #ifdef ASSERT
1786     // caller guarantees that the arrays really are different
1787     // otherwise, we would have to make conjoint checks
1788     { Label L;
1789       array_overlap_test(L, TIMES_OOP);
1790       __ stop("checkcast_copy within a single array");
1791       __ bind(L);
1792     }
1793 #endif //ASSERT
1794 
1795     // Caller of this entry point must set up the argument registers.
1796     if (entry != NULL) {
1797       *entry = __ pc();
1798       BLOCK_COMMENT("Entry:");
1799     }
1800 
1801      // Empty array:  Nothing to do.
1802     __ cbz(count, L_done);
1803 
1804     __ push(RegSet::of(r18, r19, r20, r21), sp);
1805 
1806 #ifdef ASSERT
1807     BLOCK_COMMENT("assert consistent ckoff/ckval");
1808     // The ckoff and ckval must be mutually consistent,
1809     // even though caller generates both.
1810     { Label L;
1811       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1812       __ ldrw(start_to, Address(ckval, sco_offset));
1813       __ cmpw(ckoff, start_to);
1814       __ br(Assembler::EQ, L);
1815       __ stop("super_check_offset inconsistent");
1816       __ bind(L);
1817     }
1818 #endif //ASSERT
1819 
1820     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1821     bool is_oop = true;
1822     if (dest_uninitialized) {
1823       decorators |= IS_DEST_UNINITIALIZED;
1824     }
1825 
1826     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1827     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1828 
1829     // save the original count
1830     __ mov(count_save, count);
1831 
1832     // Copy from low to high addresses
1833     __ mov(start_to, to);              // Save destination array start address
1834     __ b(L_load_element);
1835 
1836     // ======== begin loop ========
1837     // (Loop is rotated; its entry is L_load_element.)
1838     // Loop control:
1839     //   for (; count != 0; count--) {
1840     //     copied_oop = load_heap_oop(from++);
1841     //     ... generate_type_check ...;
1842     //     store_heap_oop(to++, copied_oop);
1843     //   }
1844     __ align(OptoLoopAlignment);
1845 
1846     __ BIND(L_store_element);
1847     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop 
1848     __ sub(count, count, 1);
1849     __ cbz(count, L_do_card_marks);
1850 
1851     // ======== loop entry is here ========
1852     __ BIND(L_load_element);
1853     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1854     __ cbz(copied_oop, L_store_element);
1855 
1856     __ load_klass(r19_klass, copied_oop);// query the object klass
1857     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1858     // ======== end loop ========
1859 
1860     // It was a real error; we must depend on the caller to finish the job.
1861     // Register count = remaining oops, count_orig = total oops.
1862     // Emit GC store barriers for the oops we have copied and report
1863     // their number to the caller.
1864 
1865     __ subs(count, count_save, count);     // K = partially copied oop count
1866     __ eon(count, count, zr);                   // report (-1^K) to caller
1867     __ br(Assembler::EQ, L_done_pop);
1868 
1869     __ BIND(L_do_card_marks);
1870     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1871 
1872     __ bind(L_done_pop);
1873     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1874     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1875 
1876     __ bind(L_done);
1877     __ mov(r0, count);
1878     __ leave();
1879     __ ret(lr);
1880 
1881     return start;
1882   }
1883 
1884   // Perform range checks on the proposed arraycopy.
1885   // Kills temp, but nothing else.
1886   // Also, clean the sign bits of src_pos and dst_pos.
1887   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1888                               Register src_pos, // source position (c_rarg1)
1889                               Register dst,     // destination array oo (c_rarg2)
1890                               Register dst_pos, // destination position (c_rarg3)
1891                               Register length,
1892                               Register temp,
1893                               Label& L_failed) {
1894     BLOCK_COMMENT("arraycopy_range_checks:");
1895 
1896     assert_different_registers(rscratch1, temp);
1897 
1898     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1899     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1900     __ addw(temp, length, src_pos);
1901     __ cmpw(temp, rscratch1);
1902     __ br(Assembler::HI, L_failed);
1903 
1904     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1905     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1906     __ addw(temp, length, dst_pos);
1907     __ cmpw(temp, rscratch1);
1908     __ br(Assembler::HI, L_failed);
1909 
1910     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1911     __ movw(src_pos, src_pos);
1912     __ movw(dst_pos, dst_pos);
1913 
1914     BLOCK_COMMENT("arraycopy_range_checks done");
1915   }
1916 
1917   // These stubs get called from some dumb test routine.
1918   // I'll write them properly when they're called from
1919   // something that's actually doing something.
1920   static void fake_arraycopy_stub(address src, address dst, int count) {
1921     assert(count == 0, "huh?");
1922   }
1923 
1924 
1925   //
1926   //  Generate 'unsafe' array copy stub
1927   //  Though just as safe as the other stubs, it takes an unscaled
1928   //  size_t argument instead of an element count.
1929   //
1930   //  Input:
1931   //    c_rarg0   - source array address
1932   //    c_rarg1   - destination array address
1933   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1934   //
1935   // Examines the alignment of the operands and dispatches
1936   // to a long, int, short, or byte copy loop.
1937   //
1938   address generate_unsafe_copy(const char *name,
1939                                address byte_copy_entry,
1940                                address short_copy_entry,
1941                                address int_copy_entry,
1942                                address long_copy_entry) {
1943     Label L_long_aligned, L_int_aligned, L_short_aligned;
1944     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1945 
1946     __ align(CodeEntryAlignment);
1947     StubCodeMark mark(this, "StubRoutines", name);
1948     address start = __ pc();
1949     __ enter(); // required for proper stackwalking of RuntimeStub frame
1950 
1951     // bump this on entry, not on exit:
1952     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1953 
1954     __ orr(rscratch1, s, d);
1955     __ orr(rscratch1, rscratch1, count);
1956 
1957     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1958     __ cbz(rscratch1, L_long_aligned);
1959     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1960     __ cbz(rscratch1, L_int_aligned);
1961     __ tbz(rscratch1, 0, L_short_aligned);
1962     __ b(RuntimeAddress(byte_copy_entry));
1963 
1964     __ BIND(L_short_aligned);
1965     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1966     __ b(RuntimeAddress(short_copy_entry));
1967     __ BIND(L_int_aligned);
1968     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1969     __ b(RuntimeAddress(int_copy_entry));
1970     __ BIND(L_long_aligned);
1971     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1972     __ b(RuntimeAddress(long_copy_entry));
1973 
1974     return start;
1975   }
1976 
1977   //
1978   //  Generate generic array copy stubs
1979   //
1980   //  Input:
1981   //    c_rarg0    -  src oop
1982   //    c_rarg1    -  src_pos (32-bits)
1983   //    c_rarg2    -  dst oop
1984   //    c_rarg3    -  dst_pos (32-bits)
1985   //    c_rarg4    -  element count (32-bits)
1986   //
1987   //  Output:
1988   //    r0 ==  0  -  success
1989   //    r0 == -1^K - failure, where K is partial transfer count
1990   //
1991   address generate_generic_copy(const char *name,
1992                                 address byte_copy_entry, address short_copy_entry,
1993                                 address int_copy_entry, address oop_copy_entry,
1994                                 address long_copy_entry, address checkcast_copy_entry) {
1995 
1996     Label L_failed, L_objArray;
1997     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1998 
1999     // Input registers
2000     const Register src        = c_rarg0;  // source array oop
2001     const Register src_pos    = c_rarg1;  // source position
2002     const Register dst        = c_rarg2;  // destination array oop
2003     const Register dst_pos    = c_rarg3;  // destination position
2004     const Register length     = c_rarg4;
2005 
2006 
2007     // Registers used as temps
2008     const Register dst_klass  = c_rarg5;
2009 
2010     __ align(CodeEntryAlignment);
2011 
2012     StubCodeMark mark(this, "StubRoutines", name);
2013 
2014     address start = __ pc();
2015 
2016     __ enter(); // required for proper stackwalking of RuntimeStub frame
2017 
2018     // bump this on entry, not on exit:
2019     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2020 
2021     //-----------------------------------------------------------------------
2022     // Assembler stub will be used for this call to arraycopy
2023     // if the following conditions are met:
2024     //
2025     // (1) src and dst must not be null.
2026     // (2) src_pos must not be negative.
2027     // (3) dst_pos must not be negative.
2028     // (4) length  must not be negative.
2029     // (5) src klass and dst klass should be the same and not NULL.
2030     // (6) src and dst should be arrays.
2031     // (7) src_pos + length must not exceed length of src.
2032     // (8) dst_pos + length must not exceed length of dst.
2033     //
2034 
2035     //  if (src == NULL) return -1;
2036     __ cbz(src, L_failed);
2037 
2038     //  if (src_pos < 0) return -1;
2039     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2040 
2041     //  if (dst == NULL) return -1;
2042     __ cbz(dst, L_failed);
2043 
2044     //  if (dst_pos < 0) return -1;
2045     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2046 
2047     // registers used as temp
2048     const Register scratch_length    = r16; // elements count to copy
2049     const Register scratch_src_klass = r17; // array klass
2050     const Register lh                = r18; // layout helper
2051 
2052     //  if (length < 0) return -1;
2053     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2054     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2055 
2056     __ load_klass(scratch_src_klass, src);
2057 #ifdef ASSERT
2058     //  assert(src->klass() != NULL);
2059     {
2060       BLOCK_COMMENT("assert klasses not null {");
2061       Label L1, L2;
2062       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2063       __ bind(L1);
2064       __ stop("broken null klass");
2065       __ bind(L2);
2066       __ load_klass(rscratch1, dst);
2067       __ cbz(rscratch1, L1);     // this would be broken also
2068       BLOCK_COMMENT("} assert klasses not null done");
2069     }
2070 #endif
2071 
2072     // Load layout helper (32-bits)
2073     //
2074     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2075     // 32        30    24            16              8     2                 0
2076     //
2077     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2078     //
2079 
2080     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2081 
2082     // Handle objArrays completely differently...
2083     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2084     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2085     __ movw(rscratch1, objArray_lh);
2086     __ eorw(rscratch2, lh, rscratch1);
2087     __ cbzw(rscratch2, L_objArray);
2088 
2089     //  if (src->klass() != dst->klass()) return -1;
2090     __ load_klass(rscratch2, dst);
2091     __ eor(rscratch2, rscratch2, scratch_src_klass);
2092     __ cbnz(rscratch2, L_failed);
2093 
2094     //  if (!src->is_Array()) return -1;
2095     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2096 
2097     // At this point, it is known to be a typeArray (array_tag 0x3).
2098 #ifdef ASSERT
2099     {
2100       BLOCK_COMMENT("assert primitive array {");
2101       Label L;
2102       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2103       __ cmpw(lh, rscratch2);
2104       __ br(Assembler::GE, L);
2105       __ stop("must be a primitive array");
2106       __ bind(L);
2107       BLOCK_COMMENT("} assert primitive array done");
2108     }
2109 #endif
2110 
2111     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2112                            rscratch2, L_failed);
2113 
2114     // TypeArrayKlass
2115     //
2116     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2117     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2118     //
2119 
2120     const Register rscratch1_offset = rscratch1;    // array offset
2121     const Register r18_elsize = lh; // element size
2122 
2123     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2124            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2125     __ add(src, src, rscratch1_offset);           // src array offset
2126     __ add(dst, dst, rscratch1_offset);           // dst array offset
2127     BLOCK_COMMENT("choose copy loop based on element size");
2128 
2129     // next registers should be set before the jump to corresponding stub
2130     const Register from     = c_rarg0;  // source array address
2131     const Register to       = c_rarg1;  // destination array address
2132     const Register count    = c_rarg2;  // elements count
2133 
2134     // 'from', 'to', 'count' registers should be set in such order
2135     // since they are the same as 'src', 'src_pos', 'dst'.
2136 
2137     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2138 
2139     // The possible values of elsize are 0-3, i.e. exact_log2(element
2140     // size in bytes).  We do a simple bitwise binary search.
2141   __ BIND(L_copy_bytes);
2142     __ tbnz(r18_elsize, 1, L_copy_ints);
2143     __ tbnz(r18_elsize, 0, L_copy_shorts);
2144     __ lea(from, Address(src, src_pos));// src_addr
2145     __ lea(to,   Address(dst, dst_pos));// dst_addr
2146     __ movw(count, scratch_length); // length
2147     __ b(RuntimeAddress(byte_copy_entry));
2148 
2149   __ BIND(L_copy_shorts);
2150     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2151     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2152     __ movw(count, scratch_length); // length
2153     __ b(RuntimeAddress(short_copy_entry));
2154 
2155   __ BIND(L_copy_ints);
2156     __ tbnz(r18_elsize, 0, L_copy_longs);
2157     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2158     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2159     __ movw(count, scratch_length); // length
2160     __ b(RuntimeAddress(int_copy_entry));
2161 
2162   __ BIND(L_copy_longs);
2163 #ifdef ASSERT
2164     {
2165       BLOCK_COMMENT("assert long copy {");
2166       Label L;
2167       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2168       __ cmpw(r18_elsize, LogBytesPerLong);
2169       __ br(Assembler::EQ, L);
2170       __ stop("must be long copy, but elsize is wrong");
2171       __ bind(L);
2172       BLOCK_COMMENT("} assert long copy done");
2173     }
2174 #endif
2175     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2176     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2177     __ movw(count, scratch_length); // length
2178     __ b(RuntimeAddress(long_copy_entry));
2179 
2180     // ObjArrayKlass
2181   __ BIND(L_objArray);
2182     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2183 
2184     Label L_plain_copy, L_checkcast_copy;
2185     //  test array classes for subtyping
2186     __ load_klass(r18, dst);
2187     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2188     __ br(Assembler::NE, L_checkcast_copy);
2189 
2190     // Identically typed arrays can be copied without element-wise checks.
2191     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2192                            rscratch2, L_failed);
2193 
2194     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2195     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2196     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2197     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2198     __ movw(count, scratch_length); // length
2199   __ BIND(L_plain_copy);
2200     __ b(RuntimeAddress(oop_copy_entry));
2201 
2202   __ BIND(L_checkcast_copy);
2203     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2204     {
2205       // Before looking at dst.length, make sure dst is also an objArray.
2206       __ ldrw(rscratch1, Address(r18, lh_offset));
2207       __ movw(rscratch2, objArray_lh);
2208       __ eorw(rscratch1, rscratch1, rscratch2);
2209       __ cbnzw(rscratch1, L_failed);
2210 
2211       // It is safe to examine both src.length and dst.length.
2212       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2213                              r18, L_failed);
2214 
2215       __ load_klass(dst_klass, dst); // reload
2216 
2217       // Marshal the base address arguments now, freeing registers.
2218       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2219       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2220       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2221       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2222       __ movw(count, length);           // length (reloaded)
2223       Register sco_temp = c_rarg3;      // this register is free now
2224       assert_different_registers(from, to, count, sco_temp,
2225                                  dst_klass, scratch_src_klass);
2226       // assert_clean_int(count, sco_temp);
2227 
2228       // Generate the type check.
2229       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2230       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2231 
2232       // Smashes rscratch1, rscratch2
2233       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2234 
2235       // Fetch destination element klass from the ObjArrayKlass header.
2236       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2237       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2238       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2239 
2240       // the checkcast_copy loop needs two extra arguments:
2241       assert(c_rarg3 == sco_temp, "#3 already in place");
2242       // Set up arguments for checkcast_copy_entry.
2243       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2244       __ b(RuntimeAddress(checkcast_copy_entry));
2245     }
2246 
2247   __ BIND(L_failed);
2248     __ mov(r0, -1);
2249     __ leave();   // required for proper stackwalking of RuntimeStub frame
2250     __ ret(lr);
2251 
2252     return start;
2253   }
2254 
2255   //
2256   // Generate stub for array fill. If "aligned" is true, the
2257   // "to" address is assumed to be heapword aligned.
2258   //
2259   // Arguments for generated stub:
2260   //   to:    c_rarg0
2261   //   value: c_rarg1
2262   //   count: c_rarg2 treated as signed
2263   //
2264   address generate_fill(BasicType t, bool aligned, const char *name) {
2265     __ align(CodeEntryAlignment);
2266     StubCodeMark mark(this, "StubRoutines", name);
2267     address start = __ pc();
2268 
2269     BLOCK_COMMENT("Entry:");
2270 
2271     const Register to        = c_rarg0;  // source array address
2272     const Register value     = c_rarg1;  // value
2273     const Register count     = c_rarg2;  // elements count
2274 
2275     const Register bz_base = r10;        // base for block_zero routine
2276     const Register cnt_words = r11;      // temp register
2277 
2278     __ enter();
2279 
2280     Label L_fill_elements, L_exit1;
2281 
2282     int shift = -1;
2283     switch (t) {
2284       case T_BYTE:
2285         shift = 0;
2286         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2287         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2288         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2289         __ br(Assembler::LO, L_fill_elements);
2290         break;
2291       case T_SHORT:
2292         shift = 1;
2293         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2294         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2295         __ br(Assembler::LO, L_fill_elements);
2296         break;
2297       case T_INT:
2298         shift = 2;
2299         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2300         __ br(Assembler::LO, L_fill_elements);
2301         break;
2302       default: ShouldNotReachHere();
2303     }
2304 
2305     // Align source address at 8 bytes address boundary.
2306     Label L_skip_align1, L_skip_align2, L_skip_align4;
2307     if (!aligned) {
2308       switch (t) {
2309         case T_BYTE:
2310           // One byte misalignment happens only for byte arrays.
2311           __ tbz(to, 0, L_skip_align1);
2312           __ strb(value, Address(__ post(to, 1)));
2313           __ subw(count, count, 1);
2314           __ bind(L_skip_align1);
2315           // Fallthrough
2316         case T_SHORT:
2317           // Two bytes misalignment happens only for byte and short (char) arrays.
2318           __ tbz(to, 1, L_skip_align2);
2319           __ strh(value, Address(__ post(to, 2)));
2320           __ subw(count, count, 2 >> shift);
2321           __ bind(L_skip_align2);
2322           // Fallthrough
2323         case T_INT:
2324           // Align to 8 bytes, we know we are 4 byte aligned to start.
2325           __ tbz(to, 2, L_skip_align4);
2326           __ strw(value, Address(__ post(to, 4)));
2327           __ subw(count, count, 4 >> shift);
2328           __ bind(L_skip_align4);
2329           break;
2330         default: ShouldNotReachHere();
2331       }
2332     }
2333 
2334     //
2335     //  Fill large chunks
2336     //
2337     __ lsrw(cnt_words, count, 3 - shift); // number of words
2338     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2339     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2340     if (UseBlockZeroing) {
2341       Label non_block_zeroing, rest;
2342       // If the fill value is zero we can use the fast zero_words().
2343       __ cbnz(value, non_block_zeroing);
2344       __ mov(bz_base, to);
2345       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2346       __ zero_words(bz_base, cnt_words);
2347       __ b(rest);
2348       __ bind(non_block_zeroing);
2349       __ fill_words(to, cnt_words, value);
2350       __ bind(rest);
2351     } else {
2352       __ fill_words(to, cnt_words, value);
2353     }
2354 
2355     // Remaining count is less than 8 bytes. Fill it by a single store.
2356     // Note that the total length is no less than 8 bytes.
2357     if (t == T_BYTE || t == T_SHORT) {
2358       Label L_exit1;
2359       __ cbzw(count, L_exit1);
2360       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2361       __ str(value, Address(to, -8));    // overwrite some elements
2362       __ bind(L_exit1);
2363       __ leave();
2364       __ ret(lr);
2365     }
2366 
2367     // Handle copies less than 8 bytes.
2368     Label L_fill_2, L_fill_4, L_exit2;
2369     __ bind(L_fill_elements);
2370     switch (t) {
2371       case T_BYTE:
2372         __ tbz(count, 0, L_fill_2);
2373         __ strb(value, Address(__ post(to, 1)));
2374         __ bind(L_fill_2);
2375         __ tbz(count, 1, L_fill_4);
2376         __ strh(value, Address(__ post(to, 2)));
2377         __ bind(L_fill_4);
2378         __ tbz(count, 2, L_exit2);
2379         __ strw(value, Address(to));
2380         break;
2381       case T_SHORT:
2382         __ tbz(count, 0, L_fill_4);
2383         __ strh(value, Address(__ post(to, 2)));
2384         __ bind(L_fill_4);
2385         __ tbz(count, 1, L_exit2);
2386         __ strw(value, Address(to));
2387         break;
2388       case T_INT:
2389         __ cbzw(count, L_exit2);
2390         __ strw(value, Address(to));
2391         break;
2392       default: ShouldNotReachHere();
2393     }
2394     __ bind(L_exit2);
2395     __ leave();
2396     __ ret(lr);
2397     return start;
2398   }
2399 
2400   void generate_arraycopy_stubs() {
2401     address entry;
2402     address entry_jbyte_arraycopy;
2403     address entry_jshort_arraycopy;
2404     address entry_jint_arraycopy;
2405     address entry_oop_arraycopy;
2406     address entry_jlong_arraycopy;
2407     address entry_checkcast_arraycopy;
2408 
2409     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2410     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2411 
2412     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2413 
2414     //*** jbyte
2415     // Always need aligned and unaligned versions
2416     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2417                                                                                   "jbyte_disjoint_arraycopy");
2418     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2419                                                                                   &entry_jbyte_arraycopy,
2420                                                                                   "jbyte_arraycopy");
2421     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2422                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2423     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2424                                                                                   "arrayof_jbyte_arraycopy");
2425 
2426     //*** jshort
2427     // Always need aligned and unaligned versions
2428     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2429                                                                                     "jshort_disjoint_arraycopy");
2430     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2431                                                                                     &entry_jshort_arraycopy,
2432                                                                                     "jshort_arraycopy");
2433     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2434                                                                                     "arrayof_jshort_disjoint_arraycopy");
2435     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2436                                                                                     "arrayof_jshort_arraycopy");
2437 
2438     //*** jint
2439     // Aligned versions
2440     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2441                                                                                 "arrayof_jint_disjoint_arraycopy");
2442     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2443                                                                                 "arrayof_jint_arraycopy");
2444     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2445     // entry_jint_arraycopy always points to the unaligned version
2446     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2447                                                                                 "jint_disjoint_arraycopy");
2448     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2449                                                                                 &entry_jint_arraycopy,
2450                                                                                 "jint_arraycopy");
2451 
2452     //*** jlong
2453     // It is always aligned
2454     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2455                                                                                   "arrayof_jlong_disjoint_arraycopy");
2456     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2457                                                                                   "arrayof_jlong_arraycopy");
2458     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2459     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2460 
2461     //*** oops
2462     {
2463       // With compressed oops we need unaligned versions; notice that
2464       // we overwrite entry_oop_arraycopy.
2465       bool aligned = !UseCompressedOops;
2466 
2467       StubRoutines::_arrayof_oop_disjoint_arraycopy
2468         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2469                                      /*dest_uninitialized*/false);
2470       StubRoutines::_arrayof_oop_arraycopy
2471         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2472                                      /*dest_uninitialized*/false);
2473       // Aligned versions without pre-barriers
2474       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2475         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2476                                      /*dest_uninitialized*/true);
2477       StubRoutines::_arrayof_oop_arraycopy_uninit
2478         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2479                                      /*dest_uninitialized*/true);
2480     }
2481 
2482     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2483     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2484     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2485     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2486 
2487     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2488     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2489                                                                         /*dest_uninitialized*/true);
2490 
2491     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2492                                                               entry_jbyte_arraycopy,
2493                                                               entry_jshort_arraycopy,
2494                                                               entry_jint_arraycopy,
2495                                                               entry_jlong_arraycopy);
2496 
2497     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2498                                                                entry_jbyte_arraycopy,
2499                                                                entry_jshort_arraycopy,
2500                                                                entry_jint_arraycopy,
2501                                                                entry_oop_arraycopy,
2502                                                                entry_jlong_arraycopy,
2503                                                                entry_checkcast_arraycopy);
2504 
2505     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2506     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2507     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2508     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2509     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2510     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2511   }
2512 
2513   void generate_math_stubs() { Unimplemented(); }
2514 
2515   // Arguments:
2516   //
2517   // Inputs:
2518   //   c_rarg0   - source byte array address
2519   //   c_rarg1   - destination byte array address
2520   //   c_rarg2   - K (key) in little endian int array
2521   //
2522   address generate_aescrypt_encryptBlock() {
2523     __ align(CodeEntryAlignment);
2524     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2525 
2526     Label L_doLast;
2527 
2528     const Register from        = c_rarg0;  // source array address
2529     const Register to          = c_rarg1;  // destination array address
2530     const Register key         = c_rarg2;  // key array address
2531     const Register keylen      = rscratch1;
2532 
2533     address start = __ pc();
2534     __ enter();
2535 
2536     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2537 
2538     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2539 
2540     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2541     __ rev32(v1, __ T16B, v1);
2542     __ rev32(v2, __ T16B, v2);
2543     __ rev32(v3, __ T16B, v3);
2544     __ rev32(v4, __ T16B, v4);
2545     __ aese(v0, v1);
2546     __ aesmc(v0, v0);
2547     __ aese(v0, v2);
2548     __ aesmc(v0, v0);
2549     __ aese(v0, v3);
2550     __ aesmc(v0, v0);
2551     __ aese(v0, v4);
2552     __ aesmc(v0, v0);
2553 
2554     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2555     __ rev32(v1, __ T16B, v1);
2556     __ rev32(v2, __ T16B, v2);
2557     __ rev32(v3, __ T16B, v3);
2558     __ rev32(v4, __ T16B, v4);
2559     __ aese(v0, v1);
2560     __ aesmc(v0, v0);
2561     __ aese(v0, v2);
2562     __ aesmc(v0, v0);
2563     __ aese(v0, v3);
2564     __ aesmc(v0, v0);
2565     __ aese(v0, v4);
2566     __ aesmc(v0, v0);
2567 
2568     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2569     __ rev32(v1, __ T16B, v1);
2570     __ rev32(v2, __ T16B, v2);
2571 
2572     __ cmpw(keylen, 44);
2573     __ br(Assembler::EQ, L_doLast);
2574 
2575     __ aese(v0, v1);
2576     __ aesmc(v0, v0);
2577     __ aese(v0, v2);
2578     __ aesmc(v0, v0);
2579 
2580     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2581     __ rev32(v1, __ T16B, v1);
2582     __ rev32(v2, __ T16B, v2);
2583 
2584     __ cmpw(keylen, 52);
2585     __ br(Assembler::EQ, L_doLast);
2586 
2587     __ aese(v0, v1);
2588     __ aesmc(v0, v0);
2589     __ aese(v0, v2);
2590     __ aesmc(v0, v0);
2591 
2592     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2593     __ rev32(v1, __ T16B, v1);
2594     __ rev32(v2, __ T16B, v2);
2595 
2596     __ BIND(L_doLast);
2597 
2598     __ aese(v0, v1);
2599     __ aesmc(v0, v0);
2600     __ aese(v0, v2);
2601 
2602     __ ld1(v1, __ T16B, key);
2603     __ rev32(v1, __ T16B, v1);
2604     __ eor(v0, __ T16B, v0, v1);
2605 
2606     __ st1(v0, __ T16B, to);
2607 
2608     __ mov(r0, 0);
2609 
2610     __ leave();
2611     __ ret(lr);
2612 
2613     return start;
2614   }
2615 
2616   // Arguments:
2617   //
2618   // Inputs:
2619   //   c_rarg0   - source byte array address
2620   //   c_rarg1   - destination byte array address
2621   //   c_rarg2   - K (key) in little endian int array
2622   //
2623   address generate_aescrypt_decryptBlock() {
2624     assert(UseAES, "need AES instructions and misaligned SSE support");
2625     __ align(CodeEntryAlignment);
2626     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2627     Label L_doLast;
2628 
2629     const Register from        = c_rarg0;  // source array address
2630     const Register to          = c_rarg1;  // destination array address
2631     const Register key         = c_rarg2;  // key array address
2632     const Register keylen      = rscratch1;
2633 
2634     address start = __ pc();
2635     __ enter(); // required for proper stackwalking of RuntimeStub frame
2636 
2637     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2638 
2639     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2640 
2641     __ ld1(v5, __ T16B, __ post(key, 16));
2642     __ rev32(v5, __ T16B, v5);
2643 
2644     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2645     __ rev32(v1, __ T16B, v1);
2646     __ rev32(v2, __ T16B, v2);
2647     __ rev32(v3, __ T16B, v3);
2648     __ rev32(v4, __ T16B, v4);
2649     __ aesd(v0, v1);
2650     __ aesimc(v0, v0);
2651     __ aesd(v0, v2);
2652     __ aesimc(v0, v0);
2653     __ aesd(v0, v3);
2654     __ aesimc(v0, v0);
2655     __ aesd(v0, v4);
2656     __ aesimc(v0, v0);
2657 
2658     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2659     __ rev32(v1, __ T16B, v1);
2660     __ rev32(v2, __ T16B, v2);
2661     __ rev32(v3, __ T16B, v3);
2662     __ rev32(v4, __ T16B, v4);
2663     __ aesd(v0, v1);
2664     __ aesimc(v0, v0);
2665     __ aesd(v0, v2);
2666     __ aesimc(v0, v0);
2667     __ aesd(v0, v3);
2668     __ aesimc(v0, v0);
2669     __ aesd(v0, v4);
2670     __ aesimc(v0, v0);
2671 
2672     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2673     __ rev32(v1, __ T16B, v1);
2674     __ rev32(v2, __ T16B, v2);
2675 
2676     __ cmpw(keylen, 44);
2677     __ br(Assembler::EQ, L_doLast);
2678 
2679     __ aesd(v0, v1);
2680     __ aesimc(v0, v0);
2681     __ aesd(v0, v2);
2682     __ aesimc(v0, v0);
2683 
2684     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2685     __ rev32(v1, __ T16B, v1);
2686     __ rev32(v2, __ T16B, v2);
2687 
2688     __ cmpw(keylen, 52);
2689     __ br(Assembler::EQ, L_doLast);
2690 
2691     __ aesd(v0, v1);
2692     __ aesimc(v0, v0);
2693     __ aesd(v0, v2);
2694     __ aesimc(v0, v0);
2695 
2696     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2697     __ rev32(v1, __ T16B, v1);
2698     __ rev32(v2, __ T16B, v2);
2699 
2700     __ BIND(L_doLast);
2701 
2702     __ aesd(v0, v1);
2703     __ aesimc(v0, v0);
2704     __ aesd(v0, v2);
2705 
2706     __ eor(v0, __ T16B, v0, v5);
2707 
2708     __ st1(v0, __ T16B, to);
2709 
2710     __ mov(r0, 0);
2711 
2712     __ leave();
2713     __ ret(lr);
2714 
2715     return start;
2716   }
2717 
2718   // Arguments:
2719   //
2720   // Inputs:
2721   //   c_rarg0   - source byte array address
2722   //   c_rarg1   - destination byte array address
2723   //   c_rarg2   - K (key) in little endian int array
2724   //   c_rarg3   - r vector byte array address
2725   //   c_rarg4   - input length
2726   //
2727   // Output:
2728   //   x0        - input length
2729   //
2730   address generate_cipherBlockChaining_encryptAESCrypt() {
2731     assert(UseAES, "need AES instructions and misaligned SSE support");
2732     __ align(CodeEntryAlignment);
2733     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2734 
2735     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2736 
2737     const Register from        = c_rarg0;  // source array address
2738     const Register to          = c_rarg1;  // destination array address
2739     const Register key         = c_rarg2;  // key array address
2740     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2741                                            // and left with the results of the last encryption block
2742     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2743     const Register keylen      = rscratch1;
2744 
2745     address start = __ pc();
2746 
2747       __ enter();
2748 
2749       __ movw(rscratch2, len_reg);
2750 
2751       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2752 
2753       __ ld1(v0, __ T16B, rvec);
2754 
2755       __ cmpw(keylen, 52);
2756       __ br(Assembler::CC, L_loadkeys_44);
2757       __ br(Assembler::EQ, L_loadkeys_52);
2758 
2759       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2760       __ rev32(v17, __ T16B, v17);
2761       __ rev32(v18, __ T16B, v18);
2762     __ BIND(L_loadkeys_52);
2763       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2764       __ rev32(v19, __ T16B, v19);
2765       __ rev32(v20, __ T16B, v20);
2766     __ BIND(L_loadkeys_44);
2767       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2768       __ rev32(v21, __ T16B, v21);
2769       __ rev32(v22, __ T16B, v22);
2770       __ rev32(v23, __ T16B, v23);
2771       __ rev32(v24, __ T16B, v24);
2772       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2773       __ rev32(v25, __ T16B, v25);
2774       __ rev32(v26, __ T16B, v26);
2775       __ rev32(v27, __ T16B, v27);
2776       __ rev32(v28, __ T16B, v28);
2777       __ ld1(v29, v30, v31, __ T16B, key);
2778       __ rev32(v29, __ T16B, v29);
2779       __ rev32(v30, __ T16B, v30);
2780       __ rev32(v31, __ T16B, v31);
2781 
2782     __ BIND(L_aes_loop);
2783       __ ld1(v1, __ T16B, __ post(from, 16));
2784       __ eor(v0, __ T16B, v0, v1);
2785 
2786       __ br(Assembler::CC, L_rounds_44);
2787       __ br(Assembler::EQ, L_rounds_52);
2788 
2789       __ aese(v0, v17); __ aesmc(v0, v0);
2790       __ aese(v0, v18); __ aesmc(v0, v0);
2791     __ BIND(L_rounds_52);
2792       __ aese(v0, v19); __ aesmc(v0, v0);
2793       __ aese(v0, v20); __ aesmc(v0, v0);
2794     __ BIND(L_rounds_44);
2795       __ aese(v0, v21); __ aesmc(v0, v0);
2796       __ aese(v0, v22); __ aesmc(v0, v0);
2797       __ aese(v0, v23); __ aesmc(v0, v0);
2798       __ aese(v0, v24); __ aesmc(v0, v0);
2799       __ aese(v0, v25); __ aesmc(v0, v0);
2800       __ aese(v0, v26); __ aesmc(v0, v0);
2801       __ aese(v0, v27); __ aesmc(v0, v0);
2802       __ aese(v0, v28); __ aesmc(v0, v0);
2803       __ aese(v0, v29); __ aesmc(v0, v0);
2804       __ aese(v0, v30);
2805       __ eor(v0, __ T16B, v0, v31);
2806 
2807       __ st1(v0, __ T16B, __ post(to, 16));
2808 
2809       __ subw(len_reg, len_reg, 16);
2810       __ cbnzw(len_reg, L_aes_loop);
2811 
2812       __ st1(v0, __ T16B, rvec);
2813 
2814       __ mov(r0, rscratch2);
2815 
2816       __ leave();
2817       __ ret(lr);
2818 
2819       return start;
2820   }
2821 
2822   // Arguments:
2823   //
2824   // Inputs:
2825   //   c_rarg0   - source byte array address
2826   //   c_rarg1   - destination byte array address
2827   //   c_rarg2   - K (key) in little endian int array
2828   //   c_rarg3   - r vector byte array address
2829   //   c_rarg4   - input length
2830   //
2831   // Output:
2832   //   r0        - input length
2833   //
2834   address generate_cipherBlockChaining_decryptAESCrypt() {
2835     assert(UseAES, "need AES instructions and misaligned SSE support");
2836     __ align(CodeEntryAlignment);
2837     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2838 
2839     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2840 
2841     const Register from        = c_rarg0;  // source array address
2842     const Register to          = c_rarg1;  // destination array address
2843     const Register key         = c_rarg2;  // key array address
2844     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2845                                            // and left with the results of the last encryption block
2846     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2847     const Register keylen      = rscratch1;
2848 
2849     address start = __ pc();
2850 
2851       __ enter();
2852 
2853       __ movw(rscratch2, len_reg);
2854 
2855       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2856 
2857       __ ld1(v2, __ T16B, rvec);
2858 
2859       __ ld1(v31, __ T16B, __ post(key, 16));
2860       __ rev32(v31, __ T16B, v31);
2861 
2862       __ cmpw(keylen, 52);
2863       __ br(Assembler::CC, L_loadkeys_44);
2864       __ br(Assembler::EQ, L_loadkeys_52);
2865 
2866       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2867       __ rev32(v17, __ T16B, v17);
2868       __ rev32(v18, __ T16B, v18);
2869     __ BIND(L_loadkeys_52);
2870       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2871       __ rev32(v19, __ T16B, v19);
2872       __ rev32(v20, __ T16B, v20);
2873     __ BIND(L_loadkeys_44);
2874       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2875       __ rev32(v21, __ T16B, v21);
2876       __ rev32(v22, __ T16B, v22);
2877       __ rev32(v23, __ T16B, v23);
2878       __ rev32(v24, __ T16B, v24);
2879       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2880       __ rev32(v25, __ T16B, v25);
2881       __ rev32(v26, __ T16B, v26);
2882       __ rev32(v27, __ T16B, v27);
2883       __ rev32(v28, __ T16B, v28);
2884       __ ld1(v29, v30, __ T16B, key);
2885       __ rev32(v29, __ T16B, v29);
2886       __ rev32(v30, __ T16B, v30);
2887 
2888     __ BIND(L_aes_loop);
2889       __ ld1(v0, __ T16B, __ post(from, 16));
2890       __ orr(v1, __ T16B, v0, v0);
2891 
2892       __ br(Assembler::CC, L_rounds_44);
2893       __ br(Assembler::EQ, L_rounds_52);
2894 
2895       __ aesd(v0, v17); __ aesimc(v0, v0);
2896       __ aesd(v0, v18); __ aesimc(v0, v0);
2897     __ BIND(L_rounds_52);
2898       __ aesd(v0, v19); __ aesimc(v0, v0);
2899       __ aesd(v0, v20); __ aesimc(v0, v0);
2900     __ BIND(L_rounds_44);
2901       __ aesd(v0, v21); __ aesimc(v0, v0);
2902       __ aesd(v0, v22); __ aesimc(v0, v0);
2903       __ aesd(v0, v23); __ aesimc(v0, v0);
2904       __ aesd(v0, v24); __ aesimc(v0, v0);
2905       __ aesd(v0, v25); __ aesimc(v0, v0);
2906       __ aesd(v0, v26); __ aesimc(v0, v0);
2907       __ aesd(v0, v27); __ aesimc(v0, v0);
2908       __ aesd(v0, v28); __ aesimc(v0, v0);
2909       __ aesd(v0, v29); __ aesimc(v0, v0);
2910       __ aesd(v0, v30);
2911       __ eor(v0, __ T16B, v0, v31);
2912       __ eor(v0, __ T16B, v0, v2);
2913 
2914       __ st1(v0, __ T16B, __ post(to, 16));
2915       __ orr(v2, __ T16B, v1, v1);
2916 
2917       __ subw(len_reg, len_reg, 16);
2918       __ cbnzw(len_reg, L_aes_loop);
2919 
2920       __ st1(v2, __ T16B, rvec);
2921 
2922       __ mov(r0, rscratch2);
2923 
2924       __ leave();
2925       __ ret(lr);
2926 
2927     return start;
2928   }
2929 
2930   // Arguments:
2931   //
2932   // Inputs:
2933   //   c_rarg0   - byte[]  source+offset
2934   //   c_rarg1   - int[]   SHA.state
2935   //   c_rarg2   - int     offset
2936   //   c_rarg3   - int     limit
2937   //
2938   address generate_sha1_implCompress(bool multi_block, const char *name) {
2939     __ align(CodeEntryAlignment);
2940     StubCodeMark mark(this, "StubRoutines", name);
2941     address start = __ pc();
2942 
2943     Register buf   = c_rarg0;
2944     Register state = c_rarg1;
2945     Register ofs   = c_rarg2;
2946     Register limit = c_rarg3;
2947 
2948     Label keys;
2949     Label sha1_loop;
2950 
2951     // load the keys into v0..v3
2952     __ adr(rscratch1, keys);
2953     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2954     // load 5 words state into v6, v7
2955     __ ldrq(v6, Address(state, 0));
2956     __ ldrs(v7, Address(state, 16));
2957 
2958 
2959     __ BIND(sha1_loop);
2960     // load 64 bytes of data into v16..v19
2961     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2962     __ rev32(v16, __ T16B, v16);
2963     __ rev32(v17, __ T16B, v17);
2964     __ rev32(v18, __ T16B, v18);
2965     __ rev32(v19, __ T16B, v19);
2966 
2967     // do the sha1
2968     __ addv(v4, __ T4S, v16, v0);
2969     __ orr(v20, __ T16B, v6, v6);
2970 
2971     FloatRegister d0 = v16;
2972     FloatRegister d1 = v17;
2973     FloatRegister d2 = v18;
2974     FloatRegister d3 = v19;
2975 
2976     for (int round = 0; round < 20; round++) {
2977       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2978       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2979       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2980       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2981       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2982 
2983       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2984       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2985       __ sha1h(tmp2, __ T4S, v20);
2986       if (round < 5)
2987         __ sha1c(v20, __ T4S, tmp3, tmp4);
2988       else if (round < 10 || round >= 15)
2989         __ sha1p(v20, __ T4S, tmp3, tmp4);
2990       else
2991         __ sha1m(v20, __ T4S, tmp3, tmp4);
2992       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2993 
2994       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2995     }
2996 
2997     __ addv(v7, __ T2S, v7, v21);
2998     __ addv(v6, __ T4S, v6, v20);
2999 
3000     if (multi_block) {
3001       __ add(ofs, ofs, 64);
3002       __ cmp(ofs, limit);
3003       __ br(Assembler::LE, sha1_loop);
3004       __ mov(c_rarg0, ofs); // return ofs
3005     }
3006 
3007     __ strq(v6, Address(state, 0));
3008     __ strs(v7, Address(state, 16));
3009 
3010     __ ret(lr);
3011 
3012     __ bind(keys);
3013     __ emit_int32(0x5a827999);
3014     __ emit_int32(0x6ed9eba1);
3015     __ emit_int32(0x8f1bbcdc);
3016     __ emit_int32(0xca62c1d6);
3017 
3018     return start;
3019   }
3020 
3021 
3022   // Arguments:
3023   //
3024   // Inputs:
3025   //   c_rarg0   - byte[]  source+offset
3026   //   c_rarg1   - int[]   SHA.state
3027   //   c_rarg2   - int     offset
3028   //   c_rarg3   - int     limit
3029   //
3030   address generate_sha256_implCompress(bool multi_block, const char *name) {
3031     static const uint32_t round_consts[64] = {
3032       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3033       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3034       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3035       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3036       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3037       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3038       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3039       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3040       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3041       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3042       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3043       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3044       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3045       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3046       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3047       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3048     };
3049     __ align(CodeEntryAlignment);
3050     StubCodeMark mark(this, "StubRoutines", name);
3051     address start = __ pc();
3052 
3053     Register buf   = c_rarg0;
3054     Register state = c_rarg1;
3055     Register ofs   = c_rarg2;
3056     Register limit = c_rarg3;
3057 
3058     Label sha1_loop;
3059 
3060     __ stpd(v8, v9, __ pre(sp, -32));
3061     __ stpd(v10, v11, Address(sp, 16));
3062 
3063 // dga == v0
3064 // dgb == v1
3065 // dg0 == v2
3066 // dg1 == v3
3067 // dg2 == v4
3068 // t0 == v6
3069 // t1 == v7
3070 
3071     // load 16 keys to v16..v31
3072     __ lea(rscratch1, ExternalAddress((address)round_consts));
3073     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3074     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3075     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3076     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3077 
3078     // load 8 words (256 bits) state
3079     __ ldpq(v0, v1, state);
3080 
3081     __ BIND(sha1_loop);
3082     // load 64 bytes of data into v8..v11
3083     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3084     __ rev32(v8, __ T16B, v8);
3085     __ rev32(v9, __ T16B, v9);
3086     __ rev32(v10, __ T16B, v10);
3087     __ rev32(v11, __ T16B, v11);
3088 
3089     __ addv(v6, __ T4S, v8, v16);
3090     __ orr(v2, __ T16B, v0, v0);
3091     __ orr(v3, __ T16B, v1, v1);
3092 
3093     FloatRegister d0 = v8;
3094     FloatRegister d1 = v9;
3095     FloatRegister d2 = v10;
3096     FloatRegister d3 = v11;
3097 
3098 
3099     for (int round = 0; round < 16; round++) {
3100       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3101       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3102       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3103       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3104 
3105       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3106        __ orr(v4, __ T16B, v2, v2);
3107       if (round < 15)
3108         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3109       __ sha256h(v2, __ T4S, v3, tmp2);
3110       __ sha256h2(v3, __ T4S, v4, tmp2);
3111       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3112 
3113       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3114     }
3115 
3116     __ addv(v0, __ T4S, v0, v2);
3117     __ addv(v1, __ T4S, v1, v3);
3118 
3119     if (multi_block) {
3120       __ add(ofs, ofs, 64);
3121       __ cmp(ofs, limit);
3122       __ br(Assembler::LE, sha1_loop);
3123       __ mov(c_rarg0, ofs); // return ofs
3124     }
3125 
3126     __ ldpd(v10, v11, Address(sp, 16));
3127     __ ldpd(v8, v9, __ post(sp, 32));
3128 
3129     __ stpq(v0, v1, state);
3130 
3131     __ ret(lr);
3132 
3133     return start;
3134   }
3135 
3136 #ifndef BUILTIN_SIM
3137   // Safefetch stubs.
3138   void generate_safefetch(const char* name, int size, address* entry,
3139                           address* fault_pc, address* continuation_pc) {
3140     // safefetch signatures:
3141     //   int      SafeFetch32(int*      adr, int      errValue);
3142     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3143     //
3144     // arguments:
3145     //   c_rarg0 = adr
3146     //   c_rarg1 = errValue
3147     //
3148     // result:
3149     //   PPC_RET  = *adr or errValue
3150 
3151     StubCodeMark mark(this, "StubRoutines", name);
3152 
3153     // Entry point, pc or function descriptor.
3154     *entry = __ pc();
3155 
3156     // Load *adr into c_rarg1, may fault.
3157     *fault_pc = __ pc();
3158     switch (size) {
3159       case 4:
3160         // int32_t
3161         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3162         break;
3163       case 8:
3164         // int64_t
3165         __ ldr(c_rarg1, Address(c_rarg0, 0));
3166         break;
3167       default:
3168         ShouldNotReachHere();
3169     }
3170 
3171     // return errValue or *adr
3172     *continuation_pc = __ pc();
3173     __ mov(r0, c_rarg1);
3174     __ ret(lr);
3175   }
3176 #endif
3177 
3178   /**
3179    *  Arguments:
3180    *
3181    * Inputs:
3182    *   c_rarg0   - int crc
3183    *   c_rarg1   - byte* buf
3184    *   c_rarg2   - int length
3185    *
3186    * Ouput:
3187    *       rax   - int crc result
3188    */
3189   address generate_updateBytesCRC32() {
3190     assert(UseCRC32Intrinsics, "what are we doing here?");
3191 
3192     __ align(CodeEntryAlignment);
3193     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3194 
3195     address start = __ pc();
3196 
3197     const Register crc   = c_rarg0;  // crc
3198     const Register buf   = c_rarg1;  // source java byte array address
3199     const Register len   = c_rarg2;  // length
3200     const Register table0 = c_rarg3; // crc_table address
3201     const Register table1 = c_rarg4;
3202     const Register table2 = c_rarg5;
3203     const Register table3 = c_rarg6;
3204     const Register tmp3 = c_rarg7;
3205 
3206     BLOCK_COMMENT("Entry:");
3207     __ enter(); // required for proper stackwalking of RuntimeStub frame
3208 
3209     __ kernel_crc32(crc, buf, len,
3210               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3211 
3212     __ leave(); // required for proper stackwalking of RuntimeStub frame
3213     __ ret(lr);
3214 
3215     return start;
3216   }
3217 
3218   /**
3219    *  Arguments:
3220    *
3221    * Inputs:
3222    *   c_rarg0   - int crc
3223    *   c_rarg1   - byte* buf
3224    *   c_rarg2   - int length
3225    *   c_rarg3   - int* table
3226    *
3227    * Ouput:
3228    *       r0   - int crc result
3229    */
3230   address generate_updateBytesCRC32C() {
3231     assert(UseCRC32CIntrinsics, "what are we doing here?");
3232 
3233     __ align(CodeEntryAlignment);
3234     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3235 
3236     address start = __ pc();
3237 
3238     const Register crc   = c_rarg0;  // crc
3239     const Register buf   = c_rarg1;  // source java byte array address
3240     const Register len   = c_rarg2;  // length
3241     const Register table0 = c_rarg3; // crc_table address
3242     const Register table1 = c_rarg4;
3243     const Register table2 = c_rarg5;
3244     const Register table3 = c_rarg6;
3245     const Register tmp3 = c_rarg7;
3246 
3247     BLOCK_COMMENT("Entry:");
3248     __ enter(); // required for proper stackwalking of RuntimeStub frame
3249 
3250     __ kernel_crc32c(crc, buf, len,
3251               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3252 
3253     __ leave(); // required for proper stackwalking of RuntimeStub frame
3254     __ ret(lr);
3255 
3256     return start;
3257   }
3258 
3259   /***
3260    *  Arguments:
3261    *
3262    *  Inputs:
3263    *   c_rarg0   - int   adler
3264    *   c_rarg1   - byte* buff
3265    *   c_rarg2   - int   len
3266    *
3267    * Output:
3268    *   c_rarg0   - int adler result
3269    */
3270   address generate_updateBytesAdler32() {
3271     __ align(CodeEntryAlignment);
3272     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3273     address start = __ pc();
3274 
3275     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3276 
3277     // Aliases
3278     Register adler  = c_rarg0;
3279     Register s1     = c_rarg0;
3280     Register s2     = c_rarg3;
3281     Register buff   = c_rarg1;
3282     Register len    = c_rarg2;
3283     Register nmax  = r4;
3284     Register base  = r5;
3285     Register count = r6;
3286     Register temp0 = rscratch1;
3287     Register temp1 = rscratch2;
3288     FloatRegister vbytes = v0;
3289     FloatRegister vs1acc = v1;
3290     FloatRegister vs2acc = v2;
3291     FloatRegister vtable = v3;
3292 
3293     // Max number of bytes we can process before having to take the mod
3294     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3295     unsigned long BASE = 0xfff1;
3296     unsigned long NMAX = 0x15B0;
3297 
3298     __ mov(base, BASE);
3299     __ mov(nmax, NMAX);
3300 
3301     // Load accumulation coefficients for the upper 16 bits
3302     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3303     __ ld1(vtable, __ T16B, Address(temp0));
3304 
3305     // s1 is initialized to the lower 16 bits of adler
3306     // s2 is initialized to the upper 16 bits of adler
3307     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3308     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3309 
3310     // The pipelined loop needs at least 16 elements for 1 iteration
3311     // It does check this, but it is more effective to skip to the cleanup loop
3312     __ cmp(len, (u1)16);
3313     __ br(Assembler::HS, L_nmax);
3314     __ cbz(len, L_combine);
3315 
3316     __ bind(L_simple_by1_loop);
3317     __ ldrb(temp0, Address(__ post(buff, 1)));
3318     __ add(s1, s1, temp0);
3319     __ add(s2, s2, s1);
3320     __ subs(len, len, 1);
3321     __ br(Assembler::HI, L_simple_by1_loop);
3322 
3323     // s1 = s1 % BASE
3324     __ subs(temp0, s1, base);
3325     __ csel(s1, temp0, s1, Assembler::HS);
3326 
3327     // s2 = s2 % BASE
3328     __ lsr(temp0, s2, 16);
3329     __ lsl(temp1, temp0, 4);
3330     __ sub(temp1, temp1, temp0);
3331     __ add(s2, temp1, s2, ext::uxth);
3332 
3333     __ subs(temp0, s2, base);
3334     __ csel(s2, temp0, s2, Assembler::HS);
3335 
3336     __ b(L_combine);
3337 
3338     __ bind(L_nmax);
3339     __ subs(len, len, nmax);
3340     __ sub(count, nmax, 16);
3341     __ br(Assembler::LO, L_by16);
3342 
3343     __ bind(L_nmax_loop);
3344 
3345     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3346                                       vbytes, vs1acc, vs2acc, vtable);
3347 
3348     __ subs(count, count, 16);
3349     __ br(Assembler::HS, L_nmax_loop);
3350 
3351     // s1 = s1 % BASE
3352     __ lsr(temp0, s1, 16);
3353     __ lsl(temp1, temp0, 4);
3354     __ sub(temp1, temp1, temp0);
3355     __ add(temp1, temp1, s1, ext::uxth);
3356 
3357     __ lsr(temp0, temp1, 16);
3358     __ lsl(s1, temp0, 4);
3359     __ sub(s1, s1, temp0);
3360     __ add(s1, s1, temp1, ext:: uxth);
3361 
3362     __ subs(temp0, s1, base);
3363     __ csel(s1, temp0, s1, Assembler::HS);
3364 
3365     // s2 = s2 % BASE
3366     __ lsr(temp0, s2, 16);
3367     __ lsl(temp1, temp0, 4);
3368     __ sub(temp1, temp1, temp0);
3369     __ add(temp1, temp1, s2, ext::uxth);
3370 
3371     __ lsr(temp0, temp1, 16);
3372     __ lsl(s2, temp0, 4);
3373     __ sub(s2, s2, temp0);
3374     __ add(s2, s2, temp1, ext:: uxth);
3375 
3376     __ subs(temp0, s2, base);
3377     __ csel(s2, temp0, s2, Assembler::HS);
3378 
3379     __ subs(len, len, nmax);
3380     __ sub(count, nmax, 16);
3381     __ br(Assembler::HS, L_nmax_loop);
3382 
3383     __ bind(L_by16);
3384     __ adds(len, len, count);
3385     __ br(Assembler::LO, L_by1);
3386 
3387     __ bind(L_by16_loop);
3388 
3389     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3390                                       vbytes, vs1acc, vs2acc, vtable);
3391 
3392     __ subs(len, len, 16);
3393     __ br(Assembler::HS, L_by16_loop);
3394 
3395     __ bind(L_by1);
3396     __ adds(len, len, 15);
3397     __ br(Assembler::LO, L_do_mod);
3398 
3399     __ bind(L_by1_loop);
3400     __ ldrb(temp0, Address(__ post(buff, 1)));
3401     __ add(s1, temp0, s1);
3402     __ add(s2, s2, s1);
3403     __ subs(len, len, 1);
3404     __ br(Assembler::HS, L_by1_loop);
3405 
3406     __ bind(L_do_mod);
3407     // s1 = s1 % BASE
3408     __ lsr(temp0, s1, 16);
3409     __ lsl(temp1, temp0, 4);
3410     __ sub(temp1, temp1, temp0);
3411     __ add(temp1, temp1, s1, ext::uxth);
3412 
3413     __ lsr(temp0, temp1, 16);
3414     __ lsl(s1, temp0, 4);
3415     __ sub(s1, s1, temp0);
3416     __ add(s1, s1, temp1, ext:: uxth);
3417 
3418     __ subs(temp0, s1, base);
3419     __ csel(s1, temp0, s1, Assembler::HS);
3420 
3421     // s2 = s2 % BASE
3422     __ lsr(temp0, s2, 16);
3423     __ lsl(temp1, temp0, 4);
3424     __ sub(temp1, temp1, temp0);
3425     __ add(temp1, temp1, s2, ext::uxth);
3426 
3427     __ lsr(temp0, temp1, 16);
3428     __ lsl(s2, temp0, 4);
3429     __ sub(s2, s2, temp0);
3430     __ add(s2, s2, temp1, ext:: uxth);
3431 
3432     __ subs(temp0, s2, base);
3433     __ csel(s2, temp0, s2, Assembler::HS);
3434 
3435     // Combine lower bits and higher bits
3436     __ bind(L_combine);
3437     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3438 
3439     __ ret(lr);
3440 
3441     return start;
3442   }
3443 
3444   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3445           Register temp0, Register temp1, FloatRegister vbytes,
3446           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3447     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3448     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3449     // In non-vectorized code, we update s1 and s2 as:
3450     //   s1 <- s1 + b1
3451     //   s2 <- s2 + s1
3452     //   s1 <- s1 + b2
3453     //   s2 <- s2 + b1
3454     //   ...
3455     //   s1 <- s1 + b16
3456     //   s2 <- s2 + s1
3457     // Putting above assignments together, we have:
3458     //   s1_new = s1 + b1 + b2 + ... + b16
3459     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3460     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3461     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3462     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3463 
3464     // s2 = s2 + s1 * 16
3465     __ add(s2, s2, s1, Assembler::LSL, 4);
3466 
3467     // vs1acc = b1 + b2 + b3 + ... + b16
3468     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3469     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3470     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3471     __ uaddlv(vs1acc, __ T16B, vbytes);
3472     __ uaddlv(vs2acc, __ T8H, vs2acc);
3473 
3474     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3475     __ fmovd(temp0, vs1acc);
3476     __ fmovd(temp1, vs2acc);
3477     __ add(s1, s1, temp0);
3478     __ add(s2, s2, temp1);
3479   }
3480 
3481   /**
3482    *  Arguments:
3483    *
3484    *  Input:
3485    *    c_rarg0   - x address
3486    *    c_rarg1   - x length
3487    *    c_rarg2   - y address
3488    *    c_rarg3   - y lenth
3489    *    c_rarg4   - z address
3490    *    c_rarg5   - z length
3491    */
3492   address generate_multiplyToLen() {
3493     __ align(CodeEntryAlignment);
3494     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3495 
3496     address start = __ pc();
3497     const Register x     = r0;
3498     const Register xlen  = r1;
3499     const Register y     = r2;
3500     const Register ylen  = r3;
3501     const Register z     = r4;
3502     const Register zlen  = r5;
3503 
3504     const Register tmp1  = r10;
3505     const Register tmp2  = r11;
3506     const Register tmp3  = r12;
3507     const Register tmp4  = r13;
3508     const Register tmp5  = r14;
3509     const Register tmp6  = r15;
3510     const Register tmp7  = r16;
3511 
3512     BLOCK_COMMENT("Entry:");
3513     __ enter(); // required for proper stackwalking of RuntimeStub frame
3514     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3515     __ leave(); // required for proper stackwalking of RuntimeStub frame
3516     __ ret(lr);
3517 
3518     return start;
3519   }
3520 
3521   address generate_squareToLen() {
3522     // squareToLen algorithm for sizes 1..127 described in java code works
3523     // faster than multiply_to_len on some CPUs and slower on others, but
3524     // multiply_to_len shows a bit better overall results
3525     __ align(CodeEntryAlignment);
3526     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3527     address start = __ pc();
3528 
3529     const Register x     = r0;
3530     const Register xlen  = r1;
3531     const Register z     = r2;
3532     const Register zlen  = r3;
3533     const Register y     = r4; // == x
3534     const Register ylen  = r5; // == xlen
3535 
3536     const Register tmp1  = r10;
3537     const Register tmp2  = r11;
3538     const Register tmp3  = r12;
3539     const Register tmp4  = r13;
3540     const Register tmp5  = r14;
3541     const Register tmp6  = r15;
3542     const Register tmp7  = r16;
3543 
3544     RegSet spilled_regs = RegSet::of(y, ylen);
3545     BLOCK_COMMENT("Entry:");
3546     __ enter();
3547     __ push(spilled_regs, sp);
3548     __ mov(y, x);
3549     __ mov(ylen, xlen);
3550     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3551     __ pop(spilled_regs, sp);
3552     __ leave();
3553     __ ret(lr);
3554     return start;
3555   }
3556 
3557   address generate_mulAdd() {
3558     __ align(CodeEntryAlignment);
3559     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3560 
3561     address start = __ pc();
3562 
3563     const Register out     = r0;
3564     const Register in      = r1;
3565     const Register offset  = r2;
3566     const Register len     = r3;
3567     const Register k       = r4;
3568 
3569     BLOCK_COMMENT("Entry:");
3570     __ enter();
3571     __ mul_add(out, in, offset, len, k);
3572     __ leave();
3573     __ ret(lr);
3574 
3575     return start;
3576   }
3577 
3578   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3579                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3580                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3581     // Karatsuba multiplication performs a 128*128 -> 256-bit
3582     // multiplication in three 128-bit multiplications and a few
3583     // additions.
3584     //
3585     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3586     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3587     //
3588     // Inputs:
3589     //
3590     // A0 in a.d[0]     (subkey)
3591     // A1 in a.d[1]
3592     // (A1+A0) in a1_xor_a0.d[0]
3593     //
3594     // B0 in b.d[0]     (state)
3595     // B1 in b.d[1]
3596 
3597     __ ext(tmp1, __ T16B, b, b, 0x08);
3598     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3599     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3600     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3601     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3602 
3603     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3604     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3605     __ eor(tmp2, __ T16B, tmp2, tmp4);
3606     __ eor(tmp2, __ T16B, tmp2, tmp3);
3607 
3608     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3609     __ ins(result_hi, __ D, tmp2, 0, 1);
3610     __ ins(result_lo, __ D, tmp2, 1, 0);
3611   }
3612 
3613   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3614                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3615     const FloatRegister t0 = result;
3616 
3617     // The GCM field polynomial f is z^128 + p(z), where p =
3618     // z^7+z^2+z+1.
3619     //
3620     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3621     //
3622     // so, given that the product we're reducing is
3623     //    a == lo + hi * z^128
3624     // substituting,
3625     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3626     //
3627     // we reduce by multiplying hi by p(z) and subtracting the result
3628     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3629     // bits we can do this with two 64-bit multiplications, lo*p and
3630     // hi*p.
3631 
3632     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3633     __ ext(t1, __ T16B, t0, z, 8);
3634     __ eor(hi, __ T16B, hi, t1);
3635     __ ext(t1, __ T16B, z, t0, 8);
3636     __ eor(lo, __ T16B, lo, t1);
3637     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3638     __ eor(result, __ T16B, lo, t0);
3639   }
3640 
3641   address generate_has_negatives(address &has_negatives_long) {
3642     const u1 large_loop_size = 64;
3643     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3644     int dcache_line = VM_Version::dcache_line_size();
3645 
3646     Register ary1 = r1, len = r2, result = r0;
3647 
3648     __ align(CodeEntryAlignment);
3649 
3650     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3651 
3652     address entry = __ pc();
3653 
3654     __ enter();
3655 
3656   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3657         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3658 
3659   __ cmp(len, (u1)15);
3660   __ br(Assembler::GT, LEN_OVER_15);
3661   // The only case when execution falls into this code is when pointer is near
3662   // the end of memory page and we have to avoid reading next page
3663   __ add(ary1, ary1, len);
3664   __ subs(len, len, 8);
3665   __ br(Assembler::GT, LEN_OVER_8);
3666   __ ldr(rscratch2, Address(ary1, -8));
3667   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3668   __ lsrv(rscratch2, rscratch2, rscratch1);
3669   __ tst(rscratch2, UPPER_BIT_MASK);
3670   __ cset(result, Assembler::NE);
3671   __ leave();
3672   __ ret(lr);
3673   __ bind(LEN_OVER_8);
3674   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3675   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3676   __ tst(rscratch2, UPPER_BIT_MASK);
3677   __ br(Assembler::NE, RET_TRUE_NO_POP);
3678   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3679   __ lsrv(rscratch1, rscratch1, rscratch2);
3680   __ tst(rscratch1, UPPER_BIT_MASK);
3681   __ cset(result, Assembler::NE);
3682   __ leave();
3683   __ ret(lr);
3684 
3685   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3686   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3687 
3688   has_negatives_long = __ pc(); // 2nd entry point
3689 
3690   __ enter();
3691 
3692   __ bind(LEN_OVER_15);
3693     __ push(spilled_regs, sp);
3694     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3695     __ cbz(rscratch2, ALIGNED);
3696     __ ldp(tmp6, tmp1, Address(ary1));
3697     __ mov(tmp5, 16);
3698     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3699     __ add(ary1, ary1, rscratch1);
3700     __ sub(len, len, rscratch1);
3701     __ orr(tmp6, tmp6, tmp1);
3702     __ tst(tmp6, UPPER_BIT_MASK);
3703     __ br(Assembler::NE, RET_TRUE);
3704 
3705   __ bind(ALIGNED);
3706     __ cmp(len, large_loop_size);
3707     __ br(Assembler::LT, CHECK_16);
3708     // Perform 16-byte load as early return in pre-loop to handle situation
3709     // when initially aligned large array has negative values at starting bytes,
3710     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3711     // slower. Cases with negative bytes further ahead won't be affected that
3712     // much. In fact, it'll be faster due to early loads, less instructions and
3713     // less branches in LARGE_LOOP.
3714     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3715     __ sub(len, len, 16);
3716     __ orr(tmp6, tmp6, tmp1);
3717     __ tst(tmp6, UPPER_BIT_MASK);
3718     __ br(Assembler::NE, RET_TRUE);
3719     __ cmp(len, large_loop_size);
3720     __ br(Assembler::LT, CHECK_16);
3721 
3722     if (SoftwarePrefetchHintDistance >= 0
3723         && SoftwarePrefetchHintDistance >= dcache_line) {
3724       // initial prefetch
3725       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3726     }
3727   __ bind(LARGE_LOOP);
3728     if (SoftwarePrefetchHintDistance >= 0) {
3729       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3730     }
3731     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3732     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3733     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3734     // instructions per cycle and have less branches, but this approach disables
3735     // early return, thus, all 64 bytes are loaded and checked every time.
3736     __ ldp(tmp2, tmp3, Address(ary1));
3737     __ ldp(tmp4, tmp5, Address(ary1, 16));
3738     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3739     __ ldp(tmp6, tmp1, Address(ary1, 48));
3740     __ add(ary1, ary1, large_loop_size);
3741     __ sub(len, len, large_loop_size);
3742     __ orr(tmp2, tmp2, tmp3);
3743     __ orr(tmp4, tmp4, tmp5);
3744     __ orr(rscratch1, rscratch1, rscratch2);
3745     __ orr(tmp6, tmp6, tmp1);
3746     __ orr(tmp2, tmp2, tmp4);
3747     __ orr(rscratch1, rscratch1, tmp6);
3748     __ orr(tmp2, tmp2, rscratch1);
3749     __ tst(tmp2, UPPER_BIT_MASK);
3750     __ br(Assembler::NE, RET_TRUE);
3751     __ cmp(len, large_loop_size);
3752     __ br(Assembler::GE, LARGE_LOOP);
3753 
3754   __ bind(CHECK_16); // small 16-byte load pre-loop
3755     __ cmp(len, (u1)16);
3756     __ br(Assembler::LT, POST_LOOP16);
3757 
3758   __ bind(LOOP16); // small 16-byte load loop
3759     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3760     __ sub(len, len, 16);
3761     __ orr(tmp2, tmp2, tmp3);
3762     __ tst(tmp2, UPPER_BIT_MASK);
3763     __ br(Assembler::NE, RET_TRUE);
3764     __ cmp(len, (u1)16);
3765     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3766 
3767   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3768     __ cmp(len, (u1)8);
3769     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3770     __ ldr(tmp3, Address(__ post(ary1, 8)));
3771     __ sub(len, len, 8);
3772     __ tst(tmp3, UPPER_BIT_MASK);
3773     __ br(Assembler::NE, RET_TRUE);
3774 
3775   __ bind(POST_LOOP16_LOAD_TAIL);
3776     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3777     __ ldr(tmp1, Address(ary1));
3778     __ mov(tmp2, 64);
3779     __ sub(tmp4, tmp2, len, __ LSL, 3);
3780     __ lslv(tmp1, tmp1, tmp4);
3781     __ tst(tmp1, UPPER_BIT_MASK);
3782     __ br(Assembler::NE, RET_TRUE);
3783     // Fallthrough
3784 
3785   __ bind(RET_FALSE);
3786     __ pop(spilled_regs, sp);
3787     __ leave();
3788     __ mov(result, zr);
3789     __ ret(lr);
3790 
3791   __ bind(RET_TRUE);
3792     __ pop(spilled_regs, sp);
3793   __ bind(RET_TRUE_NO_POP);
3794     __ leave();
3795     __ mov(result, 1);
3796     __ ret(lr);
3797 
3798   __ bind(DONE);
3799     __ pop(spilled_regs, sp);
3800     __ leave();
3801     __ ret(lr);
3802     return entry;
3803   }
3804 
3805   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3806         bool usePrefetch, Label &NOT_EQUAL) {
3807     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3808         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3809         tmp7 = r12, tmp8 = r13;
3810     Label LOOP;
3811 
3812     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3813     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3814     __ bind(LOOP);
3815     if (usePrefetch) {
3816       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3817       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3818     }
3819     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3820     __ eor(tmp1, tmp1, tmp2);
3821     __ eor(tmp3, tmp3, tmp4);
3822     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3823     __ orr(tmp1, tmp1, tmp3);
3824     __ cbnz(tmp1, NOT_EQUAL);
3825     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3826     __ eor(tmp5, tmp5, tmp6);
3827     __ eor(tmp7, tmp7, tmp8);
3828     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3829     __ orr(tmp5, tmp5, tmp7);
3830     __ cbnz(tmp5, NOT_EQUAL);
3831     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3832     __ eor(tmp1, tmp1, tmp2);
3833     __ eor(tmp3, tmp3, tmp4);
3834     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3835     __ orr(tmp1, tmp1, tmp3);
3836     __ cbnz(tmp1, NOT_EQUAL);
3837     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3838     __ eor(tmp5, tmp5, tmp6);
3839     __ sub(cnt1, cnt1, 8 * wordSize);
3840     __ eor(tmp7, tmp7, tmp8);
3841     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3842     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3843     // cmp) because subs allows an unlimited range of immediate operand.
3844     __ subs(tmp6, cnt1, loopThreshold);
3845     __ orr(tmp5, tmp5, tmp7);
3846     __ cbnz(tmp5, NOT_EQUAL);
3847     __ br(__ GE, LOOP);
3848     // post-loop
3849     __ eor(tmp1, tmp1, tmp2);
3850     __ eor(tmp3, tmp3, tmp4);
3851     __ orr(tmp1, tmp1, tmp3);
3852     __ sub(cnt1, cnt1, 2 * wordSize);
3853     __ cbnz(tmp1, NOT_EQUAL);
3854   }
3855 
3856   void generate_large_array_equals_loop_simd(int loopThreshold,
3857         bool usePrefetch, Label &NOT_EQUAL) {
3858     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3859         tmp2 = rscratch2;
3860     Label LOOP;
3861 
3862     __ bind(LOOP);
3863     if (usePrefetch) {
3864       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3865       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3866     }
3867     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3868     __ sub(cnt1, cnt1, 8 * wordSize);
3869     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3870     __ subs(tmp1, cnt1, loopThreshold);
3871     __ eor(v0, __ T16B, v0, v4);
3872     __ eor(v1, __ T16B, v1, v5);
3873     __ eor(v2, __ T16B, v2, v6);
3874     __ eor(v3, __ T16B, v3, v7);
3875     __ orr(v0, __ T16B, v0, v1);
3876     __ orr(v1, __ T16B, v2, v3);
3877     __ orr(v0, __ T16B, v0, v1);
3878     __ umov(tmp1, v0, __ D, 0);
3879     __ umov(tmp2, v0, __ D, 1);
3880     __ orr(tmp1, tmp1, tmp2);
3881     __ cbnz(tmp1, NOT_EQUAL);
3882     __ br(__ GE, LOOP);
3883   }
3884 
3885   // a1 = r1 - array1 address
3886   // a2 = r2 - array2 address
3887   // result = r0 - return value. Already contains "false"
3888   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3889   // r3-r5 are reserved temporary registers
3890   address generate_large_array_equals() {
3891     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3892         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3893         tmp7 = r12, tmp8 = r13;
3894     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3895         SMALL_LOOP, POST_LOOP;
3896     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3897     // calculate if at least 32 prefetched bytes are used
3898     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3899     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3900     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3901     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3902         tmp5, tmp6, tmp7, tmp8);
3903 
3904     __ align(CodeEntryAlignment);
3905 
3906     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3907 
3908     address entry = __ pc();
3909     __ enter();
3910     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3911     // also advance pointers to use post-increment instead of pre-increment
3912     __ add(a1, a1, wordSize);
3913     __ add(a2, a2, wordSize);
3914     if (AvoidUnalignedAccesses) {
3915       // both implementations (SIMD/nonSIMD) are using relatively large load
3916       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3917       // on some CPUs in case of address is not at least 16-byte aligned.
3918       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3919       // load if needed at least for 1st address and make if 16-byte aligned.
3920       Label ALIGNED16;
3921       __ tbz(a1, 3, ALIGNED16);
3922       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3923       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3924       __ sub(cnt1, cnt1, wordSize);
3925       __ eor(tmp1, tmp1, tmp2);
3926       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3927       __ bind(ALIGNED16);
3928     }
3929     if (UseSIMDForArrayEquals) {
3930       if (SoftwarePrefetchHintDistance >= 0) {
3931         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3932         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3933         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3934             /* prfm = */ true, NOT_EQUAL);
3935         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3936         __ br(__ LT, TAIL);
3937       }
3938       __ bind(NO_PREFETCH_LARGE_LOOP);
3939       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3940           /* prfm = */ false, NOT_EQUAL);
3941     } else {
3942       __ push(spilled_regs, sp);
3943       if (SoftwarePrefetchHintDistance >= 0) {
3944         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3945         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3946         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3947             /* prfm = */ true, NOT_EQUAL);
3948         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3949         __ br(__ LT, TAIL);
3950       }
3951       __ bind(NO_PREFETCH_LARGE_LOOP);
3952       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3953           /* prfm = */ false, NOT_EQUAL);
3954     }
3955     __ bind(TAIL);
3956       __ cbz(cnt1, EQUAL);
3957       __ subs(cnt1, cnt1, wordSize);
3958       __ br(__ LE, POST_LOOP);
3959     __ bind(SMALL_LOOP);
3960       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3961       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3962       __ subs(cnt1, cnt1, wordSize);
3963       __ eor(tmp1, tmp1, tmp2);
3964       __ cbnz(tmp1, NOT_EQUAL);
3965       __ br(__ GT, SMALL_LOOP);
3966     __ bind(POST_LOOP);
3967       __ ldr(tmp1, Address(a1, cnt1));
3968       __ ldr(tmp2, Address(a2, cnt1));
3969       __ eor(tmp1, tmp1, tmp2);
3970       __ cbnz(tmp1, NOT_EQUAL);
3971     __ bind(EQUAL);
3972       __ mov(result, true);
3973     __ bind(NOT_EQUAL);
3974       if (!UseSIMDForArrayEquals) {
3975         __ pop(spilled_regs, sp);
3976       }
3977     __ bind(NOT_EQUAL_NO_POP);
3978     __ leave();
3979     __ ret(lr);
3980     return entry;
3981   }
3982 
3983   address generate_dsin_dcos(bool isCos) {
3984     __ align(CodeEntryAlignment);
3985     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3986     address start = __ pc();
3987     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3988         (address)StubRoutines::aarch64::_two_over_pi,
3989         (address)StubRoutines::aarch64::_pio2,
3990         (address)StubRoutines::aarch64::_dsin_coef,
3991         (address)StubRoutines::aarch64::_dcos_coef);
3992     return start;
3993   }
3994 
3995   address generate_dlog() {
3996     __ align(CodeEntryAlignment);
3997     StubCodeMark mark(this, "StubRoutines", "dlog");
3998     address entry = __ pc();
3999     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4000         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4001     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4002     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4003         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4004     return entry;
4005   }
4006 
4007   // code for comparing 16 bytes of strings with same encoding
4008   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4009     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4010     __ ldr(rscratch1, Address(__ post(str1, 8)));
4011     __ eor(rscratch2, tmp1, tmp2);
4012     __ ldr(cnt1, Address(__ post(str2, 8)));
4013     __ cbnz(rscratch2, DIFF1);
4014     __ ldr(tmp1, Address(__ post(str1, 8)));
4015     __ eor(rscratch2, rscratch1, cnt1);
4016     __ ldr(tmp2, Address(__ post(str2, 8)));
4017     __ cbnz(rscratch2, DIFF2);
4018   }
4019 
4020   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4021   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4022       Label &DIFF2) {
4023     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4024     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4025 
4026     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4027     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4028     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4029     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4030 
4031     __ fmovd(tmpL, vtmp3);
4032     __ eor(rscratch2, tmp3, tmpL);
4033     __ cbnz(rscratch2, DIFF2);
4034 
4035     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4036     __ umov(tmpL, vtmp3, __ D, 1);
4037     __ eor(rscratch2, tmpU, tmpL);
4038     __ cbnz(rscratch2, DIFF1);
4039 
4040     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4041     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4042     __ fmovd(tmpL, vtmp);
4043     __ eor(rscratch2, tmp3, tmpL);
4044     __ cbnz(rscratch2, DIFF2);
4045 
4046     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4047     __ umov(tmpL, vtmp, __ D, 1);
4048     __ eor(rscratch2, tmpU, tmpL);
4049     __ cbnz(rscratch2, DIFF1);
4050   }
4051 
4052   // r0  = result
4053   // r1  = str1
4054   // r2  = cnt1
4055   // r3  = str2
4056   // r4  = cnt2
4057   // r10 = tmp1
4058   // r11 = tmp2
4059   address generate_compare_long_string_different_encoding(bool isLU) {
4060     __ align(CodeEntryAlignment);
4061     StubCodeMark mark(this, "StubRoutines", isLU
4062         ? "compare_long_string_different_encoding LU"
4063         : "compare_long_string_different_encoding UL");
4064     address entry = __ pc();
4065     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4066         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4067         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4068     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4069         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4070     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4071     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4072 
4073     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4074 
4075     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4076     // cnt2 == amount of characters left to compare
4077     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4078     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4079     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4080     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4081     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4082     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4083     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4084     __ eor(rscratch2, tmp1, tmp2);
4085     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4086     __ mov(rscratch1, tmp2);
4087     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4088     Register strU = isLU ? str2 : str1,
4089              strL = isLU ? str1 : str2,
4090              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4091              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4092     __ push(spilled_regs, sp);
4093     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4094     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4095 
4096     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4097 
4098     if (SoftwarePrefetchHintDistance >= 0) {
4099       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4100       __ br(__ LT, NO_PREFETCH);
4101       __ bind(LARGE_LOOP_PREFETCH);
4102         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4103         __ mov(tmp4, 2);
4104         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4105         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4106           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4107           __ subs(tmp4, tmp4, 1);
4108           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4109           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4110           __ mov(tmp4, 2);
4111         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4112           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4113           __ subs(tmp4, tmp4, 1);
4114           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4115           __ sub(cnt2, cnt2, 64);
4116           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4117           __ br(__ GE, LARGE_LOOP_PREFETCH);
4118     }
4119     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4120     __ bind(NO_PREFETCH);
4121     __ subs(cnt2, cnt2, 16);
4122     __ br(__ LT, TAIL);
4123     __ bind(SMALL_LOOP); // smaller loop
4124       __ subs(cnt2, cnt2, 16);
4125       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4126       __ br(__ GE, SMALL_LOOP);
4127       __ cmn(cnt2, (u1)16);
4128       __ br(__ EQ, LOAD_LAST);
4129     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4130       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
4131       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4132       __ ldr(tmp3, Address(cnt1, -8));
4133       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4134       __ b(LOAD_LAST);
4135     __ bind(DIFF2);
4136       __ mov(tmpU, tmp3);
4137     __ bind(DIFF1);
4138       __ pop(spilled_regs, sp);
4139       __ b(CALCULATE_DIFFERENCE);
4140     __ bind(LOAD_LAST);
4141       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4142       // No need to load it again
4143       __ mov(tmpU, tmp3);
4144       __ pop(spilled_regs, sp);
4145 
4146       __ ldrs(vtmp, Address(strL));
4147       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4148       __ fmovd(tmpL, vtmp);
4149 
4150       __ eor(rscratch2, tmpU, tmpL);
4151       __ cbz(rscratch2, DONE);
4152 
4153     // Find the first different characters in the longwords and
4154     // compute their difference.
4155     __ bind(CALCULATE_DIFFERENCE);
4156       __ rev(rscratch2, rscratch2);
4157       __ clz(rscratch2, rscratch2);
4158       __ andr(rscratch2, rscratch2, -16);
4159       __ lsrv(tmp1, tmp1, rscratch2);
4160       __ uxthw(tmp1, tmp1);
4161       __ lsrv(rscratch1, rscratch1, rscratch2);
4162       __ uxthw(rscratch1, rscratch1);
4163       __ subw(result, tmp1, rscratch1);
4164     __ bind(DONE);
4165       __ ret(lr);
4166     return entry;
4167   }
4168 
4169   // r0  = result
4170   // r1  = str1
4171   // r2  = cnt1
4172   // r3  = str2
4173   // r4  = cnt2
4174   // r10 = tmp1
4175   // r11 = tmp2
4176   address generate_compare_long_string_same_encoding(bool isLL) {
4177     __ align(CodeEntryAlignment);
4178     StubCodeMark mark(this, "StubRoutines", isLL
4179         ? "compare_long_string_same_encoding LL"
4180         : "compare_long_string_same_encoding UU");
4181     address entry = __ pc();
4182     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4183         tmp1 = r10, tmp2 = r11;
4184     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4185         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4186         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4187     // exit from large loop when less than 64 bytes left to read or we're about
4188     // to prefetch memory behind array border
4189     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4190     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4191     // update cnt2 counter with already loaded 8 bytes
4192     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4193     // update pointers, because of previous read
4194     __ add(str1, str1, wordSize);
4195     __ add(str2, str2, wordSize);
4196     if (SoftwarePrefetchHintDistance >= 0) {
4197       __ bind(LARGE_LOOP_PREFETCH);
4198         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4199         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4200         compare_string_16_bytes_same(DIFF, DIFF2);
4201         compare_string_16_bytes_same(DIFF, DIFF2);
4202         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4203         compare_string_16_bytes_same(DIFF, DIFF2);
4204         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4205         compare_string_16_bytes_same(DIFF, DIFF2);
4206         __ br(__ GT, LARGE_LOOP_PREFETCH);
4207         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4208     }
4209     // less than 16 bytes left?
4210     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4211     __ br(__ LT, TAIL);
4212     __ bind(SMALL_LOOP);
4213       compare_string_16_bytes_same(DIFF, DIFF2);
4214       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4215       __ br(__ GE, SMALL_LOOP);
4216     __ bind(TAIL);
4217       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4218       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4219       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4220       __ br(__ LE, CHECK_LAST);
4221       __ eor(rscratch2, tmp1, tmp2);
4222       __ cbnz(rscratch2, DIFF);
4223       __ ldr(tmp1, Address(__ post(str1, 8)));
4224       __ ldr(tmp2, Address(__ post(str2, 8)));
4225       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4226     __ bind(CHECK_LAST);
4227       if (!isLL) {
4228         __ add(cnt2, cnt2, cnt2); // now in bytes
4229       }
4230       __ eor(rscratch2, tmp1, tmp2);
4231       __ cbnz(rscratch2, DIFF);
4232       __ ldr(rscratch1, Address(str1, cnt2));
4233       __ ldr(cnt1, Address(str2, cnt2));
4234       __ eor(rscratch2, rscratch1, cnt1);
4235       __ cbz(rscratch2, LENGTH_DIFF);
4236       // Find the first different characters in the longwords and
4237       // compute their difference.
4238     __ bind(DIFF2);
4239       __ rev(rscratch2, rscratch2);
4240       __ clz(rscratch2, rscratch2);
4241       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4242       __ lsrv(rscratch1, rscratch1, rscratch2);
4243       if (isLL) {
4244         __ lsrv(cnt1, cnt1, rscratch2);
4245         __ uxtbw(rscratch1, rscratch1);
4246         __ uxtbw(cnt1, cnt1);
4247       } else {
4248         __ lsrv(cnt1, cnt1, rscratch2);
4249         __ uxthw(rscratch1, rscratch1);
4250         __ uxthw(cnt1, cnt1);
4251       }
4252       __ subw(result, rscratch1, cnt1);
4253       __ b(LENGTH_DIFF);
4254     __ bind(DIFF);
4255       __ rev(rscratch2, rscratch2);
4256       __ clz(rscratch2, rscratch2);
4257       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4258       __ lsrv(tmp1, tmp1, rscratch2);
4259       if (isLL) {
4260         __ lsrv(tmp2, tmp2, rscratch2);
4261         __ uxtbw(tmp1, tmp1);
4262         __ uxtbw(tmp2, tmp2);
4263       } else {
4264         __ lsrv(tmp2, tmp2, rscratch2);
4265         __ uxthw(tmp1, tmp1);
4266         __ uxthw(tmp2, tmp2);
4267       }
4268       __ subw(result, tmp1, tmp2);
4269       __ b(LENGTH_DIFF);
4270     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4271       __ eor(rscratch2, tmp1, tmp2);
4272       __ cbnz(rscratch2, DIFF);
4273     __ bind(LENGTH_DIFF);
4274       __ ret(lr);
4275     return entry;
4276   }
4277 
4278   void generate_compare_long_strings() {
4279       StubRoutines::aarch64::_compare_long_string_LL
4280           = generate_compare_long_string_same_encoding(true);
4281       StubRoutines::aarch64::_compare_long_string_UU
4282           = generate_compare_long_string_same_encoding(false);
4283       StubRoutines::aarch64::_compare_long_string_LU
4284           = generate_compare_long_string_different_encoding(true);
4285       StubRoutines::aarch64::_compare_long_string_UL
4286           = generate_compare_long_string_different_encoding(false);
4287   }
4288 
4289   // R0 = result
4290   // R1 = str2
4291   // R2 = cnt1
4292   // R3 = str1
4293   // R4 = cnt2
4294   // This generic linear code use few additional ideas, which makes it faster:
4295   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4296   // in order to skip initial loading(help in systems with 1 ld pipeline)
4297   // 2) we can use "fast" algorithm of finding single character to search for
4298   // first symbol with less branches(1 branch per each loaded register instead
4299   // of branch for each symbol), so, this is where constants like
4300   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4301   // 3) after loading and analyzing 1st register of source string, it can be
4302   // used to search for every 1st character entry, saving few loads in
4303   // comparison with "simplier-but-slower" implementation
4304   // 4) in order to avoid lots of push/pop operations, code below is heavily
4305   // re-using/re-initializing/compressing register values, which makes code
4306   // larger and a bit less readable, however, most of extra operations are
4307   // issued during loads or branches, so, penalty is minimal
4308   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4309     const char* stubName = str1_isL
4310         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4311         : "indexof_linear_uu";
4312     __ align(CodeEntryAlignment);
4313     StubCodeMark mark(this, "StubRoutines", stubName);
4314     address entry = __ pc();
4315 
4316     int str1_chr_size = str1_isL ? 1 : 2;
4317     int str2_chr_size = str2_isL ? 1 : 2;
4318     int str1_chr_shift = str1_isL ? 0 : 1;
4319     int str2_chr_shift = str2_isL ? 0 : 1;
4320     bool isL = str1_isL && str2_isL;
4321    // parameters
4322     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4323     // temporary registers
4324     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4325     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4326     // redefinitions
4327     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4328 
4329     __ push(spilled_regs, sp);
4330     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4331         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4332         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4333         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4334         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4335         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4336     // Read whole register from str1. It is safe, because length >=8 here
4337     __ ldr(ch1, Address(str1));
4338     // Read whole register from str2. It is safe, because length >=8 here
4339     __ ldr(ch2, Address(str2));
4340     __ sub(cnt2, cnt2, cnt1);
4341     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4342     if (str1_isL != str2_isL) {
4343       __ eor(v0, __ T16B, v0, v0);
4344     }
4345     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4346     __ mul(first, first, tmp1);
4347     // check if we have less than 1 register to check
4348     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4349     if (str1_isL != str2_isL) {
4350       __ fmovd(v1, ch1);
4351     }
4352     __ br(__ LE, L_SMALL);
4353     __ eor(ch2, first, ch2);
4354     if (str1_isL != str2_isL) {
4355       __ zip1(v1, __ T16B, v1, v0);
4356     }
4357     __ sub(tmp2, ch2, tmp1);
4358     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4359     __ bics(tmp2, tmp2, ch2);
4360     if (str1_isL != str2_isL) {
4361       __ fmovd(ch1, v1);
4362     }
4363     __ br(__ NE, L_HAS_ZERO);
4364     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4365     __ add(result, result, wordSize/str2_chr_size);
4366     __ add(str2, str2, wordSize);
4367     __ br(__ LT, L_POST_LOOP);
4368     __ BIND(L_LOOP);
4369       __ ldr(ch2, Address(str2));
4370       __ eor(ch2, first, ch2);
4371       __ sub(tmp2, ch2, tmp1);
4372       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4373       __ bics(tmp2, tmp2, ch2);
4374       __ br(__ NE, L_HAS_ZERO);
4375     __ BIND(L_LOOP_PROCEED);
4376       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4377       __ add(str2, str2, wordSize);
4378       __ add(result, result, wordSize/str2_chr_size);
4379       __ br(__ GE, L_LOOP);
4380     __ BIND(L_POST_LOOP);
4381       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4382       __ br(__ LE, NOMATCH);
4383       __ ldr(ch2, Address(str2));
4384       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4385       __ eor(ch2, first, ch2);
4386       __ sub(tmp2, ch2, tmp1);
4387       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4388       __ mov(tmp4, -1); // all bits set
4389       __ b(L_SMALL_PROCEED);
4390     __ align(OptoLoopAlignment);
4391     __ BIND(L_SMALL);
4392       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4393       __ eor(ch2, first, ch2);
4394       if (str1_isL != str2_isL) {
4395         __ zip1(v1, __ T16B, v1, v0);
4396       }
4397       __ sub(tmp2, ch2, tmp1);
4398       __ mov(tmp4, -1); // all bits set
4399       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4400       if (str1_isL != str2_isL) {
4401         __ fmovd(ch1, v1); // move converted 4 symbols
4402       }
4403     __ BIND(L_SMALL_PROCEED);
4404       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4405       __ bic(tmp2, tmp2, ch2);
4406       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4407       __ rbit(tmp2, tmp2);
4408       __ br(__ EQ, NOMATCH);
4409     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4410       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4411       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4412       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4413       if (str2_isL) { // LL
4414         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4415         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4416         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4417         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4418         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4419       } else {
4420         __ mov(ch2, 0xE); // all bits in byte set except last one
4421         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4422         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4423         __ lslv(tmp2, tmp2, tmp4);
4424         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4425         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4426         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4427         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4428       }
4429       __ cmp(ch1, ch2);
4430       __ mov(tmp4, wordSize/str2_chr_size);
4431       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4432     __ BIND(L_SMALL_CMP_LOOP);
4433       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4434                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4435       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4436                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4437       __ add(tmp4, tmp4, 1);
4438       __ cmp(tmp4, cnt1);
4439       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4440       __ cmp(first, ch2);
4441       __ br(__ EQ, L_SMALL_CMP_LOOP);
4442     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4443       __ cbz(tmp2, NOMATCH); // no more matches. exit
4444       __ clz(tmp4, tmp2);
4445       __ add(result, result, 1); // advance index
4446       __ add(str2, str2, str2_chr_size); // advance pointer
4447       __ b(L_SMALL_HAS_ZERO_LOOP);
4448     __ align(OptoLoopAlignment);
4449     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4450       __ cmp(first, ch2);
4451       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4452       __ b(DONE);
4453     __ align(OptoLoopAlignment);
4454     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4455       if (str2_isL) { // LL
4456         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4457         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4458         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4459         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4460         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4461       } else {
4462         __ mov(ch2, 0xE); // all bits in byte set except last one
4463         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4464         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4465         __ lslv(tmp2, tmp2, tmp4);
4466         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4467         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4468         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4469         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4470       }
4471       __ cmp(ch1, ch2);
4472       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4473       __ b(DONE);
4474     __ align(OptoLoopAlignment);
4475     __ BIND(L_HAS_ZERO);
4476       __ rbit(tmp2, tmp2);
4477       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4478       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4479       // It's fine because both counters are 32bit and are not changed in this
4480       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4481       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4482       __ sub(result, result, 1);
4483     __ BIND(L_HAS_ZERO_LOOP);
4484       __ mov(cnt1, wordSize/str2_chr_size);
4485       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4486       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4487       if (str2_isL) {
4488         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4489         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4490         __ lslv(tmp2, tmp2, tmp4);
4491         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4492         __ add(tmp4, tmp4, 1);
4493         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4494         __ lsl(tmp2, tmp2, 1);
4495         __ mov(tmp4, wordSize/str2_chr_size);
4496       } else {
4497         __ mov(ch2, 0xE);
4498         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4499         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4500         __ lslv(tmp2, tmp2, tmp4);
4501         __ add(tmp4, tmp4, 1);
4502         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4503         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4504         __ lsl(tmp2, tmp2, 1);
4505         __ mov(tmp4, wordSize/str2_chr_size);
4506         __ sub(str2, str2, str2_chr_size);
4507       }
4508       __ cmp(ch1, ch2);
4509       __ mov(tmp4, wordSize/str2_chr_size);
4510       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4511     __ BIND(L_CMP_LOOP);
4512       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4513                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4514       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4515                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4516       __ add(tmp4, tmp4, 1);
4517       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4518       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4519       __ cmp(cnt1, ch2);
4520       __ br(__ EQ, L_CMP_LOOP);
4521     __ BIND(L_CMP_LOOP_NOMATCH);
4522       // here we're not matched
4523       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4524       __ clz(tmp4, tmp2);
4525       __ add(str2, str2, str2_chr_size); // advance pointer
4526       __ b(L_HAS_ZERO_LOOP);
4527     __ align(OptoLoopAlignment);
4528     __ BIND(L_CMP_LOOP_LAST_CMP);
4529       __ cmp(cnt1, ch2);
4530       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4531       __ b(DONE);
4532     __ align(OptoLoopAlignment);
4533     __ BIND(L_CMP_LOOP_LAST_CMP2);
4534       if (str2_isL) {
4535         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4536         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4537         __ lslv(tmp2, tmp2, tmp4);
4538         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4539         __ add(tmp4, tmp4, 1);
4540         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4541         __ lsl(tmp2, tmp2, 1);
4542       } else {
4543         __ mov(ch2, 0xE);
4544         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4545         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4546         __ lslv(tmp2, tmp2, tmp4);
4547         __ add(tmp4, tmp4, 1);
4548         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4549         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4550         __ lsl(tmp2, tmp2, 1);
4551         __ sub(str2, str2, str2_chr_size);
4552       }
4553       __ cmp(ch1, ch2);
4554       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4555       __ b(DONE);
4556     __ align(OptoLoopAlignment);
4557     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4558       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4559       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4560       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4561       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4562       // result by analyzed characters value, so, we can just reset lower bits
4563       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4564       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4565       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4566       // index of last analyzed substring inside current octet. So, str2 in at
4567       // respective start address. We need to advance it to next octet
4568       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4569       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4570       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4571       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4572       __ movw(cnt2, cnt2);
4573       __ b(L_LOOP_PROCEED);
4574     __ align(OptoLoopAlignment);
4575     __ BIND(NOMATCH);
4576       __ mov(result, -1);
4577     __ BIND(DONE);
4578       __ pop(spilled_regs, sp);
4579       __ ret(lr);
4580     return entry;
4581   }
4582 
4583   void generate_string_indexof_stubs() {
4584     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4585     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4586     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4587   }
4588 
4589   void inflate_and_store_2_fp_registers(bool generatePrfm,
4590       FloatRegister src1, FloatRegister src2) {
4591     Register dst = r1;
4592     __ zip1(v1, __ T16B, src1, v0);
4593     __ zip2(v2, __ T16B, src1, v0);
4594     if (generatePrfm) {
4595       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4596     }
4597     __ zip1(v3, __ T16B, src2, v0);
4598     __ zip2(v4, __ T16B, src2, v0);
4599     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4600   }
4601 
4602   // R0 = src
4603   // R1 = dst
4604   // R2 = len
4605   // R3 = len >> 3
4606   // V0 = 0
4607   // v1 = loaded 8 bytes
4608   address generate_large_byte_array_inflate() {
4609     __ align(CodeEntryAlignment);
4610     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4611     address entry = __ pc();
4612     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4613     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4614     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4615 
4616     // do one more 8-byte read to have address 16-byte aligned in most cases
4617     // also use single store instruction
4618     __ ldrd(v2, __ post(src, 8));
4619     __ sub(octetCounter, octetCounter, 2);
4620     __ zip1(v1, __ T16B, v1, v0);
4621     __ zip1(v2, __ T16B, v2, v0);
4622     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4623     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4624     __ subs(rscratch1, octetCounter, large_loop_threshold);
4625     __ br(__ LE, LOOP_START);
4626     __ b(LOOP_PRFM_START);
4627     __ bind(LOOP_PRFM);
4628       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4629     __ bind(LOOP_PRFM_START);
4630       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4631       __ sub(octetCounter, octetCounter, 8);
4632       __ subs(rscratch1, octetCounter, large_loop_threshold);
4633       inflate_and_store_2_fp_registers(true, v3, v4);
4634       inflate_and_store_2_fp_registers(true, v5, v6);
4635       __ br(__ GT, LOOP_PRFM);
4636       __ cmp(octetCounter, (u1)8);
4637       __ br(__ LT, DONE);
4638     __ bind(LOOP);
4639       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4640       __ bind(LOOP_START);
4641       __ sub(octetCounter, octetCounter, 8);
4642       __ cmp(octetCounter, (u1)8);
4643       inflate_and_store_2_fp_registers(false, v3, v4);
4644       inflate_and_store_2_fp_registers(false, v5, v6);
4645       __ br(__ GE, LOOP);
4646     __ bind(DONE);
4647       __ ret(lr);
4648     return entry;
4649   }
4650 
4651   /**
4652    *  Arguments:
4653    *
4654    *  Input:
4655    *  c_rarg0   - current state address
4656    *  c_rarg1   - H key address
4657    *  c_rarg2   - data address
4658    *  c_rarg3   - number of blocks
4659    *
4660    *  Output:
4661    *  Updated state at c_rarg0
4662    */
4663   address generate_ghash_processBlocks() {
4664     // Bafflingly, GCM uses little-endian for the byte order, but
4665     // big-endian for the bit order.  For example, the polynomial 1 is
4666     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4667     //
4668     // So, we must either reverse the bytes in each word and do
4669     // everything big-endian or reverse the bits in each byte and do
4670     // it little-endian.  On AArch64 it's more idiomatic to reverse
4671     // the bits in each byte (we have an instruction, RBIT, to do
4672     // that) and keep the data in little-endian bit order throught the
4673     // calculation, bit-reversing the inputs and outputs.
4674 
4675     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4676     __ align(wordSize * 2);
4677     address p = __ pc();
4678     __ emit_int64(0x87);  // The low-order bits of the field
4679                           // polynomial (i.e. p = z^7+z^2+z+1)
4680                           // repeated in the low and high parts of a
4681                           // 128-bit vector
4682     __ emit_int64(0x87);
4683 
4684     __ align(CodeEntryAlignment);
4685     address start = __ pc();
4686 
4687     Register state   = c_rarg0;
4688     Register subkeyH = c_rarg1;
4689     Register data    = c_rarg2;
4690     Register blocks  = c_rarg3;
4691 
4692     FloatRegister vzr = v30;
4693     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4694 
4695     __ ldrq(v0, Address(state));
4696     __ ldrq(v1, Address(subkeyH));
4697 
4698     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4699     __ rbit(v0, __ T16B, v0);
4700     __ rev64(v1, __ T16B, v1);
4701     __ rbit(v1, __ T16B, v1);
4702 
4703     __ ldrq(v26, p);
4704 
4705     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4706     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4707 
4708     {
4709       Label L_ghash_loop;
4710       __ bind(L_ghash_loop);
4711 
4712       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4713                                                  // reversing each byte
4714       __ rbit(v2, __ T16B, v2);
4715       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4716 
4717       // Multiply state in v2 by subkey in v1
4718       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4719                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4720                      /*temps*/v6, v20, v18, v21);
4721       // Reduce v7:v5 by the field polynomial
4722       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4723 
4724       __ sub(blocks, blocks, 1);
4725       __ cbnz(blocks, L_ghash_loop);
4726     }
4727 
4728     // The bit-reversed result is at this point in v0
4729     __ rev64(v1, __ T16B, v0);
4730     __ rbit(v1, __ T16B, v1);
4731 
4732     __ st1(v1, __ T16B, state);
4733     __ ret(lr);
4734 
4735     return start;
4736   }
4737 
4738   // Continuation point for throwing of implicit exceptions that are
4739   // not handled in the current activation. Fabricates an exception
4740   // oop and initiates normal exception dispatching in this
4741   // frame. Since we need to preserve callee-saved values (currently
4742   // only for C2, but done for C1 as well) we need a callee-saved oop
4743   // map and therefore have to make these stubs into RuntimeStubs
4744   // rather than BufferBlobs.  If the compiler needs all registers to
4745   // be preserved between the fault point and the exception handler
4746   // then it must assume responsibility for that in
4747   // AbstractCompiler::continuation_for_implicit_null_exception or
4748   // continuation_for_implicit_division_by_zero_exception. All other
4749   // implicit exceptions (e.g., NullPointerException or
4750   // AbstractMethodError on entry) are either at call sites or
4751   // otherwise assume that stack unwinding will be initiated, so
4752   // caller saved registers were assumed volatile in the compiler.
4753 
4754 #undef __
4755 #define __ masm->
4756 
4757   address generate_throw_exception(const char* name,
4758                                    address runtime_entry,
4759                                    Register arg1 = noreg,
4760                                    Register arg2 = noreg) {
4761     // Information about frame layout at time of blocking runtime call.
4762     // Note that we only have to preserve callee-saved registers since
4763     // the compilers are responsible for supplying a continuation point
4764     // if they expect all registers to be preserved.
4765     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4766     enum layout {
4767       rfp_off = 0,
4768       rfp_off2,
4769       return_off,
4770       return_off2,
4771       framesize // inclusive of return address
4772     };
4773 
4774     int insts_size = 512;
4775     int locs_size  = 64;
4776 
4777     CodeBuffer code(name, insts_size, locs_size);
4778     OopMapSet* oop_maps  = new OopMapSet();
4779     MacroAssembler* masm = new MacroAssembler(&code);
4780 
4781     address start = __ pc();
4782 
4783     // This is an inlined and slightly modified version of call_VM
4784     // which has the ability to fetch the return PC out of
4785     // thread-local storage and also sets up last_Java_sp slightly
4786     // differently than the real call_VM
4787 
4788     __ enter(); // Save FP and LR before call
4789 
4790     assert(is_even(framesize/2), "sp not 16-byte aligned");
4791 
4792     // lr and fp are already in place
4793     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4794 
4795     int frame_complete = __ pc() - start;
4796 
4797     // Set up last_Java_sp and last_Java_fp
4798     address the_pc = __ pc();
4799     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4800 
4801     // Call runtime
4802     if (arg1 != noreg) {
4803       assert(arg2 != c_rarg1, "clobbered");
4804       __ mov(c_rarg1, arg1);
4805     }
4806     if (arg2 != noreg) {
4807       __ mov(c_rarg2, arg2);
4808     }
4809     __ mov(c_rarg0, rthread);
4810     BLOCK_COMMENT("call runtime_entry");
4811     __ mov(rscratch1, runtime_entry);
4812     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4813 
4814     // Generate oop map
4815     OopMap* map = new OopMap(framesize, 0);
4816 
4817     oop_maps->add_gc_map(the_pc - start, map);
4818 
4819     __ reset_last_Java_frame(true);
4820     __ maybe_isb();
4821 
4822     __ leave();
4823 
4824     // check for pending exceptions
4825 #ifdef ASSERT
4826     Label L;
4827     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4828     __ cbnz(rscratch1, L);
4829     __ should_not_reach_here();
4830     __ bind(L);
4831 #endif // ASSERT
4832     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4833 
4834 
4835     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4836     RuntimeStub* stub =
4837       RuntimeStub::new_runtime_stub(name,
4838                                     &code,
4839                                     frame_complete,
4840                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4841                                     oop_maps, false);
4842     return stub->entry_point();
4843   }
4844 
4845   class MontgomeryMultiplyGenerator : public MacroAssembler {
4846 
4847     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4848       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4849 
4850     RegSet _toSave;
4851     bool _squaring;
4852 
4853   public:
4854     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4855       : MacroAssembler(as->code()), _squaring(squaring) {
4856 
4857       // Register allocation
4858 
4859       Register reg = c_rarg0;
4860       Pa_base = reg;       // Argument registers
4861       if (squaring)
4862         Pb_base = Pa_base;
4863       else
4864         Pb_base = ++reg;
4865       Pn_base = ++reg;
4866       Rlen= ++reg;
4867       inv = ++reg;
4868       Pm_base = ++reg;
4869 
4870                           // Working registers:
4871       Ra =  ++reg;        // The current digit of a, b, n, and m.
4872       Rb =  ++reg;
4873       Rm =  ++reg;
4874       Rn =  ++reg;
4875 
4876       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4877       Pb =  ++reg;
4878       Pm =  ++reg;
4879       Pn =  ++reg;
4880 
4881       t0 =  ++reg;        // Three registers which form a
4882       t1 =  ++reg;        // triple-precision accumuator.
4883       t2 =  ++reg;
4884 
4885       Ri =  ++reg;        // Inner and outer loop indexes.
4886       Rj =  ++reg;
4887 
4888       Rhi_ab = ++reg;     // Product registers: low and high parts
4889       Rlo_ab = ++reg;     // of a*b and m*n.
4890       Rhi_mn = ++reg;
4891       Rlo_mn = ++reg;
4892 
4893       // r19 and up are callee-saved.
4894       _toSave = RegSet::range(r19, reg) + Pm_base;
4895     }
4896 
4897   private:
4898     void save_regs() {
4899       push(_toSave, sp);
4900     }
4901 
4902     void restore_regs() {
4903       pop(_toSave, sp);
4904     }
4905 
4906     template <typename T>
4907     void unroll_2(Register count, T block) {
4908       Label loop, end, odd;
4909       tbnz(count, 0, odd);
4910       cbz(count, end);
4911       align(16);
4912       bind(loop);
4913       (this->*block)();
4914       bind(odd);
4915       (this->*block)();
4916       subs(count, count, 2);
4917       br(Assembler::GT, loop);
4918       bind(end);
4919     }
4920 
4921     template <typename T>
4922     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4923       Label loop, end, odd;
4924       tbnz(count, 0, odd);
4925       cbz(count, end);
4926       align(16);
4927       bind(loop);
4928       (this->*block)(d, s, tmp);
4929       bind(odd);
4930       (this->*block)(d, s, tmp);
4931       subs(count, count, 2);
4932       br(Assembler::GT, loop);
4933       bind(end);
4934     }
4935 
4936     void pre1(RegisterOrConstant i) {
4937       block_comment("pre1");
4938       // Pa = Pa_base;
4939       // Pb = Pb_base + i;
4940       // Pm = Pm_base;
4941       // Pn = Pn_base + i;
4942       // Ra = *Pa;
4943       // Rb = *Pb;
4944       // Rm = *Pm;
4945       // Rn = *Pn;
4946       ldr(Ra, Address(Pa_base));
4947       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4948       ldr(Rm, Address(Pm_base));
4949       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4950       lea(Pa, Address(Pa_base));
4951       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4952       lea(Pm, Address(Pm_base));
4953       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4954 
4955       // Zero the m*n result.
4956       mov(Rhi_mn, zr);
4957       mov(Rlo_mn, zr);
4958     }
4959 
4960     // The core multiply-accumulate step of a Montgomery
4961     // multiplication.  The idea is to schedule operations as a
4962     // pipeline so that instructions with long latencies (loads and
4963     // multiplies) have time to complete before their results are
4964     // used.  This most benefits in-order implementations of the
4965     // architecture but out-of-order ones also benefit.
4966     void step() {
4967       block_comment("step");
4968       // MACC(Ra, Rb, t0, t1, t2);
4969       // Ra = *++Pa;
4970       // Rb = *--Pb;
4971       umulh(Rhi_ab, Ra, Rb);
4972       mul(Rlo_ab, Ra, Rb);
4973       ldr(Ra, pre(Pa, wordSize));
4974       ldr(Rb, pre(Pb, -wordSize));
4975       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4976                                        // previous iteration.
4977       // MACC(Rm, Rn, t0, t1, t2);
4978       // Rm = *++Pm;
4979       // Rn = *--Pn;
4980       umulh(Rhi_mn, Rm, Rn);
4981       mul(Rlo_mn, Rm, Rn);
4982       ldr(Rm, pre(Pm, wordSize));
4983       ldr(Rn, pre(Pn, -wordSize));
4984       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4985     }
4986 
4987     void post1() {
4988       block_comment("post1");
4989 
4990       // MACC(Ra, Rb, t0, t1, t2);
4991       // Ra = *++Pa;
4992       // Rb = *--Pb;
4993       umulh(Rhi_ab, Ra, Rb);
4994       mul(Rlo_ab, Ra, Rb);
4995       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4996       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4997 
4998       // *Pm = Rm = t0 * inv;
4999       mul(Rm, t0, inv);
5000       str(Rm, Address(Pm));
5001 
5002       // MACC(Rm, Rn, t0, t1, t2);
5003       // t0 = t1; t1 = t2; t2 = 0;
5004       umulh(Rhi_mn, Rm, Rn);
5005 
5006 #ifndef PRODUCT
5007       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5008       {
5009         mul(Rlo_mn, Rm, Rn);
5010         add(Rlo_mn, t0, Rlo_mn);
5011         Label ok;
5012         cbz(Rlo_mn, ok); {
5013           stop("broken Montgomery multiply");
5014         } bind(ok);
5015       }
5016 #endif
5017       // We have very carefully set things up so that
5018       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5019       // the lower half of Rm * Rn because we know the result already:
5020       // it must be -t0.  t0 + (-t0) must generate a carry iff
5021       // t0 != 0.  So, rather than do a mul and an adds we just set
5022       // the carry flag iff t0 is nonzero.
5023       //
5024       // mul(Rlo_mn, Rm, Rn);
5025       // adds(zr, t0, Rlo_mn);
5026       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5027       adcs(t0, t1, Rhi_mn);
5028       adc(t1, t2, zr);
5029       mov(t2, zr);
5030     }
5031 
5032     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5033       block_comment("pre2");
5034       // Pa = Pa_base + i-len;
5035       // Pb = Pb_base + len;
5036       // Pm = Pm_base + i-len;
5037       // Pn = Pn_base + len;
5038 
5039       if (i.is_register()) {
5040         sub(Rj, i.as_register(), len);
5041       } else {
5042         mov(Rj, i.as_constant());
5043         sub(Rj, Rj, len);
5044       }
5045       // Rj == i-len
5046 
5047       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5048       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5049       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5050       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5051 
5052       // Ra = *++Pa;
5053       // Rb = *--Pb;
5054       // Rm = *++Pm;
5055       // Rn = *--Pn;
5056       ldr(Ra, pre(Pa, wordSize));
5057       ldr(Rb, pre(Pb, -wordSize));
5058       ldr(Rm, pre(Pm, wordSize));
5059       ldr(Rn, pre(Pn, -wordSize));
5060 
5061       mov(Rhi_mn, zr);
5062       mov(Rlo_mn, zr);
5063     }
5064 
5065     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5066       block_comment("post2");
5067       if (i.is_constant()) {
5068         mov(Rj, i.as_constant()-len.as_constant());
5069       } else {
5070         sub(Rj, i.as_register(), len);
5071       }
5072 
5073       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5074 
5075       // As soon as we know the least significant digit of our result,
5076       // store it.
5077       // Pm_base[i-len] = t0;
5078       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5079 
5080       // t0 = t1; t1 = t2; t2 = 0;
5081       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5082       adc(t1, t2, zr);
5083       mov(t2, zr);
5084     }
5085 
5086     // A carry in t0 after Montgomery multiplication means that we
5087     // should subtract multiples of n from our result in m.  We'll
5088     // keep doing that until there is no carry.
5089     void normalize(RegisterOrConstant len) {
5090       block_comment("normalize");
5091       // while (t0)
5092       //   t0 = sub(Pm_base, Pn_base, t0, len);
5093       Label loop, post, again;
5094       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5095       cbz(t0, post); {
5096         bind(again); {
5097           mov(i, zr);
5098           mov(cnt, len);
5099           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5100           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5101           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5102           align(16);
5103           bind(loop); {
5104             sbcs(Rm, Rm, Rn);
5105             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5106             add(i, i, 1);
5107             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5108             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5109             sub(cnt, cnt, 1);
5110           } cbnz(cnt, loop);
5111           sbc(t0, t0, zr);
5112         } cbnz(t0, again);
5113       } bind(post);
5114     }
5115 
5116     // Move memory at s to d, reversing words.
5117     //    Increments d to end of copied memory
5118     //    Destroys tmp1, tmp2
5119     //    Preserves len
5120     //    Leaves s pointing to the address which was in d at start
5121     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5122       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5123 
5124       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5125       mov(tmp1, len);
5126       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5127       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5128     }
5129     // where
5130     void reverse1(Register d, Register s, Register tmp) {
5131       ldr(tmp, pre(s, -wordSize));
5132       ror(tmp, tmp, 32);
5133       str(tmp, post(d, wordSize));
5134     }
5135 
5136     void step_squaring() {
5137       // An extra ACC
5138       step();
5139       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5140     }
5141 
5142     void last_squaring(RegisterOrConstant i) {
5143       Label dont;
5144       // if ((i & 1) == 0) {
5145       tbnz(i.as_register(), 0, dont); {
5146         // MACC(Ra, Rb, t0, t1, t2);
5147         // Ra = *++Pa;
5148         // Rb = *--Pb;
5149         umulh(Rhi_ab, Ra, Rb);
5150         mul(Rlo_ab, Ra, Rb);
5151         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5152       } bind(dont);
5153     }
5154 
5155     void extra_step_squaring() {
5156       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5157 
5158       // MACC(Rm, Rn, t0, t1, t2);
5159       // Rm = *++Pm;
5160       // Rn = *--Pn;
5161       umulh(Rhi_mn, Rm, Rn);
5162       mul(Rlo_mn, Rm, Rn);
5163       ldr(Rm, pre(Pm, wordSize));
5164       ldr(Rn, pre(Pn, -wordSize));
5165     }
5166 
5167     void post1_squaring() {
5168       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5169 
5170       // *Pm = Rm = t0 * inv;
5171       mul(Rm, t0, inv);
5172       str(Rm, Address(Pm));
5173 
5174       // MACC(Rm, Rn, t0, t1, t2);
5175       // t0 = t1; t1 = t2; t2 = 0;
5176       umulh(Rhi_mn, Rm, Rn);
5177 
5178 #ifndef PRODUCT
5179       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5180       {
5181         mul(Rlo_mn, Rm, Rn);
5182         add(Rlo_mn, t0, Rlo_mn);
5183         Label ok;
5184         cbz(Rlo_mn, ok); {
5185           stop("broken Montgomery multiply");
5186         } bind(ok);
5187       }
5188 #endif
5189       // We have very carefully set things up so that
5190       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5191       // the lower half of Rm * Rn because we know the result already:
5192       // it must be -t0.  t0 + (-t0) must generate a carry iff
5193       // t0 != 0.  So, rather than do a mul and an adds we just set
5194       // the carry flag iff t0 is nonzero.
5195       //
5196       // mul(Rlo_mn, Rm, Rn);
5197       // adds(zr, t0, Rlo_mn);
5198       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5199       adcs(t0, t1, Rhi_mn);
5200       adc(t1, t2, zr);
5201       mov(t2, zr);
5202     }
5203 
5204     void acc(Register Rhi, Register Rlo,
5205              Register t0, Register t1, Register t2) {
5206       adds(t0, t0, Rlo);
5207       adcs(t1, t1, Rhi);
5208       adc(t2, t2, zr);
5209     }
5210 
5211   public:
5212     /**
5213      * Fast Montgomery multiplication.  The derivation of the
5214      * algorithm is in A Cryptographic Library for the Motorola
5215      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5216      *
5217      * Arguments:
5218      *
5219      * Inputs for multiplication:
5220      *   c_rarg0   - int array elements a
5221      *   c_rarg1   - int array elements b
5222      *   c_rarg2   - int array elements n (the modulus)
5223      *   c_rarg3   - int length
5224      *   c_rarg4   - int inv
5225      *   c_rarg5   - int array elements m (the result)
5226      *
5227      * Inputs for squaring:
5228      *   c_rarg0   - int array elements a
5229      *   c_rarg1   - int array elements n (the modulus)
5230      *   c_rarg2   - int length
5231      *   c_rarg3   - int inv
5232      *   c_rarg4   - int array elements m (the result)
5233      *
5234      */
5235     address generate_multiply() {
5236       Label argh, nothing;
5237       bind(argh);
5238       stop("MontgomeryMultiply total_allocation must be <= 8192");
5239 
5240       align(CodeEntryAlignment);
5241       address entry = pc();
5242 
5243       cbzw(Rlen, nothing);
5244 
5245       enter();
5246 
5247       // Make room.
5248       cmpw(Rlen, 512);
5249       br(Assembler::HI, argh);
5250       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5251       andr(sp, Ra, -2 * wordSize);
5252 
5253       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5254 
5255       {
5256         // Copy input args, reversing as we go.  We use Ra as a
5257         // temporary variable.
5258         reverse(Ra, Pa_base, Rlen, t0, t1);
5259         if (!_squaring)
5260           reverse(Ra, Pb_base, Rlen, t0, t1);
5261         reverse(Ra, Pn_base, Rlen, t0, t1);
5262       }
5263 
5264       // Push all call-saved registers and also Pm_base which we'll need
5265       // at the end.
5266       save_regs();
5267 
5268 #ifndef PRODUCT
5269       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5270       {
5271         ldr(Rn, Address(Pn_base, 0));
5272         mul(Rlo_mn, Rn, inv);
5273         subs(zr, Rlo_mn, -1);
5274         Label ok;
5275         br(EQ, ok); {
5276           stop("broken inverse in Montgomery multiply");
5277         } bind(ok);
5278       }
5279 #endif
5280 
5281       mov(Pm_base, Ra);
5282 
5283       mov(t0, zr);
5284       mov(t1, zr);
5285       mov(t2, zr);
5286 
5287       block_comment("for (int i = 0; i < len; i++) {");
5288       mov(Ri, zr); {
5289         Label loop, end;
5290         cmpw(Ri, Rlen);
5291         br(Assembler::GE, end);
5292 
5293         bind(loop);
5294         pre1(Ri);
5295 
5296         block_comment("  for (j = i; j; j--) {"); {
5297           movw(Rj, Ri);
5298           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5299         } block_comment("  } // j");
5300 
5301         post1();
5302         addw(Ri, Ri, 1);
5303         cmpw(Ri, Rlen);
5304         br(Assembler::LT, loop);
5305         bind(end);
5306         block_comment("} // i");
5307       }
5308 
5309       block_comment("for (int i = len; i < 2*len; i++) {");
5310       mov(Ri, Rlen); {
5311         Label loop, end;
5312         cmpw(Ri, Rlen, Assembler::LSL, 1);
5313         br(Assembler::GE, end);
5314 
5315         bind(loop);
5316         pre2(Ri, Rlen);
5317 
5318         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5319           lslw(Rj, Rlen, 1);
5320           subw(Rj, Rj, Ri);
5321           subw(Rj, Rj, 1);
5322           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5323         } block_comment("  } // j");
5324 
5325         post2(Ri, Rlen);
5326         addw(Ri, Ri, 1);
5327         cmpw(Ri, Rlen, Assembler::LSL, 1);
5328         br(Assembler::LT, loop);
5329         bind(end);
5330       }
5331       block_comment("} // i");
5332 
5333       normalize(Rlen);
5334 
5335       mov(Ra, Pm_base);  // Save Pm_base in Ra
5336       restore_regs();  // Restore caller's Pm_base
5337 
5338       // Copy our result into caller's Pm_base
5339       reverse(Pm_base, Ra, Rlen, t0, t1);
5340 
5341       leave();
5342       bind(nothing);
5343       ret(lr);
5344 
5345       return entry;
5346     }
5347     // In C, approximately:
5348 
5349     // void
5350     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5351     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5352     //                     unsigned long inv, int len) {
5353     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5354     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5355     //   unsigned long Ra, Rb, Rn, Rm;
5356 
5357     //   int i;
5358 
5359     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5360 
5361     //   for (i = 0; i < len; i++) {
5362     //     int j;
5363 
5364     //     Pa = Pa_base;
5365     //     Pb = Pb_base + i;
5366     //     Pm = Pm_base;
5367     //     Pn = Pn_base + i;
5368 
5369     //     Ra = *Pa;
5370     //     Rb = *Pb;
5371     //     Rm = *Pm;
5372     //     Rn = *Pn;
5373 
5374     //     int iters = i;
5375     //     for (j = 0; iters--; j++) {
5376     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5377     //       MACC(Ra, Rb, t0, t1, t2);
5378     //       Ra = *++Pa;
5379     //       Rb = *--Pb;
5380     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5381     //       MACC(Rm, Rn, t0, t1, t2);
5382     //       Rm = *++Pm;
5383     //       Rn = *--Pn;
5384     //     }
5385 
5386     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5387     //     MACC(Ra, Rb, t0, t1, t2);
5388     //     *Pm = Rm = t0 * inv;
5389     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5390     //     MACC(Rm, Rn, t0, t1, t2);
5391 
5392     //     assert(t0 == 0, "broken Montgomery multiply");
5393 
5394     //     t0 = t1; t1 = t2; t2 = 0;
5395     //   }
5396 
5397     //   for (i = len; i < 2*len; i++) {
5398     //     int j;
5399 
5400     //     Pa = Pa_base + i-len;
5401     //     Pb = Pb_base + len;
5402     //     Pm = Pm_base + i-len;
5403     //     Pn = Pn_base + len;
5404 
5405     //     Ra = *++Pa;
5406     //     Rb = *--Pb;
5407     //     Rm = *++Pm;
5408     //     Rn = *--Pn;
5409 
5410     //     int iters = len*2-i-1;
5411     //     for (j = i-len+1; iters--; j++) {
5412     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5413     //       MACC(Ra, Rb, t0, t1, t2);
5414     //       Ra = *++Pa;
5415     //       Rb = *--Pb;
5416     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5417     //       MACC(Rm, Rn, t0, t1, t2);
5418     //       Rm = *++Pm;
5419     //       Rn = *--Pn;
5420     //     }
5421 
5422     //     Pm_base[i-len] = t0;
5423     //     t0 = t1; t1 = t2; t2 = 0;
5424     //   }
5425 
5426     //   while (t0)
5427     //     t0 = sub(Pm_base, Pn_base, t0, len);
5428     // }
5429 
5430     /**
5431      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5432      * multiplies than Montgomery multiplication so it should be up to
5433      * 25% faster.  However, its loop control is more complex and it
5434      * may actually run slower on some machines.
5435      *
5436      * Arguments:
5437      *
5438      * Inputs:
5439      *   c_rarg0   - int array elements a
5440      *   c_rarg1   - int array elements n (the modulus)
5441      *   c_rarg2   - int length
5442      *   c_rarg3   - int inv
5443      *   c_rarg4   - int array elements m (the result)
5444      *
5445      */
5446     address generate_square() {
5447       Label argh;
5448       bind(argh);
5449       stop("MontgomeryMultiply total_allocation must be <= 8192");
5450 
5451       align(CodeEntryAlignment);
5452       address entry = pc();
5453 
5454       enter();
5455 
5456       // Make room.
5457       cmpw(Rlen, 512);
5458       br(Assembler::HI, argh);
5459       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5460       andr(sp, Ra, -2 * wordSize);
5461 
5462       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5463 
5464       {
5465         // Copy input args, reversing as we go.  We use Ra as a
5466         // temporary variable.
5467         reverse(Ra, Pa_base, Rlen, t0, t1);
5468         reverse(Ra, Pn_base, Rlen, t0, t1);
5469       }
5470 
5471       // Push all call-saved registers and also Pm_base which we'll need
5472       // at the end.
5473       save_regs();
5474 
5475       mov(Pm_base, Ra);
5476 
5477       mov(t0, zr);
5478       mov(t1, zr);
5479       mov(t2, zr);
5480 
5481       block_comment("for (int i = 0; i < len; i++) {");
5482       mov(Ri, zr); {
5483         Label loop, end;
5484         bind(loop);
5485         cmp(Ri, Rlen);
5486         br(Assembler::GE, end);
5487 
5488         pre1(Ri);
5489 
5490         block_comment("for (j = (i+1)/2; j; j--) {"); {
5491           add(Rj, Ri, 1);
5492           lsr(Rj, Rj, 1);
5493           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5494         } block_comment("  } // j");
5495 
5496         last_squaring(Ri);
5497 
5498         block_comment("  for (j = i/2; j; j--) {"); {
5499           lsr(Rj, Ri, 1);
5500           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5501         } block_comment("  } // j");
5502 
5503         post1_squaring();
5504         add(Ri, Ri, 1);
5505         cmp(Ri, Rlen);
5506         br(Assembler::LT, loop);
5507 
5508         bind(end);
5509         block_comment("} // i");
5510       }
5511 
5512       block_comment("for (int i = len; i < 2*len; i++) {");
5513       mov(Ri, Rlen); {
5514         Label loop, end;
5515         bind(loop);
5516         cmp(Ri, Rlen, Assembler::LSL, 1);
5517         br(Assembler::GE, end);
5518 
5519         pre2(Ri, Rlen);
5520 
5521         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5522           lsl(Rj, Rlen, 1);
5523           sub(Rj, Rj, Ri);
5524           sub(Rj, Rj, 1);
5525           lsr(Rj, Rj, 1);
5526           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5527         } block_comment("  } // j");
5528 
5529         last_squaring(Ri);
5530 
5531         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5532           lsl(Rj, Rlen, 1);
5533           sub(Rj, Rj, Ri);
5534           lsr(Rj, Rj, 1);
5535           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5536         } block_comment("  } // j");
5537 
5538         post2(Ri, Rlen);
5539         add(Ri, Ri, 1);
5540         cmp(Ri, Rlen, Assembler::LSL, 1);
5541 
5542         br(Assembler::LT, loop);
5543         bind(end);
5544         block_comment("} // i");
5545       }
5546 
5547       normalize(Rlen);
5548 
5549       mov(Ra, Pm_base);  // Save Pm_base in Ra
5550       restore_regs();  // Restore caller's Pm_base
5551 
5552       // Copy our result into caller's Pm_base
5553       reverse(Pm_base, Ra, Rlen, t0, t1);
5554 
5555       leave();
5556       ret(lr);
5557 
5558       return entry;
5559     }
5560     // In C, approximately:
5561 
5562     // void
5563     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5564     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5565     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5566     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5567     //   unsigned long Ra, Rb, Rn, Rm;
5568 
5569     //   int i;
5570 
5571     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5572 
5573     //   for (i = 0; i < len; i++) {
5574     //     int j;
5575 
5576     //     Pa = Pa_base;
5577     //     Pb = Pa_base + i;
5578     //     Pm = Pm_base;
5579     //     Pn = Pn_base + i;
5580 
5581     //     Ra = *Pa;
5582     //     Rb = *Pb;
5583     //     Rm = *Pm;
5584     //     Rn = *Pn;
5585 
5586     //     int iters = (i+1)/2;
5587     //     for (j = 0; iters--; j++) {
5588     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5589     //       MACC2(Ra, Rb, t0, t1, t2);
5590     //       Ra = *++Pa;
5591     //       Rb = *--Pb;
5592     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5593     //       MACC(Rm, Rn, t0, t1, t2);
5594     //       Rm = *++Pm;
5595     //       Rn = *--Pn;
5596     //     }
5597     //     if ((i & 1) == 0) {
5598     //       assert(Ra == Pa_base[j], "must be");
5599     //       MACC(Ra, Ra, t0, t1, t2);
5600     //     }
5601     //     iters = i/2;
5602     //     assert(iters == i-j, "must be");
5603     //     for (; iters--; j++) {
5604     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5605     //       MACC(Rm, Rn, t0, t1, t2);
5606     //       Rm = *++Pm;
5607     //       Rn = *--Pn;
5608     //     }
5609 
5610     //     *Pm = Rm = t0 * inv;
5611     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5612     //     MACC(Rm, Rn, t0, t1, t2);
5613 
5614     //     assert(t0 == 0, "broken Montgomery multiply");
5615 
5616     //     t0 = t1; t1 = t2; t2 = 0;
5617     //   }
5618 
5619     //   for (i = len; i < 2*len; i++) {
5620     //     int start = i-len+1;
5621     //     int end = start + (len - start)/2;
5622     //     int j;
5623 
5624     //     Pa = Pa_base + i-len;
5625     //     Pb = Pa_base + len;
5626     //     Pm = Pm_base + i-len;
5627     //     Pn = Pn_base + len;
5628 
5629     //     Ra = *++Pa;
5630     //     Rb = *--Pb;
5631     //     Rm = *++Pm;
5632     //     Rn = *--Pn;
5633 
5634     //     int iters = (2*len-i-1)/2;
5635     //     assert(iters == end-start, "must be");
5636     //     for (j = start; iters--; j++) {
5637     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5638     //       MACC2(Ra, Rb, t0, t1, t2);
5639     //       Ra = *++Pa;
5640     //       Rb = *--Pb;
5641     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5642     //       MACC(Rm, Rn, t0, t1, t2);
5643     //       Rm = *++Pm;
5644     //       Rn = *--Pn;
5645     //     }
5646     //     if ((i & 1) == 0) {
5647     //       assert(Ra == Pa_base[j], "must be");
5648     //       MACC(Ra, Ra, t0, t1, t2);
5649     //     }
5650     //     iters =  (2*len-i)/2;
5651     //     assert(iters == len-j, "must be");
5652     //     for (; iters--; j++) {
5653     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5654     //       MACC(Rm, Rn, t0, t1, t2);
5655     //       Rm = *++Pm;
5656     //       Rn = *--Pn;
5657     //     }
5658     //     Pm_base[i-len] = t0;
5659     //     t0 = t1; t1 = t2; t2 = 0;
5660     //   }
5661 
5662     //   while (t0)
5663     //     t0 = sub(Pm_base, Pn_base, t0, len);
5664     // }
5665   };
5666 
5667 
5668   // Call here from the interpreter or compiled code to either load
5669   // multiple returned values from the value type instance being
5670   // returned to registers or to store returned values to a newly
5671   // allocated value type instance.
5672   address generate_return_value_stub(address destination, const char* name, bool has_res) {
5673 
5674     // Information about frame layout at time of blocking runtime call.
5675     // Note that we only have to preserve callee-saved registers since
5676     // the compilers are responsible for supplying a continuation point
5677     // if they expect all registers to be preserved.
5678     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
5679     enum layout {
5680       rfp_off = 0, rfp_off2,
5681 
5682       j_rarg7_off, j_rarg7_2,
5683       j_rarg6_off, j_rarg6_2,
5684       j_rarg5_off, j_rarg5_2,
5685       j_rarg4_off, j_rarg4_2,
5686       j_rarg3_off, j_rarg3_2,
5687       j_rarg2_off, j_rarg2_2,
5688       j_rarg1_off, j_rarg1_2,
5689       j_rarg0_off, j_rarg0_2,
5690 
5691       j_farg0_off, j_farg0_2,
5692       j_farg1_off, j_farg1_2,
5693       j_farg2_off, j_farg2_2,
5694       j_farg3_off, j_farg3_2,
5695       j_farg4_off, j_farg4_2,
5696       j_farg5_off, j_farg5_2,
5697       j_farg6_off, j_farg6_2,
5698       j_farg7_off, j_farg7_2,
5699  
5700       return_off, return_off2,
5701       framesize // inclusive of return address
5702     };
5703 
5704     int insts_size = 512;
5705     int locs_size  = 64;
5706 
5707     CodeBuffer code(name, insts_size, locs_size);
5708     OopMapSet* oop_maps  = new OopMapSet();
5709     MacroAssembler* masm = new MacroAssembler(&code);
5710 
5711     address start = __ pc();
5712 
5713     const Address f7_save       (rfp, j_farg7_off * wordSize);
5714     const Address f6_save       (rfp, j_farg6_off * wordSize);
5715     const Address f5_save       (rfp, j_farg5_off * wordSize);
5716     const Address f4_save       (rfp, j_farg4_off * wordSize);
5717     const Address f3_save       (rfp, j_farg3_off * wordSize);
5718     const Address f2_save       (rfp, j_farg2_off * wordSize);
5719     const Address f1_save       (rfp, j_farg1_off * wordSize);
5720     const Address f0_save       (rfp, j_farg0_off * wordSize);
5721 
5722     const Address r0_save      (rfp, j_rarg0_off * wordSize);
5723     const Address r1_save      (rfp, j_rarg1_off * wordSize);
5724     const Address r2_save      (rfp, j_rarg2_off * wordSize);
5725     const Address r3_save      (rfp, j_rarg3_off * wordSize);
5726     const Address r4_save      (rfp, j_rarg4_off * wordSize);
5727     const Address r5_save      (rfp, j_rarg5_off * wordSize);
5728     const Address r6_save      (rfp, j_rarg6_off * wordSize);
5729     const Address r7_save      (rfp, j_rarg7_off * wordSize);
5730 
5731     // Generate oop map
5732     OopMap* map = new OopMap(framesize, 0);
5733 
5734     map->set_callee_saved(VMRegImpl::stack2reg(rfp_off), rfp->as_VMReg());
5735     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
5736     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
5737     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
5738     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
5739     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
5740     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
5741     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
5742     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
5743 
5744     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
5745     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
5746     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
5747     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
5748     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
5749     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
5750     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
5751     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
5752 
5753     // This is an inlined and slightly modified version of call_VM
5754     // which has the ability to fetch the return PC out of
5755     // thread-local storage and also sets up last_Java_sp slightly
5756     // differently than the real call_VM
5757 
5758     __ enter(); // Save FP and LR before call
5759 
5760     assert(is_even(framesize/2), "sp not 16-byte aligned");
5761 
5762     // lr and fp are already in place
5763     __ sub(sp, rfp, ((unsigned)framesize - 4) << LogBytesPerInt); // prolog
5764 
5765     __ strd(j_farg7, f7_save); 
5766     __ strd(j_farg6, f6_save); 
5767     __ strd(j_farg5, f5_save); 
5768     __ strd(j_farg4, f4_save); 
5769     __ strd(j_farg3, f3_save); 
5770     __ strd(j_farg2, f2_save); 
5771     __ strd(j_farg1, f1_save); 
5772     __ strd(j_farg0, f0_save); 
5773 
5774     __ str(j_rarg0, r0_save); 
5775     __ str(j_rarg1, r1_save); 
5776     __ str(j_rarg2, r2_save); 
5777     __ str(j_rarg3, r3_save); 
5778     __ str(j_rarg4, r4_save); 
5779     __ str(j_rarg5, r5_save); 
5780     __ str(j_rarg6, r6_save); 
5781     __ str(j_rarg7, r7_save); 
5782 
5783     int frame_complete = __ pc() - start;
5784 
5785     // Set up last_Java_sp and last_Java_fp
5786     address the_pc = __ pc();
5787     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
5788 
5789     // Call runtime
5790     __ mov(c_rarg0, rthread);
5791     __ mov(c_rarg1, r0);
5792 
5793     BLOCK_COMMENT("call runtime_entry");
5794     __ mov(rscratch1, destination);
5795     __ blrt(rscratch1, 2 /* number_of_arguments */, 0, 1);
5796 
5797     oop_maps->add_gc_map(the_pc - start, map);
5798 
5799     __ reset_last_Java_frame(false); 
5800     __ maybe_isb(); 
5801 
5802     __ ldrd(j_farg7, f7_save); 
5803     __ ldrd(j_farg6, f6_save); 
5804     __ ldrd(j_farg5, f5_save); 
5805     __ ldrd(j_farg4, f4_save); 
5806     __ ldrd(j_farg3, f3_save); 
5807     __ ldrd(j_farg3, f2_save); 
5808     __ ldrd(j_farg1, f1_save); 
5809     __ ldrd(j_farg0, f0_save); 
5810 
5811     __ ldr(j_rarg0, r0_save); 
5812     __ ldr(j_rarg1, r1_save); 
5813     __ ldr(j_rarg2, r2_save); 
5814     __ ldr(j_rarg3, r3_save); 
5815     __ ldr(j_rarg4, r4_save); 
5816     __ ldr(j_rarg5, r5_save); 
5817     __ ldr(j_rarg6, r6_save); 
5818     __ ldr(j_rarg7, r7_save); 
5819 
5820     __ leave();
5821 
5822     // check for pending exceptions
5823     Label pending;
5824     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
5825     __ cmp(rscratch1, (u1)NULL_WORD);
5826     __ br(Assembler::NE, pending);
5827 
5828     if (has_res) {
5829       __ get_vm_result(r0, rthread);
5830     }
5831     __ ret(lr);
5832 
5833     __ bind(pending);
5834     __ ldr(r0, Address(rthread, in_bytes(Thread::pending_exception_offset())));
5835     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5836 
5837 
5838     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5839     int frame_size_in_words = (framesize >> (LogBytesPerWord - LogBytesPerInt));
5840     RuntimeStub* stub =
5841       RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
5842 
5843     return stub->entry_point();
5844   }
5845 
5846   // Initialization
5847   void generate_initial() {
5848     // Generate initial stubs and initializes the entry points
5849 
5850     // entry points that exist in all platforms Note: This is code
5851     // that could be shared among different platforms - however the
5852     // benefit seems to be smaller than the disadvantage of having a
5853     // much more complicated generator structure. See also comment in
5854     // stubRoutines.hpp.
5855 
5856     StubRoutines::_forward_exception_entry = generate_forward_exception();
5857 
5858     StubRoutines::_call_stub_entry =
5859       generate_call_stub(StubRoutines::_call_stub_return_address);
5860 
5861     // is referenced by megamorphic call
5862     StubRoutines::_catch_exception_entry = generate_catch_exception();
5863 
5864     // Build this early so it's available for the interpreter.
5865     StubRoutines::_throw_StackOverflowError_entry =
5866       generate_throw_exception("StackOverflowError throw_exception",
5867                                CAST_FROM_FN_PTR(address,
5868                                                 SharedRuntime::throw_StackOverflowError));
5869     StubRoutines::_throw_delayed_StackOverflowError_entry =
5870       generate_throw_exception("delayed StackOverflowError throw_exception",
5871                                CAST_FROM_FN_PTR(address,
5872                                                 SharedRuntime::throw_delayed_StackOverflowError));
5873     if (UseCRC32Intrinsics) {
5874       // set table address before stub generation which use it
5875       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5876       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5877     }
5878 
5879     if (UseCRC32CIntrinsics) {
5880       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5881     }
5882 
5883     // Disabled until JDK-8210858 is fixed
5884     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5885     //   StubRoutines::_dlog = generate_dlog();
5886     // }
5887 
5888     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5889       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5890     }
5891 
5892     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5893       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5894     }
5895 
5896 
5897     StubRoutines::_load_value_type_fields_in_regs = 
5898          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_value_type_fields_in_regs), "load_value_type_fields_in_regs", false);
5899     StubRoutines::_store_value_type_fields_to_buf = 
5900          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_value_type_fields_to_buf), "store_value_type_fields_to_buf", true);
5901   }
5902 
5903   void generate_all() {
5904     // support for verify_oop (must happen after universe_init)
5905     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5906     StubRoutines::_throw_AbstractMethodError_entry =
5907       generate_throw_exception("AbstractMethodError throw_exception",
5908                                CAST_FROM_FN_PTR(address,
5909                                                 SharedRuntime::
5910                                                 throw_AbstractMethodError));
5911 
5912     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5913       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5914                                CAST_FROM_FN_PTR(address,
5915                                                 SharedRuntime::
5916                                                 throw_IncompatibleClassChangeError));
5917 
5918     StubRoutines::_throw_NullPointerException_at_call_entry =
5919       generate_throw_exception("NullPointerException at call throw_exception",
5920                                CAST_FROM_FN_PTR(address,
5921                                                 SharedRuntime::
5922                                                 throw_NullPointerException_at_call));
5923 
5924     // arraycopy stubs used by compilers
5925     generate_arraycopy_stubs();
5926 
5927     // has negatives stub for large arrays.
5928     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5929 
5930     // array equals stub for large arrays.
5931     if (!UseSimpleArrayEquals) {
5932       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5933     }
5934 
5935     generate_compare_long_strings();
5936 
5937     generate_string_indexof_stubs();
5938 
5939     // byte_array_inflate stub for large arrays.
5940     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5941 
5942 #ifdef COMPILER2
5943     if (UseMultiplyToLenIntrinsic) {
5944       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5945     }
5946 
5947     if (UseSquareToLenIntrinsic) {
5948       StubRoutines::_squareToLen = generate_squareToLen();
5949     }
5950 
5951     if (UseMulAddIntrinsic) {
5952       StubRoutines::_mulAdd = generate_mulAdd();
5953     }
5954 
5955     if (UseMontgomeryMultiplyIntrinsic) {
5956       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5957       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5958       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5959     }
5960 
5961     if (UseMontgomerySquareIntrinsic) {
5962       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5963       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5964       // We use generate_multiply() rather than generate_square()
5965       // because it's faster for the sizes of modulus we care about.
5966       StubRoutines::_montgomerySquare = g.generate_multiply();
5967     }
5968 #endif // COMPILER2
5969 
5970 #ifndef BUILTIN_SIM
5971     // generate GHASH intrinsics code
5972     if (UseGHASHIntrinsics) {
5973       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5974     }
5975 
5976     if (UseAESIntrinsics) {
5977       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5978       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5979       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5980       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5981     }
5982 
5983     if (UseSHA1Intrinsics) {
5984       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5985       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5986     }
5987     if (UseSHA256Intrinsics) {
5988       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5989       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5990     }
5991 
5992     // generate Adler32 intrinsics code
5993     if (UseAdler32Intrinsics) {
5994       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5995     }
5996 
5997     // Safefetch stubs.
5998     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5999                                                        &StubRoutines::_safefetch32_fault_pc,
6000                                                        &StubRoutines::_safefetch32_continuation_pc);
6001     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6002                                                        &StubRoutines::_safefetchN_fault_pc,
6003                                                        &StubRoutines::_safefetchN_continuation_pc);
6004 #endif
6005     StubRoutines::aarch64::set_completed();
6006   }
6007 
6008  public:
6009   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6010     if (all) {
6011       generate_all();
6012     } else {
6013       generate_initial();
6014     }
6015   }
6016 }; // end class declaration
6017 
6018 void StubGenerator_generate(CodeBuffer* code, bool all) {
6019   StubGenerator g(code, all);
6020 }