1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #include "utilities/macros.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 #if INCLUDE_SHENANDOAHGC
  50 #include "gc/shenandoah/brooksPointer.hpp"
  51 #include "gc/shenandoah/shenandoahHeap.hpp"
  52 #include "gc/shenandoah/shenandoahHeapRegion.hpp"
  53 #include "gc/shenandoah/shenandoahRuntime.hpp"
  54 #endif
  55 
  56 #ifdef BUILTIN_SIM
  57 #include "../../../../../../simulator/simulator.hpp"
  58 #endif
  59 
  60 // Declaration and definition of StubGenerator (no .hpp file).
  61 // For a more detailed description of the stub routine structure
  62 // see the comment in stubRoutines.hpp
  63 
  64 #undef __
  65 #define __ _masm->
  66 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  67 
  68 #ifdef PRODUCT
  69 #define BLOCK_COMMENT(str) /* nothing */
  70 #else
  71 #define BLOCK_COMMENT(str) __ block_comment(str)
  72 #endif
  73 
  74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  75 
  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     __ lea(rscratch2, ExternalAddress((address)&counter));
  86     __ ldrw(rscratch1, Address(rscratch2));
  87     __ addw(rscratch1, rscratch1, 1);
  88     __ strw(rscratch1, Address(rscratch2));
  89   }
  90 #define inc_counter_np(counter) \
  91   BLOCK_COMMENT("inc_counter " #counter); \
  92   inc_counter_np_(counter);
  93 #endif
  94 
  95   // Call stubs are used to call Java from C
  96   //
  97   // Arguments:
  98   //    c_rarg0:   call wrapper address                   address
  99   //    c_rarg1:   result                                 address
 100   //    c_rarg2:   result type                            BasicType
 101   //    c_rarg3:   method                                 Method*
 102   //    c_rarg4:   (interpreter) entry point              address
 103   //    c_rarg5:   parameters                             intptr_t*
 104   //    c_rarg6:   parameter size (in words)              int
 105   //    c_rarg7:   thread                                 Thread*
 106   //
 107   // There is no return from the stub itself as any Java result
 108   // is written to result
 109   //
 110   // we save r30 (lr) as the return PC at the base of the frame and
 111   // link r29 (fp) below it as the frame pointer installing sp (r31)
 112   // into fp.
 113   //
 114   // we save r0-r7, which accounts for all the c arguments.
 115   //
 116   // TODO: strictly do we need to save them all? they are treated as
 117   // volatile by C so could we omit saving the ones we are going to
 118   // place in global registers (thread? method?) or those we only use
 119   // during setup of the Java call?
 120   //
 121   // we don't need to save r8 which C uses as an indirect result location
 122   // return register.
 123   //
 124   // we don't need to save r9-r15 which both C and Java treat as
 125   // volatile
 126   //
 127   // we don't need to save r16-18 because Java does not use them
 128   //
 129   // we save r19-r28 which Java uses as scratch registers and C
 130   // expects to be callee-save
 131   //
 132   // we save the bottom 64 bits of each value stored in v8-v15; it is
 133   // the responsibility of the caller to preserve larger values.
 134   //
 135   // so the stub frame looks like this when we enter Java code
 136   //
 137   //     [ return_from_Java     ] <--- sp
 138   //     [ argument word n      ]
 139   //      ...
 140   // -27 [ argument word 1      ]
 141   // -26 [ saved v15            ] <--- sp_after_call
 142   // -25 [ saved v14            ]
 143   // -24 [ saved v13            ]
 144   // -23 [ saved v12            ]
 145   // -22 [ saved v11            ]
 146   // -21 [ saved v10            ]
 147   // -20 [ saved v9             ]
 148   // -19 [ saved v8             ]
 149   // -18 [ saved r28            ]
 150   // -17 [ saved r27            ]
 151   // -16 [ saved r26            ]
 152   // -15 [ saved r25            ]
 153   // -14 [ saved r24            ]
 154   // -13 [ saved r23            ]
 155   // -12 [ saved r22            ]
 156   // -11 [ saved r21            ]
 157   // -10 [ saved r20            ]
 158   //  -9 [ saved r19            ]
 159   //  -8 [ call wrapper    (r0) ]
 160   //  -7 [ result          (r1) ]
 161   //  -6 [ result type     (r2) ]
 162   //  -5 [ method          (r3) ]
 163   //  -4 [ entry point     (r4) ]
 164   //  -3 [ parameters      (r5) ]
 165   //  -2 [ parameter size  (r6) ]
 166   //  -1 [ thread (r7)          ]
 167   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 168   //   1 [ saved lr       (r30) ]
 169 
 170   // Call stub stack layout word offsets from fp
 171   enum call_stub_layout {
 172     sp_after_call_off = -26,
 173 
 174     d15_off            = -26,
 175     d13_off            = -24,
 176     d11_off            = -22,
 177     d9_off             = -20,
 178 
 179     r28_off            = -18,
 180     r26_off            = -16,
 181     r24_off            = -14,
 182     r22_off            = -12,
 183     r20_off            = -10,
 184     call_wrapper_off   =  -8,
 185     result_off         =  -7,
 186     result_type_off    =  -6,
 187     method_off         =  -5,
 188     entry_point_off    =  -4,
 189     parameter_size_off =  -2,
 190     thread_off         =  -1,
 191     fp_f               =   0,
 192     retaddr_off        =   1,
 193   };
 194 
 195   address generate_call_stub(address& return_address) {
 196     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 197            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 198            "adjust this code");
 199 
 200     StubCodeMark mark(this, "StubRoutines", "call_stub");
 201     address start = __ pc();
 202 
 203     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 204 
 205     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 206     const Address result        (rfp, result_off         * wordSize);
 207     const Address result_type   (rfp, result_type_off    * wordSize);
 208     const Address method        (rfp, method_off         * wordSize);
 209     const Address entry_point   (rfp, entry_point_off    * wordSize);
 210     const Address parameter_size(rfp, parameter_size_off * wordSize);
 211 
 212     const Address thread        (rfp, thread_off         * wordSize);
 213 
 214     const Address d15_save      (rfp, d15_off * wordSize);
 215     const Address d13_save      (rfp, d13_off * wordSize);
 216     const Address d11_save      (rfp, d11_off * wordSize);
 217     const Address d9_save       (rfp, d9_off * wordSize);
 218 
 219     const Address r28_save      (rfp, r28_off * wordSize);
 220     const Address r26_save      (rfp, r26_off * wordSize);
 221     const Address r24_save      (rfp, r24_off * wordSize);
 222     const Address r22_save      (rfp, r22_off * wordSize);
 223     const Address r20_save      (rfp, r20_off * wordSize);
 224 
 225     // stub code
 226 
 227     // we need a C prolog to bootstrap the x86 caller into the sim
 228     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 229 
 230     address aarch64_entry = __ pc();
 231 
 232 #ifdef BUILTIN_SIM
 233     // Save sender's SP for stack traces.
 234     __ mov(rscratch1, sp);
 235     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 236 #endif
 237     // set up frame and move sp to end of save area
 238     __ enter();
 239     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 240 
 241     // save register parameters and Java scratch/global registers
 242     // n.b. we save thread even though it gets installed in
 243     // rthread because we want to sanity check rthread later
 244     __ str(c_rarg7,  thread);
 245     __ strw(c_rarg6, parameter_size);
 246     __ stp(c_rarg4, c_rarg5,  entry_point);
 247     __ stp(c_rarg2, c_rarg3,  result_type);
 248     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 249 
 250     __ stp(r20, r19,   r20_save);
 251     __ stp(r22, r21,   r22_save);
 252     __ stp(r24, r23,   r24_save);
 253     __ stp(r26, r25,   r26_save);
 254     __ stp(r28, r27,   r28_save);
 255 
 256     __ stpd(v9,  v8,   d9_save);
 257     __ stpd(v11, v10,  d11_save);
 258     __ stpd(v13, v12,  d13_save);
 259     __ stpd(v15, v14,  d15_save);
 260 
 261     // install Java thread in global register now we have saved
 262     // whatever value it held
 263     __ mov(rthread, c_rarg7);
 264     // And method
 265     __ mov(rmethod, c_rarg3);
 266 
 267     // set up the heapbase register
 268     __ reinit_heapbase();
 269 
 270 #ifdef ASSERT
 271     // make sure we have no pending exceptions
 272     {
 273       Label L;
 274       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 275       __ cmp(rscratch1, (unsigned)NULL_WORD);
 276       __ br(Assembler::EQ, L);
 277       __ stop("StubRoutines::call_stub: entered with pending exception");
 278       __ BIND(L);
 279     }
 280 #endif
 281     // pass parameters if any
 282     __ mov(esp, sp);
 283     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 284     __ andr(sp, rscratch1, -2 * wordSize);
 285 
 286     BLOCK_COMMENT("pass parameters if any");
 287     Label parameters_done;
 288     // parameter count is still in c_rarg6
 289     // and parameter pointer identifying param 1 is in c_rarg5
 290     __ cbzw(c_rarg6, parameters_done);
 291 
 292     address loop = __ pc();
 293     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 294     __ subsw(c_rarg6, c_rarg6, 1);
 295     __ push(rscratch1);
 296     __ br(Assembler::GT, loop);
 297 
 298     __ BIND(parameters_done);
 299 
 300     // call Java entry -- passing methdoOop, and current sp
 301     //      rmethod: Method*
 302     //      r13: sender sp
 303     BLOCK_COMMENT("call Java function");
 304     __ mov(r13, sp);
 305     __ blr(c_rarg4);
 306 
 307     // tell the simulator we have returned to the stub
 308 
 309     // we do this here because the notify will already have been done
 310     // if we get to the next instruction via an exception
 311     //
 312     // n.b. adding this instruction here affects the calculation of
 313     // whether or not a routine returns to the call stub (used when
 314     // doing stack walks) since the normal test is to check the return
 315     // pc against the address saved below. so we may need to allow for
 316     // this extra instruction in the check.
 317 
 318     if (NotifySimulator) {
 319       __ notify(Assembler::method_reentry);
 320     }
 321     // save current address for use by exception handling code
 322 
 323     return_address = __ pc();
 324 
 325     // store result depending on type (everything that is not
 326     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 327     // n.b. this assumes Java returns an integral result in r0
 328     // and a floating result in j_farg0
 329     __ ldr(j_rarg2, result);
 330     Label is_long, is_float, is_double, exit;
 331     __ ldr(j_rarg1, result_type);
 332     __ cmp(j_rarg1, T_OBJECT);
 333     __ br(Assembler::EQ, is_long);
 334     __ cmp(j_rarg1, T_LONG);
 335     __ br(Assembler::EQ, is_long);
 336     __ cmp(j_rarg1, T_FLOAT);
 337     __ br(Assembler::EQ, is_float);
 338     __ cmp(j_rarg1, T_DOUBLE);
 339     __ br(Assembler::EQ, is_double);
 340 
 341     // handle T_INT case
 342     __ strw(r0, Address(j_rarg2));
 343 
 344     __ BIND(exit);
 345 
 346     // pop parameters
 347     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 348 
 349 #ifdef ASSERT
 350     // verify that threads correspond
 351     {
 352       Label L, S;
 353       __ ldr(rscratch1, thread);
 354       __ cmp(rthread, rscratch1);
 355       __ br(Assembler::NE, S);
 356       __ get_thread(rscratch1);
 357       __ cmp(rthread, rscratch1);
 358       __ br(Assembler::EQ, L);
 359       __ BIND(S);
 360       __ stop("StubRoutines::call_stub: threads must correspond");
 361       __ BIND(L);
 362     }
 363 #endif
 364 
 365     // restore callee-save registers
 366     __ ldpd(v15, v14,  d15_save);
 367     __ ldpd(v13, v12,  d13_save);
 368     __ ldpd(v11, v10,  d11_save);
 369     __ ldpd(v9,  v8,   d9_save);
 370 
 371     __ ldp(r28, r27,   r28_save);
 372     __ ldp(r26, r25,   r26_save);
 373     __ ldp(r24, r23,   r24_save);
 374     __ ldp(r22, r21,   r22_save);
 375     __ ldp(r20, r19,   r20_save);
 376 
 377     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 378     __ ldrw(c_rarg2, result_type);
 379     __ ldr(c_rarg3,  method);
 380     __ ldp(c_rarg4, c_rarg5,  entry_point);
 381     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 382 
 383 #ifndef PRODUCT
 384     // tell the simulator we are about to end Java execution
 385     if (NotifySimulator) {
 386       __ notify(Assembler::method_exit);
 387     }
 388 #endif
 389     // leave frame and return to caller
 390     __ leave();
 391     __ ret(lr);
 392 
 393     // handle return types different from T_INT
 394 
 395     __ BIND(is_long);
 396     __ str(r0, Address(j_rarg2, 0));
 397     __ br(Assembler::AL, exit);
 398 
 399     __ BIND(is_float);
 400     __ strs(j_farg0, Address(j_rarg2, 0));
 401     __ br(Assembler::AL, exit);
 402 
 403     __ BIND(is_double);
 404     __ strd(j_farg0, Address(j_rarg2, 0));
 405     __ br(Assembler::AL, exit);
 406 
 407     return start;
 408   }
 409 
 410   // Return point for a Java call if there's an exception thrown in
 411   // Java code.  The exception is caught and transformed into a
 412   // pending exception stored in JavaThread that can be tested from
 413   // within the VM.
 414   //
 415   // Note: Usually the parameters are removed by the callee. In case
 416   // of an exception crossing an activation frame boundary, that is
 417   // not the case if the callee is compiled code => need to setup the
 418   // rsp.
 419   //
 420   // r0: exception oop
 421 
 422   // NOTE: this is used as a target from the signal handler so it
 423   // needs an x86 prolog which returns into the current simulator
 424   // executing the generated catch_exception code. so the prolog
 425   // needs to install rax in a sim register and adjust the sim's
 426   // restart pc to enter the generated code at the start position
 427   // then return from native to simulated execution.
 428 
 429   address generate_catch_exception() {
 430     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 431     address start = __ pc();
 432 
 433     // same as in generate_call_stub():
 434     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 435     const Address thread        (rfp, thread_off         * wordSize);
 436 
 437 #ifdef ASSERT
 438     // verify that threads correspond
 439     {
 440       Label L, S;
 441       __ ldr(rscratch1, thread);
 442       __ cmp(rthread, rscratch1);
 443       __ br(Assembler::NE, S);
 444       __ get_thread(rscratch1);
 445       __ cmp(rthread, rscratch1);
 446       __ br(Assembler::EQ, L);
 447       __ bind(S);
 448       __ stop("StubRoutines::catch_exception: threads must correspond");
 449       __ bind(L);
 450     }
 451 #endif
 452 
 453     // set pending exception
 454     __ verify_oop(r0);
 455 
 456     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 457     __ mov(rscratch1, (address)__FILE__);
 458     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 459     __ movw(rscratch1, (int)__LINE__);
 460     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 461 
 462     // complete return to VM
 463     assert(StubRoutines::_call_stub_return_address != NULL,
 464            "_call_stub_return_address must have been generated before");
 465     __ b(StubRoutines::_call_stub_return_address);
 466 
 467     return start;
 468   }
 469 
 470   // Continuation point for runtime calls returning with a pending
 471   // exception.  The pending exception check happened in the runtime
 472   // or native call stub.  The pending exception in Thread is
 473   // converted into a Java-level exception.
 474   //
 475   // Contract with Java-level exception handlers:
 476   // r0: exception
 477   // r3: throwing pc
 478   //
 479   // NOTE: At entry of this stub, exception-pc must be in LR !!
 480 
 481   // NOTE: this is always used as a jump target within generated code
 482   // so it just needs to be generated code wiht no x86 prolog
 483 
 484   address generate_forward_exception() {
 485     StubCodeMark mark(this, "StubRoutines", "forward exception");
 486     address start = __ pc();
 487 
 488     // Upon entry, LR points to the return address returning into
 489     // Java (interpreted or compiled) code; i.e., the return address
 490     // becomes the throwing pc.
 491     //
 492     // Arguments pushed before the runtime call are still on the stack
 493     // but the exception handler will reset the stack pointer ->
 494     // ignore them.  A potential result in registers can be ignored as
 495     // well.
 496 
 497 #ifdef ASSERT
 498     // make sure this code is only executed if there is a pending exception
 499     {
 500       Label L;
 501       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 502       __ cbnz(rscratch1, L);
 503       __ stop("StubRoutines::forward exception: no pending exception (1)");
 504       __ bind(L);
 505     }
 506 #endif
 507 
 508     // compute exception handler into r19
 509 
 510     // call the VM to find the handler address associated with the
 511     // caller address. pass thread in r0 and caller pc (ret address)
 512     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 513     // the stack.
 514     __ mov(c_rarg1, lr);
 515     // lr will be trashed by the VM call so we move it to R19
 516     // (callee-saved) because we also need to pass it to the handler
 517     // returned by this call.
 518     __ mov(r19, lr);
 519     BLOCK_COMMENT("call exception_handler_for_return_address");
 520     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 521                          SharedRuntime::exception_handler_for_return_address),
 522                     rthread, c_rarg1);
 523     // we should not really care that lr is no longer the callee
 524     // address. we saved the value the handler needs in r19 so we can
 525     // just copy it to r3. however, the C2 handler will push its own
 526     // frame and then calls into the VM and the VM code asserts that
 527     // the PC for the frame above the handler belongs to a compiled
 528     // Java method. So, we restore lr here to satisfy that assert.
 529     __ mov(lr, r19);
 530     // setup r0 & r3 & clear pending exception
 531     __ mov(r3, r19);
 532     __ mov(r19, r0);
 533     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 534     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 535 
 536 #ifdef ASSERT
 537     // make sure exception is set
 538     {
 539       Label L;
 540       __ cbnz(r0, L);
 541       __ stop("StubRoutines::forward exception: no pending exception (2)");
 542       __ bind(L);
 543     }
 544 #endif
 545 
 546     // continue at exception handler
 547     // r0: exception
 548     // r3: throwing pc
 549     // r19: exception handler
 550     __ verify_oop(r0);
 551     __ br(r19);
 552 
 553     return start;
 554   }
 555 
 556 #if INCLUDE_SHENANDOAHGC
 557   // Shenandoah write barrier.
 558   //
 559   // Input:
 560   //   r0: OOP to evacuate.  Not null.
 561   //
 562   // Output:
 563   //   r0: Pointer to evacuated OOP.
 564   //
 565   // Trash rscratch1, rscratch2.  Preserve everything else.
 566 
 567   address generate_shenandoah_wb(bool c_abi, bool do_cset_test) {
 568     StubCodeMark mark(this, "StubRoutines", "shenandoah_wb");
 569 
 570     __ align(6);
 571     address start = __ pc();
 572 
 573     if (do_cset_test) {
 574       Label work;
 575       __ mov(rscratch2, ShenandoahHeap::in_cset_fast_test_addr());
 576       __ lsr(rscratch1, r0, ShenandoahHeapRegion::region_size_bytes_shift_jint());
 577       __ ldrb(rscratch2, Address(rscratch2, rscratch1));
 578       __ tbnz(rscratch2, 0, work);
 579       __ ret(lr);
 580       __ bind(work);
 581     }
 582 
 583     Register obj = r0;
 584 
 585     __ enter(); // required for proper stackwalking of RuntimeStub frame
 586 
 587     if (!c_abi) {
 588       __ push_call_clobbered_registers();
 589     } else {
 590       __ push_call_clobbered_fp_registers();
 591     }
 592 
 593     __ mov(lr, CAST_FROM_FN_PTR(address, ShenandoahRuntime::write_barrier_JRT));
 594     __ blrt(lr, 1, 0, MacroAssembler::ret_type_integral);
 595     if (!c_abi) {
 596       __ mov(rscratch1, obj);
 597       __ pop_call_clobbered_registers();
 598       __ mov(obj, rscratch1);
 599     } else {
 600       __ pop_call_clobbered_fp_registers();
 601     }
 602 
 603     __ leave(); // required for proper stackwalking of RuntimeStub frame
 604     __ ret(lr);
 605 
 606     return start;
 607   }
 608 #endif
 609 
 610   // Non-destructive plausibility checks for oops
 611   //
 612   // Arguments:
 613   //    r0: oop to verify
 614   //    rscratch1: error message
 615   //
 616   // Stack after saving c_rarg3:
 617   //    [tos + 0]: saved c_rarg3
 618   //    [tos + 1]: saved c_rarg2
 619   //    [tos + 2]: saved lr
 620   //    [tos + 3]: saved rscratch2
 621   //    [tos + 4]: saved r0
 622   //    [tos + 5]: saved rscratch1
 623   address generate_verify_oop() {
 624 
 625     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 626     address start = __ pc();
 627 
 628     Label exit, error;
 629 
 630     // save c_rarg2 and c_rarg3
 631     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 632 
 633     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 634     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 635     __ ldr(c_rarg3, Address(c_rarg2));
 636     __ add(c_rarg3, c_rarg3, 1);
 637     __ str(c_rarg3, Address(c_rarg2));
 638 
 639     // object is in r0
 640     // make sure object is 'reasonable'
 641     __ cbz(r0, exit); // if obj is NULL it is OK
 642 
 643     // Check if the oop is in the right area of memory
 644     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 645     __ andr(c_rarg2, r0, c_rarg3);
 646     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 647 
 648     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 649     // instruction here because the flags register is live.
 650     __ eor(c_rarg2, c_rarg2, c_rarg3);
 651     __ cbnz(c_rarg2, error);
 652 
 653     // make sure klass is 'reasonable', which is not zero.
 654     __ load_klass(r0, r0);  // get klass
 655     __ cbz(r0, error);      // if klass is NULL it is broken
 656 
 657     // return if everything seems ok
 658     __ bind(exit);
 659 
 660     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 661     __ ret(lr);
 662 
 663     // handle errors
 664     __ bind(error);
 665     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 666 
 667     __ push(RegSet::range(r0, r29), sp);
 668     // debug(char* msg, int64_t pc, int64_t regs[])
 669     __ mov(c_rarg0, rscratch1);      // pass address of error message
 670     __ mov(c_rarg1, lr);             // pass return address
 671     __ mov(c_rarg2, sp);             // pass address of regs on stack
 672 #ifndef PRODUCT
 673     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 674 #endif
 675     BLOCK_COMMENT("call MacroAssembler::debug");
 676     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 677     __ blrt(rscratch1, 3, 0, 1);
 678 
 679     return start;
 680   }
 681 
 682   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 683 
 684   // The inner part of zero_words().  This is the bulk operation,
 685   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 686   // caller is responsible for zeroing the last few words.
 687   //
 688   // Inputs:
 689   // r10: the HeapWord-aligned base address of an array to zero.
 690   // r11: the count in HeapWords, r11 > 0.
 691   //
 692   // Returns r10 and r11, adjusted for the caller to clear.
 693   // r10: the base address of the tail of words left to clear.
 694   // r11: the number of words in the tail.
 695   //      r11 < MacroAssembler::zero_words_block_size.
 696 
 697   address generate_zero_blocks() {
 698     Label store_pair, loop_store_pair, done;
 699     Label base_aligned;
 700 
 701     Register base = r10, cnt = r11;
 702 
 703     __ align(CodeEntryAlignment);
 704     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 705     address start = __ pc();
 706 
 707     if (UseBlockZeroing) {
 708       int zva_length = VM_Version::zva_length();
 709 
 710       // Ensure ZVA length can be divided by 16. This is required by
 711       // the subsequent operations.
 712       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 713 
 714       __ tbz(base, 3, base_aligned);
 715       __ str(zr, Address(__ post(base, 8)));
 716       __ sub(cnt, cnt, 1);
 717       __ bind(base_aligned);
 718 
 719       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 720       // alignment.
 721       Label small;
 722       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 723       __ subs(rscratch1, cnt, low_limit >> 3);
 724       __ br(Assembler::LT, small);
 725       __ zero_dcache_blocks(base, cnt);
 726       __ bind(small);
 727     }
 728 
 729     {
 730       // Number of stp instructions we'll unroll
 731       const int unroll =
 732         MacroAssembler::zero_words_block_size / 2;
 733       // Clear the remaining blocks.
 734       Label loop;
 735       __ subs(cnt, cnt, unroll * 2);
 736       __ br(Assembler::LT, done);
 737       __ bind(loop);
 738       for (int i = 0; i < unroll; i++)
 739         __ stp(zr, zr, __ post(base, 16));
 740       __ subs(cnt, cnt, unroll * 2);
 741       __ br(Assembler::GE, loop);
 742       __ bind(done);
 743       __ add(cnt, cnt, unroll * 2);
 744     }
 745 
 746     __ ret(lr);
 747 
 748     return start;
 749   }
 750 
 751 
 752   typedef enum {
 753     copy_forwards = 1,
 754     copy_backwards = -1
 755   } copy_direction;
 756 
 757   // Bulk copy of blocks of 8 words.
 758   //
 759   // count is a count of words.
 760   //
 761   // Precondition: count >= 8
 762   //
 763   // Postconditions:
 764   //
 765   // The least significant bit of count contains the remaining count
 766   // of words to copy.  The rest of count is trash.
 767   //
 768   // s and d are adjusted to point to the remaining words to copy
 769   //
 770   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 771                            copy_direction direction) {
 772     int unit = wordSize * direction;
 773     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 774 
 775     int offset;
 776     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 777       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 778     const Register stride = r13;
 779 
 780     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 781     assert_different_registers(s, d, count, rscratch1);
 782 
 783     Label again, drain;
 784     const char *stub_name;
 785     if (direction == copy_forwards)
 786       stub_name = "forward_copy_longs";
 787     else
 788       stub_name = "backward_copy_longs";
 789     StubCodeMark mark(this, "StubRoutines", stub_name);
 790     __ align(CodeEntryAlignment);
 791     __ bind(start);
 792 
 793     Label unaligned_copy_long;
 794     if (AvoidUnalignedAccesses) {
 795       __ tbnz(d, 3, unaligned_copy_long);
 796     }
 797 
 798     if (direction == copy_forwards) {
 799       __ sub(s, s, bias);
 800       __ sub(d, d, bias);
 801     }
 802 
 803 #ifdef ASSERT
 804     // Make sure we are never given < 8 words
 805     {
 806       Label L;
 807       __ cmp(count, 8);
 808       __ br(Assembler::GE, L);
 809       __ stop("genrate_copy_longs called with < 8 words");
 810       __ bind(L);
 811     }
 812 #endif
 813 
 814     // Fill 8 registers
 815     if (UseSIMDForMemoryOps) {
 816       __ ldpq(v0, v1, Address(s, 4 * unit));
 817       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 818     } else {
 819       __ ldp(t0, t1, Address(s, 2 * unit));
 820       __ ldp(t2, t3, Address(s, 4 * unit));
 821       __ ldp(t4, t5, Address(s, 6 * unit));
 822       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 823     }
 824 
 825     __ subs(count, count, 16);
 826     __ br(Assembler::LO, drain);
 827 
 828     int prefetch = PrefetchCopyIntervalInBytes;
 829     bool use_stride = false;
 830     if (direction == copy_backwards) {
 831        use_stride = prefetch > 256;
 832        prefetch = -prefetch;
 833        if (use_stride) __ mov(stride, prefetch);
 834     }
 835 
 836     __ bind(again);
 837 
 838     if (PrefetchCopyIntervalInBytes > 0)
 839       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 840 
 841     if (UseSIMDForMemoryOps) {
 842       __ stpq(v0, v1, Address(d, 4 * unit));
 843       __ ldpq(v0, v1, Address(s, 4 * unit));
 844       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 845       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 846     } else {
 847       __ stp(t0, t1, Address(d, 2 * unit));
 848       __ ldp(t0, t1, Address(s, 2 * unit));
 849       __ stp(t2, t3, Address(d, 4 * unit));
 850       __ ldp(t2, t3, Address(s, 4 * unit));
 851       __ stp(t4, t5, Address(d, 6 * unit));
 852       __ ldp(t4, t5, Address(s, 6 * unit));
 853       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 854       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 855     }
 856 
 857     __ subs(count, count, 8);
 858     __ br(Assembler::HS, again);
 859 
 860     // Drain
 861     __ bind(drain);
 862     if (UseSIMDForMemoryOps) {
 863       __ stpq(v0, v1, Address(d, 4 * unit));
 864       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 865     } else {
 866       __ stp(t0, t1, Address(d, 2 * unit));
 867       __ stp(t2, t3, Address(d, 4 * unit));
 868       __ stp(t4, t5, Address(d, 6 * unit));
 869       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 870     }
 871 
 872     {
 873       Label L1, L2;
 874       __ tbz(count, exact_log2(4), L1);
 875       if (UseSIMDForMemoryOps) {
 876         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 877         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 878       } else {
 879         __ ldp(t0, t1, Address(s, 2 * unit));
 880         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 881         __ stp(t0, t1, Address(d, 2 * unit));
 882         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 883       }
 884       __ bind(L1);
 885 
 886       if (direction == copy_forwards) {
 887         __ add(s, s, bias);
 888         __ add(d, d, bias);
 889       }
 890 
 891       __ tbz(count, 1, L2);
 892       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 893       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 894       __ bind(L2);
 895     }
 896 
 897     __ ret(lr);
 898 
 899     if (AvoidUnalignedAccesses) {
 900       Label drain, again;
 901       // Register order for storing. Order is different for backward copy.
 902 
 903       __ bind(unaligned_copy_long);
 904 
 905       // source address is even aligned, target odd aligned
 906       //
 907       // when forward copying word pairs we read long pairs at offsets
 908       // {0, 2, 4, 6} (in long words). when backwards copying we read
 909       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 910       // address by -2 in the forwards case so we can compute the
 911       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 912       // or -1.
 913       //
 914       // when forward copying we need to store 1 word, 3 pairs and
 915       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 916       // zero offset We adjust the destination by -1 which means we
 917       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 918       //
 919       // When backwards copyng we need to store 1 word, 3 pairs and
 920       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 921       // offsets {1, 3, 5, 7, 8} * unit.
 922 
 923       if (direction == copy_forwards) {
 924         __ sub(s, s, 16);
 925         __ sub(d, d, 8);
 926       }
 927 
 928       // Fill 8 registers
 929       //
 930       // for forwards copy s was offset by -16 from the original input
 931       // value of s so the register contents are at these offsets
 932       // relative to the 64 bit block addressed by that original input
 933       // and so on for each successive 64 byte block when s is updated
 934       //
 935       // t0 at offset 0,  t1 at offset 8
 936       // t2 at offset 16, t3 at offset 24
 937       // t4 at offset 32, t5 at offset 40
 938       // t6 at offset 48, t7 at offset 56
 939 
 940       // for backwards copy s was not offset so the register contents
 941       // are at these offsets into the preceding 64 byte block
 942       // relative to that original input and so on for each successive
 943       // preceding 64 byte block when s is updated. this explains the
 944       // slightly counter-intuitive looking pattern of register usage
 945       // in the stp instructions for backwards copy.
 946       //
 947       // t0 at offset -16, t1 at offset -8
 948       // t2 at offset -32, t3 at offset -24
 949       // t4 at offset -48, t5 at offset -40
 950       // t6 at offset -64, t7 at offset -56
 951 
 952       __ ldp(t0, t1, Address(s, 2 * unit));
 953       __ ldp(t2, t3, Address(s, 4 * unit));
 954       __ ldp(t4, t5, Address(s, 6 * unit));
 955       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 956 
 957       __ subs(count, count, 16);
 958       __ br(Assembler::LO, drain);
 959 
 960       int prefetch = PrefetchCopyIntervalInBytes;
 961       bool use_stride = false;
 962       if (direction == copy_backwards) {
 963          use_stride = prefetch > 256;
 964          prefetch = -prefetch;
 965          if (use_stride) __ mov(stride, prefetch);
 966       }
 967 
 968       __ bind(again);
 969 
 970       if (PrefetchCopyIntervalInBytes > 0)
 971         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 972 
 973       if (direction == copy_forwards) {
 974        // allowing for the offset of -8 the store instructions place
 975        // registers into the target 64 bit block at the following
 976        // offsets
 977        //
 978        // t0 at offset 0
 979        // t1 at offset 8,  t2 at offset 16
 980        // t3 at offset 24, t4 at offset 32
 981        // t5 at offset 40, t6 at offset 48
 982        // t7 at offset 56
 983 
 984         __ str(t0, Address(d, 1 * unit));
 985         __ stp(t1, t2, Address(d, 2 * unit));
 986         __ ldp(t0, t1, Address(s, 2 * unit));
 987         __ stp(t3, t4, Address(d, 4 * unit));
 988         __ ldp(t2, t3, Address(s, 4 * unit));
 989         __ stp(t5, t6, Address(d, 6 * unit));
 990         __ ldp(t4, t5, Address(s, 6 * unit));
 991         __ str(t7, Address(__ pre(d, 8 * unit)));
 992         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 993       } else {
 994        // d was not offset when we started so the registers are
 995        // written into the 64 bit block preceding d with the following
 996        // offsets
 997        //
 998        // t1 at offset -8
 999        // t3 at offset -24, t0 at offset -16
1000        // t5 at offset -48, t2 at offset -32
1001        // t7 at offset -56, t4 at offset -48
1002        //                   t6 at offset -64
1003        //
1004        // note that this matches the offsets previously noted for the
1005        // loads
1006 
1007         __ str(t1, Address(d, 1 * unit));
1008         __ stp(t3, t0, Address(d, 3 * unit));
1009         __ ldp(t0, t1, Address(s, 2 * unit));
1010         __ stp(t5, t2, Address(d, 5 * unit));
1011         __ ldp(t2, t3, Address(s, 4 * unit));
1012         __ stp(t7, t4, Address(d, 7 * unit));
1013         __ ldp(t4, t5, Address(s, 6 * unit));
1014         __ str(t6, Address(__ pre(d, 8 * unit)));
1015         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1016       }
1017 
1018       __ subs(count, count, 8);
1019       __ br(Assembler::HS, again);
1020 
1021       // Drain
1022       //
1023       // this uses the same pattern of offsets and register arguments
1024       // as above
1025       __ bind(drain);
1026       if (direction == copy_forwards) {
1027         __ str(t0, Address(d, 1 * unit));
1028         __ stp(t1, t2, Address(d, 2 * unit));
1029         __ stp(t3, t4, Address(d, 4 * unit));
1030         __ stp(t5, t6, Address(d, 6 * unit));
1031         __ str(t7, Address(__ pre(d, 8 * unit)));
1032       } else {
1033         __ str(t1, Address(d, 1 * unit));
1034         __ stp(t3, t0, Address(d, 3 * unit));
1035         __ stp(t5, t2, Address(d, 5 * unit));
1036         __ stp(t7, t4, Address(d, 7 * unit));
1037         __ str(t6, Address(__ pre(d, 8 * unit)));
1038       }
1039       // now we need to copy any remaining part block which may
1040       // include a 4 word block subblock and/or a 2 word subblock.
1041       // bits 2 and 1 in the count are the tell-tale for whetehr we
1042       // have each such subblock
1043       {
1044         Label L1, L2;
1045         __ tbz(count, exact_log2(4), L1);
1046        // this is the same as above but copying only 4 longs hence
1047        // with ony one intervening stp between the str instructions
1048        // but note that the offsets and registers still follow the
1049        // same pattern
1050         __ ldp(t0, t1, Address(s, 2 * unit));
1051         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1052         if (direction == copy_forwards) {
1053           __ str(t0, Address(d, 1 * unit));
1054           __ stp(t1, t2, Address(d, 2 * unit));
1055           __ str(t3, Address(__ pre(d, 4 * unit)));
1056         } else {
1057           __ str(t1, Address(d, 1 * unit));
1058           __ stp(t3, t0, Address(d, 3 * unit));
1059           __ str(t2, Address(__ pre(d, 4 * unit)));
1060         }
1061         __ bind(L1);
1062 
1063         __ tbz(count, 1, L2);
1064        // this is the same as above but copying only 2 longs hence
1065        // there is no intervening stp between the str instructions
1066        // but note that the offset and register patterns are still
1067        // the same
1068         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1069         if (direction == copy_forwards) {
1070           __ str(t0, Address(d, 1 * unit));
1071           __ str(t1, Address(__ pre(d, 2 * unit)));
1072         } else {
1073           __ str(t1, Address(d, 1 * unit));
1074           __ str(t0, Address(__ pre(d, 2 * unit)));
1075         }
1076         __ bind(L2);
1077 
1078        // for forwards copy we need to re-adjust the offsets we
1079        // applied so that s and d are follow the last words written
1080 
1081        if (direction == copy_forwards) {
1082          __ add(s, s, 16);
1083          __ add(d, d, 8);
1084        }
1085 
1086       }
1087 
1088       __ ret(lr);
1089       }
1090   }
1091 
1092   // Small copy: less than 16 bytes.
1093   //
1094   // NB: Ignores all of the bits of count which represent more than 15
1095   // bytes, so a caller doesn't have to mask them.
1096 
1097   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1098     bool is_backwards = step < 0;
1099     size_t granularity = uabs(step);
1100     int direction = is_backwards ? -1 : 1;
1101     int unit = wordSize * direction;
1102 
1103     Label Lpair, Lword, Lint, Lshort, Lbyte;
1104 
1105     assert(granularity
1106            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1107 
1108     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1109 
1110     // ??? I don't know if this bit-test-and-branch is the right thing
1111     // to do.  It does a lot of jumping, resulting in several
1112     // mispredicted branches.  It might make more sense to do this
1113     // with something like Duff's device with a single computed branch.
1114 
1115     __ tbz(count, 3 - exact_log2(granularity), Lword);
1116     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1117     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1118     __ bind(Lword);
1119 
1120     if (granularity <= sizeof (jint)) {
1121       __ tbz(count, 2 - exact_log2(granularity), Lint);
1122       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1123       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1124       __ bind(Lint);
1125     }
1126 
1127     if (granularity <= sizeof (jshort)) {
1128       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1129       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1130       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1131       __ bind(Lshort);
1132     }
1133 
1134     if (granularity <= sizeof (jbyte)) {
1135       __ tbz(count, 0, Lbyte);
1136       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1137       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1138       __ bind(Lbyte);
1139     }
1140   }
1141 
1142   Label copy_f, copy_b;
1143 
1144   // All-singing all-dancing memory copy.
1145   //
1146   // Copy count units of memory from s to d.  The size of a unit is
1147   // step, which can be positive or negative depending on the direction
1148   // of copy.  If is_aligned is false, we align the source address.
1149   //
1150 
1151   void copy_memory(bool is_aligned, Register s, Register d,
1152                    Register count, Register tmp, int step) {
1153     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1154     bool is_backwards = step < 0;
1155     int granularity = uabs(step);
1156     const Register t0 = r3, t1 = r4;
1157 
1158     // <= 96 bytes do inline. Direction doesn't matter because we always
1159     // load all the data before writing anything
1160     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1161     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1162     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1163     const Register send = r17, dend = r18;
1164 
1165     if (PrefetchCopyIntervalInBytes > 0)
1166       __ prfm(Address(s, 0), PLDL1KEEP);
1167     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1168     __ br(Assembler::HI, copy_big);
1169 
1170     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1171     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1172 
1173     __ cmp(count, 16/granularity);
1174     __ br(Assembler::LS, copy16);
1175 
1176     __ cmp(count, 64/granularity);
1177     __ br(Assembler::HI, copy80);
1178 
1179     __ cmp(count, 32/granularity);
1180     __ br(Assembler::LS, copy32);
1181 
1182     // 33..64 bytes
1183     if (UseSIMDForMemoryOps) {
1184       __ ldpq(v0, v1, Address(s, 0));
1185       __ ldpq(v2, v3, Address(send, -32));
1186       __ stpq(v0, v1, Address(d, 0));
1187       __ stpq(v2, v3, Address(dend, -32));
1188     } else {
1189       __ ldp(t0, t1, Address(s, 0));
1190       __ ldp(t2, t3, Address(s, 16));
1191       __ ldp(t4, t5, Address(send, -32));
1192       __ ldp(t6, t7, Address(send, -16));
1193 
1194       __ stp(t0, t1, Address(d, 0));
1195       __ stp(t2, t3, Address(d, 16));
1196       __ stp(t4, t5, Address(dend, -32));
1197       __ stp(t6, t7, Address(dend, -16));
1198     }
1199     __ b(finish);
1200 
1201     // 17..32 bytes
1202     __ bind(copy32);
1203     __ ldp(t0, t1, Address(s, 0));
1204     __ ldp(t2, t3, Address(send, -16));
1205     __ stp(t0, t1, Address(d, 0));
1206     __ stp(t2, t3, Address(dend, -16));
1207     __ b(finish);
1208 
1209     // 65..80/96 bytes
1210     // (96 bytes if SIMD because we do 32 byes per instruction)
1211     __ bind(copy80);
1212     if (UseSIMDForMemoryOps) {
1213       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1214       __ ldpq(v4, v5, Address(send, -32));
1215       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1216       __ stpq(v4, v5, Address(dend, -32));
1217     } else {
1218       __ ldp(t0, t1, Address(s, 0));
1219       __ ldp(t2, t3, Address(s, 16));
1220       __ ldp(t4, t5, Address(s, 32));
1221       __ ldp(t6, t7, Address(s, 48));
1222       __ ldp(t8, t9, Address(send, -16));
1223 
1224       __ stp(t0, t1, Address(d, 0));
1225       __ stp(t2, t3, Address(d, 16));
1226       __ stp(t4, t5, Address(d, 32));
1227       __ stp(t6, t7, Address(d, 48));
1228       __ stp(t8, t9, Address(dend, -16));
1229     }
1230     __ b(finish);
1231 
1232     // 0..16 bytes
1233     __ bind(copy16);
1234     __ cmp(count, 8/granularity);
1235     __ br(Assembler::LO, copy8);
1236 
1237     // 8..16 bytes
1238     __ ldr(t0, Address(s, 0));
1239     __ ldr(t1, Address(send, -8));
1240     __ str(t0, Address(d, 0));
1241     __ str(t1, Address(dend, -8));
1242     __ b(finish);
1243 
1244     if (granularity < 8) {
1245       // 4..7 bytes
1246       __ bind(copy8);
1247       __ tbz(count, 2 - exact_log2(granularity), copy4);
1248       __ ldrw(t0, Address(s, 0));
1249       __ ldrw(t1, Address(send, -4));
1250       __ strw(t0, Address(d, 0));
1251       __ strw(t1, Address(dend, -4));
1252       __ b(finish);
1253       if (granularity < 4) {
1254         // 0..3 bytes
1255         __ bind(copy4);
1256         __ cbz(count, finish); // get rid of 0 case
1257         if (granularity == 2) {
1258           __ ldrh(t0, Address(s, 0));
1259           __ strh(t0, Address(d, 0));
1260         } else { // granularity == 1
1261           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1262           // the first and last byte.
1263           // Handle the 3 byte case by loading and storing base + count/2
1264           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1265           // This does means in the 1 byte case we load/store the same
1266           // byte 3 times.
1267           __ lsr(count, count, 1);
1268           __ ldrb(t0, Address(s, 0));
1269           __ ldrb(t1, Address(send, -1));
1270           __ ldrb(t2, Address(s, count));
1271           __ strb(t0, Address(d, 0));
1272           __ strb(t1, Address(dend, -1));
1273           __ strb(t2, Address(d, count));
1274         }
1275         __ b(finish);
1276       }
1277     }
1278 
1279     __ bind(copy_big);
1280     if (is_backwards) {
1281       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1282       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1283     }
1284 
1285     // Now we've got the small case out of the way we can align the
1286     // source address on a 2-word boundary.
1287 
1288     Label aligned;
1289 
1290     if (is_aligned) {
1291       // We may have to adjust by 1 word to get s 2-word-aligned.
1292       __ tbz(s, exact_log2(wordSize), aligned);
1293       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1294       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1295       __ sub(count, count, wordSize/granularity);
1296     } else {
1297       if (is_backwards) {
1298         __ andr(rscratch2, s, 2 * wordSize - 1);
1299       } else {
1300         __ neg(rscratch2, s);
1301         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1302       }
1303       // rscratch2 is the byte adjustment needed to align s.
1304       __ cbz(rscratch2, aligned);
1305       int shift = exact_log2(granularity);
1306       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1307       __ sub(count, count, rscratch2);
1308 
1309 #if 0
1310       // ?? This code is only correct for a disjoint copy.  It may or
1311       // may not make sense to use it in that case.
1312 
1313       // Copy the first pair; s and d may not be aligned.
1314       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1315       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1316 
1317       // Align s and d, adjust count
1318       if (is_backwards) {
1319         __ sub(s, s, rscratch2);
1320         __ sub(d, d, rscratch2);
1321       } else {
1322         __ add(s, s, rscratch2);
1323         __ add(d, d, rscratch2);
1324       }
1325 #else
1326       copy_memory_small(s, d, rscratch2, rscratch1, step);
1327 #endif
1328     }
1329 
1330     __ bind(aligned);
1331 
1332     // s is now 2-word-aligned.
1333 
1334     // We have a count of units and some trailing bytes.  Adjust the
1335     // count and do a bulk copy of words.
1336     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1337     if (direction == copy_forwards)
1338       __ bl(copy_f);
1339     else
1340       __ bl(copy_b);
1341 
1342     // And the tail.
1343     copy_memory_small(s, d, count, tmp, step);
1344 
1345     if (granularity >= 8) __ bind(copy8);
1346     if (granularity >= 4) __ bind(copy4);
1347     __ bind(finish);
1348   }
1349 
1350 
1351   void clobber_registers() {
1352 #ifdef ASSERT
1353     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1354     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1355     for (Register r = r3; r <= r18; r++)
1356       if (r != rscratch1) __ mov(r, rscratch1);
1357 #endif
1358   }
1359 
1360   // Scan over array at a for count oops, verifying each one.
1361   // Preserves a and count, clobbers rscratch1 and rscratch2.
1362   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1363     Label loop, end;
1364     __ mov(rscratch1, a);
1365     __ mov(rscratch2, zr);
1366     __ bind(loop);
1367     __ cmp(rscratch2, count);
1368     __ br(Assembler::HS, end);
1369     if (size == (size_t)wordSize) {
1370       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1371       __ verify_oop(temp);
1372     } else {
1373       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1374       __ decode_heap_oop(temp); // calls verify_oop
1375     }
1376     __ add(rscratch2, rscratch2, size);
1377     __ b(loop);
1378     __ bind(end);
1379   }
1380 
1381   // Arguments:
1382   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1383   //             ignored
1384   //   is_oop  - true => oop array, so generate store check code
1385   //   name    - stub name string
1386   //
1387   // Inputs:
1388   //   c_rarg0   - source array address
1389   //   c_rarg1   - destination array address
1390   //   c_rarg2   - element count, treated as ssize_t, can be zero
1391   //
1392   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1393   // the hardware handle it.  The two dwords within qwords that span
1394   // cache line boundaries will still be loaded and stored atomicly.
1395   //
1396   // Side Effects:
1397   //   disjoint_int_copy_entry is set to the no-overlap entry point
1398   //   used by generate_conjoint_int_oop_copy().
1399   //
1400   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1401                                   const char *name, bool dest_uninitialized = false) {
1402     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1403     RegSet saved_reg = RegSet::of(s, d, count);
1404     __ align(CodeEntryAlignment);
1405     StubCodeMark mark(this, "StubRoutines", name);
1406     address start = __ pc();
1407     __ enter();
1408 
1409     if (entry != NULL) {
1410       *entry = __ pc();
1411       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1412       BLOCK_COMMENT("Entry:");
1413     }
1414 
1415     DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
1416     if (dest_uninitialized) {
1417       decorators |= AS_DEST_NOT_INITIALIZED;
1418     }
1419     if (aligned) {
1420       decorators |= ARRAYCOPY_ALIGNED;
1421     }
1422 
1423     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1424     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1425 
1426     if (is_oop) {
1427       // save regs before copy_memory
1428       __ push(RegSet::of(d, count), sp);
1429     }
1430     copy_memory(aligned, s, d, count, rscratch1, size);
1431 
1432     if (is_oop) {
1433       __ pop(RegSet::of(d, count), sp);
1434       if (VerifyOops)
1435         verify_oop_array(size, d, count, r16);
1436       __ sub(count, count, 1); // make an inclusive end pointer
1437       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1438     }
1439 
1440     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1441 
1442     __ leave();
1443     __ mov(r0, zr); // return 0
1444     __ ret(lr);
1445 #ifdef BUILTIN_SIM
1446     {
1447       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1448       sim->notifyCompile(const_cast<char*>(name), start);
1449     }
1450 #endif
1451     return start;
1452   }
1453 
1454   // Arguments:
1455   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1456   //             ignored
1457   //   is_oop  - true => oop array, so generate store check code
1458   //   name    - stub name string
1459   //
1460   // Inputs:
1461   //   c_rarg0   - source array address
1462   //   c_rarg1   - destination array address
1463   //   c_rarg2   - element count, treated as ssize_t, can be zero
1464   //
1465   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1466   // the hardware handle it.  The two dwords within qwords that span
1467   // cache line boundaries will still be loaded and stored atomicly.
1468   //
1469   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1470                                  address *entry, const char *name,
1471                                  bool dest_uninitialized = false) {
1472     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1473     RegSet saved_regs = RegSet::of(s, d, count);
1474     StubCodeMark mark(this, "StubRoutines", name);
1475     address start = __ pc();
1476     __ enter();
1477 
1478     if (entry != NULL) {
1479       *entry = __ pc();
1480       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1481       BLOCK_COMMENT("Entry:");
1482     }
1483 
1484     // use fwd copy when (d-s) above_equal (count*size)
1485     __ sub(rscratch1, d, s);
1486     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1487     __ br(Assembler::HS, nooverlap_target);
1488 
1489     DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY;
1490     if (dest_uninitialized) {
1491       decorators |= AS_DEST_NOT_INITIALIZED;
1492     }
1493     if (aligned) {
1494       decorators |= ARRAYCOPY_ALIGNED;
1495     }
1496 
1497     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1498     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1499 
1500     if (is_oop) {
1501       // save regs before copy_memory
1502       __ push(RegSet::of(d, count), sp);
1503     }
1504     copy_memory(aligned, s, d, count, rscratch1, -size);
1505     if (is_oop) {
1506       __ pop(RegSet::of(d, count), sp);
1507       if (VerifyOops)
1508         verify_oop_array(size, d, count, r16);
1509       __ sub(count, count, 1); // make an inclusive end pointer
1510       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1511     }
1512     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1513     __ leave();
1514     __ mov(r0, zr); // return 0
1515     __ ret(lr);
1516 #ifdef BUILTIN_SIM
1517     {
1518       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1519       sim->notifyCompile(const_cast<char*>(name), start);
1520     }
1521 #endif
1522     return start;
1523 }
1524 
1525   // Arguments:
1526   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1527   //             ignored
1528   //   name    - stub name string
1529   //
1530   // Inputs:
1531   //   c_rarg0   - source array address
1532   //   c_rarg1   - destination array address
1533   //   c_rarg2   - element count, treated as ssize_t, can be zero
1534   //
1535   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1536   // we let the hardware handle it.  The one to eight bytes within words,
1537   // dwords or qwords that span cache line boundaries will still be loaded
1538   // and stored atomically.
1539   //
1540   // Side Effects:
1541   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1542   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1543   // we let the hardware handle it.  The one to eight bytes within words,
1544   // dwords or qwords that span cache line boundaries will still be loaded
1545   // and stored atomically.
1546   //
1547   // Side Effects:
1548   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1549   //   used by generate_conjoint_byte_copy().
1550   //
1551   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1552     const bool not_oop = false;
1553     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1554   }
1555 
1556   // Arguments:
1557   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1558   //             ignored
1559   //   name    - stub name string
1560   //
1561   // Inputs:
1562   //   c_rarg0   - source array address
1563   //   c_rarg1   - destination array address
1564   //   c_rarg2   - element count, treated as ssize_t, can be zero
1565   //
1566   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1567   // we let the hardware handle it.  The one to eight bytes within words,
1568   // dwords or qwords that span cache line boundaries will still be loaded
1569   // and stored atomically.
1570   //
1571   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1572                                       address* entry, const char *name) {
1573     const bool not_oop = false;
1574     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1575   }
1576 
1577   // Arguments:
1578   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1579   //             ignored
1580   //   name    - stub name string
1581   //
1582   // Inputs:
1583   //   c_rarg0   - source array address
1584   //   c_rarg1   - destination array address
1585   //   c_rarg2   - element count, treated as ssize_t, can be zero
1586   //
1587   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1588   // let the hardware handle it.  The two or four words within dwords
1589   // or qwords that span cache line boundaries will still be loaded
1590   // and stored atomically.
1591   //
1592   // Side Effects:
1593   //   disjoint_short_copy_entry is set to the no-overlap entry point
1594   //   used by generate_conjoint_short_copy().
1595   //
1596   address generate_disjoint_short_copy(bool aligned,
1597                                        address* entry, const char *name) {
1598     const bool not_oop = false;
1599     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1600   }
1601 
1602   // Arguments:
1603   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1604   //             ignored
1605   //   name    - stub name string
1606   //
1607   // Inputs:
1608   //   c_rarg0   - source array address
1609   //   c_rarg1   - destination array address
1610   //   c_rarg2   - element count, treated as ssize_t, can be zero
1611   //
1612   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1613   // let the hardware handle it.  The two or four words within dwords
1614   // or qwords that span cache line boundaries will still be loaded
1615   // and stored atomically.
1616   //
1617   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1618                                        address *entry, const char *name) {
1619     const bool not_oop = false;
1620     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1621 
1622   }
1623   // Arguments:
1624   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1625   //             ignored
1626   //   name    - stub name string
1627   //
1628   // Inputs:
1629   //   c_rarg0   - source array address
1630   //   c_rarg1   - destination array address
1631   //   c_rarg2   - element count, treated as ssize_t, can be zero
1632   //
1633   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1634   // the hardware handle it.  The two dwords within qwords that span
1635   // cache line boundaries will still be loaded and stored atomicly.
1636   //
1637   // Side Effects:
1638   //   disjoint_int_copy_entry is set to the no-overlap entry point
1639   //   used by generate_conjoint_int_oop_copy().
1640   //
1641   address generate_disjoint_int_copy(bool aligned, address *entry,
1642                                          const char *name, bool dest_uninitialized = false) {
1643     const bool not_oop = false;
1644     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1645   }
1646 
1647   // Arguments:
1648   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1649   //             ignored
1650   //   name    - stub name string
1651   //
1652   // Inputs:
1653   //   c_rarg0   - source array address
1654   //   c_rarg1   - destination array address
1655   //   c_rarg2   - element count, treated as ssize_t, can be zero
1656   //
1657   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1658   // the hardware handle it.  The two dwords within qwords that span
1659   // cache line boundaries will still be loaded and stored atomicly.
1660   //
1661   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1662                                      address *entry, const char *name,
1663                                      bool dest_uninitialized = false) {
1664     const bool not_oop = false;
1665     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1666   }
1667 
1668 
1669   // Arguments:
1670   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1671   //             ignored
1672   //   name    - stub name string
1673   //
1674   // Inputs:
1675   //   c_rarg0   - source array address
1676   //   c_rarg1   - destination array address
1677   //   c_rarg2   - element count, treated as size_t, can be zero
1678   //
1679   // Side Effects:
1680   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1681   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1682   //
1683   address generate_disjoint_long_copy(bool aligned, address *entry,
1684                                           const char *name, bool dest_uninitialized = false) {
1685     const bool not_oop = false;
1686     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1687   }
1688 
1689   // Arguments:
1690   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1691   //             ignored
1692   //   name    - stub name string
1693   //
1694   // Inputs:
1695   //   c_rarg0   - source array address
1696   //   c_rarg1   - destination array address
1697   //   c_rarg2   - element count, treated as size_t, can be zero
1698   //
1699   address generate_conjoint_long_copy(bool aligned,
1700                                       address nooverlap_target, address *entry,
1701                                       const char *name, bool dest_uninitialized = false) {
1702     const bool not_oop = false;
1703     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1704   }
1705 
1706   // Arguments:
1707   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1708   //             ignored
1709   //   name    - stub name string
1710   //
1711   // Inputs:
1712   //   c_rarg0   - source array address
1713   //   c_rarg1   - destination array address
1714   //   c_rarg2   - element count, treated as size_t, can be zero
1715   //
1716   // Side Effects:
1717   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1718   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1719   //
1720   address generate_disjoint_oop_copy(bool aligned, address *entry,
1721                                      const char *name, bool dest_uninitialized) {
1722     const bool is_oop = true;
1723     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1724     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1725   }
1726 
1727   // Arguments:
1728   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1729   //             ignored
1730   //   name    - stub name string
1731   //
1732   // Inputs:
1733   //   c_rarg0   - source array address
1734   //   c_rarg1   - destination array address
1735   //   c_rarg2   - element count, treated as size_t, can be zero
1736   //
1737   address generate_conjoint_oop_copy(bool aligned,
1738                                      address nooverlap_target, address *entry,
1739                                      const char *name, bool dest_uninitialized) {
1740     const bool is_oop = true;
1741     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1742     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1743                                   name, dest_uninitialized);
1744   }
1745 
1746 
1747   // Helper for generating a dynamic type check.
1748   // Smashes rscratch1.
1749   void generate_type_check(Register sub_klass,
1750                            Register super_check_offset,
1751                            Register super_klass,
1752                            Label& L_success) {
1753     assert_different_registers(sub_klass, super_check_offset, super_klass);
1754 
1755     BLOCK_COMMENT("type_check:");
1756 
1757     Label L_miss;
1758 
1759     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1760                                      super_check_offset);
1761     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1762 
1763     // Fall through on failure!
1764     __ BIND(L_miss);
1765   }
1766 
1767   //
1768   //  Generate checkcasting array copy stub
1769   //
1770   //  Input:
1771   //    c_rarg0   - source array address
1772   //    c_rarg1   - destination array address
1773   //    c_rarg2   - element count, treated as ssize_t, can be zero
1774   //    c_rarg3   - size_t ckoff (super_check_offset)
1775   //    c_rarg4   - oop ckval (super_klass)
1776   //
1777   //  Output:
1778   //    r0 ==  0  -  success
1779   //    r0 == -1^K - failure, where K is partial transfer count
1780   //
1781   address generate_checkcast_copy(const char *name, address *entry,
1782                                   bool dest_uninitialized = false) {
1783 
1784     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1785 
1786     // Input registers (after setup_arg_regs)
1787     const Register from        = c_rarg0;   // source array address
1788     const Register to          = c_rarg1;   // destination array address
1789     const Register count       = c_rarg2;   // elementscount
1790     const Register ckoff       = c_rarg3;   // super_check_offset
1791     const Register ckval       = c_rarg4;   // super_klass
1792 
1793     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1794     RegSet wb_post_saved_regs = RegSet::of(count);
1795 
1796     // Registers used as temps (r18, r19, r20 are save-on-entry)
1797     const Register count_save  = r21;       // orig elementscount
1798     const Register start_to    = r20;       // destination array start address
1799     const Register copied_oop  = r18;       // actual oop copied
1800     const Register r19_klass   = r19;       // oop._klass
1801 
1802     //---------------------------------------------------------------
1803     // Assembler stub will be used for this call to arraycopy
1804     // if the two arrays are subtypes of Object[] but the
1805     // destination array type is not equal to or a supertype
1806     // of the source type.  Each element must be separately
1807     // checked.
1808 
1809     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1810                                copied_oop, r19_klass, count_save);
1811 
1812     __ align(CodeEntryAlignment);
1813     StubCodeMark mark(this, "StubRoutines", name);
1814     address start = __ pc();
1815 
1816     __ enter(); // required for proper stackwalking of RuntimeStub frame
1817 
1818 #ifdef ASSERT
1819     // caller guarantees that the arrays really are different
1820     // otherwise, we would have to make conjoint checks
1821     { Label L;
1822       array_overlap_test(L, TIMES_OOP);
1823       __ stop("checkcast_copy within a single array");
1824       __ bind(L);
1825     }
1826 #endif //ASSERT
1827 
1828     // Caller of this entry point must set up the argument registers.
1829     if (entry != NULL) {
1830       *entry = __ pc();
1831       BLOCK_COMMENT("Entry:");
1832     }
1833 
1834      // Empty array:  Nothing to do.
1835     __ cbz(count, L_done);
1836 
1837     __ push(RegSet::of(r18, r19, r20, r21), sp);
1838 
1839 #ifdef ASSERT
1840     BLOCK_COMMENT("assert consistent ckoff/ckval");
1841     // The ckoff and ckval must be mutually consistent,
1842     // even though caller generates both.
1843     { Label L;
1844       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1845       __ ldrw(start_to, Address(ckval, sco_offset));
1846       __ cmpw(ckoff, start_to);
1847       __ br(Assembler::EQ, L);
1848       __ stop("super_check_offset inconsistent");
1849       __ bind(L);
1850     }
1851 #endif //ASSERT
1852 
1853     DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_CHECKCAST;
1854     bool is_oop = true;
1855     if (dest_uninitialized) {
1856       decorators |= AS_DEST_NOT_INITIALIZED;
1857     }
1858 
1859     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1860     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1861 
1862     // save the original count
1863     __ mov(count_save, count);
1864 
1865     // Copy from low to high addresses
1866     __ mov(start_to, to);              // Save destination array start address
1867     __ b(L_load_element);
1868 
1869     // ======== begin loop ========
1870     // (Loop is rotated; its entry is L_load_element.)
1871     // Loop control:
1872     //   for (; count != 0; count--) {
1873     //     copied_oop = load_heap_oop(from++);
1874     //     ... generate_type_check ...;
1875     //     store_heap_oop(to++, copied_oop);
1876     //   }
1877     __ align(OptoLoopAlignment);
1878 
1879     __ BIND(L_store_element);
1880     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1881     __ sub(count, count, 1);
1882     __ cbz(count, L_do_card_marks);
1883 
1884     // ======== loop entry is here ========
1885     __ BIND(L_load_element);
1886     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1887     __ cbz(copied_oop, L_store_element);
1888 
1889     __ load_klass(r19_klass, copied_oop);// query the object klass
1890     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1891     // ======== end loop ========
1892 
1893     // It was a real error; we must depend on the caller to finish the job.
1894     // Register count = remaining oops, count_orig = total oops.
1895     // Emit GC store barriers for the oops we have copied and report
1896     // their number to the caller.
1897 
1898     __ subs(count, count_save, count);     // K = partially copied oop count
1899     __ eon(count, count, zr);                   // report (-1^K) to caller
1900     __ br(Assembler::EQ, L_done_pop);
1901 
1902     __ BIND(L_do_card_marks);
1903     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1904     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1905 
1906     __ bind(L_done_pop);
1907     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1908     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1909 
1910     __ bind(L_done);
1911     __ mov(r0, count);
1912     __ leave();
1913     __ ret(lr);
1914 
1915     return start;
1916   }
1917 
1918   // Perform range checks on the proposed arraycopy.
1919   // Kills temp, but nothing else.
1920   // Also, clean the sign bits of src_pos and dst_pos.
1921   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1922                               Register src_pos, // source position (c_rarg1)
1923                               Register dst,     // destination array oo (c_rarg2)
1924                               Register dst_pos, // destination position (c_rarg3)
1925                               Register length,
1926                               Register temp,
1927                               Label& L_failed) {
1928     BLOCK_COMMENT("arraycopy_range_checks:");
1929 
1930     assert_different_registers(rscratch1, temp);
1931 
1932     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1933     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1934     __ addw(temp, length, src_pos);
1935     __ cmpw(temp, rscratch1);
1936     __ br(Assembler::HI, L_failed);
1937 
1938     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1939     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1940     __ addw(temp, length, dst_pos);
1941     __ cmpw(temp, rscratch1);
1942     __ br(Assembler::HI, L_failed);
1943 
1944     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1945     __ movw(src_pos, src_pos);
1946     __ movw(dst_pos, dst_pos);
1947 
1948     BLOCK_COMMENT("arraycopy_range_checks done");
1949   }
1950 
1951   // These stubs get called from some dumb test routine.
1952   // I'll write them properly when they're called from
1953   // something that's actually doing something.
1954   static void fake_arraycopy_stub(address src, address dst, int count) {
1955     assert(count == 0, "huh?");
1956   }
1957 
1958 
1959   //
1960   //  Generate 'unsafe' array copy stub
1961   //  Though just as safe as the other stubs, it takes an unscaled
1962   //  size_t argument instead of an element count.
1963   //
1964   //  Input:
1965   //    c_rarg0   - source array address
1966   //    c_rarg1   - destination array address
1967   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1968   //
1969   // Examines the alignment of the operands and dispatches
1970   // to a long, int, short, or byte copy loop.
1971   //
1972   address generate_unsafe_copy(const char *name,
1973                                address byte_copy_entry,
1974                                address short_copy_entry,
1975                                address int_copy_entry,
1976                                address long_copy_entry) {
1977     Label L_long_aligned, L_int_aligned, L_short_aligned;
1978     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1979 
1980     __ align(CodeEntryAlignment);
1981     StubCodeMark mark(this, "StubRoutines", name);
1982     address start = __ pc();
1983     __ enter(); // required for proper stackwalking of RuntimeStub frame
1984 
1985     // bump this on entry, not on exit:
1986     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1987 
1988     __ orr(rscratch1, s, d);
1989     __ orr(rscratch1, rscratch1, count);
1990 
1991     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1992     __ cbz(rscratch1, L_long_aligned);
1993     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1994     __ cbz(rscratch1, L_int_aligned);
1995     __ tbz(rscratch1, 0, L_short_aligned);
1996     __ b(RuntimeAddress(byte_copy_entry));
1997 
1998     __ BIND(L_short_aligned);
1999     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2000     __ b(RuntimeAddress(short_copy_entry));
2001     __ BIND(L_int_aligned);
2002     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2003     __ b(RuntimeAddress(int_copy_entry));
2004     __ BIND(L_long_aligned);
2005     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2006     __ b(RuntimeAddress(long_copy_entry));
2007 
2008     return start;
2009   }
2010 
2011   //
2012   //  Generate generic array copy stubs
2013   //
2014   //  Input:
2015   //    c_rarg0    -  src oop
2016   //    c_rarg1    -  src_pos (32-bits)
2017   //    c_rarg2    -  dst oop
2018   //    c_rarg3    -  dst_pos (32-bits)
2019   //    c_rarg4    -  element count (32-bits)
2020   //
2021   //  Output:
2022   //    r0 ==  0  -  success
2023   //    r0 == -1^K - failure, where K is partial transfer count
2024   //
2025   address generate_generic_copy(const char *name,
2026                                 address byte_copy_entry, address short_copy_entry,
2027                                 address int_copy_entry, address oop_copy_entry,
2028                                 address long_copy_entry, address checkcast_copy_entry) {
2029 
2030     Label L_failed, L_failed_0, L_objArray;
2031     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2032 
2033     // Input registers
2034     const Register src        = c_rarg0;  // source array oop
2035     const Register src_pos    = c_rarg1;  // source position
2036     const Register dst        = c_rarg2;  // destination array oop
2037     const Register dst_pos    = c_rarg3;  // destination position
2038     const Register length     = c_rarg4;
2039 
2040     StubCodeMark mark(this, "StubRoutines", name);
2041 
2042     __ align(CodeEntryAlignment);
2043     address start = __ pc();
2044 
2045     __ enter(); // required for proper stackwalking of RuntimeStub frame
2046 
2047     // bump this on entry, not on exit:
2048     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2049 
2050     //-----------------------------------------------------------------------
2051     // Assembler stub will be used for this call to arraycopy
2052     // if the following conditions are met:
2053     //
2054     // (1) src and dst must not be null.
2055     // (2) src_pos must not be negative.
2056     // (3) dst_pos must not be negative.
2057     // (4) length  must not be negative.
2058     // (5) src klass and dst klass should be the same and not NULL.
2059     // (6) src and dst should be arrays.
2060     // (7) src_pos + length must not exceed length of src.
2061     // (8) dst_pos + length must not exceed length of dst.
2062     //
2063 
2064     //  if (src == NULL) return -1;
2065     __ cbz(src, L_failed);
2066 
2067     //  if (src_pos < 0) return -1;
2068     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2069 
2070     //  if (dst == NULL) return -1;
2071     __ cbz(dst, L_failed);
2072 
2073     //  if (dst_pos < 0) return -1;
2074     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2075 
2076     // registers used as temp
2077     const Register scratch_length    = r16; // elements count to copy
2078     const Register scratch_src_klass = r17; // array klass
2079     const Register lh                = r18; // layout helper
2080 
2081     //  if (length < 0) return -1;
2082     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2083     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2084 
2085     __ load_klass(scratch_src_klass, src);
2086 #ifdef ASSERT
2087     //  assert(src->klass() != NULL);
2088     {
2089       BLOCK_COMMENT("assert klasses not null {");
2090       Label L1, L2;
2091       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2092       __ bind(L1);
2093       __ stop("broken null klass");
2094       __ bind(L2);
2095       __ load_klass(rscratch1, dst);
2096       __ cbz(rscratch1, L1);     // this would be broken also
2097       BLOCK_COMMENT("} assert klasses not null done");
2098     }
2099 #endif
2100 
2101     // Load layout helper (32-bits)
2102     //
2103     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2104     // 32        30    24            16              8     2                 0
2105     //
2106     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2107     //
2108 
2109     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2110 
2111     // Handle objArrays completely differently...
2112     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2113     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2114     __ movw(rscratch1, objArray_lh);
2115     __ eorw(rscratch2, lh, rscratch1);
2116     __ cbzw(rscratch2, L_objArray);
2117 
2118     //  if (src->klass() != dst->klass()) return -1;
2119     __ load_klass(rscratch2, dst);
2120     __ eor(rscratch2, rscratch2, scratch_src_klass);
2121     __ cbnz(rscratch2, L_failed);
2122 
2123     //  if (!src->is_Array()) return -1;
2124     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2125 
2126     // At this point, it is known to be a typeArray (array_tag 0x3).
2127 #ifdef ASSERT
2128     {
2129       BLOCK_COMMENT("assert primitive array {");
2130       Label L;
2131       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2132       __ cmpw(lh, rscratch2);
2133       __ br(Assembler::GE, L);
2134       __ stop("must be a primitive array");
2135       __ bind(L);
2136       BLOCK_COMMENT("} assert primitive array done");
2137     }
2138 #endif
2139 
2140     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2141                            rscratch2, L_failed);
2142 
2143     // TypeArrayKlass
2144     //
2145     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2146     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2147     //
2148 
2149     const Register rscratch1_offset = rscratch1;    // array offset
2150     const Register r18_elsize = lh; // element size
2151 
2152     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2153            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2154     __ add(src, src, rscratch1_offset);           // src array offset
2155     __ add(dst, dst, rscratch1_offset);           // dst array offset
2156     BLOCK_COMMENT("choose copy loop based on element size");
2157 
2158     // next registers should be set before the jump to corresponding stub
2159     const Register from     = c_rarg0;  // source array address
2160     const Register to       = c_rarg1;  // destination array address
2161     const Register count    = c_rarg2;  // elements count
2162 
2163     // 'from', 'to', 'count' registers should be set in such order
2164     // since they are the same as 'src', 'src_pos', 'dst'.
2165 
2166     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2167 
2168     // The possible values of elsize are 0-3, i.e. exact_log2(element
2169     // size in bytes).  We do a simple bitwise binary search.
2170   __ BIND(L_copy_bytes);
2171     __ tbnz(r18_elsize, 1, L_copy_ints);
2172     __ tbnz(r18_elsize, 0, L_copy_shorts);
2173     __ lea(from, Address(src, src_pos));// src_addr
2174     __ lea(to,   Address(dst, dst_pos));// dst_addr
2175     __ movw(count, scratch_length); // length
2176     __ b(RuntimeAddress(byte_copy_entry));
2177 
2178   __ BIND(L_copy_shorts);
2179     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2180     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2181     __ movw(count, scratch_length); // length
2182     __ b(RuntimeAddress(short_copy_entry));
2183 
2184   __ BIND(L_copy_ints);
2185     __ tbnz(r18_elsize, 0, L_copy_longs);
2186     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2187     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2188     __ movw(count, scratch_length); // length
2189     __ b(RuntimeAddress(int_copy_entry));
2190 
2191   __ BIND(L_copy_longs);
2192 #ifdef ASSERT
2193     {
2194       BLOCK_COMMENT("assert long copy {");
2195       Label L;
2196       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2197       __ cmpw(r18_elsize, LogBytesPerLong);
2198       __ br(Assembler::EQ, L);
2199       __ stop("must be long copy, but elsize is wrong");
2200       __ bind(L);
2201       BLOCK_COMMENT("} assert long copy done");
2202     }
2203 #endif
2204     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2205     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2206     __ movw(count, scratch_length); // length
2207     __ b(RuntimeAddress(long_copy_entry));
2208 
2209     // ObjArrayKlass
2210   __ BIND(L_objArray);
2211     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2212 
2213     Label L_plain_copy, L_checkcast_copy;
2214     //  test array classes for subtyping
2215     __ load_klass(r18, dst);
2216     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2217     __ br(Assembler::NE, L_checkcast_copy);
2218 
2219     // Identically typed arrays can be copied without element-wise checks.
2220     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2221                            rscratch2, L_failed);
2222 
2223     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2224     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2225     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2226     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2227     __ movw(count, scratch_length); // length
2228   __ BIND(L_plain_copy);
2229     __ b(RuntimeAddress(oop_copy_entry));
2230 
2231   __ BIND(L_checkcast_copy);
2232     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2233     {
2234       // Before looking at dst.length, make sure dst is also an objArray.
2235       __ ldrw(rscratch1, Address(r18, lh_offset));
2236       __ movw(rscratch2, objArray_lh);
2237       __ eorw(rscratch1, rscratch1, rscratch2);
2238       __ cbnzw(rscratch1, L_failed);
2239 
2240       // It is safe to examine both src.length and dst.length.
2241       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2242                              r18, L_failed);
2243 
2244       const Register rscratch2_dst_klass = rscratch2;
2245       __ load_klass(rscratch2_dst_klass, dst); // reload
2246 
2247       // Marshal the base address arguments now, freeing registers.
2248       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2249       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2250       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2251       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2252       __ movw(count, length);           // length (reloaded)
2253       Register sco_temp = c_rarg3;      // this register is free now
2254       assert_different_registers(from, to, count, sco_temp,
2255                                  rscratch2_dst_klass, scratch_src_klass);
2256       // assert_clean_int(count, sco_temp);
2257 
2258       // Generate the type check.
2259       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2260       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2261       // assert_clean_int(sco_temp, r18);
2262       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2263 
2264       // Fetch destination element klass from the ObjArrayKlass header.
2265       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2266       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2267       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2268 
2269       // the checkcast_copy loop needs two extra arguments:
2270       assert(c_rarg3 == sco_temp, "#3 already in place");
2271       // Set up arguments for checkcast_copy_entry.
2272       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2273       __ b(RuntimeAddress(checkcast_copy_entry));
2274     }
2275 
2276   __ BIND(L_failed);
2277     __ mov(r0, -1);
2278     __ leave();   // required for proper stackwalking of RuntimeStub frame
2279     __ ret(lr);
2280 
2281     return start;
2282   }
2283 
2284   //
2285   // Generate stub for array fill. If "aligned" is true, the
2286   // "to" address is assumed to be heapword aligned.
2287   //
2288   // Arguments for generated stub:
2289   //   to:    c_rarg0
2290   //   value: c_rarg1
2291   //   count: c_rarg2 treated as signed
2292   //
2293   address generate_fill(BasicType t, bool aligned, const char *name) {
2294     __ align(CodeEntryAlignment);
2295     StubCodeMark mark(this, "StubRoutines", name);
2296     address start = __ pc();
2297 
2298     BLOCK_COMMENT("Entry:");
2299 
2300     const Register to        = c_rarg0;  // source array address
2301     const Register value     = c_rarg1;  // value
2302     const Register count     = c_rarg2;  // elements count
2303 
2304     const Register bz_base = r10;        // base for block_zero routine
2305     const Register cnt_words = r11;      // temp register
2306 
2307     __ enter();
2308 
2309     Label L_fill_elements, L_exit1;
2310 
2311     int shift = -1;
2312     switch (t) {
2313       case T_BYTE:
2314         shift = 0;
2315         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2316         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2317         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2318         __ br(Assembler::LO, L_fill_elements);
2319         break;
2320       case T_SHORT:
2321         shift = 1;
2322         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2323         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2324         __ br(Assembler::LO, L_fill_elements);
2325         break;
2326       case T_INT:
2327         shift = 2;
2328         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2329         __ br(Assembler::LO, L_fill_elements);
2330         break;
2331       default: ShouldNotReachHere();
2332     }
2333 
2334     // Align source address at 8 bytes address boundary.
2335     Label L_skip_align1, L_skip_align2, L_skip_align4;
2336     if (!aligned) {
2337       switch (t) {
2338         case T_BYTE:
2339           // One byte misalignment happens only for byte arrays.
2340           __ tbz(to, 0, L_skip_align1);
2341           __ strb(value, Address(__ post(to, 1)));
2342           __ subw(count, count, 1);
2343           __ bind(L_skip_align1);
2344           // Fallthrough
2345         case T_SHORT:
2346           // Two bytes misalignment happens only for byte and short (char) arrays.
2347           __ tbz(to, 1, L_skip_align2);
2348           __ strh(value, Address(__ post(to, 2)));
2349           __ subw(count, count, 2 >> shift);
2350           __ bind(L_skip_align2);
2351           // Fallthrough
2352         case T_INT:
2353           // Align to 8 bytes, we know we are 4 byte aligned to start.
2354           __ tbz(to, 2, L_skip_align4);
2355           __ strw(value, Address(__ post(to, 4)));
2356           __ subw(count, count, 4 >> shift);
2357           __ bind(L_skip_align4);
2358           break;
2359         default: ShouldNotReachHere();
2360       }
2361     }
2362 
2363     //
2364     //  Fill large chunks
2365     //
2366     __ lsrw(cnt_words, count, 3 - shift); // number of words
2367     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2368     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2369     if (UseBlockZeroing) {
2370       Label non_block_zeroing, rest;
2371       // If the fill value is zero we can use the fast zero_words().
2372       __ cbnz(value, non_block_zeroing);
2373       __ mov(bz_base, to);
2374       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2375       __ zero_words(bz_base, cnt_words);
2376       __ b(rest);
2377       __ bind(non_block_zeroing);
2378       __ fill_words(to, cnt_words, value);
2379       __ bind(rest);
2380     } else {
2381       __ fill_words(to, cnt_words, value);
2382     }
2383 
2384     // Remaining count is less than 8 bytes. Fill it by a single store.
2385     // Note that the total length is no less than 8 bytes.
2386     if (t == T_BYTE || t == T_SHORT) {
2387       Label L_exit1;
2388       __ cbzw(count, L_exit1);
2389       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2390       __ str(value, Address(to, -8));    // overwrite some elements
2391       __ bind(L_exit1);
2392       __ leave();
2393       __ ret(lr);
2394     }
2395 
2396     // Handle copies less than 8 bytes.
2397     Label L_fill_2, L_fill_4, L_exit2;
2398     __ bind(L_fill_elements);
2399     switch (t) {
2400       case T_BYTE:
2401         __ tbz(count, 0, L_fill_2);
2402         __ strb(value, Address(__ post(to, 1)));
2403         __ bind(L_fill_2);
2404         __ tbz(count, 1, L_fill_4);
2405         __ strh(value, Address(__ post(to, 2)));
2406         __ bind(L_fill_4);
2407         __ tbz(count, 2, L_exit2);
2408         __ strw(value, Address(to));
2409         break;
2410       case T_SHORT:
2411         __ tbz(count, 0, L_fill_4);
2412         __ strh(value, Address(__ post(to, 2)));
2413         __ bind(L_fill_4);
2414         __ tbz(count, 1, L_exit2);
2415         __ strw(value, Address(to));
2416         break;
2417       case T_INT:
2418         __ cbzw(count, L_exit2);
2419         __ strw(value, Address(to));
2420         break;
2421       default: ShouldNotReachHere();
2422     }
2423     __ bind(L_exit2);
2424     __ leave();
2425     __ ret(lr);
2426     return start;
2427   }
2428 
2429   void generate_arraycopy_stubs() {
2430     address entry;
2431     address entry_jbyte_arraycopy;
2432     address entry_jshort_arraycopy;
2433     address entry_jint_arraycopy;
2434     address entry_oop_arraycopy;
2435     address entry_jlong_arraycopy;
2436     address entry_checkcast_arraycopy;
2437 
2438     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2439     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2440 
2441     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2442 
2443     //*** jbyte
2444     // Always need aligned and unaligned versions
2445     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2446                                                                                   "jbyte_disjoint_arraycopy");
2447     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2448                                                                                   &entry_jbyte_arraycopy,
2449                                                                                   "jbyte_arraycopy");
2450     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2451                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2452     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2453                                                                                   "arrayof_jbyte_arraycopy");
2454 
2455     //*** jshort
2456     // Always need aligned and unaligned versions
2457     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2458                                                                                     "jshort_disjoint_arraycopy");
2459     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2460                                                                                     &entry_jshort_arraycopy,
2461                                                                                     "jshort_arraycopy");
2462     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2463                                                                                     "arrayof_jshort_disjoint_arraycopy");
2464     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2465                                                                                     "arrayof_jshort_arraycopy");
2466 
2467     //*** jint
2468     // Aligned versions
2469     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2470                                                                                 "arrayof_jint_disjoint_arraycopy");
2471     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2472                                                                                 "arrayof_jint_arraycopy");
2473     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2474     // entry_jint_arraycopy always points to the unaligned version
2475     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2476                                                                                 "jint_disjoint_arraycopy");
2477     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2478                                                                                 &entry_jint_arraycopy,
2479                                                                                 "jint_arraycopy");
2480 
2481     //*** jlong
2482     // It is always aligned
2483     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2484                                                                                   "arrayof_jlong_disjoint_arraycopy");
2485     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2486                                                                                   "arrayof_jlong_arraycopy");
2487     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2488     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2489 
2490     //*** oops
2491     {
2492       // With compressed oops we need unaligned versions; notice that
2493       // we overwrite entry_oop_arraycopy.
2494       bool aligned = !UseCompressedOops;
2495 
2496       StubRoutines::_arrayof_oop_disjoint_arraycopy
2497         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2498                                      /*dest_uninitialized*/false);
2499       StubRoutines::_arrayof_oop_arraycopy
2500         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2501                                      /*dest_uninitialized*/false);
2502       // Aligned versions without pre-barriers
2503       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2504         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2505                                      /*dest_uninitialized*/true);
2506       StubRoutines::_arrayof_oop_arraycopy_uninit
2507         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2508                                      /*dest_uninitialized*/true);
2509     }
2510 
2511     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2512     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2513     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2514     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2515 
2516     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2517     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2518                                                                         /*dest_uninitialized*/true);
2519 
2520     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2521                                                               entry_jbyte_arraycopy,
2522                                                               entry_jshort_arraycopy,
2523                                                               entry_jint_arraycopy,
2524                                                               entry_jlong_arraycopy);
2525 
2526     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2527                                                                entry_jbyte_arraycopy,
2528                                                                entry_jshort_arraycopy,
2529                                                                entry_jint_arraycopy,
2530                                                                entry_oop_arraycopy,
2531                                                                entry_jlong_arraycopy,
2532                                                                entry_checkcast_arraycopy);
2533 
2534     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2535     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2536     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2537     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2538     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2539     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2540   }
2541 
2542   void generate_math_stubs() { Unimplemented(); }
2543 
2544   // Arguments:
2545   //
2546   // Inputs:
2547   //   c_rarg0   - source byte array address
2548   //   c_rarg1   - destination byte array address
2549   //   c_rarg2   - K (key) in little endian int array
2550   //
2551   address generate_aescrypt_encryptBlock() {
2552     __ align(CodeEntryAlignment);
2553     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2554 
2555     Label L_doLast;
2556 
2557     const Register from        = c_rarg0;  // source array address
2558     const Register to          = c_rarg1;  // destination array address
2559     const Register key         = c_rarg2;  // key array address
2560     const Register keylen      = rscratch1;
2561 
2562     address start = __ pc();
2563     __ enter();
2564 
2565     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2566 
2567     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2568 
2569     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2570     __ rev32(v1, __ T16B, v1);
2571     __ rev32(v2, __ T16B, v2);
2572     __ rev32(v3, __ T16B, v3);
2573     __ rev32(v4, __ T16B, v4);
2574     __ aese(v0, v1);
2575     __ aesmc(v0, v0);
2576     __ aese(v0, v2);
2577     __ aesmc(v0, v0);
2578     __ aese(v0, v3);
2579     __ aesmc(v0, v0);
2580     __ aese(v0, v4);
2581     __ aesmc(v0, v0);
2582 
2583     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2584     __ rev32(v1, __ T16B, v1);
2585     __ rev32(v2, __ T16B, v2);
2586     __ rev32(v3, __ T16B, v3);
2587     __ rev32(v4, __ T16B, v4);
2588     __ aese(v0, v1);
2589     __ aesmc(v0, v0);
2590     __ aese(v0, v2);
2591     __ aesmc(v0, v0);
2592     __ aese(v0, v3);
2593     __ aesmc(v0, v0);
2594     __ aese(v0, v4);
2595     __ aesmc(v0, v0);
2596 
2597     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2598     __ rev32(v1, __ T16B, v1);
2599     __ rev32(v2, __ T16B, v2);
2600 
2601     __ cmpw(keylen, 44);
2602     __ br(Assembler::EQ, L_doLast);
2603 
2604     __ aese(v0, v1);
2605     __ aesmc(v0, v0);
2606     __ aese(v0, v2);
2607     __ aesmc(v0, v0);
2608 
2609     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2610     __ rev32(v1, __ T16B, v1);
2611     __ rev32(v2, __ T16B, v2);
2612 
2613     __ cmpw(keylen, 52);
2614     __ br(Assembler::EQ, L_doLast);
2615 
2616     __ aese(v0, v1);
2617     __ aesmc(v0, v0);
2618     __ aese(v0, v2);
2619     __ aesmc(v0, v0);
2620 
2621     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2622     __ rev32(v1, __ T16B, v1);
2623     __ rev32(v2, __ T16B, v2);
2624 
2625     __ BIND(L_doLast);
2626 
2627     __ aese(v0, v1);
2628     __ aesmc(v0, v0);
2629     __ aese(v0, v2);
2630 
2631     __ ld1(v1, __ T16B, key);
2632     __ rev32(v1, __ T16B, v1);
2633     __ eor(v0, __ T16B, v0, v1);
2634 
2635     __ st1(v0, __ T16B, to);
2636 
2637     __ mov(r0, 0);
2638 
2639     __ leave();
2640     __ ret(lr);
2641 
2642     return start;
2643   }
2644 
2645   // Arguments:
2646   //
2647   // Inputs:
2648   //   c_rarg0   - source byte array address
2649   //   c_rarg1   - destination byte array address
2650   //   c_rarg2   - K (key) in little endian int array
2651   //
2652   address generate_aescrypt_decryptBlock() {
2653     assert(UseAES, "need AES instructions and misaligned SSE support");
2654     __ align(CodeEntryAlignment);
2655     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2656     Label L_doLast;
2657 
2658     const Register from        = c_rarg0;  // source array address
2659     const Register to          = c_rarg1;  // destination array address
2660     const Register key         = c_rarg2;  // key array address
2661     const Register keylen      = rscratch1;
2662 
2663     address start = __ pc();
2664     __ enter(); // required for proper stackwalking of RuntimeStub frame
2665 
2666     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2667 
2668     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2669 
2670     __ ld1(v5, __ T16B, __ post(key, 16));
2671     __ rev32(v5, __ T16B, v5);
2672 
2673     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2674     __ rev32(v1, __ T16B, v1);
2675     __ rev32(v2, __ T16B, v2);
2676     __ rev32(v3, __ T16B, v3);
2677     __ rev32(v4, __ T16B, v4);
2678     __ aesd(v0, v1);
2679     __ aesimc(v0, v0);
2680     __ aesd(v0, v2);
2681     __ aesimc(v0, v0);
2682     __ aesd(v0, v3);
2683     __ aesimc(v0, v0);
2684     __ aesd(v0, v4);
2685     __ aesimc(v0, v0);
2686 
2687     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2688     __ rev32(v1, __ T16B, v1);
2689     __ rev32(v2, __ T16B, v2);
2690     __ rev32(v3, __ T16B, v3);
2691     __ rev32(v4, __ T16B, v4);
2692     __ aesd(v0, v1);
2693     __ aesimc(v0, v0);
2694     __ aesd(v0, v2);
2695     __ aesimc(v0, v0);
2696     __ aesd(v0, v3);
2697     __ aesimc(v0, v0);
2698     __ aesd(v0, v4);
2699     __ aesimc(v0, v0);
2700 
2701     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2702     __ rev32(v1, __ T16B, v1);
2703     __ rev32(v2, __ T16B, v2);
2704 
2705     __ cmpw(keylen, 44);
2706     __ br(Assembler::EQ, L_doLast);
2707 
2708     __ aesd(v0, v1);
2709     __ aesimc(v0, v0);
2710     __ aesd(v0, v2);
2711     __ aesimc(v0, v0);
2712 
2713     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2714     __ rev32(v1, __ T16B, v1);
2715     __ rev32(v2, __ T16B, v2);
2716 
2717     __ cmpw(keylen, 52);
2718     __ br(Assembler::EQ, L_doLast);
2719 
2720     __ aesd(v0, v1);
2721     __ aesimc(v0, v0);
2722     __ aesd(v0, v2);
2723     __ aesimc(v0, v0);
2724 
2725     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2726     __ rev32(v1, __ T16B, v1);
2727     __ rev32(v2, __ T16B, v2);
2728 
2729     __ BIND(L_doLast);
2730 
2731     __ aesd(v0, v1);
2732     __ aesimc(v0, v0);
2733     __ aesd(v0, v2);
2734 
2735     __ eor(v0, __ T16B, v0, v5);
2736 
2737     __ st1(v0, __ T16B, to);
2738 
2739     __ mov(r0, 0);
2740 
2741     __ leave();
2742     __ ret(lr);
2743 
2744     return start;
2745   }
2746 
2747   // Arguments:
2748   //
2749   // Inputs:
2750   //   c_rarg0   - source byte array address
2751   //   c_rarg1   - destination byte array address
2752   //   c_rarg2   - K (key) in little endian int array
2753   //   c_rarg3   - r vector byte array address
2754   //   c_rarg4   - input length
2755   //
2756   // Output:
2757   //   x0        - input length
2758   //
2759   address generate_cipherBlockChaining_encryptAESCrypt() {
2760     assert(UseAES, "need AES instructions and misaligned SSE support");
2761     __ align(CodeEntryAlignment);
2762     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2763 
2764     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2765 
2766     const Register from        = c_rarg0;  // source array address
2767     const Register to          = c_rarg1;  // destination array address
2768     const Register key         = c_rarg2;  // key array address
2769     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2770                                            // and left with the results of the last encryption block
2771     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2772     const Register keylen      = rscratch1;
2773 
2774     address start = __ pc();
2775 
2776       __ enter();
2777 
2778       __ movw(rscratch2, len_reg);
2779 
2780       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2781 
2782       __ ld1(v0, __ T16B, rvec);
2783 
2784       __ cmpw(keylen, 52);
2785       __ br(Assembler::CC, L_loadkeys_44);
2786       __ br(Assembler::EQ, L_loadkeys_52);
2787 
2788       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2789       __ rev32(v17, __ T16B, v17);
2790       __ rev32(v18, __ T16B, v18);
2791     __ BIND(L_loadkeys_52);
2792       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2793       __ rev32(v19, __ T16B, v19);
2794       __ rev32(v20, __ T16B, v20);
2795     __ BIND(L_loadkeys_44);
2796       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2797       __ rev32(v21, __ T16B, v21);
2798       __ rev32(v22, __ T16B, v22);
2799       __ rev32(v23, __ T16B, v23);
2800       __ rev32(v24, __ T16B, v24);
2801       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2802       __ rev32(v25, __ T16B, v25);
2803       __ rev32(v26, __ T16B, v26);
2804       __ rev32(v27, __ T16B, v27);
2805       __ rev32(v28, __ T16B, v28);
2806       __ ld1(v29, v30, v31, __ T16B, key);
2807       __ rev32(v29, __ T16B, v29);
2808       __ rev32(v30, __ T16B, v30);
2809       __ rev32(v31, __ T16B, v31);
2810 
2811     __ BIND(L_aes_loop);
2812       __ ld1(v1, __ T16B, __ post(from, 16));
2813       __ eor(v0, __ T16B, v0, v1);
2814 
2815       __ br(Assembler::CC, L_rounds_44);
2816       __ br(Assembler::EQ, L_rounds_52);
2817 
2818       __ aese(v0, v17); __ aesmc(v0, v0);
2819       __ aese(v0, v18); __ aesmc(v0, v0);
2820     __ BIND(L_rounds_52);
2821       __ aese(v0, v19); __ aesmc(v0, v0);
2822       __ aese(v0, v20); __ aesmc(v0, v0);
2823     __ BIND(L_rounds_44);
2824       __ aese(v0, v21); __ aesmc(v0, v0);
2825       __ aese(v0, v22); __ aesmc(v0, v0);
2826       __ aese(v0, v23); __ aesmc(v0, v0);
2827       __ aese(v0, v24); __ aesmc(v0, v0);
2828       __ aese(v0, v25); __ aesmc(v0, v0);
2829       __ aese(v0, v26); __ aesmc(v0, v0);
2830       __ aese(v0, v27); __ aesmc(v0, v0);
2831       __ aese(v0, v28); __ aesmc(v0, v0);
2832       __ aese(v0, v29); __ aesmc(v0, v0);
2833       __ aese(v0, v30);
2834       __ eor(v0, __ T16B, v0, v31);
2835 
2836       __ st1(v0, __ T16B, __ post(to, 16));
2837 
2838       __ subw(len_reg, len_reg, 16);
2839       __ cbnzw(len_reg, L_aes_loop);
2840 
2841       __ st1(v0, __ T16B, rvec);
2842 
2843       __ mov(r0, rscratch2);
2844 
2845       __ leave();
2846       __ ret(lr);
2847 
2848       return start;
2849   }
2850 
2851   // Arguments:
2852   //
2853   // Inputs:
2854   //   c_rarg0   - source byte array address
2855   //   c_rarg1   - destination byte array address
2856   //   c_rarg2   - K (key) in little endian int array
2857   //   c_rarg3   - r vector byte array address
2858   //   c_rarg4   - input length
2859   //
2860   // Output:
2861   //   r0        - input length
2862   //
2863   address generate_cipherBlockChaining_decryptAESCrypt() {
2864     assert(UseAES, "need AES instructions and misaligned SSE support");
2865     __ align(CodeEntryAlignment);
2866     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2867 
2868     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2869 
2870     const Register from        = c_rarg0;  // source array address
2871     const Register to          = c_rarg1;  // destination array address
2872     const Register key         = c_rarg2;  // key array address
2873     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2874                                            // and left with the results of the last encryption block
2875     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2876     const Register keylen      = rscratch1;
2877 
2878     address start = __ pc();
2879 
2880       __ enter();
2881 
2882       __ movw(rscratch2, len_reg);
2883 
2884       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2885 
2886       __ ld1(v2, __ T16B, rvec);
2887 
2888       __ ld1(v31, __ T16B, __ post(key, 16));
2889       __ rev32(v31, __ T16B, v31);
2890 
2891       __ cmpw(keylen, 52);
2892       __ br(Assembler::CC, L_loadkeys_44);
2893       __ br(Assembler::EQ, L_loadkeys_52);
2894 
2895       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2896       __ rev32(v17, __ T16B, v17);
2897       __ rev32(v18, __ T16B, v18);
2898     __ BIND(L_loadkeys_52);
2899       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2900       __ rev32(v19, __ T16B, v19);
2901       __ rev32(v20, __ T16B, v20);
2902     __ BIND(L_loadkeys_44);
2903       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2904       __ rev32(v21, __ T16B, v21);
2905       __ rev32(v22, __ T16B, v22);
2906       __ rev32(v23, __ T16B, v23);
2907       __ rev32(v24, __ T16B, v24);
2908       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2909       __ rev32(v25, __ T16B, v25);
2910       __ rev32(v26, __ T16B, v26);
2911       __ rev32(v27, __ T16B, v27);
2912       __ rev32(v28, __ T16B, v28);
2913       __ ld1(v29, v30, __ T16B, key);
2914       __ rev32(v29, __ T16B, v29);
2915       __ rev32(v30, __ T16B, v30);
2916 
2917     __ BIND(L_aes_loop);
2918       __ ld1(v0, __ T16B, __ post(from, 16));
2919       __ orr(v1, __ T16B, v0, v0);
2920 
2921       __ br(Assembler::CC, L_rounds_44);
2922       __ br(Assembler::EQ, L_rounds_52);
2923 
2924       __ aesd(v0, v17); __ aesimc(v0, v0);
2925       __ aesd(v0, v18); __ aesimc(v0, v0);
2926     __ BIND(L_rounds_52);
2927       __ aesd(v0, v19); __ aesimc(v0, v0);
2928       __ aesd(v0, v20); __ aesimc(v0, v0);
2929     __ BIND(L_rounds_44);
2930       __ aesd(v0, v21); __ aesimc(v0, v0);
2931       __ aesd(v0, v22); __ aesimc(v0, v0);
2932       __ aesd(v0, v23); __ aesimc(v0, v0);
2933       __ aesd(v0, v24); __ aesimc(v0, v0);
2934       __ aesd(v0, v25); __ aesimc(v0, v0);
2935       __ aesd(v0, v26); __ aesimc(v0, v0);
2936       __ aesd(v0, v27); __ aesimc(v0, v0);
2937       __ aesd(v0, v28); __ aesimc(v0, v0);
2938       __ aesd(v0, v29); __ aesimc(v0, v0);
2939       __ aesd(v0, v30);
2940       __ eor(v0, __ T16B, v0, v31);
2941       __ eor(v0, __ T16B, v0, v2);
2942 
2943       __ st1(v0, __ T16B, __ post(to, 16));
2944       __ orr(v2, __ T16B, v1, v1);
2945 
2946       __ subw(len_reg, len_reg, 16);
2947       __ cbnzw(len_reg, L_aes_loop);
2948 
2949       __ st1(v2, __ T16B, rvec);
2950 
2951       __ mov(r0, rscratch2);
2952 
2953       __ leave();
2954       __ ret(lr);
2955 
2956     return start;
2957   }
2958 
2959   // Arguments:
2960   //
2961   // Inputs:
2962   //   c_rarg0   - byte[]  source+offset
2963   //   c_rarg1   - int[]   SHA.state
2964   //   c_rarg2   - int     offset
2965   //   c_rarg3   - int     limit
2966   //
2967   address generate_sha1_implCompress(bool multi_block, const char *name) {
2968     __ align(CodeEntryAlignment);
2969     StubCodeMark mark(this, "StubRoutines", name);
2970     address start = __ pc();
2971 
2972     Register buf   = c_rarg0;
2973     Register state = c_rarg1;
2974     Register ofs   = c_rarg2;
2975     Register limit = c_rarg3;
2976 
2977     Label keys;
2978     Label sha1_loop;
2979 
2980     // load the keys into v0..v3
2981     __ adr(rscratch1, keys);
2982     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2983     // load 5 words state into v6, v7
2984     __ ldrq(v6, Address(state, 0));
2985     __ ldrs(v7, Address(state, 16));
2986 
2987 
2988     __ BIND(sha1_loop);
2989     // load 64 bytes of data into v16..v19
2990     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2991     __ rev32(v16, __ T16B, v16);
2992     __ rev32(v17, __ T16B, v17);
2993     __ rev32(v18, __ T16B, v18);
2994     __ rev32(v19, __ T16B, v19);
2995 
2996     // do the sha1
2997     __ addv(v4, __ T4S, v16, v0);
2998     __ orr(v20, __ T16B, v6, v6);
2999 
3000     FloatRegister d0 = v16;
3001     FloatRegister d1 = v17;
3002     FloatRegister d2 = v18;
3003     FloatRegister d3 = v19;
3004 
3005     for (int round = 0; round < 20; round++) {
3006       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3007       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3008       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3009       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3010       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3011 
3012       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3013       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3014       __ sha1h(tmp2, __ T4S, v20);
3015       if (round < 5)
3016         __ sha1c(v20, __ T4S, tmp3, tmp4);
3017       else if (round < 10 || round >= 15)
3018         __ sha1p(v20, __ T4S, tmp3, tmp4);
3019       else
3020         __ sha1m(v20, __ T4S, tmp3, tmp4);
3021       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3022 
3023       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3024     }
3025 
3026     __ addv(v7, __ T2S, v7, v21);
3027     __ addv(v6, __ T4S, v6, v20);
3028 
3029     if (multi_block) {
3030       __ add(ofs, ofs, 64);
3031       __ cmp(ofs, limit);
3032       __ br(Assembler::LE, sha1_loop);
3033       __ mov(c_rarg0, ofs); // return ofs
3034     }
3035 
3036     __ strq(v6, Address(state, 0));
3037     __ strs(v7, Address(state, 16));
3038 
3039     __ ret(lr);
3040 
3041     __ bind(keys);
3042     __ emit_int32(0x5a827999);
3043     __ emit_int32(0x6ed9eba1);
3044     __ emit_int32(0x8f1bbcdc);
3045     __ emit_int32(0xca62c1d6);
3046 
3047     return start;
3048   }
3049 
3050 
3051   // Arguments:
3052   //
3053   // Inputs:
3054   //   c_rarg0   - byte[]  source+offset
3055   //   c_rarg1   - int[]   SHA.state
3056   //   c_rarg2   - int     offset
3057   //   c_rarg3   - int     limit
3058   //
3059   address generate_sha256_implCompress(bool multi_block, const char *name) {
3060     static const uint32_t round_consts[64] = {
3061       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3062       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3063       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3064       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3065       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3066       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3067       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3068       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3069       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3070       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3071       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3072       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3073       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3074       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3075       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3076       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3077     };
3078     __ align(CodeEntryAlignment);
3079     StubCodeMark mark(this, "StubRoutines", name);
3080     address start = __ pc();
3081 
3082     Register buf   = c_rarg0;
3083     Register state = c_rarg1;
3084     Register ofs   = c_rarg2;
3085     Register limit = c_rarg3;
3086 
3087     Label sha1_loop;
3088 
3089     __ stpd(v8, v9, __ pre(sp, -32));
3090     __ stpd(v10, v11, Address(sp, 16));
3091 
3092 // dga == v0
3093 // dgb == v1
3094 // dg0 == v2
3095 // dg1 == v3
3096 // dg2 == v4
3097 // t0 == v6
3098 // t1 == v7
3099 
3100     // load 16 keys to v16..v31
3101     __ lea(rscratch1, ExternalAddress((address)round_consts));
3102     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3103     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3104     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3105     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3106 
3107     // load 8 words (256 bits) state
3108     __ ldpq(v0, v1, state);
3109 
3110     __ BIND(sha1_loop);
3111     // load 64 bytes of data into v8..v11
3112     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3113     __ rev32(v8, __ T16B, v8);
3114     __ rev32(v9, __ T16B, v9);
3115     __ rev32(v10, __ T16B, v10);
3116     __ rev32(v11, __ T16B, v11);
3117 
3118     __ addv(v6, __ T4S, v8, v16);
3119     __ orr(v2, __ T16B, v0, v0);
3120     __ orr(v3, __ T16B, v1, v1);
3121 
3122     FloatRegister d0 = v8;
3123     FloatRegister d1 = v9;
3124     FloatRegister d2 = v10;
3125     FloatRegister d3 = v11;
3126 
3127 
3128     for (int round = 0; round < 16; round++) {
3129       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3130       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3131       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3132       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3133 
3134       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3135        __ orr(v4, __ T16B, v2, v2);
3136       if (round < 15)
3137         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3138       __ sha256h(v2, __ T4S, v3, tmp2);
3139       __ sha256h2(v3, __ T4S, v4, tmp2);
3140       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3141 
3142       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3143     }
3144 
3145     __ addv(v0, __ T4S, v0, v2);
3146     __ addv(v1, __ T4S, v1, v3);
3147 
3148     if (multi_block) {
3149       __ add(ofs, ofs, 64);
3150       __ cmp(ofs, limit);
3151       __ br(Assembler::LE, sha1_loop);
3152       __ mov(c_rarg0, ofs); // return ofs
3153     }
3154 
3155     __ ldpd(v10, v11, Address(sp, 16));
3156     __ ldpd(v8, v9, __ post(sp, 32));
3157 
3158     __ stpq(v0, v1, state);
3159 
3160     __ ret(lr);
3161 
3162     return start;
3163   }
3164 
3165 #ifndef BUILTIN_SIM
3166   // Safefetch stubs.
3167   void generate_safefetch(const char* name, int size, address* entry,
3168                           address* fault_pc, address* continuation_pc) {
3169     // safefetch signatures:
3170     //   int      SafeFetch32(int*      adr, int      errValue);
3171     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3172     //
3173     // arguments:
3174     //   c_rarg0 = adr
3175     //   c_rarg1 = errValue
3176     //
3177     // result:
3178     //   PPC_RET  = *adr or errValue
3179 
3180     StubCodeMark mark(this, "StubRoutines", name);
3181 
3182     // Entry point, pc or function descriptor.
3183     *entry = __ pc();
3184 
3185     // Load *adr into c_rarg1, may fault.
3186     *fault_pc = __ pc();
3187     switch (size) {
3188       case 4:
3189         // int32_t
3190         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3191         break;
3192       case 8:
3193         // int64_t
3194         __ ldr(c_rarg1, Address(c_rarg0, 0));
3195         break;
3196       default:
3197         ShouldNotReachHere();
3198     }
3199 
3200     // return errValue or *adr
3201     *continuation_pc = __ pc();
3202     __ mov(r0, c_rarg1);
3203     __ ret(lr);
3204   }
3205 #endif
3206 
3207   /**
3208    *  Arguments:
3209    *
3210    * Inputs:
3211    *   c_rarg0   - int crc
3212    *   c_rarg1   - byte* buf
3213    *   c_rarg2   - int length
3214    *
3215    * Ouput:
3216    *       rax   - int crc result
3217    */
3218   address generate_updateBytesCRC32() {
3219     assert(UseCRC32Intrinsics, "what are we doing here?");
3220 
3221     __ align(CodeEntryAlignment);
3222     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3223 
3224     address start = __ pc();
3225 
3226     const Register crc   = c_rarg0;  // crc
3227     const Register buf   = c_rarg1;  // source java byte array address
3228     const Register len   = c_rarg2;  // length
3229     const Register table0 = c_rarg3; // crc_table address
3230     const Register table1 = c_rarg4;
3231     const Register table2 = c_rarg5;
3232     const Register table3 = c_rarg6;
3233     const Register tmp3 = c_rarg7;
3234 
3235     BLOCK_COMMENT("Entry:");
3236     __ enter(); // required for proper stackwalking of RuntimeStub frame
3237 
3238     __ kernel_crc32(crc, buf, len,
3239               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3240 
3241     __ leave(); // required for proper stackwalking of RuntimeStub frame
3242     __ ret(lr);
3243 
3244     return start;
3245   }
3246 
3247   /**
3248    *  Arguments:
3249    *
3250    * Inputs:
3251    *   c_rarg0   - int crc
3252    *   c_rarg1   - byte* buf
3253    *   c_rarg2   - int length
3254    *   c_rarg3   - int* table
3255    *
3256    * Ouput:
3257    *       r0   - int crc result
3258    */
3259   address generate_updateBytesCRC32C() {
3260     assert(UseCRC32CIntrinsics, "what are we doing here?");
3261 
3262     __ align(CodeEntryAlignment);
3263     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3264 
3265     address start = __ pc();
3266 
3267     const Register crc   = c_rarg0;  // crc
3268     const Register buf   = c_rarg1;  // source java byte array address
3269     const Register len   = c_rarg2;  // length
3270     const Register table0 = c_rarg3; // crc_table address
3271     const Register table1 = c_rarg4;
3272     const Register table2 = c_rarg5;
3273     const Register table3 = c_rarg6;
3274     const Register tmp3 = c_rarg7;
3275 
3276     BLOCK_COMMENT("Entry:");
3277     __ enter(); // required for proper stackwalking of RuntimeStub frame
3278 
3279     __ kernel_crc32c(crc, buf, len,
3280               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3281 
3282     __ leave(); // required for proper stackwalking of RuntimeStub frame
3283     __ ret(lr);
3284 
3285     return start;
3286   }
3287 
3288   /***
3289    *  Arguments:
3290    *
3291    *  Inputs:
3292    *   c_rarg0   - int   adler
3293    *   c_rarg1   - byte* buff
3294    *   c_rarg2   - int   len
3295    *
3296    * Output:
3297    *   c_rarg0   - int adler result
3298    */
3299   address generate_updateBytesAdler32() {
3300     __ align(CodeEntryAlignment);
3301     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3302     address start = __ pc();
3303 
3304     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3305 
3306     // Aliases
3307     Register adler  = c_rarg0;
3308     Register s1     = c_rarg0;
3309     Register s2     = c_rarg3;
3310     Register buff   = c_rarg1;
3311     Register len    = c_rarg2;
3312     Register nmax  = r4;
3313     Register base = r5;
3314     Register count = r6;
3315     Register temp0 = rscratch1;
3316     Register temp1 = rscratch2;
3317     Register temp2 = r7;
3318 
3319     // Max number of bytes we can process before having to take the mod
3320     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3321     unsigned long BASE = 0xfff1;
3322     unsigned long NMAX = 0x15B0;
3323 
3324     __ mov(base, BASE);
3325     __ mov(nmax, NMAX);
3326 
3327     // s1 is initialized to the lower 16 bits of adler
3328     // s2 is initialized to the upper 16 bits of adler
3329     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3330     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3331 
3332     // The pipelined loop needs at least 16 elements for 1 iteration
3333     // It does check this, but it is more effective to skip to the cleanup loop
3334     __ cmp(len, 16);
3335     __ br(Assembler::HS, L_nmax);
3336     __ cbz(len, L_combine);
3337 
3338     __ bind(L_simple_by1_loop);
3339     __ ldrb(temp0, Address(__ post(buff, 1)));
3340     __ add(s1, s1, temp0);
3341     __ add(s2, s2, s1);
3342     __ subs(len, len, 1);
3343     __ br(Assembler::HI, L_simple_by1_loop);
3344 
3345     // s1 = s1 % BASE
3346     __ subs(temp0, s1, base);
3347     __ csel(s1, temp0, s1, Assembler::HS);
3348 
3349     // s2 = s2 % BASE
3350     __ lsr(temp0, s2, 16);
3351     __ lsl(temp1, temp0, 4);
3352     __ sub(temp1, temp1, temp0);
3353     __ add(s2, temp1, s2, ext::uxth);
3354 
3355     __ subs(temp0, s2, base);
3356     __ csel(s2, temp0, s2, Assembler::HS);
3357 
3358     __ b(L_combine);
3359 
3360     __ bind(L_nmax);
3361     __ subs(len, len, nmax);
3362     __ sub(count, nmax, 16);
3363     __ br(Assembler::LO, L_by16);
3364 
3365     __ bind(L_nmax_loop);
3366 
3367     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3368 
3369     __ add(s1, s1, temp0, ext::uxtb);
3370     __ ubfx(temp2, temp0, 8, 8);
3371     __ add(s2, s2, s1);
3372     __ add(s1, s1, temp2);
3373     __ ubfx(temp2, temp0, 16, 8);
3374     __ add(s2, s2, s1);
3375     __ add(s1, s1, temp2);
3376     __ ubfx(temp2, temp0, 24, 8);
3377     __ add(s2, s2, s1);
3378     __ add(s1, s1, temp2);
3379     __ ubfx(temp2, temp0, 32, 8);
3380     __ add(s2, s2, s1);
3381     __ add(s1, s1, temp2);
3382     __ ubfx(temp2, temp0, 40, 8);
3383     __ add(s2, s2, s1);
3384     __ add(s1, s1, temp2);
3385     __ ubfx(temp2, temp0, 48, 8);
3386     __ add(s2, s2, s1);
3387     __ add(s1, s1, temp2);
3388     __ add(s2, s2, s1);
3389     __ add(s1, s1, temp0, Assembler::LSR, 56);
3390     __ add(s2, s2, s1);
3391 
3392     __ add(s1, s1, temp1, ext::uxtb);
3393     __ ubfx(temp2, temp1, 8, 8);
3394     __ add(s2, s2, s1);
3395     __ add(s1, s1, temp2);
3396     __ ubfx(temp2, temp1, 16, 8);
3397     __ add(s2, s2, s1);
3398     __ add(s1, s1, temp2);
3399     __ ubfx(temp2, temp1, 24, 8);
3400     __ add(s2, s2, s1);
3401     __ add(s1, s1, temp2);
3402     __ ubfx(temp2, temp1, 32, 8);
3403     __ add(s2, s2, s1);
3404     __ add(s1, s1, temp2);
3405     __ ubfx(temp2, temp1, 40, 8);
3406     __ add(s2, s2, s1);
3407     __ add(s1, s1, temp2);
3408     __ ubfx(temp2, temp1, 48, 8);
3409     __ add(s2, s2, s1);
3410     __ add(s1, s1, temp2);
3411     __ add(s2, s2, s1);
3412     __ add(s1, s1, temp1, Assembler::LSR, 56);
3413     __ add(s2, s2, s1);
3414 
3415     __ subs(count, count, 16);
3416     __ br(Assembler::HS, L_nmax_loop);
3417 
3418     // s1 = s1 % BASE
3419     __ lsr(temp0, s1, 16);
3420     __ lsl(temp1, temp0, 4);
3421     __ sub(temp1, temp1, temp0);
3422     __ add(temp1, temp1, s1, ext::uxth);
3423 
3424     __ lsr(temp0, temp1, 16);
3425     __ lsl(s1, temp0, 4);
3426     __ sub(s1, s1, temp0);
3427     __ add(s1, s1, temp1, ext:: uxth);
3428 
3429     __ subs(temp0, s1, base);
3430     __ csel(s1, temp0, s1, Assembler::HS);
3431 
3432     // s2 = s2 % BASE
3433     __ lsr(temp0, s2, 16);
3434     __ lsl(temp1, temp0, 4);
3435     __ sub(temp1, temp1, temp0);
3436     __ add(temp1, temp1, s2, ext::uxth);
3437 
3438     __ lsr(temp0, temp1, 16);
3439     __ lsl(s2, temp0, 4);
3440     __ sub(s2, s2, temp0);
3441     __ add(s2, s2, temp1, ext:: uxth);
3442 
3443     __ subs(temp0, s2, base);
3444     __ csel(s2, temp0, s2, Assembler::HS);
3445 
3446     __ subs(len, len, nmax);
3447     __ sub(count, nmax, 16);
3448     __ br(Assembler::HS, L_nmax_loop);
3449 
3450     __ bind(L_by16);
3451     __ adds(len, len, count);
3452     __ br(Assembler::LO, L_by1);
3453 
3454     __ bind(L_by16_loop);
3455 
3456     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3457 
3458     __ add(s1, s1, temp0, ext::uxtb);
3459     __ ubfx(temp2, temp0, 8, 8);
3460     __ add(s2, s2, s1);
3461     __ add(s1, s1, temp2);
3462     __ ubfx(temp2, temp0, 16, 8);
3463     __ add(s2, s2, s1);
3464     __ add(s1, s1, temp2);
3465     __ ubfx(temp2, temp0, 24, 8);
3466     __ add(s2, s2, s1);
3467     __ add(s1, s1, temp2);
3468     __ ubfx(temp2, temp0, 32, 8);
3469     __ add(s2, s2, s1);
3470     __ add(s1, s1, temp2);
3471     __ ubfx(temp2, temp0, 40, 8);
3472     __ add(s2, s2, s1);
3473     __ add(s1, s1, temp2);
3474     __ ubfx(temp2, temp0, 48, 8);
3475     __ add(s2, s2, s1);
3476     __ add(s1, s1, temp2);
3477     __ add(s2, s2, s1);
3478     __ add(s1, s1, temp0, Assembler::LSR, 56);
3479     __ add(s2, s2, s1);
3480 
3481     __ add(s1, s1, temp1, ext::uxtb);
3482     __ ubfx(temp2, temp1, 8, 8);
3483     __ add(s2, s2, s1);
3484     __ add(s1, s1, temp2);
3485     __ ubfx(temp2, temp1, 16, 8);
3486     __ add(s2, s2, s1);
3487     __ add(s1, s1, temp2);
3488     __ ubfx(temp2, temp1, 24, 8);
3489     __ add(s2, s2, s1);
3490     __ add(s1, s1, temp2);
3491     __ ubfx(temp2, temp1, 32, 8);
3492     __ add(s2, s2, s1);
3493     __ add(s1, s1, temp2);
3494     __ ubfx(temp2, temp1, 40, 8);
3495     __ add(s2, s2, s1);
3496     __ add(s1, s1, temp2);
3497     __ ubfx(temp2, temp1, 48, 8);
3498     __ add(s2, s2, s1);
3499     __ add(s1, s1, temp2);
3500     __ add(s2, s2, s1);
3501     __ add(s1, s1, temp1, Assembler::LSR, 56);
3502     __ add(s2, s2, s1);
3503 
3504     __ subs(len, len, 16);
3505     __ br(Assembler::HS, L_by16_loop);
3506 
3507     __ bind(L_by1);
3508     __ adds(len, len, 15);
3509     __ br(Assembler::LO, L_do_mod);
3510 
3511     __ bind(L_by1_loop);
3512     __ ldrb(temp0, Address(__ post(buff, 1)));
3513     __ add(s1, temp0, s1);
3514     __ add(s2, s2, s1);
3515     __ subs(len, len, 1);
3516     __ br(Assembler::HS, L_by1_loop);
3517 
3518     __ bind(L_do_mod);
3519     // s1 = s1 % BASE
3520     __ lsr(temp0, s1, 16);
3521     __ lsl(temp1, temp0, 4);
3522     __ sub(temp1, temp1, temp0);
3523     __ add(temp1, temp1, s1, ext::uxth);
3524 
3525     __ lsr(temp0, temp1, 16);
3526     __ lsl(s1, temp0, 4);
3527     __ sub(s1, s1, temp0);
3528     __ add(s1, s1, temp1, ext:: uxth);
3529 
3530     __ subs(temp0, s1, base);
3531     __ csel(s1, temp0, s1, Assembler::HS);
3532 
3533     // s2 = s2 % BASE
3534     __ lsr(temp0, s2, 16);
3535     __ lsl(temp1, temp0, 4);
3536     __ sub(temp1, temp1, temp0);
3537     __ add(temp1, temp1, s2, ext::uxth);
3538 
3539     __ lsr(temp0, temp1, 16);
3540     __ lsl(s2, temp0, 4);
3541     __ sub(s2, s2, temp0);
3542     __ add(s2, s2, temp1, ext:: uxth);
3543 
3544     __ subs(temp0, s2, base);
3545     __ csel(s2, temp0, s2, Assembler::HS);
3546 
3547     // Combine lower bits and higher bits
3548     __ bind(L_combine);
3549     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3550 
3551     __ ret(lr);
3552 
3553     return start;
3554   }
3555 
3556   /**
3557    *  Arguments:
3558    *
3559    *  Input:
3560    *    c_rarg0   - x address
3561    *    c_rarg1   - x length
3562    *    c_rarg2   - y address
3563    *    c_rarg3   - y lenth
3564    *    c_rarg4   - z address
3565    *    c_rarg5   - z length
3566    */
3567   address generate_multiplyToLen() {
3568     __ align(CodeEntryAlignment);
3569     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3570 
3571     address start = __ pc();
3572     const Register x     = r0;
3573     const Register xlen  = r1;
3574     const Register y     = r2;
3575     const Register ylen  = r3;
3576     const Register z     = r4;
3577     const Register zlen  = r5;
3578 
3579     const Register tmp1  = r10;
3580     const Register tmp2  = r11;
3581     const Register tmp3  = r12;
3582     const Register tmp4  = r13;
3583     const Register tmp5  = r14;
3584     const Register tmp6  = r15;
3585     const Register tmp7  = r16;
3586 
3587     BLOCK_COMMENT("Entry:");
3588     __ enter(); // required for proper stackwalking of RuntimeStub frame
3589     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3590     __ leave(); // required for proper stackwalking of RuntimeStub frame
3591     __ ret(lr);
3592 
3593     return start;
3594   }
3595 
3596   address generate_squareToLen() {
3597     // squareToLen algorithm for sizes 1..127 described in java code works
3598     // faster than multiply_to_len on some CPUs and slower on others, but
3599     // multiply_to_len shows a bit better overall results
3600     __ align(CodeEntryAlignment);
3601     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3602     address start = __ pc();
3603 
3604     const Register x     = r0;
3605     const Register xlen  = r1;
3606     const Register z     = r2;
3607     const Register zlen  = r3;
3608     const Register y     = r4; // == x
3609     const Register ylen  = r5; // == xlen
3610 
3611     const Register tmp1  = r10;
3612     const Register tmp2  = r11;
3613     const Register tmp3  = r12;
3614     const Register tmp4  = r13;
3615     const Register tmp5  = r14;
3616     const Register tmp6  = r15;
3617     const Register tmp7  = r16;
3618 
3619     RegSet spilled_regs = RegSet::of(y, ylen);
3620     BLOCK_COMMENT("Entry:");
3621     __ enter();
3622     __ push(spilled_regs, sp);
3623     __ mov(y, x);
3624     __ mov(ylen, xlen);
3625     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3626     __ pop(spilled_regs, sp);
3627     __ leave();
3628     __ ret(lr);
3629     return start;
3630   }
3631 
3632   address generate_mulAdd() {
3633     __ align(CodeEntryAlignment);
3634     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3635 
3636     address start = __ pc();
3637 
3638     const Register out     = r0;
3639     const Register in      = r1;
3640     const Register offset  = r2;
3641     const Register len     = r3;
3642     const Register k       = r4;
3643 
3644     BLOCK_COMMENT("Entry:");
3645     __ enter();
3646     __ mul_add(out, in, offset, len, k);
3647     __ leave();
3648     __ ret(lr);
3649 
3650     return start;
3651   }
3652 
3653   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3654                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3655                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3656     // Karatsuba multiplication performs a 128*128 -> 256-bit
3657     // multiplication in three 128-bit multiplications and a few
3658     // additions.
3659     //
3660     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3661     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3662     //
3663     // Inputs:
3664     //
3665     // A0 in a.d[0]     (subkey)
3666     // A1 in a.d[1]
3667     // (A1+A0) in a1_xor_a0.d[0]
3668     //
3669     // B0 in b.d[0]     (state)
3670     // B1 in b.d[1]
3671 
3672     __ ext(tmp1, __ T16B, b, b, 0x08);
3673     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3674     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3675     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3676     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3677 
3678     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3679     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3680     __ eor(tmp2, __ T16B, tmp2, tmp4);
3681     __ eor(tmp2, __ T16B, tmp2, tmp3);
3682 
3683     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3684     __ ins(result_hi, __ D, tmp2, 0, 1);
3685     __ ins(result_lo, __ D, tmp2, 1, 0);
3686   }
3687 
3688   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3689                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3690     const FloatRegister t0 = result;
3691 
3692     // The GCM field polynomial f is z^128 + p(z), where p =
3693     // z^7+z^2+z+1.
3694     //
3695     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3696     //
3697     // so, given that the product we're reducing is
3698     //    a == lo + hi * z^128
3699     // substituting,
3700     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3701     //
3702     // we reduce by multiplying hi by p(z) and subtracting the result
3703     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3704     // bits we can do this with two 64-bit multiplications, lo*p and
3705     // hi*p.
3706 
3707     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3708     __ ext(t1, __ T16B, t0, z, 8);
3709     __ eor(hi, __ T16B, hi, t1);
3710     __ ext(t1, __ T16B, z, t0, 8);
3711     __ eor(lo, __ T16B, lo, t1);
3712     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3713     __ eor(result, __ T16B, lo, t0);
3714   }
3715 
3716   address generate_has_negatives(address &has_negatives_long) {
3717     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3718     const int large_loop_size = 64;
3719     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3720     int dcache_line = VM_Version::dcache_line_size();
3721 
3722     Register ary1 = r1, len = r2, result = r0;
3723 
3724     __ align(CodeEntryAlignment);
3725     address entry = __ pc();
3726 
3727     __ enter();
3728 
3729   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3730         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3731 
3732   __ cmp(len, 15);
3733   __ br(Assembler::GT, LEN_OVER_15);
3734   // The only case when execution falls into this code is when pointer is near
3735   // the end of memory page and we have to avoid reading next page
3736   __ add(ary1, ary1, len);
3737   __ subs(len, len, 8);
3738   __ br(Assembler::GT, LEN_OVER_8);
3739   __ ldr(rscratch2, Address(ary1, -8));
3740   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3741   __ lsrv(rscratch2, rscratch2, rscratch1);
3742   __ tst(rscratch2, UPPER_BIT_MASK);
3743   __ cset(result, Assembler::NE);
3744   __ leave();
3745   __ ret(lr);
3746   __ bind(LEN_OVER_8);
3747   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3748   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3749   __ tst(rscratch2, UPPER_BIT_MASK);
3750   __ br(Assembler::NE, RET_TRUE_NO_POP);
3751   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3752   __ lsrv(rscratch1, rscratch1, rscratch2);
3753   __ tst(rscratch1, UPPER_BIT_MASK);
3754   __ cset(result, Assembler::NE);
3755   __ leave();
3756   __ ret(lr);
3757 
3758   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3759   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3760 
3761   has_negatives_long = __ pc(); // 2nd entry point
3762 
3763   __ enter();
3764 
3765   __ bind(LEN_OVER_15);
3766     __ push(spilled_regs, sp);
3767     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3768     __ cbz(rscratch2, ALIGNED);
3769     __ ldp(tmp6, tmp1, Address(ary1));
3770     __ mov(tmp5, 16);
3771     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3772     __ add(ary1, ary1, rscratch1);
3773     __ sub(len, len, rscratch1);
3774     __ orr(tmp6, tmp6, tmp1);
3775     __ tst(tmp6, UPPER_BIT_MASK);
3776     __ br(Assembler::NE, RET_TRUE);
3777 
3778   __ bind(ALIGNED);
3779     __ cmp(len, large_loop_size);
3780     __ br(Assembler::LT, CHECK_16);
3781     // Perform 16-byte load as early return in pre-loop to handle situation
3782     // when initially aligned large array has negative values at starting bytes,
3783     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3784     // slower. Cases with negative bytes further ahead won't be affected that
3785     // much. In fact, it'll be faster due to early loads, less instructions and
3786     // less branches in LARGE_LOOP.
3787     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3788     __ sub(len, len, 16);
3789     __ orr(tmp6, tmp6, tmp1);
3790     __ tst(tmp6, UPPER_BIT_MASK);
3791     __ br(Assembler::NE, RET_TRUE);
3792     __ cmp(len, large_loop_size);
3793     __ br(Assembler::LT, CHECK_16);
3794 
3795     if (SoftwarePrefetchHintDistance >= 0
3796         && SoftwarePrefetchHintDistance >= dcache_line) {
3797       // initial prefetch
3798       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3799     }
3800   __ bind(LARGE_LOOP);
3801     if (SoftwarePrefetchHintDistance >= 0) {
3802       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3803     }
3804     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3805     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3806     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3807     // instructions per cycle and have less branches, but this approach disables
3808     // early return, thus, all 64 bytes are loaded and checked every time.
3809     __ ldp(tmp2, tmp3, Address(ary1));
3810     __ ldp(tmp4, tmp5, Address(ary1, 16));
3811     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3812     __ ldp(tmp6, tmp1, Address(ary1, 48));
3813     __ add(ary1, ary1, large_loop_size);
3814     __ sub(len, len, large_loop_size);
3815     __ orr(tmp2, tmp2, tmp3);
3816     __ orr(tmp4, tmp4, tmp5);
3817     __ orr(rscratch1, rscratch1, rscratch2);
3818     __ orr(tmp6, tmp6, tmp1);
3819     __ orr(tmp2, tmp2, tmp4);
3820     __ orr(rscratch1, rscratch1, tmp6);
3821     __ orr(tmp2, tmp2, rscratch1);
3822     __ tst(tmp2, UPPER_BIT_MASK);
3823     __ br(Assembler::NE, RET_TRUE);
3824     __ cmp(len, large_loop_size);
3825     __ br(Assembler::GE, LARGE_LOOP);
3826 
3827   __ bind(CHECK_16); // small 16-byte load pre-loop
3828     __ cmp(len, 16);
3829     __ br(Assembler::LT, POST_LOOP16);
3830 
3831   __ bind(LOOP16); // small 16-byte load loop
3832     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3833     __ sub(len, len, 16);
3834     __ orr(tmp2, tmp2, tmp3);
3835     __ tst(tmp2, UPPER_BIT_MASK);
3836     __ br(Assembler::NE, RET_TRUE);
3837     __ cmp(len, 16);
3838     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3839 
3840   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3841     __ cmp(len, 8);
3842     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3843     __ ldr(tmp3, Address(__ post(ary1, 8)));
3844     __ sub(len, len, 8);
3845     __ tst(tmp3, UPPER_BIT_MASK);
3846     __ br(Assembler::NE, RET_TRUE);
3847 
3848   __ bind(POST_LOOP16_LOAD_TAIL);
3849     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3850     __ ldr(tmp1, Address(ary1));
3851     __ mov(tmp2, 64);
3852     __ sub(tmp4, tmp2, len, __ LSL, 3);
3853     __ lslv(tmp1, tmp1, tmp4);
3854     __ tst(tmp1, UPPER_BIT_MASK);
3855     __ br(Assembler::NE, RET_TRUE);
3856     // Fallthrough
3857 
3858   __ bind(RET_FALSE);
3859     __ pop(spilled_regs, sp);
3860     __ leave();
3861     __ mov(result, zr);
3862     __ ret(lr);
3863 
3864   __ bind(RET_TRUE);
3865     __ pop(spilled_regs, sp);
3866   __ bind(RET_TRUE_NO_POP);
3867     __ leave();
3868     __ mov(result, 1);
3869     __ ret(lr);
3870 
3871   __ bind(DONE);
3872     __ pop(spilled_regs, sp);
3873     __ leave();
3874     __ ret(lr);
3875     return entry;
3876   }
3877 
3878   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3879         bool usePrefetch, Label &NOT_EQUAL) {
3880     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3881         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3882         tmp7 = r12, tmp8 = r13;
3883     Label LOOP;
3884 
3885     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3886     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3887     __ bind(LOOP);
3888     if (usePrefetch) {
3889       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3890       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3891     }
3892     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3893     __ eor(tmp1, tmp1, tmp2);
3894     __ eor(tmp3, tmp3, tmp4);
3895     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3896     __ orr(tmp1, tmp1, tmp3);
3897     __ cbnz(tmp1, NOT_EQUAL);
3898     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3899     __ eor(tmp5, tmp5, tmp6);
3900     __ eor(tmp7, tmp7, tmp8);
3901     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3902     __ orr(tmp5, tmp5, tmp7);
3903     __ cbnz(tmp5, NOT_EQUAL);
3904     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3905     __ eor(tmp1, tmp1, tmp2);
3906     __ eor(tmp3, tmp3, tmp4);
3907     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3908     __ orr(tmp1, tmp1, tmp3);
3909     __ cbnz(tmp1, NOT_EQUAL);
3910     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3911     __ eor(tmp5, tmp5, tmp6);
3912     __ sub(cnt1, cnt1, 8 * wordSize);
3913     __ eor(tmp7, tmp7, tmp8);
3914     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3915     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3916     // cmp) because subs allows an unlimited range of immediate operand.
3917     __ subs(tmp6, cnt1, loopThreshold);
3918     __ orr(tmp5, tmp5, tmp7);
3919     __ cbnz(tmp5, NOT_EQUAL);
3920     __ br(__ GE, LOOP);
3921     // post-loop
3922     __ eor(tmp1, tmp1, tmp2);
3923     __ eor(tmp3, tmp3, tmp4);
3924     __ orr(tmp1, tmp1, tmp3);
3925     __ sub(cnt1, cnt1, 2 * wordSize);
3926     __ cbnz(tmp1, NOT_EQUAL);
3927   }
3928 
3929   void generate_large_array_equals_loop_simd(int loopThreshold,
3930         bool usePrefetch, Label &NOT_EQUAL) {
3931     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3932         tmp2 = rscratch2;
3933     Label LOOP;
3934 
3935     __ bind(LOOP);
3936     if (usePrefetch) {
3937       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3938       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3939     }
3940     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3941     __ sub(cnt1, cnt1, 8 * wordSize);
3942     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3943     __ subs(tmp1, cnt1, loopThreshold);
3944     __ eor(v0, __ T16B, v0, v4);
3945     __ eor(v1, __ T16B, v1, v5);
3946     __ eor(v2, __ T16B, v2, v6);
3947     __ eor(v3, __ T16B, v3, v7);
3948     __ orr(v0, __ T16B, v0, v1);
3949     __ orr(v1, __ T16B, v2, v3);
3950     __ orr(v0, __ T16B, v0, v1);
3951     __ umov(tmp1, v0, __ D, 0);
3952     __ umov(tmp2, v0, __ D, 1);
3953     __ orr(tmp1, tmp1, tmp2);
3954     __ cbnz(tmp1, NOT_EQUAL);
3955     __ br(__ GE, LOOP);
3956   }
3957 
3958   // a1 = r1 - array1 address
3959   // a2 = r2 - array2 address
3960   // result = r0 - return value. Already contains "false"
3961   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3962   // r3-r5 are reserved temporary registers
3963   address generate_large_array_equals() {
3964     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3965     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3966         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3967         tmp7 = r12, tmp8 = r13;
3968     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3969         SMALL_LOOP, POST_LOOP;
3970     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3971     // calculate if at least 32 prefetched bytes are used
3972     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3973     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3974     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3975     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3976         tmp5, tmp6, tmp7, tmp8);
3977 
3978     __ align(CodeEntryAlignment);
3979     address entry = __ pc();
3980     __ enter();
3981     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3982     // also advance pointers to use post-increment instead of pre-increment
3983     __ add(a1, a1, wordSize);
3984     __ add(a2, a2, wordSize);
3985     if (AvoidUnalignedAccesses) {
3986       // both implementations (SIMD/nonSIMD) are using relatively large load
3987       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3988       // on some CPUs in case of address is not at least 16-byte aligned.
3989       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3990       // load if needed at least for 1st address and make if 16-byte aligned.
3991       Label ALIGNED16;
3992       __ tbz(a1, 3, ALIGNED16);
3993       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3994       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3995       __ sub(cnt1, cnt1, wordSize);
3996       __ eor(tmp1, tmp1, tmp2);
3997       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3998       __ bind(ALIGNED16);
3999     }
4000     if (UseSIMDForArrayEquals) {
4001       if (SoftwarePrefetchHintDistance >= 0) {
4002         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4003         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4004         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4005             /* prfm = */ true, NOT_EQUAL);
4006         __ cmp(cnt1, nonPrefetchLoopThreshold);
4007         __ br(__ LT, TAIL);
4008       }
4009       __ bind(NO_PREFETCH_LARGE_LOOP);
4010       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4011           /* prfm = */ false, NOT_EQUAL);
4012     } else {
4013       __ push(spilled_regs, sp);
4014       if (SoftwarePrefetchHintDistance >= 0) {
4015         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4016         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4017         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4018             /* prfm = */ true, NOT_EQUAL);
4019         __ cmp(cnt1, nonPrefetchLoopThreshold);
4020         __ br(__ LT, TAIL);
4021       }
4022       __ bind(NO_PREFETCH_LARGE_LOOP);
4023       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4024           /* prfm = */ false, NOT_EQUAL);
4025     }
4026     __ bind(TAIL);
4027       __ cbz(cnt1, EQUAL);
4028       __ subs(cnt1, cnt1, wordSize);
4029       __ br(__ LE, POST_LOOP);
4030     __ bind(SMALL_LOOP);
4031       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4032       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4033       __ subs(cnt1, cnt1, wordSize);
4034       __ eor(tmp1, tmp1, tmp2);
4035       __ cbnz(tmp1, NOT_EQUAL);
4036       __ br(__ GT, SMALL_LOOP);
4037     __ bind(POST_LOOP);
4038       __ ldr(tmp1, Address(a1, cnt1));
4039       __ ldr(tmp2, Address(a2, cnt1));
4040       __ eor(tmp1, tmp1, tmp2);
4041       __ cbnz(tmp1, NOT_EQUAL);
4042     __ bind(EQUAL);
4043       __ mov(result, true);
4044     __ bind(NOT_EQUAL);
4045       if (!UseSIMDForArrayEquals) {
4046         __ pop(spilled_regs, sp);
4047       }
4048     __ bind(NOT_EQUAL_NO_POP);
4049     __ leave();
4050     __ ret(lr);
4051     return entry;
4052   }
4053 
4054 
4055   /**
4056    *  Arguments:
4057    *
4058    *  Input:
4059    *  c_rarg0   - current state address
4060    *  c_rarg1   - H key address
4061    *  c_rarg2   - data address
4062    *  c_rarg3   - number of blocks
4063    *
4064    *  Output:
4065    *  Updated state at c_rarg0
4066    */
4067   address generate_ghash_processBlocks() {
4068     // Bafflingly, GCM uses little-endian for the byte order, but
4069     // big-endian for the bit order.  For example, the polynomial 1 is
4070     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4071     //
4072     // So, we must either reverse the bytes in each word and do
4073     // everything big-endian or reverse the bits in each byte and do
4074     // it little-endian.  On AArch64 it's more idiomatic to reverse
4075     // the bits in each byte (we have an instruction, RBIT, to do
4076     // that) and keep the data in little-endian bit order throught the
4077     // calculation, bit-reversing the inputs and outputs.
4078 
4079     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4080     __ align(wordSize * 2);
4081     address p = __ pc();
4082     __ emit_int64(0x87);  // The low-order bits of the field
4083                           // polynomial (i.e. p = z^7+z^2+z+1)
4084                           // repeated in the low and high parts of a
4085                           // 128-bit vector
4086     __ emit_int64(0x87);
4087 
4088     __ align(CodeEntryAlignment);
4089     address start = __ pc();
4090 
4091     Register state   = c_rarg0;
4092     Register subkeyH = c_rarg1;
4093     Register data    = c_rarg2;
4094     Register blocks  = c_rarg3;
4095 
4096     FloatRegister vzr = v30;
4097     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4098 
4099     __ ldrq(v0, Address(state));
4100     __ ldrq(v1, Address(subkeyH));
4101 
4102     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4103     __ rbit(v0, __ T16B, v0);
4104     __ rev64(v1, __ T16B, v1);
4105     __ rbit(v1, __ T16B, v1);
4106 
4107     __ ldrq(v26, p);
4108 
4109     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4110     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4111 
4112     {
4113       Label L_ghash_loop;
4114       __ bind(L_ghash_loop);
4115 
4116       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4117                                                  // reversing each byte
4118       __ rbit(v2, __ T16B, v2);
4119       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4120 
4121       // Multiply state in v2 by subkey in v1
4122       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4123                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4124                      /*temps*/v6, v20, v18, v21);
4125       // Reduce v7:v5 by the field polynomial
4126       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4127 
4128       __ sub(blocks, blocks, 1);
4129       __ cbnz(blocks, L_ghash_loop);
4130     }
4131 
4132     // The bit-reversed result is at this point in v0
4133     __ rev64(v1, __ T16B, v0);
4134     __ rbit(v1, __ T16B, v1);
4135 
4136     __ st1(v1, __ T16B, state);
4137     __ ret(lr);
4138 
4139     return start;
4140   }
4141 
4142   // Continuation point for throwing of implicit exceptions that are
4143   // not handled in the current activation. Fabricates an exception
4144   // oop and initiates normal exception dispatching in this
4145   // frame. Since we need to preserve callee-saved values (currently
4146   // only for C2, but done for C1 as well) we need a callee-saved oop
4147   // map and therefore have to make these stubs into RuntimeStubs
4148   // rather than BufferBlobs.  If the compiler needs all registers to
4149   // be preserved between the fault point and the exception handler
4150   // then it must assume responsibility for that in
4151   // AbstractCompiler::continuation_for_implicit_null_exception or
4152   // continuation_for_implicit_division_by_zero_exception. All other
4153   // implicit exceptions (e.g., NullPointerException or
4154   // AbstractMethodError on entry) are either at call sites or
4155   // otherwise assume that stack unwinding will be initiated, so
4156   // caller saved registers were assumed volatile in the compiler.
4157 
4158 #undef __
4159 #define __ masm->
4160 
4161   address generate_throw_exception(const char* name,
4162                                    address runtime_entry,
4163                                    Register arg1 = noreg,
4164                                    Register arg2 = noreg) {
4165     // Information about frame layout at time of blocking runtime call.
4166     // Note that we only have to preserve callee-saved registers since
4167     // the compilers are responsible for supplying a continuation point
4168     // if they expect all registers to be preserved.
4169     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4170     enum layout {
4171       rfp_off = 0,
4172       rfp_off2,
4173       return_off,
4174       return_off2,
4175       framesize // inclusive of return address
4176     };
4177 
4178     int insts_size = 512;
4179     int locs_size  = 64;
4180 
4181     CodeBuffer code(name, insts_size, locs_size);
4182     OopMapSet* oop_maps  = new OopMapSet();
4183     MacroAssembler* masm = new MacroAssembler(&code);
4184 
4185     address start = __ pc();
4186 
4187     // This is an inlined and slightly modified version of call_VM
4188     // which has the ability to fetch the return PC out of
4189     // thread-local storage and also sets up last_Java_sp slightly
4190     // differently than the real call_VM
4191 
4192     __ enter(); // Save FP and LR before call
4193 
4194     assert(is_even(framesize/2), "sp not 16-byte aligned");
4195 
4196     // lr and fp are already in place
4197     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4198 
4199     int frame_complete = __ pc() - start;
4200 
4201     // Set up last_Java_sp and last_Java_fp
4202     address the_pc = __ pc();
4203     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4204 
4205     // Call runtime
4206     if (arg1 != noreg) {
4207       assert(arg2 != c_rarg1, "clobbered");
4208       __ mov(c_rarg1, arg1);
4209     }
4210     if (arg2 != noreg) {
4211       __ mov(c_rarg2, arg2);
4212     }
4213     __ mov(c_rarg0, rthread);
4214     BLOCK_COMMENT("call runtime_entry");
4215     __ mov(rscratch1, runtime_entry);
4216     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4217 
4218     // Generate oop map
4219     OopMap* map = new OopMap(framesize, 0);
4220 
4221     oop_maps->add_gc_map(the_pc - start, map);
4222 
4223     __ reset_last_Java_frame(true);
4224     __ maybe_isb();
4225 
4226     __ leave();
4227 
4228     // check for pending exceptions
4229 #ifdef ASSERT
4230     Label L;
4231     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4232     __ cbnz(rscratch1, L);
4233     __ should_not_reach_here();
4234     __ bind(L);
4235 #endif // ASSERT
4236     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4237 
4238 
4239     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4240     RuntimeStub* stub =
4241       RuntimeStub::new_runtime_stub(name,
4242                                     &code,
4243                                     frame_complete,
4244                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4245                                     oop_maps, false);
4246     return stub->entry_point();
4247   }
4248 
4249   class MontgomeryMultiplyGenerator : public MacroAssembler {
4250 
4251     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4252       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4253 
4254     RegSet _toSave;
4255     bool _squaring;
4256 
4257   public:
4258     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4259       : MacroAssembler(as->code()), _squaring(squaring) {
4260 
4261       // Register allocation
4262 
4263       Register reg = c_rarg0;
4264       Pa_base = reg;       // Argument registers
4265       if (squaring)
4266         Pb_base = Pa_base;
4267       else
4268         Pb_base = ++reg;
4269       Pn_base = ++reg;
4270       Rlen= ++reg;
4271       inv = ++reg;
4272       Pm_base = ++reg;
4273 
4274                           // Working registers:
4275       Ra =  ++reg;        // The current digit of a, b, n, and m.
4276       Rb =  ++reg;
4277       Rm =  ++reg;
4278       Rn =  ++reg;
4279 
4280       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4281       Pb =  ++reg;
4282       Pm =  ++reg;
4283       Pn =  ++reg;
4284 
4285       t0 =  ++reg;        // Three registers which form a
4286       t1 =  ++reg;        // triple-precision accumuator.
4287       t2 =  ++reg;
4288 
4289       Ri =  ++reg;        // Inner and outer loop indexes.
4290       Rj =  ++reg;
4291 
4292       Rhi_ab = ++reg;     // Product registers: low and high parts
4293       Rlo_ab = ++reg;     // of a*b and m*n.
4294       Rhi_mn = ++reg;
4295       Rlo_mn = ++reg;
4296 
4297       // r19 and up are callee-saved.
4298       _toSave = RegSet::range(r19, reg) + Pm_base;
4299     }
4300 
4301   private:
4302     void save_regs() {
4303       push(_toSave, sp);
4304     }
4305 
4306     void restore_regs() {
4307       pop(_toSave, sp);
4308     }
4309 
4310     template <typename T>
4311     void unroll_2(Register count, T block) {
4312       Label loop, end, odd;
4313       tbnz(count, 0, odd);
4314       cbz(count, end);
4315       align(16);
4316       bind(loop);
4317       (this->*block)();
4318       bind(odd);
4319       (this->*block)();
4320       subs(count, count, 2);
4321       br(Assembler::GT, loop);
4322       bind(end);
4323     }
4324 
4325     template <typename T>
4326     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4327       Label loop, end, odd;
4328       tbnz(count, 0, odd);
4329       cbz(count, end);
4330       align(16);
4331       bind(loop);
4332       (this->*block)(d, s, tmp);
4333       bind(odd);
4334       (this->*block)(d, s, tmp);
4335       subs(count, count, 2);
4336       br(Assembler::GT, loop);
4337       bind(end);
4338     }
4339 
4340     void pre1(RegisterOrConstant i) {
4341       block_comment("pre1");
4342       // Pa = Pa_base;
4343       // Pb = Pb_base + i;
4344       // Pm = Pm_base;
4345       // Pn = Pn_base + i;
4346       // Ra = *Pa;
4347       // Rb = *Pb;
4348       // Rm = *Pm;
4349       // Rn = *Pn;
4350       ldr(Ra, Address(Pa_base));
4351       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4352       ldr(Rm, Address(Pm_base));
4353       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4354       lea(Pa, Address(Pa_base));
4355       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4356       lea(Pm, Address(Pm_base));
4357       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4358 
4359       // Zero the m*n result.
4360       mov(Rhi_mn, zr);
4361       mov(Rlo_mn, zr);
4362     }
4363 
4364     // The core multiply-accumulate step of a Montgomery
4365     // multiplication.  The idea is to schedule operations as a
4366     // pipeline so that instructions with long latencies (loads and
4367     // multiplies) have time to complete before their results are
4368     // used.  This most benefits in-order implementations of the
4369     // architecture but out-of-order ones also benefit.
4370     void step() {
4371       block_comment("step");
4372       // MACC(Ra, Rb, t0, t1, t2);
4373       // Ra = *++Pa;
4374       // Rb = *--Pb;
4375       umulh(Rhi_ab, Ra, Rb);
4376       mul(Rlo_ab, Ra, Rb);
4377       ldr(Ra, pre(Pa, wordSize));
4378       ldr(Rb, pre(Pb, -wordSize));
4379       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4380                                        // previous iteration.
4381       // MACC(Rm, Rn, t0, t1, t2);
4382       // Rm = *++Pm;
4383       // Rn = *--Pn;
4384       umulh(Rhi_mn, Rm, Rn);
4385       mul(Rlo_mn, Rm, Rn);
4386       ldr(Rm, pre(Pm, wordSize));
4387       ldr(Rn, pre(Pn, -wordSize));
4388       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4389     }
4390 
4391     void post1() {
4392       block_comment("post1");
4393 
4394       // MACC(Ra, Rb, t0, t1, t2);
4395       // Ra = *++Pa;
4396       // Rb = *--Pb;
4397       umulh(Rhi_ab, Ra, Rb);
4398       mul(Rlo_ab, Ra, Rb);
4399       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4400       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4401 
4402       // *Pm = Rm = t0 * inv;
4403       mul(Rm, t0, inv);
4404       str(Rm, Address(Pm));
4405 
4406       // MACC(Rm, Rn, t0, t1, t2);
4407       // t0 = t1; t1 = t2; t2 = 0;
4408       umulh(Rhi_mn, Rm, Rn);
4409 
4410 #ifndef PRODUCT
4411       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4412       {
4413         mul(Rlo_mn, Rm, Rn);
4414         add(Rlo_mn, t0, Rlo_mn);
4415         Label ok;
4416         cbz(Rlo_mn, ok); {
4417           stop("broken Montgomery multiply");
4418         } bind(ok);
4419       }
4420 #endif
4421       // We have very carefully set things up so that
4422       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4423       // the lower half of Rm * Rn because we know the result already:
4424       // it must be -t0.  t0 + (-t0) must generate a carry iff
4425       // t0 != 0.  So, rather than do a mul and an adds we just set
4426       // the carry flag iff t0 is nonzero.
4427       //
4428       // mul(Rlo_mn, Rm, Rn);
4429       // adds(zr, t0, Rlo_mn);
4430       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4431       adcs(t0, t1, Rhi_mn);
4432       adc(t1, t2, zr);
4433       mov(t2, zr);
4434     }
4435 
4436     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4437       block_comment("pre2");
4438       // Pa = Pa_base + i-len;
4439       // Pb = Pb_base + len;
4440       // Pm = Pm_base + i-len;
4441       // Pn = Pn_base + len;
4442 
4443       if (i.is_register()) {
4444         sub(Rj, i.as_register(), len);
4445       } else {
4446         mov(Rj, i.as_constant());
4447         sub(Rj, Rj, len);
4448       }
4449       // Rj == i-len
4450 
4451       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4452       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4453       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4454       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4455 
4456       // Ra = *++Pa;
4457       // Rb = *--Pb;
4458       // Rm = *++Pm;
4459       // Rn = *--Pn;
4460       ldr(Ra, pre(Pa, wordSize));
4461       ldr(Rb, pre(Pb, -wordSize));
4462       ldr(Rm, pre(Pm, wordSize));
4463       ldr(Rn, pre(Pn, -wordSize));
4464 
4465       mov(Rhi_mn, zr);
4466       mov(Rlo_mn, zr);
4467     }
4468 
4469     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4470       block_comment("post2");
4471       if (i.is_constant()) {
4472         mov(Rj, i.as_constant()-len.as_constant());
4473       } else {
4474         sub(Rj, i.as_register(), len);
4475       }
4476 
4477       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4478 
4479       // As soon as we know the least significant digit of our result,
4480       // store it.
4481       // Pm_base[i-len] = t0;
4482       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4483 
4484       // t0 = t1; t1 = t2; t2 = 0;
4485       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4486       adc(t1, t2, zr);
4487       mov(t2, zr);
4488     }
4489 
4490     // A carry in t0 after Montgomery multiplication means that we
4491     // should subtract multiples of n from our result in m.  We'll
4492     // keep doing that until there is no carry.
4493     void normalize(RegisterOrConstant len) {
4494       block_comment("normalize");
4495       // while (t0)
4496       //   t0 = sub(Pm_base, Pn_base, t0, len);
4497       Label loop, post, again;
4498       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4499       cbz(t0, post); {
4500         bind(again); {
4501           mov(i, zr);
4502           mov(cnt, len);
4503           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4504           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4505           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4506           align(16);
4507           bind(loop); {
4508             sbcs(Rm, Rm, Rn);
4509             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4510             add(i, i, 1);
4511             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4512             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4513             sub(cnt, cnt, 1);
4514           } cbnz(cnt, loop);
4515           sbc(t0, t0, zr);
4516         } cbnz(t0, again);
4517       } bind(post);
4518     }
4519 
4520     // Move memory at s to d, reversing words.
4521     //    Increments d to end of copied memory
4522     //    Destroys tmp1, tmp2
4523     //    Preserves len
4524     //    Leaves s pointing to the address which was in d at start
4525     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4526       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4527 
4528       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4529       mov(tmp1, len);
4530       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4531       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4532     }
4533     // where
4534     void reverse1(Register d, Register s, Register tmp) {
4535       ldr(tmp, pre(s, -wordSize));
4536       ror(tmp, tmp, 32);
4537       str(tmp, post(d, wordSize));
4538     }
4539 
4540     void step_squaring() {
4541       // An extra ACC
4542       step();
4543       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4544     }
4545 
4546     void last_squaring(RegisterOrConstant i) {
4547       Label dont;
4548       // if ((i & 1) == 0) {
4549       tbnz(i.as_register(), 0, dont); {
4550         // MACC(Ra, Rb, t0, t1, t2);
4551         // Ra = *++Pa;
4552         // Rb = *--Pb;
4553         umulh(Rhi_ab, Ra, Rb);
4554         mul(Rlo_ab, Ra, Rb);
4555         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4556       } bind(dont);
4557     }
4558 
4559     void extra_step_squaring() {
4560       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4561 
4562       // MACC(Rm, Rn, t0, t1, t2);
4563       // Rm = *++Pm;
4564       // Rn = *--Pn;
4565       umulh(Rhi_mn, Rm, Rn);
4566       mul(Rlo_mn, Rm, Rn);
4567       ldr(Rm, pre(Pm, wordSize));
4568       ldr(Rn, pre(Pn, -wordSize));
4569     }
4570 
4571     void post1_squaring() {
4572       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4573 
4574       // *Pm = Rm = t0 * inv;
4575       mul(Rm, t0, inv);
4576       str(Rm, Address(Pm));
4577 
4578       // MACC(Rm, Rn, t0, t1, t2);
4579       // t0 = t1; t1 = t2; t2 = 0;
4580       umulh(Rhi_mn, Rm, Rn);
4581 
4582 #ifndef PRODUCT
4583       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4584       {
4585         mul(Rlo_mn, Rm, Rn);
4586         add(Rlo_mn, t0, Rlo_mn);
4587         Label ok;
4588         cbz(Rlo_mn, ok); {
4589           stop("broken Montgomery multiply");
4590         } bind(ok);
4591       }
4592 #endif
4593       // We have very carefully set things up so that
4594       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4595       // the lower half of Rm * Rn because we know the result already:
4596       // it must be -t0.  t0 + (-t0) must generate a carry iff
4597       // t0 != 0.  So, rather than do a mul and an adds we just set
4598       // the carry flag iff t0 is nonzero.
4599       //
4600       // mul(Rlo_mn, Rm, Rn);
4601       // adds(zr, t0, Rlo_mn);
4602       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4603       adcs(t0, t1, Rhi_mn);
4604       adc(t1, t2, zr);
4605       mov(t2, zr);
4606     }
4607 
4608     void acc(Register Rhi, Register Rlo,
4609              Register t0, Register t1, Register t2) {
4610       adds(t0, t0, Rlo);
4611       adcs(t1, t1, Rhi);
4612       adc(t2, t2, zr);
4613     }
4614 
4615   public:
4616     /**
4617      * Fast Montgomery multiplication.  The derivation of the
4618      * algorithm is in A Cryptographic Library for the Motorola
4619      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4620      *
4621      * Arguments:
4622      *
4623      * Inputs for multiplication:
4624      *   c_rarg0   - int array elements a
4625      *   c_rarg1   - int array elements b
4626      *   c_rarg2   - int array elements n (the modulus)
4627      *   c_rarg3   - int length
4628      *   c_rarg4   - int inv
4629      *   c_rarg5   - int array elements m (the result)
4630      *
4631      * Inputs for squaring:
4632      *   c_rarg0   - int array elements a
4633      *   c_rarg1   - int array elements n (the modulus)
4634      *   c_rarg2   - int length
4635      *   c_rarg3   - int inv
4636      *   c_rarg4   - int array elements m (the result)
4637      *
4638      */
4639     address generate_multiply() {
4640       Label argh, nothing;
4641       bind(argh);
4642       stop("MontgomeryMultiply total_allocation must be <= 8192");
4643 
4644       align(CodeEntryAlignment);
4645       address entry = pc();
4646 
4647       cbzw(Rlen, nothing);
4648 
4649       enter();
4650 
4651       // Make room.
4652       cmpw(Rlen, 512);
4653       br(Assembler::HI, argh);
4654       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4655       andr(sp, Ra, -2 * wordSize);
4656 
4657       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4658 
4659       {
4660         // Copy input args, reversing as we go.  We use Ra as a
4661         // temporary variable.
4662         reverse(Ra, Pa_base, Rlen, t0, t1);
4663         if (!_squaring)
4664           reverse(Ra, Pb_base, Rlen, t0, t1);
4665         reverse(Ra, Pn_base, Rlen, t0, t1);
4666       }
4667 
4668       // Push all call-saved registers and also Pm_base which we'll need
4669       // at the end.
4670       save_regs();
4671 
4672 #ifndef PRODUCT
4673       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4674       {
4675         ldr(Rn, Address(Pn_base, 0));
4676         mul(Rlo_mn, Rn, inv);
4677         cmp(Rlo_mn, -1);
4678         Label ok;
4679         br(EQ, ok); {
4680           stop("broken inverse in Montgomery multiply");
4681         } bind(ok);
4682       }
4683 #endif
4684 
4685       mov(Pm_base, Ra);
4686 
4687       mov(t0, zr);
4688       mov(t1, zr);
4689       mov(t2, zr);
4690 
4691       block_comment("for (int i = 0; i < len; i++) {");
4692       mov(Ri, zr); {
4693         Label loop, end;
4694         cmpw(Ri, Rlen);
4695         br(Assembler::GE, end);
4696 
4697         bind(loop);
4698         pre1(Ri);
4699 
4700         block_comment("  for (j = i; j; j--) {"); {
4701           movw(Rj, Ri);
4702           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4703         } block_comment("  } // j");
4704 
4705         post1();
4706         addw(Ri, Ri, 1);
4707         cmpw(Ri, Rlen);
4708         br(Assembler::LT, loop);
4709         bind(end);
4710         block_comment("} // i");
4711       }
4712 
4713       block_comment("for (int i = len; i < 2*len; i++) {");
4714       mov(Ri, Rlen); {
4715         Label loop, end;
4716         cmpw(Ri, Rlen, Assembler::LSL, 1);
4717         br(Assembler::GE, end);
4718 
4719         bind(loop);
4720         pre2(Ri, Rlen);
4721 
4722         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4723           lslw(Rj, Rlen, 1);
4724           subw(Rj, Rj, Ri);
4725           subw(Rj, Rj, 1);
4726           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4727         } block_comment("  } // j");
4728 
4729         post2(Ri, Rlen);
4730         addw(Ri, Ri, 1);
4731         cmpw(Ri, Rlen, Assembler::LSL, 1);
4732         br(Assembler::LT, loop);
4733         bind(end);
4734       }
4735       block_comment("} // i");
4736 
4737       normalize(Rlen);
4738 
4739       mov(Ra, Pm_base);  // Save Pm_base in Ra
4740       restore_regs();  // Restore caller's Pm_base
4741 
4742       // Copy our result into caller's Pm_base
4743       reverse(Pm_base, Ra, Rlen, t0, t1);
4744 
4745       leave();
4746       bind(nothing);
4747       ret(lr);
4748 
4749       return entry;
4750     }
4751     // In C, approximately:
4752 
4753     // void
4754     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4755     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4756     //                     unsigned long inv, int len) {
4757     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4758     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4759     //   unsigned long Ra, Rb, Rn, Rm;
4760 
4761     //   int i;
4762 
4763     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4764 
4765     //   for (i = 0; i < len; i++) {
4766     //     int j;
4767 
4768     //     Pa = Pa_base;
4769     //     Pb = Pb_base + i;
4770     //     Pm = Pm_base;
4771     //     Pn = Pn_base + i;
4772 
4773     //     Ra = *Pa;
4774     //     Rb = *Pb;
4775     //     Rm = *Pm;
4776     //     Rn = *Pn;
4777 
4778     //     int iters = i;
4779     //     for (j = 0; iters--; j++) {
4780     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4781     //       MACC(Ra, Rb, t0, t1, t2);
4782     //       Ra = *++Pa;
4783     //       Rb = *--Pb;
4784     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4785     //       MACC(Rm, Rn, t0, t1, t2);
4786     //       Rm = *++Pm;
4787     //       Rn = *--Pn;
4788     //     }
4789 
4790     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4791     //     MACC(Ra, Rb, t0, t1, t2);
4792     //     *Pm = Rm = t0 * inv;
4793     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4794     //     MACC(Rm, Rn, t0, t1, t2);
4795 
4796     //     assert(t0 == 0, "broken Montgomery multiply");
4797 
4798     //     t0 = t1; t1 = t2; t2 = 0;
4799     //   }
4800 
4801     //   for (i = len; i < 2*len; i++) {
4802     //     int j;
4803 
4804     //     Pa = Pa_base + i-len;
4805     //     Pb = Pb_base + len;
4806     //     Pm = Pm_base + i-len;
4807     //     Pn = Pn_base + len;
4808 
4809     //     Ra = *++Pa;
4810     //     Rb = *--Pb;
4811     //     Rm = *++Pm;
4812     //     Rn = *--Pn;
4813 
4814     //     int iters = len*2-i-1;
4815     //     for (j = i-len+1; iters--; j++) {
4816     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4817     //       MACC(Ra, Rb, t0, t1, t2);
4818     //       Ra = *++Pa;
4819     //       Rb = *--Pb;
4820     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4821     //       MACC(Rm, Rn, t0, t1, t2);
4822     //       Rm = *++Pm;
4823     //       Rn = *--Pn;
4824     //     }
4825 
4826     //     Pm_base[i-len] = t0;
4827     //     t0 = t1; t1 = t2; t2 = 0;
4828     //   }
4829 
4830     //   while (t0)
4831     //     t0 = sub(Pm_base, Pn_base, t0, len);
4832     // }
4833 
4834     /**
4835      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4836      * multiplies than Montgomery multiplication so it should be up to
4837      * 25% faster.  However, its loop control is more complex and it
4838      * may actually run slower on some machines.
4839      *
4840      * Arguments:
4841      *
4842      * Inputs:
4843      *   c_rarg0   - int array elements a
4844      *   c_rarg1   - int array elements n (the modulus)
4845      *   c_rarg2   - int length
4846      *   c_rarg3   - int inv
4847      *   c_rarg4   - int array elements m (the result)
4848      *
4849      */
4850     address generate_square() {
4851       Label argh;
4852       bind(argh);
4853       stop("MontgomeryMultiply total_allocation must be <= 8192");
4854 
4855       align(CodeEntryAlignment);
4856       address entry = pc();
4857 
4858       enter();
4859 
4860       // Make room.
4861       cmpw(Rlen, 512);
4862       br(Assembler::HI, argh);
4863       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4864       andr(sp, Ra, -2 * wordSize);
4865 
4866       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4867 
4868       {
4869         // Copy input args, reversing as we go.  We use Ra as a
4870         // temporary variable.
4871         reverse(Ra, Pa_base, Rlen, t0, t1);
4872         reverse(Ra, Pn_base, Rlen, t0, t1);
4873       }
4874 
4875       // Push all call-saved registers and also Pm_base which we'll need
4876       // at the end.
4877       save_regs();
4878 
4879       mov(Pm_base, Ra);
4880 
4881       mov(t0, zr);
4882       mov(t1, zr);
4883       mov(t2, zr);
4884 
4885       block_comment("for (int i = 0; i < len; i++) {");
4886       mov(Ri, zr); {
4887         Label loop, end;
4888         bind(loop);
4889         cmp(Ri, Rlen);
4890         br(Assembler::GE, end);
4891 
4892         pre1(Ri);
4893 
4894         block_comment("for (j = (i+1)/2; j; j--) {"); {
4895           add(Rj, Ri, 1);
4896           lsr(Rj, Rj, 1);
4897           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4898         } block_comment("  } // j");
4899 
4900         last_squaring(Ri);
4901 
4902         block_comment("  for (j = i/2; j; j--) {"); {
4903           lsr(Rj, Ri, 1);
4904           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4905         } block_comment("  } // j");
4906 
4907         post1_squaring();
4908         add(Ri, Ri, 1);
4909         cmp(Ri, Rlen);
4910         br(Assembler::LT, loop);
4911 
4912         bind(end);
4913         block_comment("} // i");
4914       }
4915 
4916       block_comment("for (int i = len; i < 2*len; i++) {");
4917       mov(Ri, Rlen); {
4918         Label loop, end;
4919         bind(loop);
4920         cmp(Ri, Rlen, Assembler::LSL, 1);
4921         br(Assembler::GE, end);
4922 
4923         pre2(Ri, Rlen);
4924 
4925         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4926           lsl(Rj, Rlen, 1);
4927           sub(Rj, Rj, Ri);
4928           sub(Rj, Rj, 1);
4929           lsr(Rj, Rj, 1);
4930           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4931         } block_comment("  } // j");
4932 
4933         last_squaring(Ri);
4934 
4935         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4936           lsl(Rj, Rlen, 1);
4937           sub(Rj, Rj, Ri);
4938           lsr(Rj, Rj, 1);
4939           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4940         } block_comment("  } // j");
4941 
4942         post2(Ri, Rlen);
4943         add(Ri, Ri, 1);
4944         cmp(Ri, Rlen, Assembler::LSL, 1);
4945 
4946         br(Assembler::LT, loop);
4947         bind(end);
4948         block_comment("} // i");
4949       }
4950 
4951       normalize(Rlen);
4952 
4953       mov(Ra, Pm_base);  // Save Pm_base in Ra
4954       restore_regs();  // Restore caller's Pm_base
4955 
4956       // Copy our result into caller's Pm_base
4957       reverse(Pm_base, Ra, Rlen, t0, t1);
4958 
4959       leave();
4960       ret(lr);
4961 
4962       return entry;
4963     }
4964     // In C, approximately:
4965 
4966     // void
4967     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4968     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4969     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4970     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4971     //   unsigned long Ra, Rb, Rn, Rm;
4972 
4973     //   int i;
4974 
4975     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4976 
4977     //   for (i = 0; i < len; i++) {
4978     //     int j;
4979 
4980     //     Pa = Pa_base;
4981     //     Pb = Pa_base + i;
4982     //     Pm = Pm_base;
4983     //     Pn = Pn_base + i;
4984 
4985     //     Ra = *Pa;
4986     //     Rb = *Pb;
4987     //     Rm = *Pm;
4988     //     Rn = *Pn;
4989 
4990     //     int iters = (i+1)/2;
4991     //     for (j = 0; iters--; j++) {
4992     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4993     //       MACC2(Ra, Rb, t0, t1, t2);
4994     //       Ra = *++Pa;
4995     //       Rb = *--Pb;
4996     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4997     //       MACC(Rm, Rn, t0, t1, t2);
4998     //       Rm = *++Pm;
4999     //       Rn = *--Pn;
5000     //     }
5001     //     if ((i & 1) == 0) {
5002     //       assert(Ra == Pa_base[j], "must be");
5003     //       MACC(Ra, Ra, t0, t1, t2);
5004     //     }
5005     //     iters = i/2;
5006     //     assert(iters == i-j, "must be");
5007     //     for (; iters--; j++) {
5008     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5009     //       MACC(Rm, Rn, t0, t1, t2);
5010     //       Rm = *++Pm;
5011     //       Rn = *--Pn;
5012     //     }
5013 
5014     //     *Pm = Rm = t0 * inv;
5015     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5016     //     MACC(Rm, Rn, t0, t1, t2);
5017 
5018     //     assert(t0 == 0, "broken Montgomery multiply");
5019 
5020     //     t0 = t1; t1 = t2; t2 = 0;
5021     //   }
5022 
5023     //   for (i = len; i < 2*len; i++) {
5024     //     int start = i-len+1;
5025     //     int end = start + (len - start)/2;
5026     //     int j;
5027 
5028     //     Pa = Pa_base + i-len;
5029     //     Pb = Pa_base + len;
5030     //     Pm = Pm_base + i-len;
5031     //     Pn = Pn_base + len;
5032 
5033     //     Ra = *++Pa;
5034     //     Rb = *--Pb;
5035     //     Rm = *++Pm;
5036     //     Rn = *--Pn;
5037 
5038     //     int iters = (2*len-i-1)/2;
5039     //     assert(iters == end-start, "must be");
5040     //     for (j = start; iters--; j++) {
5041     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5042     //       MACC2(Ra, Rb, t0, t1, t2);
5043     //       Ra = *++Pa;
5044     //       Rb = *--Pb;
5045     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5046     //       MACC(Rm, Rn, t0, t1, t2);
5047     //       Rm = *++Pm;
5048     //       Rn = *--Pn;
5049     //     }
5050     //     if ((i & 1) == 0) {
5051     //       assert(Ra == Pa_base[j], "must be");
5052     //       MACC(Ra, Ra, t0, t1, t2);
5053     //     }
5054     //     iters =  (2*len-i)/2;
5055     //     assert(iters == len-j, "must be");
5056     //     for (; iters--; j++) {
5057     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5058     //       MACC(Rm, Rn, t0, t1, t2);
5059     //       Rm = *++Pm;
5060     //       Rn = *--Pn;
5061     //     }
5062     //     Pm_base[i-len] = t0;
5063     //     t0 = t1; t1 = t2; t2 = 0;
5064     //   }
5065 
5066     //   while (t0)
5067     //     t0 = sub(Pm_base, Pn_base, t0, len);
5068     // }
5069   };
5070 
5071 
5072   // Initialization
5073   void generate_initial() {
5074     // Generate initial stubs and initializes the entry points
5075 
5076     // entry points that exist in all platforms Note: This is code
5077     // that could be shared among different platforms - however the
5078     // benefit seems to be smaller than the disadvantage of having a
5079     // much more complicated generator structure. See also comment in
5080     // stubRoutines.hpp.
5081 
5082     StubRoutines::_forward_exception_entry = generate_forward_exception();
5083 
5084     StubRoutines::_call_stub_entry =
5085       generate_call_stub(StubRoutines::_call_stub_return_address);
5086 
5087     // is referenced by megamorphic call
5088     StubRoutines::_catch_exception_entry = generate_catch_exception();
5089 
5090     // Build this early so it's available for the interpreter.
5091     StubRoutines::_throw_StackOverflowError_entry =
5092       generate_throw_exception("StackOverflowError throw_exception",
5093                                CAST_FROM_FN_PTR(address,
5094                                                 SharedRuntime::throw_StackOverflowError));
5095     StubRoutines::_throw_delayed_StackOverflowError_entry =
5096       generate_throw_exception("delayed StackOverflowError throw_exception",
5097                                CAST_FROM_FN_PTR(address,
5098                                                 SharedRuntime::throw_delayed_StackOverflowError));
5099     if (UseCRC32Intrinsics) {
5100       // set table address before stub generation which use it
5101       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5102       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5103     }
5104 
5105     if (UseCRC32CIntrinsics) {
5106       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5107     }
5108   }
5109 
5110   void generate_all() {
5111     // support for verify_oop (must happen after universe_init)
5112     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5113     StubRoutines::_throw_AbstractMethodError_entry =
5114       generate_throw_exception("AbstractMethodError throw_exception",
5115                                CAST_FROM_FN_PTR(address,
5116                                                 SharedRuntime::
5117                                                 throw_AbstractMethodError));
5118 
5119     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5120       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5121                                CAST_FROM_FN_PTR(address,
5122                                                 SharedRuntime::
5123                                                 throw_IncompatibleClassChangeError));
5124 
5125     StubRoutines::_throw_NullPointerException_at_call_entry =
5126       generate_throw_exception("NullPointerException at call throw_exception",
5127                                CAST_FROM_FN_PTR(address,
5128                                                 SharedRuntime::
5129                                                 throw_NullPointerException_at_call));
5130 
5131     // arraycopy stubs used by compilers
5132     generate_arraycopy_stubs();
5133 
5134     // has negatives stub for large arrays.
5135     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5136 
5137     // array equals stub for large arrays.
5138     if (!UseSimpleArrayEquals) {
5139       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5140     }
5141 
5142     if (UseMultiplyToLenIntrinsic) {
5143       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5144     }
5145 
5146     if (UseSquareToLenIntrinsic) {
5147       StubRoutines::_squareToLen = generate_squareToLen();
5148     }
5149 
5150     if (UseMulAddIntrinsic) {
5151       StubRoutines::_mulAdd = generate_mulAdd();
5152     }
5153 
5154     if (UseMontgomeryMultiplyIntrinsic) {
5155       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5156       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5157       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5158     }
5159 
5160     if (UseMontgomerySquareIntrinsic) {
5161       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5162       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5163       // We use generate_multiply() rather than generate_square()
5164       // because it's faster for the sizes of modulus we care about.
5165       StubRoutines::_montgomerySquare = g.generate_multiply();
5166     }
5167 
5168 #if INCLUDE_SHENANDOAHGC
5169     if (UseShenandoahGC && (ShenandoahWriteBarrier || ShenandoahStoreValEnqueueBarrier)) {
5170       StubRoutines::aarch64::_shenandoah_wb = generate_shenandoah_wb(false, true);
5171       StubRoutines::_shenandoah_wb_C = generate_shenandoah_wb(true, !ShenandoahWriteBarrierCsetTestInIR);
5172     }
5173 #endif
5174 
5175 #ifndef BUILTIN_SIM
5176     // generate GHASH intrinsics code
5177     if (UseGHASHIntrinsics) {
5178       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5179     }
5180 
5181     if (UseAESIntrinsics) {
5182       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5183       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5184       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5185       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5186     }
5187 
5188     if (UseSHA1Intrinsics) {
5189       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5190       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5191     }
5192     if (UseSHA256Intrinsics) {
5193       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5194       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5195     }
5196 
5197     // generate Adler32 intrinsics code
5198     if (UseAdler32Intrinsics) {
5199       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5200     }
5201 
5202     // Safefetch stubs.
5203     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5204                                                        &StubRoutines::_safefetch32_fault_pc,
5205                                                        &StubRoutines::_safefetch32_continuation_pc);
5206     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5207                                                        &StubRoutines::_safefetchN_fault_pc,
5208                                                        &StubRoutines::_safefetchN_continuation_pc);
5209 #endif
5210     StubRoutines::aarch64::set_completed();
5211   }
5212 
5213  public:
5214   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5215     if (all) {
5216       generate_all();
5217     } else {
5218       generate_initial();
5219     }
5220   }
5221 }; // end class declaration
5222 
5223 void StubGenerator_generate(CodeBuffer* code, bool all) {
5224   StubGenerator g(code, all);
5225 }