1 /*
   2  * Copyright (c) 2013, Red Hat Inc.
   3  * Copyright (c) 2003, 2011, Oracle and/or its affiliates.
   4  * All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "gc_implementation/shenandoah/shenandoahBarrierSet.hpp"
  31 #include "gc_implementation/shenandoah/shenandoahBrooksPointer.hpp"
  32 #include "gc_implementation/shenandoah/shenandoahHeap.hpp"
  33 #include "gc_implementation/shenandoah/shenandoahHeapRegion.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "nativeInst_aarch64.hpp"
  36 #include "oops/instanceOop.hpp"
  37 #include "oops/method.hpp"
  38 #include "oops/objArrayKlass.hpp"
  39 #include "oops/oop.inline.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "runtime/frame.inline.hpp"
  42 #include "runtime/handles.inline.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubCodeGenerator.hpp"
  45 #include "runtime/stubRoutines.hpp"
  46 #include "runtime/thread.inline.hpp"
  47 #include "utilities/top.hpp"
  48 
  49 #include "stubRoutines_aarch64.hpp"
  50 
  51 #ifdef COMPILER2
  52 #include "opto/runtime.hpp"
  53 #endif
  54 
  55 #ifdef BUILTIN_SIM
  56 #include "../../../../../../simulator/simulator.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #undef __
  64 #define __ _masm->
  65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 
  75 // Stub Code definitions
  76 
  77 class StubGenerator: public StubCodeGenerator {
  78  private:
  79 
  80 #ifdef PRODUCT
  81 #define inc_counter_np(counter) ((void)0)
  82 #else
  83   void inc_counter_np_(int& counter) {
  84     __ lea(rscratch2, ExternalAddress((address)&counter));
  85     __ ldrw(rscratch1, Address(rscratch2));
  86     __ addw(rscratch1, rscratch1, 1);
  87     __ strw(rscratch1, Address(rscratch2));
  88   }
  89 #define inc_counter_np(counter) \
  90   BLOCK_COMMENT("inc_counter " #counter); \
  91   inc_counter_np_(counter);
  92 #endif
  93 
  94   // Call stubs are used to call Java from C
  95   //
  96   // Arguments:
  97   //    c_rarg0:   call wrapper address                   address
  98   //    c_rarg1:   result                                 address
  99   //    c_rarg2:   result type                            BasicType
 100   //    c_rarg3:   method                                 Method*
 101   //    c_rarg4:   (interpreter) entry point              address
 102   //    c_rarg5:   parameters                             intptr_t*
 103   //    c_rarg6:   parameter size (in words)              int
 104   //    c_rarg7:   thread                                 Thread*
 105   //
 106   // There is no return from the stub itself as any Java result
 107   // is written to result
 108   //
 109   // we save r30 (lr) as the return PC at the base of the frame and
 110   // link r29 (fp) below it as the frame pointer installing sp (r31)
 111   // into fp.
 112   //
 113   // we save r0-r7, which accounts for all the c arguments.
 114   //
 115   // TODO: strictly do we need to save them all? they are treated as
 116   // volatile by C so could we omit saving the ones we are going to
 117   // place in global registers (thread? method?) or those we only use
 118   // during setup of the Java call?
 119   //
 120   // we don't need to save r8 which C uses as an indirect result location
 121   // return register.
 122   //
 123   // we don't need to save r9-r15 which both C and Java treat as
 124   // volatile
 125   //
 126   // we don't need to save r16-18 because Java does not use them
 127   //
 128   // we save r19-r28 which Java uses as scratch registers and C
 129   // expects to be callee-save
 130   //
 131   // we save the bottom 64 bits of each value stored in v8-v15; it is
 132   // the responsibility of the caller to preserve larger values.
 133   //
 134   // so the stub frame looks like this when we enter Java code
 135   //
 136   //     [ return_from_Java     ] <--- sp
 137   //     [ argument word n      ]
 138   //      ...
 139   // -27 [ argument word 1      ]
 140   // -26 [ saved v15            ] <--- sp_after_call
 141   // -25 [ saved v14            ]
 142   // -24 [ saved v13            ]
 143   // -23 [ saved v12            ]
 144   // -22 [ saved v11            ]
 145   // -21 [ saved v10            ]
 146   // -20 [ saved v9             ]
 147   // -19 [ saved v8             ]
 148   // -18 [ saved r28            ]
 149   // -17 [ saved r27            ]
 150   // -16 [ saved r26            ]
 151   // -15 [ saved r25            ]
 152   // -14 [ saved r24            ]
 153   // -13 [ saved r23            ]
 154   // -12 [ saved r22            ]
 155   // -11 [ saved r21            ]
 156   // -10 [ saved r20            ]
 157   //  -9 [ saved r19            ]
 158   //  -8 [ call wrapper    (r0) ]
 159   //  -7 [ result          (r1) ]
 160   //  -6 [ result type     (r2) ]
 161   //  -5 [ method          (r3) ]
 162   //  -4 [ entry point     (r4) ]
 163   //  -3 [ parameters      (r5) ]
 164   //  -2 [ parameter size  (r6) ]
 165   //  -1 [ thread (r7)          ]
 166   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 167   //   1 [ saved lr       (r30) ]
 168 
 169   // Call stub stack layout word offsets from fp
 170   enum call_stub_layout {
 171     sp_after_call_off = -26,
 172 
 173     d15_off            = -26,
 174     d13_off            = -24,
 175     d11_off            = -22,
 176     d9_off             = -20,
 177 
 178     r28_off            = -18,
 179     r26_off            = -16,
 180     r24_off            = -14,
 181     r22_off            = -12,
 182     r20_off            = -10,
 183     call_wrapper_off   =  -8,
 184     result_off         =  -7,
 185     result_type_off    =  -6,
 186     method_off         =  -5,
 187     entry_point_off    =  -4,
 188     parameter_size_off =  -2,
 189     thread_off         =  -1,
 190     fp_f               =   0,
 191     retaddr_off        =   1,
 192   };
 193 
 194   address generate_call_stub(address& return_address) {
 195     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 196            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 197            "adjust this code");
 198 
 199     StubCodeMark mark(this, "StubRoutines", "call_stub");
 200     address start = __ pc();
 201 
 202     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 203 
 204     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 205     const Address result        (rfp, result_off         * wordSize);
 206     const Address result_type   (rfp, result_type_off    * wordSize);
 207     const Address method        (rfp, method_off         * wordSize);
 208     const Address entry_point   (rfp, entry_point_off    * wordSize);
 209     const Address parameter_size(rfp, parameter_size_off * wordSize);
 210 
 211     const Address thread        (rfp, thread_off         * wordSize);
 212 
 213     const Address d15_save      (rfp, d15_off * wordSize);
 214     const Address d13_save      (rfp, d13_off * wordSize);
 215     const Address d11_save      (rfp, d11_off * wordSize);
 216     const Address d9_save       (rfp, d9_off * wordSize);
 217 
 218     const Address r28_save      (rfp, r28_off * wordSize);
 219     const Address r26_save      (rfp, r26_off * wordSize);
 220     const Address r24_save      (rfp, r24_off * wordSize);
 221     const Address r22_save      (rfp, r22_off * wordSize);
 222     const Address r20_save      (rfp, r20_off * wordSize);
 223 
 224     // stub code
 225 
 226     // we need a C prolog to bootstrap the x86 caller into the sim
 227     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 228 
 229     address aarch64_entry = __ pc();
 230 
 231 #ifdef BUILTIN_SIM
 232     // Save sender's SP for stack traces.
 233     __ mov(rscratch1, sp);
 234     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 235 #endif
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     // install Java thread in global register now we have saved
 261     // whatever value it held
 262     __ mov(rthread, c_rarg7);
 263     // And method
 264     __ mov(rmethod, c_rarg3);
 265 
 266     // set up the heapbase register
 267     __ reinit_heapbase();
 268 
 269 #ifdef ASSERT
 270     // make sure we have no pending exceptions
 271     {
 272       Label L;
 273       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 274       __ cmp(rscratch1, (unsigned)NULL_WORD);
 275       __ br(Assembler::EQ, L);
 276       __ stop("StubRoutines::call_stub: entered with pending exception");
 277       __ BIND(L);
 278     }
 279 #endif
 280     // pass parameters if any
 281     __ mov(esp, sp);
 282     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 283     __ andr(sp, rscratch1, -2 * wordSize);
 284 
 285     BLOCK_COMMENT("pass parameters if any");
 286     Label parameters_done;
 287     // parameter count is still in c_rarg6
 288     // and parameter pointer identifying param 1 is in c_rarg5
 289     __ cbzw(c_rarg6, parameters_done);
 290 
 291     address loop = __ pc();
 292     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 293     __ subsw(c_rarg6, c_rarg6, 1);
 294     __ push(rscratch1);
 295     __ br(Assembler::GT, loop);
 296 
 297     __ BIND(parameters_done);
 298 
 299     // call Java entry -- passing methdoOop, and current sp
 300     //      rmethod: Method*
 301     //      r13: sender sp
 302     BLOCK_COMMENT("call Java function");
 303     __ mov(r13, sp);
 304     __ blr(c_rarg4);
 305 
 306     // tell the simulator we have returned to the stub
 307 
 308     // we do this here because the notify will already have been done
 309     // if we get to the next instruction via an exception
 310     //
 311     // n.b. adding this instruction here affects the calculation of
 312     // whether or not a routine returns to the call stub (used when
 313     // doing stack walks) since the normal test is to check the return
 314     // pc against the address saved below. so we may need to allow for
 315     // this extra instruction in the check.
 316 
 317     if (NotifySimulator) {
 318       __ notify(Assembler::method_reentry);
 319     }
 320     // save current address for use by exception handling code
 321 
 322     return_address = __ pc();
 323 
 324     // store result depending on type (everything that is not
 325     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 326     // n.b. this assumes Java returns an integral result in r0
 327     // and a floating result in j_farg0
 328     __ ldr(j_rarg2, result);
 329     Label is_long, is_float, is_double, exit;
 330     __ ldr(j_rarg1, result_type);
 331     __ cmp(j_rarg1, T_OBJECT);
 332     __ br(Assembler::EQ, is_long);
 333     __ cmp(j_rarg1, T_LONG);
 334     __ br(Assembler::EQ, is_long);
 335     __ cmp(j_rarg1, T_FLOAT);
 336     __ br(Assembler::EQ, is_float);
 337     __ cmp(j_rarg1, T_DOUBLE);
 338     __ br(Assembler::EQ, is_double);
 339 
 340     // handle T_INT case
 341     __ strw(r0, Address(j_rarg2));
 342 
 343     __ BIND(exit);
 344 
 345     // pop parameters
 346     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 347 
 348 #ifdef ASSERT
 349     // verify that threads correspond
 350     {
 351       Label L, S;
 352       __ ldr(rscratch1, thread);
 353       __ cmp(rthread, rscratch1);
 354       __ br(Assembler::NE, S);
 355       __ get_thread(rscratch1);
 356       __ cmp(rthread, rscratch1);
 357       __ br(Assembler::EQ, L);
 358       __ BIND(S);
 359       __ stop("StubRoutines::call_stub: threads must correspond");
 360       __ BIND(L);
 361     }
 362 #endif
 363 
 364     // restore callee-save registers
 365     __ ldpd(v15, v14,  d15_save);
 366     __ ldpd(v13, v12,  d13_save);
 367     __ ldpd(v11, v10,  d11_save);
 368     __ ldpd(v9,  v8,   d9_save);
 369 
 370     __ ldp(r28, r27,   r28_save);
 371     __ ldp(r26, r25,   r26_save);
 372     __ ldp(r24, r23,   r24_save);
 373     __ ldp(r22, r21,   r22_save);
 374     __ ldp(r20, r19,   r20_save);
 375 
 376     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 377     __ ldrw(c_rarg2, result_type);
 378     __ ldr(c_rarg3,  method);
 379     __ ldp(c_rarg4, c_rarg5,  entry_point);
 380     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 381 
 382 #ifndef PRODUCT
 383     // tell the simulator we are about to end Java execution
 384     if (NotifySimulator) {
 385       __ notify(Assembler::method_exit);
 386     }
 387 #endif
 388     // leave frame and return to caller
 389     __ leave();
 390     __ ret(lr);
 391 
 392     // handle return types different from T_INT
 393 
 394     __ BIND(is_long);
 395     __ str(r0, Address(j_rarg2, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     __ BIND(is_float);
 399     __ strs(j_farg0, Address(j_rarg2, 0));
 400     __ br(Assembler::AL, exit);
 401 
 402     __ BIND(is_double);
 403     __ strd(j_farg0, Address(j_rarg2, 0));
 404     __ br(Assembler::AL, exit);
 405 
 406     return start;
 407   }
 408 
 409   // Return point for a Java call if there's an exception thrown in
 410   // Java code.  The exception is caught and transformed into a
 411   // pending exception stored in JavaThread that can be tested from
 412   // within the VM.
 413   //
 414   // Note: Usually the parameters are removed by the callee. In case
 415   // of an exception crossing an activation frame boundary, that is
 416   // not the case if the callee is compiled code => need to setup the
 417   // rsp.
 418   //
 419   // r0: exception oop
 420 
 421   // NOTE: this is used as a target from the signal handler so it
 422   // needs an x86 prolog which returns into the current simulator
 423   // executing the generated catch_exception code. so the prolog
 424   // needs to install rax in a sim register and adjust the sim's
 425   // restart pc to enter the generated code at the start position
 426   // then return from native to simulated execution.
 427 
 428   address generate_catch_exception() {
 429     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 430     address start = __ pc();
 431 
 432     // same as in generate_call_stub():
 433     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 434     const Address thread        (rfp, thread_off         * wordSize);
 435 
 436 #ifdef ASSERT
 437     // verify that threads correspond
 438     {
 439       Label L, S;
 440       __ ldr(rscratch1, thread);
 441       __ cmp(rthread, rscratch1);
 442       __ br(Assembler::NE, S);
 443       __ get_thread(rscratch1);
 444       __ cmp(rthread, rscratch1);
 445       __ br(Assembler::EQ, L);
 446       __ bind(S);
 447       __ stop("StubRoutines::catch_exception: threads must correspond");
 448       __ bind(L);
 449     }
 450 #endif
 451 
 452     // set pending exception
 453     __ verify_oop(r0);
 454 
 455     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 456     __ mov(rscratch1, (address)__FILE__);
 457     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 458     __ movw(rscratch1, (int)__LINE__);
 459     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 460 
 461     // complete return to VM
 462     assert(StubRoutines::_call_stub_return_address != NULL,
 463            "_call_stub_return_address must have been generated before");
 464     __ b(StubRoutines::_call_stub_return_address);
 465 
 466     return start;
 467   }
 468 
 469   // Continuation point for runtime calls returning with a pending
 470   // exception.  The pending exception check happened in the runtime
 471   // or native call stub.  The pending exception in Thread is
 472   // converted into a Java-level exception.
 473   //
 474   // Contract with Java-level exception handlers:
 475   // r0: exception
 476   // r3: throwing pc
 477   //
 478   // NOTE: At entry of this stub, exception-pc must be in LR !!
 479 
 480   // NOTE: this is always used as a jump target within generated code
 481   // so it just needs to be generated code wiht no x86 prolog
 482 
 483   address generate_forward_exception() {
 484     StubCodeMark mark(this, "StubRoutines", "forward exception");
 485     address start = __ pc();
 486 
 487     // Upon entry, LR points to the return address returning into
 488     // Java (interpreted or compiled) code; i.e., the return address
 489     // becomes the throwing pc.
 490     //
 491     // Arguments pushed before the runtime call are still on the stack
 492     // but the exception handler will reset the stack pointer ->
 493     // ignore them.  A potential result in registers can be ignored as
 494     // well.
 495 
 496 #ifdef ASSERT
 497     // make sure this code is only executed if there is a pending exception
 498     {
 499       Label L;
 500       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 501       __ cbnz(rscratch1, L);
 502       __ stop("StubRoutines::forward exception: no pending exception (1)");
 503       __ bind(L);
 504     }
 505 #endif
 506 
 507     // compute exception handler into r19
 508 
 509     // call the VM to find the handler address associated with the
 510     // caller address. pass thread in r0 and caller pc (ret address)
 511     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 512     // the stack.
 513     __ mov(c_rarg1, lr);
 514     // lr will be trashed by the VM call so we move it to R19
 515     // (callee-saved) because we also need to pass it to the handler
 516     // returned by this call.
 517     __ mov(r19, lr);
 518     BLOCK_COMMENT("call exception_handler_for_return_address");
 519     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 520                          SharedRuntime::exception_handler_for_return_address),
 521                     rthread, c_rarg1);
 522     // we should not really care that lr is no longer the callee
 523     // address. we saved the value the handler needs in r19 so we can
 524     // just copy it to r3. however, the C2 handler will push its own
 525     // frame and then calls into the VM and the VM code asserts that
 526     // the PC for the frame above the handler belongs to a compiled
 527     // Java method. So, we restore lr here to satisfy that assert.
 528     __ mov(lr, r19);
 529     // setup r0 & r3 & clear pending exception
 530     __ mov(r3, r19);
 531     __ mov(r19, r0);
 532     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 533     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 534 
 535 #ifdef ASSERT
 536     // make sure exception is set
 537     {
 538       Label L;
 539       __ cbnz(r0, L);
 540       __ stop("StubRoutines::forward exception: no pending exception (2)");
 541       __ bind(L);
 542     }
 543 #endif
 544 
 545     // continue at exception handler
 546     // r0: exception
 547     // r3: throwing pc
 548     // r19: exception handler
 549     __ verify_oop(r0);
 550     __ br(r19);
 551 
 552     return start;
 553   }
 554 
 555   // Shenandoah write barrier.
 556   //
 557   // Input:
 558   //   r0: OOP to evacuate.  Not null.
 559   //
 560   // Output:
 561   //   r0: Pointer to evacuated OOP.
 562   //
 563   // Trash rscratch1, rscratch2.  Preserve everything else.
 564 
 565   address generate_shenandoah_wb(bool c_abi, bool do_cset_test) {
 566     StubCodeMark mark(this, "StubRoutines", "shenandoah_wb");
 567 
 568     __ align(6);
 569     address start = __ pc();
 570 
 571     if (do_cset_test) {
 572       Label work;
 573       __ mov(rscratch2, ShenandoahHeap::in_cset_fast_test_addr());
 574       __ lsr(rscratch1, r0, ShenandoahHeapRegion::region_size_bytes_shift_jint());
 575       __ ldrb(rscratch2, Address(rscratch2, rscratch1));
 576       __ tbnz(rscratch2, 0, work);
 577       __ ret(lr);
 578       __ bind(work);
 579     }
 580 
 581     Register obj = r0;
 582 
 583     __ enter(); // required for proper stackwalking of RuntimeStub frame
 584 
 585     if (!c_abi) {
 586       __ push_call_clobbered_registers();
 587     } else {
 588       __ push_call_clobbered_fp_registers();
 589     }
 590 
 591     __ mov(lr, CAST_FROM_FN_PTR(address, ShenandoahBarrierSet::write_barrier_JRT));
 592     __ blrt(lr, 1, 0, MacroAssembler::ret_type_integral);
 593     if (!c_abi) {
 594       __ mov(rscratch1, obj);
 595       __ pop_call_clobbered_registers();
 596       __ mov(obj, rscratch1);
 597     } else {
 598       __ pop_call_clobbered_fp_registers();
 599     }
 600 
 601     __ leave(); // required for proper stackwalking of RuntimeStub frame
 602     __ ret(lr);
 603 
 604     return start;
 605   }
 606 
 607   // Non-destructive plausibility checks for oops
 608   //
 609   // Arguments:
 610   //    r0: oop to verify
 611   //    rscratch1: error message
 612   //
 613   // Stack after saving c_rarg3:
 614   //    [tos + 0]: saved c_rarg3
 615   //    [tos + 1]: saved c_rarg2
 616   //    [tos + 2]: saved lr
 617   //    [tos + 3]: saved rscratch2
 618   //    [tos + 4]: saved r0
 619   //    [tos + 5]: saved rscratch1
 620   address generate_verify_oop() {
 621 
 622     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 623     address start = __ pc();
 624 
 625     Label exit, error;
 626 
 627     // save c_rarg2 and c_rarg3
 628     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 629 
 630     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 631     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 632     __ ldr(c_rarg3, Address(c_rarg2));
 633     __ add(c_rarg3, c_rarg3, 1);
 634     __ str(c_rarg3, Address(c_rarg2));
 635 
 636     // object is in r0
 637     // make sure object is 'reasonable'
 638     __ cbz(r0, exit); // if obj is NULL it is OK
 639 
 640     // Check if the oop is in the right area of memory
 641     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 642     __ andr(c_rarg2, r0, c_rarg3);
 643     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 644 
 645     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 646     // instruction here because the flags register is live.
 647     __ eor(c_rarg2, c_rarg2, c_rarg3);
 648     __ cbnz(c_rarg2, error);
 649 
 650     // make sure klass is 'reasonable', which is not zero.
 651     __ load_klass(r0, r0);  // get klass
 652     __ cbz(r0, error);      // if klass is NULL it is broken
 653 
 654     // return if everything seems ok
 655     __ bind(exit);
 656 
 657     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 658     __ ret(lr);
 659 
 660     // handle errors
 661     __ bind(error);
 662     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 663 
 664     __ push(RegSet::range(r0, r29), sp);
 665     // debug(char* msg, int64_t pc, int64_t regs[])
 666     __ mov(c_rarg0, rscratch1);      // pass address of error message
 667     __ mov(c_rarg1, lr);             // pass return address
 668     __ mov(c_rarg2, sp);             // pass address of regs on stack
 669 #ifndef PRODUCT
 670     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 671 #endif
 672     BLOCK_COMMENT("call MacroAssembler::debug");
 673     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 674     __ blrt(rscratch1, 3, 0, 1);
 675 
 676     return start;
 677   }
 678 
 679   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 680 
 681   // Generate code for an array write pre barrier
 682   //
 683   //     addr    -  starting address
 684   //     count   -  element count
 685   //     tmp     - scratch register
 686   //
 687   //     Destroy no registers except rscratch1 and rscratch2
 688   //
 689   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 690     BarrierSet* bs = Universe::heap()->barrier_set();
 691     switch (bs->kind()) {
 692     case BarrierSet::G1SATBCT:
 693     case BarrierSet::G1SATBCTLogging:
 694     case BarrierSet::ShenandoahBarrierSet:
 695       // Don't generate the call if we statically know that the target is uninitialized
 696       if (!dest_uninitialized) {
 697         __ push_call_clobbered_registers();
 698         if (count == c_rarg0) {
 699           if (addr == c_rarg1) {
 700             // exactly backwards!!
 701             __ mov(rscratch1, c_rarg0);
 702             __ mov(c_rarg0, c_rarg1);
 703             __ mov(c_rarg1, rscratch1);
 704           } else {
 705             __ mov(c_rarg1, count);
 706             __ mov(c_rarg0, addr);
 707           }
 708         } else {
 709           __ mov(c_rarg0, addr);
 710           __ mov(c_rarg1, count);
 711         }
 712         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 713         __ pop_call_clobbered_registers();
 714         break;
 715       case BarrierSet::CardTableModRef:
 716       case BarrierSet::CardTableExtension:
 717       case BarrierSet::ModRef:
 718         break;
 719       default:
 720         ShouldNotReachHere();
 721 
 722       }
 723     }
 724   }
 725 
 726   //
 727   // Generate code for an array write post barrier
 728   //
 729   //  Input:
 730   //     start    - register containing starting address of destination array
 731   //     end      - register containing ending address of destination array
 732   //     scratch  - scratch register
 733   //
 734   //  The input registers are overwritten.
 735   //  The ending address is inclusive.
 736   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 737     assert_different_registers(start, end, scratch);
 738     BarrierSet* bs = Universe::heap()->barrier_set();
 739     switch (bs->kind()) {
 740       case BarrierSet::G1SATBCT:
 741       case BarrierSet::G1SATBCTLogging:
 742       case BarrierSet::ShenandoahBarrierSet:
 743 
 744         {
 745           __ push_call_clobbered_registers();
 746           // must compute element count unless barrier set interface is changed (other platforms supply count)
 747           assert_different_registers(start, end, scratch);
 748           __ lea(scratch, Address(end, BytesPerHeapOop));
 749           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 750           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 751           __ mov(c_rarg0, start);
 752           __ mov(c_rarg1, scratch);
 753           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 754           __ pop_call_clobbered_registers();
 755         }
 756         break;
 757       case BarrierSet::CardTableModRef:
 758       case BarrierSet::CardTableExtension:
 759         {
 760           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 761           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 762 
 763           Label L_loop;
 764 
 765            __ lsr(start, start, CardTableModRefBS::card_shift);
 766            __ lsr(end, end, CardTableModRefBS::card_shift);
 767            __ sub(end, end, start); // number of bytes to copy
 768 
 769           const Register count = end; // 'end' register contains bytes count now
 770           __ load_byte_map_base(scratch);
 771           __ add(start, start, scratch);
 772           __ BIND(L_loop);
 773           __ strb(zr, Address(start, count));
 774           __ subs(count, count, 1);
 775           __ br(Assembler::GE, L_loop);
 776         }
 777         break;
 778       default:
 779         ShouldNotReachHere();
 780 
 781     }
 782   }
 783 
 784   address generate_zero_longs(Register base, Register cnt) {
 785     Register tmp = rscratch1;
 786     Register tmp2 = rscratch2;
 787     int zva_length = VM_Version::zva_length();
 788     Label initial_table_end, loop_zva;
 789     Label fini;
 790 
 791     __ align(CodeEntryAlignment);
 792     StubCodeMark mark(this, "StubRoutines", "zero_longs");
 793     address start = __ pc();
 794 
 795     // Base must be 16 byte aligned. If not just return and let caller handle it
 796     __ tst(base, 0x0f);
 797     __ br(Assembler::NE, fini);
 798     // Align base with ZVA length.
 799     __ neg(tmp, base);
 800     __ andr(tmp, tmp, zva_length - 1);
 801 
 802     // tmp: the number of bytes to be filled to align the base with ZVA length.
 803     __ add(base, base, tmp);
 804     __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
 805     __ adr(tmp2, initial_table_end);
 806     __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
 807     __ br(tmp2);
 808 
 809     for (int i = -zva_length + 16; i < 0; i += 16)
 810       __ stp(zr, zr, Address(base, i));
 811     __ bind(initial_table_end);
 812 
 813     __ sub(cnt, cnt, zva_length >> 3);
 814     __ bind(loop_zva);
 815     __ dc(Assembler::ZVA, base);
 816     __ subs(cnt, cnt, zva_length >> 3);
 817     __ add(base, base, zva_length);
 818     __ br(Assembler::GE, loop_zva);
 819     __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
 820     __ bind(fini);
 821     __ ret(lr);
 822 
 823     return start;
 824   }
 825 
 826   typedef enum {
 827     copy_forwards = 1,
 828     copy_backwards = -1
 829   } copy_direction;
 830 
 831   // Bulk copy of blocks of 8 words.
 832   //
 833   // count is a count of words.
 834   //
 835   // Precondition: count >= 8
 836   //
 837   // Postconditions:
 838   //
 839   // The least significant bit of count contains the remaining count
 840   // of words to copy.  The rest of count is trash.
 841   //
 842   // s and d are adjusted to point to the remaining words to copy
 843   //
 844   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 845                            copy_direction direction) {
 846     int unit = wordSize * direction;
 847     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 848 
 849     int offset;
 850     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 851       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 852     const Register stride = r13;
 853 
 854     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 855     assert_different_registers(s, d, count, rscratch1);
 856 
 857     Label again, drain;
 858     const char *stub_name;
 859     if (direction == copy_forwards)
 860       stub_name = "foward_copy_longs";
 861     else
 862       stub_name = "backward_copy_longs";
 863     StubCodeMark mark(this, "StubRoutines", stub_name);
 864     __ align(CodeEntryAlignment);
 865     __ bind(start);
 866 
 867     Label unaligned_copy_long;
 868     if (AvoidUnalignedAccesses) {
 869       __ tbnz(d, 3, unaligned_copy_long);
 870     }
 871 
 872     if (direction == copy_forwards) {
 873       __ sub(s, s, bias);
 874       __ sub(d, d, bias);
 875     }
 876 
 877 #ifdef ASSERT
 878     // Make sure we are never given < 8 words
 879     {
 880       Label L;
 881       __ cmp(count, 8);
 882       __ br(Assembler::GE, L);
 883       __ stop("genrate_copy_longs called with < 8 words");
 884       __ bind(L);
 885     }
 886 #endif
 887 
 888     // Fill 8 registers
 889     if (UseSIMDForMemoryOps) {
 890       __ ldpq(v0, v1, Address(s, 4 * unit));
 891       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 892     } else {
 893       __ ldp(t0, t1, Address(s, 2 * unit));
 894       __ ldp(t2, t3, Address(s, 4 * unit));
 895       __ ldp(t4, t5, Address(s, 6 * unit));
 896       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 897     }
 898 
 899     __ subs(count, count, 16);
 900     __ br(Assembler::LO, drain);
 901 
 902     int prefetch = PrefetchCopyIntervalInBytes;
 903     bool use_stride = false;
 904     if (direction == copy_backwards) {
 905        use_stride = prefetch > 256;
 906        prefetch = -prefetch;
 907        if (use_stride) __ mov(stride, prefetch);
 908     }
 909 
 910     __ bind(again);
 911 
 912     if (PrefetchCopyIntervalInBytes > 0)
 913       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 914 
 915     if (UseSIMDForMemoryOps) {
 916       __ stpq(v0, v1, Address(d, 4 * unit));
 917       __ ldpq(v0, v1, Address(s, 4 * unit));
 918       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 919       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 920     } else {
 921       __ stp(t0, t1, Address(d, 2 * unit));
 922       __ ldp(t0, t1, Address(s, 2 * unit));
 923       __ stp(t2, t3, Address(d, 4 * unit));
 924       __ ldp(t2, t3, Address(s, 4 * unit));
 925       __ stp(t4, t5, Address(d, 6 * unit));
 926       __ ldp(t4, t5, Address(s, 6 * unit));
 927       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 928       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 929     }
 930 
 931     __ subs(count, count, 8);
 932     __ br(Assembler::HS, again);
 933 
 934     // Drain
 935     __ bind(drain);
 936     if (UseSIMDForMemoryOps) {
 937       __ stpq(v0, v1, Address(d, 4 * unit));
 938       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 939     } else {
 940       __ stp(t0, t1, Address(d, 2 * unit));
 941       __ stp(t2, t3, Address(d, 4 * unit));
 942       __ stp(t4, t5, Address(d, 6 * unit));
 943       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 944     }
 945 
 946     {
 947       Label L1, L2;
 948       __ tbz(count, exact_log2(4), L1);
 949       if (UseSIMDForMemoryOps) {
 950         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 951         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 952       } else {
 953         __ ldp(t0, t1, Address(s, 2 * unit));
 954         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 955         __ stp(t0, t1, Address(d, 2 * unit));
 956         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 957       }
 958       __ bind(L1);
 959 
 960       if (direction == copy_forwards) {
 961         __ add(s, s, bias);
 962         __ add(d, d, bias);
 963       }
 964 
 965       __ tbz(count, 1, L2);
 966       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 967       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 968       __ bind(L2);
 969     }
 970 
 971     __ ret(lr);
 972 
 973     if (AvoidUnalignedAccesses) {
 974       Label drain, again;
 975       // Register order for storing. Order is different for backward copy.
 976 
 977       __ bind(unaligned_copy_long);
 978 
 979       // source address is even aligned, target odd aligned
 980       //
 981       // when forward copying word pairs we read long pairs at offsets
 982       // {0, 2, 4, 6} (in long words). when backwards copying we read
 983       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 984       // address by -2 in the forwards case so we can compute the
 985       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 986       // or -1.
 987       //
 988       // when forward copying we need to store 1 word, 3 pairs and
 989       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 990       // zero offset We adjust the destination by -1 which means we
 991       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 992       //
 993       // When backwards copyng we need to store 1 word, 3 pairs and
 994       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 995       // offsets {1, 3, 5, 7, 8} * unit.
 996 
 997       if (direction == copy_forwards) {
 998         __ sub(s, s, 16);
 999         __ sub(d, d, 8);
1000       }
1001 
1002       // Fill 8 registers
1003       //
1004       // for forwards copy s was offset by -16 from the original input
1005       // value of s so the register contents are at these offsets
1006       // relative to the 64 bit block addressed by that original input
1007       // and so on for each successive 64 byte block when s is updated
1008       //
1009       // t0 at offset 0,  t1 at offset 8
1010       // t2 at offset 16, t3 at offset 24
1011       // t4 at offset 32, t5 at offset 40
1012       // t6 at offset 48, t7 at offset 56
1013 
1014       // for backwards copy s was not offset so the register contents
1015       // are at these offsets into the preceding 64 byte block
1016       // relative to that original input and so on for each successive
1017       // preceding 64 byte block when s is updated. this explains the
1018       // slightly counter-intuitive looking pattern of register usage
1019       // in the stp instructions for backwards copy.
1020       //
1021       // t0 at offset -16, t1 at offset -8
1022       // t2 at offset -32, t3 at offset -24
1023       // t4 at offset -48, t5 at offset -40
1024       // t6 at offset -64, t7 at offset -56
1025 
1026       __ ldp(t0, t1, Address(s, 2 * unit));
1027       __ ldp(t2, t3, Address(s, 4 * unit));
1028       __ ldp(t4, t5, Address(s, 6 * unit));
1029       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1030 
1031       __ subs(count, count, 16);
1032       __ br(Assembler::LO, drain);
1033 
1034       int prefetch = PrefetchCopyIntervalInBytes;
1035       bool use_stride = false;
1036       if (direction == copy_backwards) {
1037          use_stride = prefetch > 256;
1038          prefetch = -prefetch;
1039          if (use_stride) __ mov(stride, prefetch);
1040       }
1041 
1042       __ bind(again);
1043 
1044       if (PrefetchCopyIntervalInBytes > 0)
1045         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1046 
1047       if (direction == copy_forwards) {
1048        // allowing for the offset of -8 the store instructions place
1049        // registers into the target 64 bit block at the following
1050        // offsets
1051        //
1052        // t0 at offset 0
1053        // t1 at offset 8,  t2 at offset 16
1054        // t3 at offset 24, t4 at offset 32
1055        // t5 at offset 40, t6 at offset 48
1056        // t7 at offset 56
1057 
1058         __ str(t0, Address(d, 1 * unit));
1059         __ stp(t1, t2, Address(d, 2 * unit));
1060         __ ldp(t0, t1, Address(s, 2 * unit));
1061         __ stp(t3, t4, Address(d, 4 * unit));
1062         __ ldp(t2, t3, Address(s, 4 * unit));
1063         __ stp(t5, t6, Address(d, 6 * unit));
1064         __ ldp(t4, t5, Address(s, 6 * unit));
1065         __ str(t7, Address(__ pre(d, 8 * unit)));
1066         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1067       } else {
1068        // d was not offset when we started so the registers are
1069        // written into the 64 bit block preceding d with the following
1070        // offsets
1071        //
1072        // t1 at offset -8
1073        // t3 at offset -24, t0 at offset -16
1074        // t5 at offset -48, t2 at offset -32
1075        // t7 at offset -56, t4 at offset -48
1076        //                   t6 at offset -64
1077        //
1078        // note that this matches the offsets previously noted for the
1079        // loads
1080 
1081         __ str(t1, Address(d, 1 * unit));
1082         __ stp(t3, t0, Address(d, 3 * unit));
1083         __ ldp(t0, t1, Address(s, 2 * unit));
1084         __ stp(t5, t2, Address(d, 5 * unit));
1085         __ ldp(t2, t3, Address(s, 4 * unit));
1086         __ stp(t7, t4, Address(d, 7 * unit));
1087         __ ldp(t4, t5, Address(s, 6 * unit));
1088         __ str(t6, Address(__ pre(d, 8 * unit)));
1089         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1090       }
1091 
1092       __ subs(count, count, 8);
1093       __ br(Assembler::HS, again);
1094 
1095       // Drain
1096       //
1097       // this uses the same pattern of offsets and register arguments
1098       // as above
1099       __ bind(drain);
1100       if (direction == copy_forwards) {
1101         __ str(t0, Address(d, 1 * unit));
1102         __ stp(t1, t2, Address(d, 2 * unit));
1103         __ stp(t3, t4, Address(d, 4 * unit));
1104         __ stp(t5, t6, Address(d, 6 * unit));
1105         __ str(t7, Address(__ pre(d, 8 * unit)));
1106       } else {
1107         __ str(t1, Address(d, 1 * unit));
1108         __ stp(t3, t0, Address(d, 3 * unit));
1109         __ stp(t5, t2, Address(d, 5 * unit));
1110         __ stp(t7, t4, Address(d, 7 * unit));
1111         __ str(t6, Address(__ pre(d, 8 * unit)));
1112       }
1113       // now we need to copy any remaining part block which may
1114       // include a 4 word block subblock and/or a 2 word subblock.
1115       // bits 2 and 1 in the count are the tell-tale for whetehr we
1116       // have each such subblock
1117       {
1118         Label L1, L2;
1119         __ tbz(count, exact_log2(4), L1);
1120        // this is the same as above but copying only 4 longs hence
1121        // with ony one intervening stp between the str instructions
1122        // but note that the offsets and registers still follow the
1123        // same pattern
1124         __ ldp(t0, t1, Address(s, 2 * unit));
1125         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1126         if (direction == copy_forwards) {
1127           __ str(t0, Address(d, 1 * unit));
1128           __ stp(t1, t2, Address(d, 2 * unit));
1129           __ str(t3, Address(__ pre(d, 4 * unit)));
1130         } else {
1131           __ str(t1, Address(d, 1 * unit));
1132           __ stp(t3, t0, Address(d, 3 * unit));
1133           __ str(t2, Address(__ pre(d, 4 * unit)));
1134         }
1135         __ bind(L1);
1136 
1137         __ tbz(count, 1, L2);
1138        // this is the same as above but copying only 2 longs hence
1139        // there is no intervening stp between the str instructions
1140        // but note that the offset and register patterns are still
1141        // the same
1142         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1143         if (direction == copy_forwards) {
1144           __ str(t0, Address(d, 1 * unit));
1145           __ str(t1, Address(__ pre(d, 2 * unit)));
1146         } else {
1147           __ str(t1, Address(d, 1 * unit));
1148           __ str(t0, Address(__ pre(d, 2 * unit)));
1149         }
1150         __ bind(L2);
1151 
1152        // for forwards copy we need to re-adjust the offsets we
1153        // applied so that s and d are follow the last words written
1154 
1155        if (direction == copy_forwards) {
1156          __ add(s, s, 16);
1157          __ add(d, d, 8);
1158        }
1159 
1160       }
1161 
1162       __ ret(lr);
1163       }
1164   }
1165 
1166   // Small copy: less than 16 bytes.
1167   //
1168   // NB: Ignores all of the bits of count which represent more than 15
1169   // bytes, so a caller doesn't have to mask them.
1170 
1171   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1172     bool is_backwards = step < 0;
1173     size_t granularity = uabs(step);
1174     int direction = is_backwards ? -1 : 1;
1175     int unit = wordSize * direction;
1176 
1177     Label Lpair, Lword, Lint, Lshort, Lbyte;
1178 
1179     assert(granularity
1180            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1181 
1182     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1183 
1184     // ??? I don't know if this bit-test-and-branch is the right thing
1185     // to do.  It does a lot of jumping, resulting in several
1186     // mispredicted branches.  It might make more sense to do this
1187     // with something like Duff's device with a single computed branch.
1188 
1189     __ tbz(count, 3 - exact_log2(granularity), Lword);
1190     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1191     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1192     __ bind(Lword);
1193 
1194     if (granularity <= sizeof (jint)) {
1195       __ tbz(count, 2 - exact_log2(granularity), Lint);
1196       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1197       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1198       __ bind(Lint);
1199     }
1200 
1201     if (granularity <= sizeof (jshort)) {
1202       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1203       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1204       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1205       __ bind(Lshort);
1206     }
1207 
1208     if (granularity <= sizeof (jbyte)) {
1209       __ tbz(count, 0, Lbyte);
1210       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1211       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1212       __ bind(Lbyte);
1213     }
1214   }
1215 
1216   Label copy_f, copy_b;
1217 
1218   // All-singing all-dancing memory copy.
1219   //
1220   // Copy count units of memory from s to d.  The size of a unit is
1221   // step, which can be positive or negative depending on the direction
1222   // of copy.  If is_aligned is false, we align the source address.
1223   //
1224 
1225   void copy_memory(bool is_aligned, Register s, Register d,
1226                    Register count, Register tmp, int step) {
1227     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1228     bool is_backwards = step < 0;
1229     int granularity = uabs(step);
1230     const Register t0 = r3, t1 = r4;
1231 
1232     // <= 96 bytes do inline. Direction doesn't matter because we always
1233     // load all the data before writing anything
1234     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1235     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1236     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1237     const Register send = r17, dend = r18;
1238 
1239     if (PrefetchCopyIntervalInBytes > 0)
1240       __ prfm(Address(s, 0), PLDL1KEEP);
1241     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1242     __ br(Assembler::HI, copy_big);
1243 
1244     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1245     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1246 
1247     __ cmp(count, 16/granularity);
1248     __ br(Assembler::LS, copy16);
1249 
1250     __ cmp(count, 64/granularity);
1251     __ br(Assembler::HI, copy80);
1252 
1253     __ cmp(count, 32/granularity);
1254     __ br(Assembler::LS, copy32);
1255 
1256     // 33..64 bytes
1257     if (UseSIMDForMemoryOps) {
1258       __ ldpq(v0, v1, Address(s, 0));
1259       __ ldpq(v2, v3, Address(send, -32));
1260       __ stpq(v0, v1, Address(d, 0));
1261       __ stpq(v2, v3, Address(dend, -32));
1262     } else {
1263       __ ldp(t0, t1, Address(s, 0));
1264       __ ldp(t2, t3, Address(s, 16));
1265       __ ldp(t4, t5, Address(send, -32));
1266       __ ldp(t6, t7, Address(send, -16));
1267 
1268       __ stp(t0, t1, Address(d, 0));
1269       __ stp(t2, t3, Address(d, 16));
1270       __ stp(t4, t5, Address(dend, -32));
1271       __ stp(t6, t7, Address(dend, -16));
1272     }
1273     __ b(finish);
1274 
1275     // 17..32 bytes
1276     __ bind(copy32);
1277     __ ldp(t0, t1, Address(s, 0));
1278     __ ldp(t2, t3, Address(send, -16));
1279     __ stp(t0, t1, Address(d, 0));
1280     __ stp(t2, t3, Address(dend, -16));
1281     __ b(finish);
1282 
1283     // 65..80/96 bytes
1284     // (96 bytes if SIMD because we do 32 byes per instruction)
1285     __ bind(copy80);
1286     if (UseSIMDForMemoryOps) {
1287       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1288       __ ldpq(v4, v5, Address(send, -32));
1289       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1290       __ stpq(v4, v5, Address(dend, -32));
1291     } else {
1292       __ ldp(t0, t1, Address(s, 0));
1293       __ ldp(t2, t3, Address(s, 16));
1294       __ ldp(t4, t5, Address(s, 32));
1295       __ ldp(t6, t7, Address(s, 48));
1296       __ ldp(t8, t9, Address(send, -16));
1297   
1298       __ stp(t0, t1, Address(d, 0));
1299       __ stp(t2, t3, Address(d, 16));
1300       __ stp(t4, t5, Address(d, 32));
1301       __ stp(t6, t7, Address(d, 48));
1302       __ stp(t8, t9, Address(dend, -16));
1303     }
1304     __ b(finish);
1305 
1306     // 0..16 bytes
1307     __ bind(copy16);
1308     __ cmp(count, 8/granularity);
1309     __ br(Assembler::LO, copy8);
1310 
1311     // 8..16 bytes
1312     __ ldr(t0, Address(s, 0));
1313     __ ldr(t1, Address(send, -8));
1314     __ str(t0, Address(d, 0));
1315     __ str(t1, Address(dend, -8));
1316     __ b(finish);
1317 
1318     if (granularity < 8) {
1319       // 4..7 bytes
1320       __ bind(copy8);
1321       __ tbz(count, 2 - exact_log2(granularity), copy4);
1322       __ ldrw(t0, Address(s, 0));
1323       __ ldrw(t1, Address(send, -4));
1324       __ strw(t0, Address(d, 0));
1325       __ strw(t1, Address(dend, -4));
1326       __ b(finish);
1327       if (granularity < 4) {
1328         // 0..3 bytes
1329         __ bind(copy4);
1330         __ cbz(count, finish); // get rid of 0 case
1331         if (granularity == 2) {
1332           __ ldrh(t0, Address(s, 0));
1333           __ strh(t0, Address(d, 0));
1334         } else { // granularity == 1
1335           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1336           // the first and last byte.
1337           // Handle the 3 byte case by loading and storing base + count/2
1338           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1339           // This does means in the 1 byte case we load/store the same
1340           // byte 3 times.
1341           __ lsr(count, count, 1);
1342           __ ldrb(t0, Address(s, 0));
1343           __ ldrb(t1, Address(send, -1));
1344           __ ldrb(t2, Address(s, count));
1345           __ strb(t0, Address(d, 0));
1346           __ strb(t1, Address(dend, -1));
1347           __ strb(t2, Address(d, count));
1348         }
1349         __ b(finish);
1350       }
1351     }
1352 
1353     __ bind(copy_big);
1354     if (is_backwards) {
1355       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1356       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1357     }
1358 
1359     // Now we've got the small case out of the way we can align the
1360     // source address on a 2-word boundary.
1361 
1362     Label aligned;
1363 
1364     if (is_aligned) {
1365       // We may have to adjust by 1 word to get s 2-word-aligned.
1366       __ tbz(s, exact_log2(wordSize), aligned);
1367       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1368       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1369       __ sub(count, count, wordSize/granularity);
1370     } else {
1371       if (is_backwards) {
1372         __ andr(rscratch2, s, 2 * wordSize - 1);
1373       } else {
1374         __ neg(rscratch2, s);
1375         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1376       }
1377       // rscratch2 is the byte adjustment needed to align s.
1378       __ cbz(rscratch2, aligned);
1379       int shift = exact_log2(granularity);
1380       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1381       __ sub(count, count, rscratch2);
1382 
1383 #if 0
1384       // ?? This code is only correct for a disjoint copy.  It may or
1385       // may not make sense to use it in that case.
1386 
1387       // Copy the first pair; s and d may not be aligned.
1388       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1389       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1390 
1391       // Align s and d, adjust count
1392       if (is_backwards) {
1393         __ sub(s, s, rscratch2);
1394         __ sub(d, d, rscratch2);
1395       } else {
1396         __ add(s, s, rscratch2);
1397         __ add(d, d, rscratch2);
1398       }
1399 #else
1400       copy_memory_small(s, d, rscratch2, rscratch1, step);
1401 #endif
1402     }
1403 
1404     __ bind(aligned);
1405 
1406     // s is now 2-word-aligned.
1407 
1408     // We have a count of units and some trailing bytes.  Adjust the
1409     // count and do a bulk copy of words.
1410     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1411     if (direction == copy_forwards)
1412       __ bl(copy_f);
1413     else
1414       __ bl(copy_b);
1415 
1416     // And the tail.
1417     copy_memory_small(s, d, count, tmp, step);
1418 
1419     if (granularity >= 8) __ bind(copy8);
1420     if (granularity >= 4) __ bind(copy4);
1421     __ bind(finish);
1422   }
1423 
1424 
1425   void clobber_registers() {
1426 #ifdef ASSERT
1427     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1428     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1429     for (Register r = r3; r <= r18; r++)
1430       if (r != rscratch1) __ mov(r, rscratch1);
1431 #endif
1432   }
1433 
1434   // Scan over array at a for count oops, verifying each one.
1435   // Preserves a and count, clobbers rscratch1 and rscratch2.
1436   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1437     Label loop, end;
1438     __ mov(rscratch1, a);
1439     __ mov(rscratch2, zr);
1440     __ bind(loop);
1441     __ cmp(rscratch2, count);
1442     __ br(Assembler::HS, end);
1443     if (size == (size_t)wordSize) {
1444       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1445       __ verify_oop(temp);
1446     } else {
1447       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1448       __ decode_heap_oop(temp); // calls verify_oop
1449     }
1450     __ add(rscratch2, rscratch2, size);
1451     __ b(loop);
1452     __ bind(end);
1453   }
1454 
1455   // Arguments:
1456   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1457   //             ignored
1458   //   is_oop  - true => oop array, so generate store check code
1459   //   name    - stub name string
1460   //
1461   // Inputs:
1462   //   c_rarg0   - source array address
1463   //   c_rarg1   - destination array address
1464   //   c_rarg2   - element count, treated as ssize_t, can be zero
1465   //
1466   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1467   // the hardware handle it.  The two dwords within qwords that span
1468   // cache line boundaries will still be loaded and stored atomicly.
1469   //
1470   // Side Effects:
1471   //   disjoint_int_copy_entry is set to the no-overlap entry point
1472   //   used by generate_conjoint_int_oop_copy().
1473   //
1474   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1475                                   const char *name, bool dest_uninitialized = false) {
1476     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1477     __ align(CodeEntryAlignment);
1478     StubCodeMark mark(this, "StubRoutines", name);
1479     address start = __ pc();
1480     __ enter();
1481 
1482     if (entry != NULL) {
1483       *entry = __ pc();
1484       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1485       BLOCK_COMMENT("Entry:");
1486     }
1487 
1488     if (is_oop) {
1489       __ push(RegSet::of(d, count), sp);
1490       // no registers are destroyed by this call
1491       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1492     }
1493     copy_memory(aligned, s, d, count, rscratch1, size);
1494     if (is_oop) {
1495       __ pop(RegSet::of(d, count), sp);
1496       if (VerifyOops)
1497         verify_oop_array(size, d, count, r16);
1498       __ sub(count, count, 1); // make an inclusive end pointer
1499       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1500       gen_write_ref_array_post_barrier(d, count, rscratch1);
1501     }
1502     __ leave();
1503     __ mov(r0, zr); // return 0
1504     __ ret(lr);
1505 #ifdef BUILTIN_SIM
1506     {
1507       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1508       sim->notifyCompile(const_cast<char*>(name), start);
1509     }
1510 #endif
1511     return start;
1512   }
1513 
1514   // Arguments:
1515   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1516   //             ignored
1517   //   is_oop  - true => oop array, so generate store check code
1518   //   name    - stub name string
1519   //
1520   // Inputs:
1521   //   c_rarg0   - source array address
1522   //   c_rarg1   - destination array address
1523   //   c_rarg2   - element count, treated as ssize_t, can be zero
1524   //
1525   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1526   // the hardware handle it.  The two dwords within qwords that span
1527   // cache line boundaries will still be loaded and stored atomicly.
1528   //
1529   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1530                                  address *entry, const char *name,
1531                                  bool dest_uninitialized = false) {
1532     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1533 
1534     StubCodeMark mark(this, "StubRoutines", name);
1535     address start = __ pc();
1536 
1537     __ enter();
1538 
1539     if (entry != NULL) {
1540       *entry = __ pc();
1541       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1542       BLOCK_COMMENT("Entry:");
1543     }
1544 
1545     // use fwd copy when (d-s) above_equal (count*size)
1546     __ sub(rscratch1, d, s);
1547     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1548     __ br(Assembler::HS, nooverlap_target);
1549 
1550     if (is_oop) {
1551       __ push(RegSet::of(d, count), sp);
1552       // no registers are destroyed by this call
1553       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1554     }
1555     copy_memory(aligned, s, d, count, rscratch1, -size);
1556     if (is_oop) {
1557       __ pop(RegSet::of(d, count), sp);
1558       if (VerifyOops)
1559         verify_oop_array(size, d, count, r16);
1560       __ sub(count, count, 1); // make an inclusive end pointer
1561       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1562       gen_write_ref_array_post_barrier(d, count, rscratch1);
1563     }
1564     __ leave();
1565     __ mov(r0, zr); // return 0
1566     __ ret(lr);
1567 #ifdef BUILTIN_SIM
1568     {
1569       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1570       sim->notifyCompile(const_cast<char*>(name), start);
1571     }
1572 #endif
1573     return start;
1574 }
1575 
1576   // Arguments:
1577   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1578   //             ignored
1579   //   name    - stub name string
1580   //
1581   // Inputs:
1582   //   c_rarg0   - source array address
1583   //   c_rarg1   - destination array address
1584   //   c_rarg2   - element count, treated as ssize_t, can be zero
1585   //
1586   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1587   // we let the hardware handle it.  The one to eight bytes within words,
1588   // dwords or qwords that span cache line boundaries will still be loaded
1589   // and stored atomically.
1590   //
1591   // Side Effects:
1592   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1593   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1594   // we let the hardware handle it.  The one to eight bytes within words,
1595   // dwords or qwords that span cache line boundaries will still be loaded
1596   // and stored atomically.
1597   //
1598   // Side Effects:
1599   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1600   //   used by generate_conjoint_byte_copy().
1601   //
1602   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1603     const bool not_oop = false;
1604     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1605   }
1606 
1607   // Arguments:
1608   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1609   //             ignored
1610   //   name    - stub name string
1611   //
1612   // Inputs:
1613   //   c_rarg0   - source array address
1614   //   c_rarg1   - destination array address
1615   //   c_rarg2   - element count, treated as ssize_t, can be zero
1616   //
1617   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1618   // we let the hardware handle it.  The one to eight bytes within words,
1619   // dwords or qwords that span cache line boundaries will still be loaded
1620   // and stored atomically.
1621   //
1622   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1623                                       address* entry, const char *name) {
1624     const bool not_oop = false;
1625     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1626   }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as ssize_t, can be zero
1637   //
1638   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1639   // let the hardware handle it.  The two or four words within dwords
1640   // or qwords that span cache line boundaries will still be loaded
1641   // and stored atomically.
1642   //
1643   // Side Effects:
1644   //   disjoint_short_copy_entry is set to the no-overlap entry point
1645   //   used by generate_conjoint_short_copy().
1646   //
1647   address generate_disjoint_short_copy(bool aligned,
1648                                        address* entry, const char *name) {
1649     const bool not_oop = false;
1650     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1651   }
1652 
1653   // Arguments:
1654   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1655   //             ignored
1656   //   name    - stub name string
1657   //
1658   // Inputs:
1659   //   c_rarg0   - source array address
1660   //   c_rarg1   - destination array address
1661   //   c_rarg2   - element count, treated as ssize_t, can be zero
1662   //
1663   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1664   // let the hardware handle it.  The two or four words within dwords
1665   // or qwords that span cache line boundaries will still be loaded
1666   // and stored atomically.
1667   //
1668   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1669                                        address *entry, const char *name) {
1670     const bool not_oop = false;
1671     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1672 
1673   }
1674   // Arguments:
1675   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1676   //             ignored
1677   //   name    - stub name string
1678   //
1679   // Inputs:
1680   //   c_rarg0   - source array address
1681   //   c_rarg1   - destination array address
1682   //   c_rarg2   - element count, treated as ssize_t, can be zero
1683   //
1684   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1685   // the hardware handle it.  The two dwords within qwords that span
1686   // cache line boundaries will still be loaded and stored atomicly.
1687   //
1688   // Side Effects:
1689   //   disjoint_int_copy_entry is set to the no-overlap entry point
1690   //   used by generate_conjoint_int_oop_copy().
1691   //
1692   address generate_disjoint_int_copy(bool aligned, address *entry,
1693                                         const char *name) {
1694     const bool not_oop = false;
1695     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1696   }
1697 
1698   // Arguments:
1699   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1700   //             ignored
1701   //   name    - stub name string
1702   //
1703   // Inputs:
1704   //   c_rarg0   - source array address
1705   //   c_rarg1   - destination array address
1706   //   c_rarg2   - element count, treated as ssize_t, can be zero
1707   //
1708   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1709   // the hardware handle it.  The two dwords within qwords that span
1710   // cache line boundaries will still be loaded and stored atomicly.
1711   //
1712   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1713                                      address *entry, const char *name,
1714                                      bool dest_uninitialized = false) {
1715     const bool not_oop = false;
1716     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1717   }
1718 
1719 
1720   // Arguments:
1721   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1722   //             ignored
1723   //   name    - stub name string
1724   //
1725   // Inputs:
1726   //   c_rarg0   - source array address
1727   //   c_rarg1   - destination array address
1728   //   c_rarg2   - element count, treated as size_t, can be zero
1729   //
1730   // Side Effects:
1731   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1732   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1733   //
1734   address generate_disjoint_long_copy(bool aligned, address *entry,
1735                                           const char *name, bool dest_uninitialized = false) {
1736     const bool not_oop = false;
1737     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1738   }
1739 
1740   // Arguments:
1741   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1742   //             ignored
1743   //   name    - stub name string
1744   //
1745   // Inputs:
1746   //   c_rarg0   - source array address
1747   //   c_rarg1   - destination array address
1748   //   c_rarg2   - element count, treated as size_t, can be zero
1749   //
1750   address generate_conjoint_long_copy(bool aligned,
1751                                       address nooverlap_target, address *entry,
1752                                       const char *name, bool dest_uninitialized = false) {
1753     const bool not_oop = false;
1754     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1755   }
1756 
1757   // Arguments:
1758   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1759   //             ignored
1760   //   name    - stub name string
1761   //
1762   // Inputs:
1763   //   c_rarg0   - source array address
1764   //   c_rarg1   - destination array address
1765   //   c_rarg2   - element count, treated as size_t, can be zero
1766   //
1767   // Side Effects:
1768   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1769   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1770   //
1771   address generate_disjoint_oop_copy(bool aligned, address *entry,
1772                                      const char *name, bool dest_uninitialized) {
1773     const bool is_oop = true;
1774     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1775     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1776   }
1777 
1778   // Arguments:
1779   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1780   //             ignored
1781   //   name    - stub name string
1782   //
1783   // Inputs:
1784   //   c_rarg0   - source array address
1785   //   c_rarg1   - destination array address
1786   //   c_rarg2   - element count, treated as size_t, can be zero
1787   //
1788   address generate_conjoint_oop_copy(bool aligned,
1789                                      address nooverlap_target, address *entry,
1790                                      const char *name, bool dest_uninitialized) {
1791     const bool is_oop = true;
1792     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1793     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1794                                   name, dest_uninitialized);
1795   }
1796 
1797 
1798   // Helper for generating a dynamic type check.
1799   // Smashes rscratch1.
1800   void generate_type_check(Register sub_klass,
1801                            Register super_check_offset,
1802                            Register super_klass,
1803                            Label& L_success) {
1804     assert_different_registers(sub_klass, super_check_offset, super_klass);
1805 
1806     BLOCK_COMMENT("type_check:");
1807 
1808     Label L_miss;
1809 
1810     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1811                                      super_check_offset);
1812     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1813 
1814     // Fall through on failure!
1815     __ BIND(L_miss);
1816   }
1817 
1818   //
1819   //  Generate checkcasting array copy stub
1820   //
1821   //  Input:
1822   //    c_rarg0   - source array address
1823   //    c_rarg1   - destination array address
1824   //    c_rarg2   - element count, treated as ssize_t, can be zero
1825   //    c_rarg3   - size_t ckoff (super_check_offset)
1826   //    c_rarg4   - oop ckval (super_klass)
1827   //
1828   //  Output:
1829   //    r0 ==  0  -  success
1830   //    r0 == -1^K - failure, where K is partial transfer count
1831   //
1832   address generate_checkcast_copy(const char *name, address *entry,
1833                                   bool dest_uninitialized = false) {
1834 
1835     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1836 
1837     // Input registers (after setup_arg_regs)
1838     const Register from        = c_rarg0;   // source array address
1839     const Register to          = c_rarg1;   // destination array address
1840     const Register count       = c_rarg2;   // elementscount
1841     const Register ckoff       = c_rarg3;   // super_check_offset
1842     const Register ckval       = c_rarg4;   // super_klass
1843 
1844     // Registers used as temps (r18, r19, r20 are save-on-entry)
1845     const Register count_save  = r21;       // orig elementscount
1846     const Register start_to    = r20;       // destination array start address
1847     const Register copied_oop  = r18;       // actual oop copied
1848     const Register r19_klass   = r19;       // oop._klass
1849 
1850     //---------------------------------------------------------------
1851     // Assembler stub will be used for this call to arraycopy
1852     // if the two arrays are subtypes of Object[] but the
1853     // destination array type is not equal to or a supertype
1854     // of the source type.  Each element must be separately
1855     // checked.
1856 
1857     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1858                                copied_oop, r19_klass, count_save);
1859 
1860     __ align(CodeEntryAlignment);
1861     StubCodeMark mark(this, "StubRoutines", name);
1862     address start = __ pc();
1863 
1864     __ enter(); // required for proper stackwalking of RuntimeStub frame
1865 
1866 #ifdef ASSERT
1867     // caller guarantees that the arrays really are different
1868     // otherwise, we would have to make conjoint checks
1869     { Label L;
1870       array_overlap_test(L, TIMES_OOP);
1871       __ stop("checkcast_copy within a single array");
1872       __ bind(L);
1873     }
1874 #endif //ASSERT
1875 
1876     // Caller of this entry point must set up the argument registers.
1877     if (entry != NULL) {
1878       *entry = __ pc();
1879       BLOCK_COMMENT("Entry:");
1880     }
1881 
1882      // Empty array:  Nothing to do.
1883     __ cbz(count, L_done);
1884 
1885     __ push(RegSet::of(r18, r19, r20, r21), sp);
1886 
1887 #ifdef ASSERT
1888     BLOCK_COMMENT("assert consistent ckoff/ckval");
1889     // The ckoff and ckval must be mutually consistent,
1890     // even though caller generates both.
1891     { Label L;
1892       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1893       __ ldrw(start_to, Address(ckval, sco_offset));
1894       __ cmpw(ckoff, start_to);
1895       __ br(Assembler::EQ, L);
1896       __ stop("super_check_offset inconsistent");
1897       __ bind(L);
1898     }
1899 #endif //ASSERT
1900 
1901     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1902 
1903     // save the original count
1904     __ mov(count_save, count);
1905 
1906     // Copy from low to high addresses
1907     __ mov(start_to, to);              // Save destination array start address
1908     __ b(L_load_element);
1909 
1910     // ======== begin loop ========
1911     // (Loop is rotated; its entry is L_load_element.)
1912     // Loop control:
1913     //   for (; count != 0; count--) {
1914     //     copied_oop = load_heap_oop(from++);
1915     //     ... generate_type_check ...;
1916     //     store_heap_oop(to++, copied_oop);
1917     //   }
1918     __ align(OptoLoopAlignment);
1919 
1920     __ BIND(L_store_element);
1921     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1922     __ sub(count, count, 1);
1923     __ cbz(count, L_do_card_marks);
1924 
1925     // ======== loop entry is here ========
1926     __ BIND(L_load_element);
1927     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1928     __ cbz(copied_oop, L_store_element);
1929 
1930     __ load_klass(r19_klass, copied_oop);// query the object klass
1931     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1932     // ======== end loop ========
1933 
1934     // It was a real error; we must depend on the caller to finish the job.
1935     // Register count = remaining oops, count_orig = total oops.
1936     // Emit GC store barriers for the oops we have copied and report
1937     // their number to the caller.
1938 
1939     __ subs(count, count_save, count);     // K = partially copied oop count
1940     __ eon(count, count, zr);                   // report (-1^K) to caller
1941     __ br(Assembler::EQ, L_done_pop);
1942 
1943     __ BIND(L_do_card_marks);
1944     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1945     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1946 
1947     __ bind(L_done_pop);
1948     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1949     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1950 
1951     __ bind(L_done);
1952     __ mov(r0, count);
1953     __ leave();
1954     __ ret(lr);
1955 
1956     return start;
1957   }
1958 
1959   // Perform range checks on the proposed arraycopy.
1960   // Kills temp, but nothing else.
1961   // Also, clean the sign bits of src_pos and dst_pos.
1962   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1963                               Register src_pos, // source position (c_rarg1)
1964                               Register dst,     // destination array oo (c_rarg2)
1965                               Register dst_pos, // destination position (c_rarg3)
1966                               Register length,
1967                               Register temp,
1968                               Label& L_failed) {
1969     BLOCK_COMMENT("arraycopy_range_checks:");
1970 
1971     assert_different_registers(rscratch1, temp);
1972 
1973     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1974     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1975     __ addw(temp, length, src_pos);
1976     __ cmpw(temp, rscratch1);
1977     __ br(Assembler::HI, L_failed);
1978 
1979     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1980     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1981     __ addw(temp, length, dst_pos);
1982     __ cmpw(temp, rscratch1);
1983     __ br(Assembler::HI, L_failed);
1984 
1985     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1986     __ movw(src_pos, src_pos);
1987     __ movw(dst_pos, dst_pos);
1988 
1989     BLOCK_COMMENT("arraycopy_range_checks done");
1990   }
1991 
1992   // These stubs get called from some dumb test routine.
1993   // I'll write them properly when they're called from
1994   // something that's actually doing something.
1995   static void fake_arraycopy_stub(address src, address dst, int count) {
1996     assert(count == 0, "huh?");
1997   }
1998 
1999 
2000   //
2001   // Generate stub for array fill. If "aligned" is true, the
2002   // "to" address is assumed to be heapword aligned.
2003   //
2004   // Arguments for generated stub:
2005   //   to:    c_rarg0
2006   //   value: c_rarg1
2007   //   count: c_rarg2 treated as signed
2008   //
2009   address generate_fill(BasicType t, bool aligned, const char *name) {
2010     __ align(CodeEntryAlignment);
2011     StubCodeMark mark(this, "StubRoutines", name);
2012     address start = __ pc();
2013 
2014     BLOCK_COMMENT("Entry:");
2015 
2016     const Register to        = c_rarg0;  // source array address
2017     const Register value     = c_rarg1;  // value
2018     const Register count     = c_rarg2;  // elements count
2019 
2020     const Register bz_base = r10;        // base for block_zero routine
2021     const Register cnt_words = r11;      // temp register
2022 
2023     __ enter();
2024 
2025     Label L_fill_elements, L_exit1;
2026 
2027     int shift = -1;
2028     switch (t) {
2029       case T_BYTE:
2030         shift = 0;
2031         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2032         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2033         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2034         __ br(Assembler::LO, L_fill_elements);
2035         break;
2036       case T_SHORT:
2037         shift = 1;
2038         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2039         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2040         __ br(Assembler::LO, L_fill_elements);
2041         break;
2042       case T_INT:
2043         shift = 2;
2044         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2045         __ br(Assembler::LO, L_fill_elements);
2046         break;
2047       default: ShouldNotReachHere();
2048     }
2049 
2050     // Align source address at 8 bytes address boundary.
2051     Label L_skip_align1, L_skip_align2, L_skip_align4;
2052     if (!aligned) {
2053       switch (t) {
2054         case T_BYTE:
2055           // One byte misalignment happens only for byte arrays.
2056           __ tbz(to, 0, L_skip_align1);
2057           __ strb(value, Address(__ post(to, 1)));
2058           __ subw(count, count, 1);
2059           __ bind(L_skip_align1);
2060           // Fallthrough
2061         case T_SHORT:
2062           // Two bytes misalignment happens only for byte and short (char) arrays.
2063           __ tbz(to, 1, L_skip_align2);
2064           __ strh(value, Address(__ post(to, 2)));
2065           __ subw(count, count, 2 >> shift);
2066           __ bind(L_skip_align2);
2067           // Fallthrough
2068         case T_INT:
2069           // Align to 8 bytes, we know we are 4 byte aligned to start.
2070           __ tbz(to, 2, L_skip_align4);
2071           __ strw(value, Address(__ post(to, 4)));
2072           __ subw(count, count, 4 >> shift);
2073           __ bind(L_skip_align4);
2074           break;
2075         default: ShouldNotReachHere();
2076       }
2077     }
2078 
2079     //
2080     //  Fill large chunks
2081     //
2082     __ lsrw(cnt_words, count, 3 - shift); // number of words
2083     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2084     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2085     if (UseBlockZeroing) {
2086       Label non_block_zeroing, rest;
2087       // count >= BlockZeroingLowLimit && value == 0
2088       __ subs(rscratch1, cnt_words, BlockZeroingLowLimit >> 3);
2089       __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
2090       __ br(Assembler::NE, non_block_zeroing);
2091       __ mov(bz_base, to);
2092       __ block_zero(bz_base, cnt_words, true);
2093       __ mov(to, bz_base);
2094       __ b(rest);
2095       __ bind(non_block_zeroing);
2096       __ fill_words(to, cnt_words, value);
2097       __ bind(rest);
2098     }
2099     else {
2100       __ fill_words(to, cnt_words, value);
2101     }
2102 
2103     // Remaining count is less than 8 bytes. Fill it by a single store.
2104     // Note that the total length is no less than 8 bytes.
2105     if (t == T_BYTE || t == T_SHORT) {
2106       Label L_exit1;
2107       __ cbzw(count, L_exit1);
2108       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2109       __ str(value, Address(to, -8));    // overwrite some elements
2110       __ bind(L_exit1);
2111       __ leave();
2112       __ ret(lr);
2113     }
2114 
2115     // Handle copies less than 8 bytes.
2116     Label L_fill_2, L_fill_4, L_exit2;
2117     __ bind(L_fill_elements);
2118     switch (t) {
2119       case T_BYTE:
2120         __ tbz(count, 0, L_fill_2);
2121         __ strb(value, Address(__ post(to, 1)));
2122         __ bind(L_fill_2);
2123         __ tbz(count, 1, L_fill_4);
2124         __ strh(value, Address(__ post(to, 2)));
2125         __ bind(L_fill_4);
2126         __ tbz(count, 2, L_exit2);
2127         __ strw(value, Address(to));
2128         break;
2129       case T_SHORT:
2130         __ tbz(count, 0, L_fill_4);
2131         __ strh(value, Address(__ post(to, 2)));
2132         __ bind(L_fill_4);
2133         __ tbz(count, 1, L_exit2);
2134         __ strw(value, Address(to));
2135         break;
2136       case T_INT:
2137         __ cbzw(count, L_exit2);
2138         __ strw(value, Address(to));
2139         break;
2140       default: ShouldNotReachHere();
2141     }
2142     __ bind(L_exit2);
2143     __ leave();
2144     __ ret(lr);
2145     return start;
2146   }
2147 
2148   //
2149   //  Generate 'unsafe' array copy stub
2150   //  Though just as safe as the other stubs, it takes an unscaled
2151   //  size_t argument instead of an element count.
2152   //
2153   //  Input:
2154   //    c_rarg0   - source array address
2155   //    c_rarg1   - destination array address
2156   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2157   //
2158   // Examines the alignment of the operands and dispatches
2159   // to a long, int, short, or byte copy loop.
2160   //
2161   address generate_unsafe_copy(const char *name,
2162                                address byte_copy_entry,
2163                                address short_copy_entry,
2164                                address int_copy_entry,
2165                                address long_copy_entry) {
2166     Label L_long_aligned, L_int_aligned, L_short_aligned;
2167     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2168 
2169     __ align(CodeEntryAlignment);
2170     StubCodeMark mark(this, "StubRoutines", name);
2171     address start = __ pc();
2172     __ enter(); // required for proper stackwalking of RuntimeStub frame
2173 
2174     // bump this on entry, not on exit:
2175     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2176 
2177     __ orr(rscratch1, s, d);
2178     __ orr(rscratch1, rscratch1, count);
2179 
2180     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2181     __ cbz(rscratch1, L_long_aligned);
2182     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2183     __ cbz(rscratch1, L_int_aligned);
2184     __ tbz(rscratch1, 0, L_short_aligned);
2185     __ b(RuntimeAddress(byte_copy_entry));
2186 
2187     __ BIND(L_short_aligned);
2188     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2189     __ b(RuntimeAddress(short_copy_entry));
2190     __ BIND(L_int_aligned);
2191     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2192     __ b(RuntimeAddress(int_copy_entry));
2193     __ BIND(L_long_aligned);
2194     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2195     __ b(RuntimeAddress(long_copy_entry));
2196 
2197     return start;
2198   }
2199 
2200   //
2201   //  Generate generic array copy stubs
2202   //
2203   //  Input:
2204   //    c_rarg0    -  src oop
2205   //    c_rarg1    -  src_pos (32-bits)
2206   //    c_rarg2    -  dst oop
2207   //    c_rarg3    -  dst_pos (32-bits)
2208   //    c_rarg4    -  element count (32-bits)
2209   //
2210   //  Output:
2211   //    r0 ==  0  -  success
2212   //    r0 == -1^K - failure, where K is partial transfer count
2213   //
2214   address generate_generic_copy(const char *name,
2215                                 address byte_copy_entry, address short_copy_entry,
2216                                 address int_copy_entry, address oop_copy_entry,
2217                                 address long_copy_entry, address checkcast_copy_entry) {
2218 
2219     Label L_failed, L_failed_0, L_objArray;
2220     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2221 
2222     // Input registers
2223     const Register src        = c_rarg0;  // source array oop
2224     const Register src_pos    = c_rarg1;  // source position
2225     const Register dst        = c_rarg2;  // destination array oop
2226     const Register dst_pos    = c_rarg3;  // destination position
2227     const Register length     = c_rarg4;
2228 
2229     StubCodeMark mark(this, "StubRoutines", name);
2230 
2231     __ align(CodeEntryAlignment);
2232     address start = __ pc();
2233 
2234     __ enter(); // required for proper stackwalking of RuntimeStub frame
2235 
2236     // bump this on entry, not on exit:
2237     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2238 
2239     //-----------------------------------------------------------------------
2240     // Assembler stub will be used for this call to arraycopy
2241     // if the following conditions are met:
2242     //
2243     // (1) src and dst must not be null.
2244     // (2) src_pos must not be negative.
2245     // (3) dst_pos must not be negative.
2246     // (4) length  must not be negative.
2247     // (5) src klass and dst klass should be the same and not NULL.
2248     // (6) src and dst should be arrays.
2249     // (7) src_pos + length must not exceed length of src.
2250     // (8) dst_pos + length must not exceed length of dst.
2251     //
2252 
2253     //  if (src == NULL) return -1;
2254     __ cbz(src, L_failed);
2255 
2256     //  if (src_pos < 0) return -1;
2257     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2258 
2259     //  if (dst == NULL) return -1;
2260     __ cbz(dst, L_failed);
2261 
2262     //  if (dst_pos < 0) return -1;
2263     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2264 
2265     // registers used as temp
2266     const Register scratch_length    = r16; // elements count to copy
2267     const Register scratch_src_klass = r17; // array klass
2268     const Register lh                = r18; // layout helper
2269 
2270     //  if (length < 0) return -1;
2271     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2272     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2273 
2274     __ load_klass(scratch_src_klass, src);
2275 #ifdef ASSERT
2276     //  assert(src->klass() != NULL);
2277     {
2278       BLOCK_COMMENT("assert klasses not null {");
2279       Label L1, L2;
2280       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2281       __ bind(L1);
2282       __ stop("broken null klass");
2283       __ bind(L2);
2284       __ load_klass(rscratch1, dst);
2285       __ cbz(rscratch1, L1);     // this would be broken also
2286       BLOCK_COMMENT("} assert klasses not null done");
2287     }
2288 #endif
2289 
2290     // Load layout helper (32-bits)
2291     //
2292     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2293     // 32        30    24            16              8     2                 0
2294     //
2295     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2296     //
2297 
2298     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2299 
2300     // Handle objArrays completely differently...
2301     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2302     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2303     __ movw(rscratch1, objArray_lh);
2304     __ eorw(rscratch2, lh, rscratch1);
2305     __ cbzw(rscratch2, L_objArray);
2306 
2307     //  if (src->klass() != dst->klass()) return -1;
2308     __ load_klass(rscratch2, dst);
2309     __ eor(rscratch2, rscratch2, scratch_src_klass);
2310     __ cbnz(rscratch2, L_failed);
2311 
2312     //  if (!src->is_Array()) return -1;
2313     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2314 
2315     // At this point, it is known to be a typeArray (array_tag 0x3).
2316 #ifdef ASSERT
2317     {
2318       BLOCK_COMMENT("assert primitive array {");
2319       Label L;
2320       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2321       __ cmpw(lh, rscratch2);
2322       __ br(Assembler::GE, L);
2323       __ stop("must be a primitive array");
2324       __ bind(L);
2325       BLOCK_COMMENT("} assert primitive array done");
2326     }
2327 #endif
2328 
2329     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2330                            rscratch2, L_failed);
2331 
2332     // TypeArrayKlass
2333     //
2334     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2335     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2336     //
2337 
2338     const Register rscratch1_offset = rscratch1;    // array offset
2339     const Register r18_elsize = lh; // element size
2340 
2341     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2342            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2343     __ add(src, src, rscratch1_offset);           // src array offset
2344     __ add(dst, dst, rscratch1_offset);           // dst array offset
2345     BLOCK_COMMENT("choose copy loop based on element size");
2346 
2347     // next registers should be set before the jump to corresponding stub
2348     const Register from     = c_rarg0;  // source array address
2349     const Register to       = c_rarg1;  // destination array address
2350     const Register count    = c_rarg2;  // elements count
2351 
2352     // 'from', 'to', 'count' registers should be set in such order
2353     // since they are the same as 'src', 'src_pos', 'dst'.
2354 
2355     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2356 
2357     // The possible values of elsize are 0-3, i.e. exact_log2(element
2358     // size in bytes).  We do a simple bitwise binary search.
2359   __ BIND(L_copy_bytes);
2360     __ tbnz(r18_elsize, 1, L_copy_ints);
2361     __ tbnz(r18_elsize, 0, L_copy_shorts);
2362     __ lea(from, Address(src, src_pos));// src_addr
2363     __ lea(to,   Address(dst, dst_pos));// dst_addr
2364     __ movw(count, scratch_length); // length
2365     __ b(RuntimeAddress(byte_copy_entry));
2366 
2367   __ BIND(L_copy_shorts);
2368     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2369     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2370     __ movw(count, scratch_length); // length
2371     __ b(RuntimeAddress(short_copy_entry));
2372 
2373   __ BIND(L_copy_ints);
2374     __ tbnz(r18_elsize, 0, L_copy_longs);
2375     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2376     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2377     __ movw(count, scratch_length); // length
2378     __ b(RuntimeAddress(int_copy_entry));
2379 
2380   __ BIND(L_copy_longs);
2381 #ifdef ASSERT
2382     {
2383       BLOCK_COMMENT("assert long copy {");
2384       Label L;
2385       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2386       __ cmpw(r18_elsize, LogBytesPerLong);
2387       __ br(Assembler::EQ, L);
2388       __ stop("must be long copy, but elsize is wrong");
2389       __ bind(L);
2390       BLOCK_COMMENT("} assert long copy done");
2391     }
2392 #endif
2393     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2394     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2395     __ movw(count, scratch_length); // length
2396     __ b(RuntimeAddress(long_copy_entry));
2397 
2398     // ObjArrayKlass
2399   __ BIND(L_objArray);
2400     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2401 
2402     Label L_plain_copy, L_checkcast_copy;
2403     //  test array classes for subtyping
2404     __ load_klass(r18, dst);
2405     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2406     __ br(Assembler::NE, L_checkcast_copy);
2407 
2408     // Identically typed arrays can be copied without element-wise checks.
2409     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2410                            rscratch2, L_failed);
2411 
2412     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2413     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2414     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2415     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2416     __ movw(count, scratch_length); // length
2417   __ BIND(L_plain_copy);
2418     __ b(RuntimeAddress(oop_copy_entry));
2419 
2420   __ BIND(L_checkcast_copy);
2421     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2422     {
2423       // Before looking at dst.length, make sure dst is also an objArray.
2424       __ ldrw(rscratch1, Address(r18, lh_offset));
2425       __ movw(rscratch2, objArray_lh);
2426       __ eorw(rscratch1, rscratch1, rscratch2);
2427       __ cbnzw(rscratch1, L_failed);
2428 
2429       // It is safe to examine both src.length and dst.length.
2430       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2431                              r18, L_failed);
2432 
2433       const Register rscratch2_dst_klass = rscratch2;
2434       __ load_klass(rscratch2_dst_klass, dst); // reload
2435 
2436       // Marshal the base address arguments now, freeing registers.
2437       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2438       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2439       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2440       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2441       __ movw(count, length);           // length (reloaded)
2442       Register sco_temp = c_rarg3;      // this register is free now
2443       assert_different_registers(from, to, count, sco_temp,
2444                                  rscratch2_dst_klass, scratch_src_klass);
2445       // assert_clean_int(count, sco_temp);
2446 
2447       // Generate the type check.
2448       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2449       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2450       // assert_clean_int(sco_temp, r18);
2451       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2452 
2453       // Fetch destination element klass from the ObjArrayKlass header.
2454       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2455       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2456       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2457 
2458       // the checkcast_copy loop needs two extra arguments:
2459       assert(c_rarg3 == sco_temp, "#3 already in place");
2460       // Set up arguments for checkcast_copy_entry.
2461       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2462       __ b(RuntimeAddress(checkcast_copy_entry));
2463     }
2464 
2465   __ BIND(L_failed);
2466     __ mov(r0, -1);
2467     __ leave();   // required for proper stackwalking of RuntimeStub frame
2468     __ ret(lr);
2469 
2470     return start;
2471   }
2472 
2473   void generate_arraycopy_stubs() {
2474     address entry;
2475     address entry_jbyte_arraycopy;
2476     address entry_jshort_arraycopy;
2477     address entry_jint_arraycopy;
2478     address entry_oop_arraycopy;
2479     address entry_jlong_arraycopy;
2480     address entry_checkcast_arraycopy;
2481 
2482     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2483     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2484 
2485     StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
2486 
2487     //*** jbyte
2488     // Always need aligned and unaligned versions
2489     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2490                                                                                   "jbyte_disjoint_arraycopy");
2491     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2492                                                                                   &entry_jbyte_arraycopy,
2493                                                                                   "jbyte_arraycopy");
2494     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2495                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2496     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2497                                                                                   "arrayof_jbyte_arraycopy");
2498 
2499     //*** jshort
2500     // Always need aligned and unaligned versions
2501     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2502                                                                                     "jshort_disjoint_arraycopy");
2503     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2504                                                                                     &entry_jshort_arraycopy,
2505                                                                                     "jshort_arraycopy");
2506     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2507                                                                                     "arrayof_jshort_disjoint_arraycopy");
2508     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2509                                                                                     "arrayof_jshort_arraycopy");
2510 
2511     //*** jint
2512     // Aligned versions
2513     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2514                                                                                 "arrayof_jint_disjoint_arraycopy");
2515     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2516                                                                                 "arrayof_jint_arraycopy");
2517     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2518     // entry_jint_arraycopy always points to the unaligned version
2519     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2520                                                                                 "jint_disjoint_arraycopy");
2521     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2522                                                                                 &entry_jint_arraycopy,
2523                                                                                 "jint_arraycopy");
2524 
2525     //*** jlong
2526     // It is always aligned
2527     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2528                                                                                   "arrayof_jlong_disjoint_arraycopy");
2529     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2530                                                                                   "arrayof_jlong_arraycopy");
2531     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2532     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2533 
2534     //*** oops
2535     {
2536       // With compressed oops we need unaligned versions; notice that
2537       // we overwrite entry_oop_arraycopy.
2538       bool aligned = !UseCompressedOops;
2539 
2540       StubRoutines::_arrayof_oop_disjoint_arraycopy
2541         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2542                                      /*dest_uninitialized*/false);
2543       StubRoutines::_arrayof_oop_arraycopy
2544         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2545                                      /*dest_uninitialized*/false);
2546       // Aligned versions without pre-barriers
2547       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2548         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2549                                      /*dest_uninitialized*/true);
2550       StubRoutines::_arrayof_oop_arraycopy_uninit
2551         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2552                                      /*dest_uninitialized*/true);
2553     }
2554 
2555     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2556     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2557     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2558     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2559 
2560     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2561     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2562                                                                         /*dest_uninitialized*/true);
2563 
2564     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2565                                                               entry_jbyte_arraycopy,
2566                                                               entry_jshort_arraycopy,
2567                                                               entry_jint_arraycopy,
2568                                                               entry_jlong_arraycopy);
2569 
2570     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2571                                                                entry_jbyte_arraycopy,
2572                                                                entry_jshort_arraycopy,
2573                                                                entry_jint_arraycopy,
2574                                                                entry_oop_arraycopy,
2575                                                                entry_jlong_arraycopy,
2576                                                                entry_checkcast_arraycopy);
2577 
2578     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2579     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2580     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2581     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2582     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2583     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2584   }
2585 
2586   // Arguments:
2587   //
2588   // Inputs:
2589   //   c_rarg0   - source byte array address
2590   //   c_rarg1   - destination byte array address
2591   //   c_rarg2   - K (key) in little endian int array
2592   //
2593   address generate_aescrypt_encryptBlock() {
2594     __ align(CodeEntryAlignment);
2595     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2596 
2597     Label L_doLast;
2598 
2599     const Register from        = c_rarg0;  // source array address
2600     const Register to          = c_rarg1;  // destination array address
2601     const Register key         = c_rarg2;  // key array address
2602     const Register keylen      = rscratch1;
2603 
2604     address start = __ pc();
2605     __ enter();
2606 
2607     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2608 
2609     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2610 
2611     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2612     __ rev32(v1, __ T16B, v1);
2613     __ rev32(v2, __ T16B, v2);
2614     __ rev32(v3, __ T16B, v3);
2615     __ rev32(v4, __ T16B, v4);
2616     __ aese(v0, v1);
2617     __ aesmc(v0, v0);
2618     __ aese(v0, v2);
2619     __ aesmc(v0, v0);
2620     __ aese(v0, v3);
2621     __ aesmc(v0, v0);
2622     __ aese(v0, v4);
2623     __ aesmc(v0, v0);
2624 
2625     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2626     __ rev32(v1, __ T16B, v1);
2627     __ rev32(v2, __ T16B, v2);
2628     __ rev32(v3, __ T16B, v3);
2629     __ rev32(v4, __ T16B, v4);
2630     __ aese(v0, v1);
2631     __ aesmc(v0, v0);
2632     __ aese(v0, v2);
2633     __ aesmc(v0, v0);
2634     __ aese(v0, v3);
2635     __ aesmc(v0, v0);
2636     __ aese(v0, v4);
2637     __ aesmc(v0, v0);
2638 
2639     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2640     __ rev32(v1, __ T16B, v1);
2641     __ rev32(v2, __ T16B, v2);
2642 
2643     __ cmpw(keylen, 44);
2644     __ br(Assembler::EQ, L_doLast);
2645 
2646     __ aese(v0, v1);
2647     __ aesmc(v0, v0);
2648     __ aese(v0, v2);
2649     __ aesmc(v0, v0);
2650 
2651     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2652     __ rev32(v1, __ T16B, v1);
2653     __ rev32(v2, __ T16B, v2);
2654 
2655     __ cmpw(keylen, 52);
2656     __ br(Assembler::EQ, L_doLast);
2657 
2658     __ aese(v0, v1);
2659     __ aesmc(v0, v0);
2660     __ aese(v0, v2);
2661     __ aesmc(v0, v0);
2662 
2663     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2664     __ rev32(v1, __ T16B, v1);
2665     __ rev32(v2, __ T16B, v2);
2666 
2667     __ BIND(L_doLast);
2668 
2669     __ aese(v0, v1);
2670     __ aesmc(v0, v0);
2671     __ aese(v0, v2);
2672 
2673     __ ld1(v1, __ T16B, key);
2674     __ rev32(v1, __ T16B, v1);
2675     __ eor(v0, __ T16B, v0, v1);
2676 
2677     __ st1(v0, __ T16B, to);
2678 
2679     __ mov(r0, 0);
2680 
2681     __ leave();
2682     __ ret(lr);
2683 
2684     return start;
2685   }
2686 
2687   // Arguments:
2688   //
2689   // Inputs:
2690   //   c_rarg0   - source byte array address
2691   //   c_rarg1   - destination byte array address
2692   //   c_rarg2   - K (key) in little endian int array
2693   //
2694   address generate_aescrypt_decryptBlock() {
2695     assert(UseAES, "need AES instructions and misaligned SSE support");
2696     __ align(CodeEntryAlignment);
2697     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2698     Label L_doLast;
2699 
2700     const Register from        = c_rarg0;  // source array address
2701     const Register to          = c_rarg1;  // destination array address
2702     const Register key         = c_rarg2;  // key array address
2703     const Register keylen      = rscratch1;
2704 
2705     address start = __ pc();
2706     __ enter(); // required for proper stackwalking of RuntimeStub frame
2707 
2708     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2709 
2710     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2711 
2712     __ ld1(v5, __ T16B, __ post(key, 16));
2713     __ rev32(v5, __ T16B, v5);
2714 
2715     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2716     __ rev32(v1, __ T16B, v1);
2717     __ rev32(v2, __ T16B, v2);
2718     __ rev32(v3, __ T16B, v3);
2719     __ rev32(v4, __ T16B, v4);
2720     __ aesd(v0, v1);
2721     __ aesimc(v0, v0);
2722     __ aesd(v0, v2);
2723     __ aesimc(v0, v0);
2724     __ aesd(v0, v3);
2725     __ aesimc(v0, v0);
2726     __ aesd(v0, v4);
2727     __ aesimc(v0, v0);
2728 
2729     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2730     __ rev32(v1, __ T16B, v1);
2731     __ rev32(v2, __ T16B, v2);
2732     __ rev32(v3, __ T16B, v3);
2733     __ rev32(v4, __ T16B, v4);
2734     __ aesd(v0, v1);
2735     __ aesimc(v0, v0);
2736     __ aesd(v0, v2);
2737     __ aesimc(v0, v0);
2738     __ aesd(v0, v3);
2739     __ aesimc(v0, v0);
2740     __ aesd(v0, v4);
2741     __ aesimc(v0, v0);
2742 
2743     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2744     __ rev32(v1, __ T16B, v1);
2745     __ rev32(v2, __ T16B, v2);
2746 
2747     __ cmpw(keylen, 44);
2748     __ br(Assembler::EQ, L_doLast);
2749 
2750     __ aesd(v0, v1);
2751     __ aesimc(v0, v0);
2752     __ aesd(v0, v2);
2753     __ aesimc(v0, v0);
2754 
2755     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2756     __ rev32(v1, __ T16B, v1);
2757     __ rev32(v2, __ T16B, v2);
2758 
2759     __ cmpw(keylen, 52);
2760     __ br(Assembler::EQ, L_doLast);
2761 
2762     __ aesd(v0, v1);
2763     __ aesimc(v0, v0);
2764     __ aesd(v0, v2);
2765     __ aesimc(v0, v0);
2766 
2767     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2768     __ rev32(v1, __ T16B, v1);
2769     __ rev32(v2, __ T16B, v2);
2770 
2771     __ BIND(L_doLast);
2772 
2773     __ aesd(v0, v1);
2774     __ aesimc(v0, v0);
2775     __ aesd(v0, v2);
2776 
2777     __ eor(v0, __ T16B, v0, v5);
2778 
2779     __ st1(v0, __ T16B, to);
2780 
2781     __ mov(r0, 0);
2782 
2783     __ leave();
2784     __ ret(lr);
2785 
2786     return start;
2787   }
2788 
2789   // Arguments:
2790   //
2791   // Inputs:
2792   //   c_rarg0   - source byte array address
2793   //   c_rarg1   - destination byte array address
2794   //   c_rarg2   - K (key) in little endian int array
2795   //   c_rarg3   - r vector byte array address
2796   //   c_rarg4   - input length
2797   //
2798   // Output:
2799   //   x0        - input length
2800   //
2801   address generate_cipherBlockChaining_encryptAESCrypt() {
2802     assert(UseAES, "need AES instructions and misaligned SSE support");
2803     __ align(CodeEntryAlignment);
2804     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2805 
2806     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish;
2807 
2808     const Register from        = c_rarg0;  // source array address
2809     const Register to          = c_rarg1;  // destination array address
2810     const Register key         = c_rarg2;  // key array address
2811     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2812                                            // and left with the results of the last encryption block
2813     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2814     const Register keylen      = rscratch1;
2815 
2816     address start = __ pc();
2817 
2818       __ enter();
2819 
2820       __ subsw(rscratch2, len_reg, zr);
2821       __ br(Assembler::LE, _L_finish);
2822 
2823       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2824 
2825       __ ld1(v0, __ T16B, rvec);
2826 
2827       __ cmpw(keylen, 52);
2828       __ br(Assembler::CC, L_loadkeys_44);
2829       __ br(Assembler::EQ, L_loadkeys_52);
2830 
2831       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2832       __ rev32(v17, __ T16B, v17);
2833       __ rev32(v18, __ T16B, v18);
2834     __ BIND(L_loadkeys_52);
2835       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2836       __ rev32(v19, __ T16B, v19);
2837       __ rev32(v20, __ T16B, v20);
2838     __ BIND(L_loadkeys_44);
2839       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2840       __ rev32(v21, __ T16B, v21);
2841       __ rev32(v22, __ T16B, v22);
2842       __ rev32(v23, __ T16B, v23);
2843       __ rev32(v24, __ T16B, v24);
2844       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2845       __ rev32(v25, __ T16B, v25);
2846       __ rev32(v26, __ T16B, v26);
2847       __ rev32(v27, __ T16B, v27);
2848       __ rev32(v28, __ T16B, v28);
2849       __ ld1(v29, v30, v31, __ T16B, key);
2850       __ rev32(v29, __ T16B, v29);
2851       __ rev32(v30, __ T16B, v30);
2852       __ rev32(v31, __ T16B, v31);
2853 
2854     __ BIND(L_aes_loop);
2855       __ ld1(v1, __ T16B, __ post(from, 16));
2856       __ eor(v0, __ T16B, v0, v1);
2857 
2858       __ br(Assembler::CC, L_rounds_44);
2859       __ br(Assembler::EQ, L_rounds_52);
2860 
2861       __ aese(v0, v17); __ aesmc(v0, v0);
2862       __ aese(v0, v18); __ aesmc(v0, v0);
2863     __ BIND(L_rounds_52);
2864       __ aese(v0, v19); __ aesmc(v0, v0);
2865       __ aese(v0, v20); __ aesmc(v0, v0);
2866     __ BIND(L_rounds_44);
2867       __ aese(v0, v21); __ aesmc(v0, v0);
2868       __ aese(v0, v22); __ aesmc(v0, v0);
2869       __ aese(v0, v23); __ aesmc(v0, v0);
2870       __ aese(v0, v24); __ aesmc(v0, v0);
2871       __ aese(v0, v25); __ aesmc(v0, v0);
2872       __ aese(v0, v26); __ aesmc(v0, v0);
2873       __ aese(v0, v27); __ aesmc(v0, v0);
2874       __ aese(v0, v28); __ aesmc(v0, v0);
2875       __ aese(v0, v29); __ aesmc(v0, v0);
2876       __ aese(v0, v30);
2877       __ eor(v0, __ T16B, v0, v31);
2878 
2879       __ st1(v0, __ T16B, __ post(to, 16));
2880 
2881       __ subw(len_reg, len_reg, 16);
2882       __ cbnzw(len_reg, L_aes_loop);
2883 
2884       __ st1(v0, __ T16B, rvec);
2885 
2886     __ BIND(_L_finish);
2887       __ mov(r0, rscratch2);
2888 
2889       __ leave();
2890       __ ret(lr);
2891 
2892       return start;
2893   }
2894 
2895   // Arguments:
2896   //
2897   // Inputs:
2898   //   c_rarg0   - source byte array address
2899   //   c_rarg1   - destination byte array address
2900   //   c_rarg2   - K (key) in little endian int array
2901   //   c_rarg3   - r vector byte array address
2902   //   c_rarg4   - input length
2903   //
2904   // Output:
2905   //   r0       - input length
2906   //
2907   address generate_cipherBlockChaining_decryptAESCrypt() {
2908     assert(UseAES, "need AES instructions and misaligned SSE support");
2909     __ align(CodeEntryAlignment);
2910     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2911 
2912     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish;
2913 
2914     const Register from        = c_rarg0;  // source array address
2915     const Register to          = c_rarg1;  // destination array address
2916     const Register key         = c_rarg2;  // key array address
2917     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2918                                            // and left with the results of the last encryption block
2919     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2920     const Register keylen      = rscratch1;
2921 
2922     address start = __ pc();
2923 
2924       __ enter();
2925 
2926       __ subsw(rscratch2, len_reg, zr);
2927       __ br(Assembler::LE, _L_finish);
2928 
2929       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2930 
2931       __ ld1(v2, __ T16B, rvec);
2932 
2933       __ ld1(v31, __ T16B, __ post(key, 16));
2934       __ rev32(v31, __ T16B, v31);
2935 
2936       __ cmpw(keylen, 52);
2937       __ br(Assembler::CC, L_loadkeys_44);
2938       __ br(Assembler::EQ, L_loadkeys_52);
2939 
2940       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2941       __ rev32(v17, __ T16B, v17);
2942       __ rev32(v18, __ T16B, v18);
2943     __ BIND(L_loadkeys_52);
2944       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2945       __ rev32(v19, __ T16B, v19);
2946       __ rev32(v20, __ T16B, v20);
2947     __ BIND(L_loadkeys_44);
2948       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2949       __ rev32(v21, __ T16B, v21);
2950       __ rev32(v22, __ T16B, v22);
2951       __ rev32(v23, __ T16B, v23);
2952       __ rev32(v24, __ T16B, v24);
2953       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2954       __ rev32(v25, __ T16B, v25);
2955       __ rev32(v26, __ T16B, v26);
2956       __ rev32(v27, __ T16B, v27);
2957       __ rev32(v28, __ T16B, v28);
2958       __ ld1(v29, v30, __ T16B, key);
2959       __ rev32(v29, __ T16B, v29);
2960       __ rev32(v30, __ T16B, v30);
2961 
2962     __ BIND(L_aes_loop);
2963       __ ld1(v0, __ T16B, __ post(from, 16));
2964       __ orr(v1, __ T16B, v0, v0);
2965 
2966       __ br(Assembler::CC, L_rounds_44);
2967       __ br(Assembler::EQ, L_rounds_52);
2968 
2969       __ aesd(v0, v17); __ aesimc(v0, v0);
2970       __ aesd(v0, v18); __ aesimc(v0, v0);
2971     __ BIND(L_rounds_52);
2972       __ aesd(v0, v19); __ aesimc(v0, v0);
2973       __ aesd(v0, v20); __ aesimc(v0, v0);
2974     __ BIND(L_rounds_44);
2975       __ aesd(v0, v21); __ aesimc(v0, v0);
2976       __ aesd(v0, v22); __ aesimc(v0, v0);
2977       __ aesd(v0, v23); __ aesimc(v0, v0);
2978       __ aesd(v0, v24); __ aesimc(v0, v0);
2979       __ aesd(v0, v25); __ aesimc(v0, v0);
2980       __ aesd(v0, v26); __ aesimc(v0, v0);
2981       __ aesd(v0, v27); __ aesimc(v0, v0);
2982       __ aesd(v0, v28); __ aesimc(v0, v0);
2983       __ aesd(v0, v29); __ aesimc(v0, v0);
2984       __ aesd(v0, v30);
2985       __ eor(v0, __ T16B, v0, v31);
2986       __ eor(v0, __ T16B, v0, v2);
2987 
2988       __ st1(v0, __ T16B, __ post(to, 16));
2989       __ orr(v2, __ T16B, v1, v1);
2990 
2991       __ subw(len_reg, len_reg, 16);
2992       __ cbnzw(len_reg, L_aes_loop);
2993 
2994       __ st1(v2, __ T16B, rvec);
2995 
2996     __ BIND(_L_finish);
2997       __ mov(r0, rscratch2);
2998 
2999       __ leave();
3000       __ ret(lr);
3001 
3002     return start;
3003   }
3004 
3005   // Arguments:
3006   //
3007   // Inputs:
3008   //   c_rarg0   - byte[]  source+offset
3009   //   c_rarg1   - int[]   SHA.state
3010   //   c_rarg2   - int     offset
3011   //   c_rarg3   - int     limit
3012   //
3013   address generate_sha1_implCompress(bool multi_block, const char *name) {
3014     __ align(CodeEntryAlignment);
3015     StubCodeMark mark(this, "StubRoutines", name);
3016     address start = __ pc();
3017 
3018     Register buf   = c_rarg0;
3019     Register state = c_rarg1;
3020     Register ofs   = c_rarg2;
3021     Register limit = c_rarg3;
3022 
3023     Label keys;
3024     Label sha1_loop;
3025 
3026     // load the keys into v0..v3
3027     __ adr(rscratch1, keys);
3028     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3029     // load 5 words state into v6, v7
3030     __ ldrq(v6, Address(state, 0));
3031     __ ldrs(v7, Address(state, 16));
3032 
3033 
3034     __ BIND(sha1_loop);
3035     // load 64 bytes of data into v16..v19
3036     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3037     __ rev32(v16, __ T16B, v16);
3038     __ rev32(v17, __ T16B, v17);
3039     __ rev32(v18, __ T16B, v18);
3040     __ rev32(v19, __ T16B, v19);
3041 
3042     // do the sha1
3043     __ addv(v4, __ T4S, v16, v0);
3044     __ orr(v20, __ T16B, v6, v6);
3045 
3046     FloatRegister d0 = v16;
3047     FloatRegister d1 = v17;
3048     FloatRegister d2 = v18;
3049     FloatRegister d3 = v19;
3050 
3051     for (int round = 0; round < 20; round++) {
3052       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3053       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3054       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3055       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3056       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3057 
3058       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3059       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3060       __ sha1h(tmp2, __ T4S, v20);
3061       if (round < 5)
3062         __ sha1c(v20, __ T4S, tmp3, tmp4);
3063       else if (round < 10 || round >= 15)
3064         __ sha1p(v20, __ T4S, tmp3, tmp4);
3065       else
3066         __ sha1m(v20, __ T4S, tmp3, tmp4);
3067       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3068 
3069       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3070     }
3071 
3072     __ addv(v7, __ T2S, v7, v21);
3073     __ addv(v6, __ T4S, v6, v20);
3074 
3075     if (multi_block) {
3076       __ add(ofs, ofs, 64);
3077       __ cmp(ofs, limit);
3078       __ br(Assembler::LE, sha1_loop);
3079       __ mov(c_rarg0, ofs); // return ofs
3080     }
3081 
3082     __ strq(v6, Address(state, 0));
3083     __ strs(v7, Address(state, 16));
3084 
3085     __ ret(lr);
3086 
3087     __ bind(keys);
3088     __ emit_int32(0x5a827999);
3089     __ emit_int32(0x6ed9eba1);
3090     __ emit_int32(0x8f1bbcdc);
3091     __ emit_int32(0xca62c1d6);
3092 
3093     return start;
3094   }
3095 
3096 
3097   // Arguments:
3098   //
3099   // Inputs:
3100   //   c_rarg0   - byte[]  source+offset
3101   //   c_rarg1   - int[]   SHA.state
3102   //   c_rarg2   - int     offset
3103   //   c_rarg3   - int     limit
3104   //
3105   address generate_sha256_implCompress(bool multi_block, const char *name) {
3106     static const uint32_t round_consts[64] = {
3107       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3108       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3109       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3110       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3111       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3112       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3113       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3114       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3115       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3116       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3117       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3118       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3119       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3120       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3121       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3122       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3123     };
3124     __ align(CodeEntryAlignment);
3125     StubCodeMark mark(this, "StubRoutines", name);
3126     address start = __ pc();
3127 
3128     Register buf   = c_rarg0;
3129     Register state = c_rarg1;
3130     Register ofs   = c_rarg2;
3131     Register limit = c_rarg3;
3132 
3133     Label sha1_loop;
3134 
3135     __ stpd(v8, v9, __ pre(sp, -32));
3136     __ stpd(v10, v11, Address(sp, 16));
3137 
3138 // dga == v0
3139 // dgb == v1
3140 // dg0 == v2
3141 // dg1 == v3
3142 // dg2 == v4
3143 // t0 == v6
3144 // t1 == v7
3145 
3146     // load 16 keys to v16..v31
3147     __ lea(rscratch1, ExternalAddress((address)round_consts));
3148     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3149     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3150     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3151     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3152 
3153     // load 8 words (256 bits) state
3154     __ ldpq(v0, v1, state);
3155 
3156     __ BIND(sha1_loop);
3157     // load 64 bytes of data into v8..v11
3158     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3159     __ rev32(v8, __ T16B, v8);
3160     __ rev32(v9, __ T16B, v9);
3161     __ rev32(v10, __ T16B, v10);
3162     __ rev32(v11, __ T16B, v11);
3163 
3164     __ addv(v6, __ T4S, v8, v16);
3165     __ orr(v2, __ T16B, v0, v0);
3166     __ orr(v3, __ T16B, v1, v1);
3167 
3168     FloatRegister d0 = v8;
3169     FloatRegister d1 = v9;
3170     FloatRegister d2 = v10;
3171     FloatRegister d3 = v11;
3172 
3173 
3174     for (int round = 0; round < 16; round++) {
3175       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3176       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3177       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3178       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3179 
3180       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3181        __ orr(v4, __ T16B, v2, v2);
3182       if (round < 15)
3183         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3184       __ sha256h(v2, __ T4S, v3, tmp2);
3185       __ sha256h2(v3, __ T4S, v4, tmp2);
3186       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3187 
3188       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3189     }
3190 
3191     __ addv(v0, __ T4S, v0, v2);
3192     __ addv(v1, __ T4S, v1, v3);
3193 
3194     if (multi_block) {
3195       __ add(ofs, ofs, 64);
3196       __ cmp(ofs, limit);
3197       __ br(Assembler::LE, sha1_loop);
3198       __ mov(c_rarg0, ofs); // return ofs
3199     }
3200 
3201     __ ldpd(v10, v11, Address(sp, 16));
3202     __ ldpd(v8, v9, __ post(sp, 32));
3203 
3204     __ stpq(v0, v1, state);
3205 
3206     __ ret(lr);
3207 
3208     return start;
3209   }
3210 
3211 #ifndef BUILTIN_SIM
3212   // Safefetch stubs.
3213   void generate_safefetch(const char* name, int size, address* entry,
3214                           address* fault_pc, address* continuation_pc) {
3215     // safefetch signatures:
3216     //   int      SafeFetch32(int*      adr, int      errValue);
3217     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3218     //
3219     // arguments:
3220     //   c_rarg0 = adr
3221     //   c_rarg1 = errValue
3222     //
3223     // result:
3224     //   PPC_RET  = *adr or errValue
3225 
3226     StubCodeMark mark(this, "StubRoutines", name);
3227 
3228     // Entry point, pc or function descriptor.
3229     *entry = __ pc();
3230 
3231     // Load *adr into c_rarg1, may fault.
3232     *fault_pc = __ pc();
3233     switch (size) {
3234       case 4:
3235         // int32_t
3236         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3237         break;
3238       case 8:
3239         // int64_t
3240         __ ldr(c_rarg1, Address(c_rarg0, 0));
3241         break;
3242       default:
3243         ShouldNotReachHere();
3244     }
3245 
3246     // return errValue or *adr
3247     *continuation_pc = __ pc();
3248     __ mov(r0, c_rarg1);
3249     __ ret(lr);
3250   }
3251 #endif
3252 
3253   /**
3254    *  Arguments:
3255    *
3256    * Inputs:
3257    *   c_rarg0   - int crc
3258    *   c_rarg1   - byte* buf
3259    *   c_rarg2   - int length
3260    *
3261    * Output:
3262    *       r0   - int crc result
3263    *
3264    * Preserves:
3265    *       r13
3266    *
3267    */
3268   address generate_updateBytesCRC32() {
3269     assert(UseCRC32Intrinsics, "what are we doing here?");
3270 
3271     __ align(CodeEntryAlignment);
3272     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3273 
3274     address start = __ pc();
3275 
3276     const Register crc   = c_rarg0;  // crc
3277     const Register buf   = c_rarg1;  // source java byte array address
3278     const Register len   = c_rarg2;  // length
3279     const Register table0 = c_rarg3; // crc_table address
3280     const Register table1 = c_rarg4;
3281     const Register table2 = c_rarg5;
3282     const Register table3 = c_rarg6;
3283     const Register tmp3 = c_rarg7;
3284 
3285     BLOCK_COMMENT("Entry:");
3286     __ enter(); // required for proper stackwalking of RuntimeStub frame
3287 
3288     __ kernel_crc32(crc, buf, len,
3289               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3290 
3291     __ leave(); // required for proper stackwalking of RuntimeStub frame
3292     __ ret(lr);
3293 
3294     return start;
3295   }
3296 
3297   /**
3298    *  Arguments:
3299    *
3300    *  Input:
3301    *    c_rarg0   - x address
3302    *    c_rarg1   - x length
3303    *    c_rarg2   - y address
3304    *    c_rarg3   - y lenth
3305    *    c_rarg4   - z address
3306    *    c_rarg5   - z length
3307    */
3308   address generate_multiplyToLen() {
3309     __ align(CodeEntryAlignment);
3310     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3311 
3312     address start = __ pc();
3313     const Register x     = r0;
3314     const Register xlen  = r1;
3315     const Register y     = r2;
3316     const Register ylen  = r3;
3317     const Register z     = r4;
3318     const Register zlen  = r5;
3319 
3320     const Register tmp1  = r10;
3321     const Register tmp2  = r11;
3322     const Register tmp3  = r12;
3323     const Register tmp4  = r13;
3324     const Register tmp5  = r14;
3325     const Register tmp6  = r15;
3326     const Register tmp7  = r16;
3327 
3328     BLOCK_COMMENT("Entry:");
3329     __ enter(); // required for proper stackwalking of RuntimeStub frame
3330     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3331     __ leave(); // required for proper stackwalking of RuntimeStub frame
3332     __ ret(lr);
3333 
3334     return start;
3335   }
3336 
3337   // Continuation point for throwing of implicit exceptions that are
3338   // not handled in the current activation. Fabricates an exception
3339   // oop and initiates normal exception dispatching in this
3340   // frame. Since we need to preserve callee-saved values (currently
3341   // only for C2, but done for C1 as well) we need a callee-saved oop
3342   // map and therefore have to make these stubs into RuntimeStubs
3343   // rather than BufferBlobs.  If the compiler needs all registers to
3344   // be preserved between the fault point and the exception handler
3345   // then it must assume responsibility for that in
3346   // AbstractCompiler::continuation_for_implicit_null_exception or
3347   // continuation_for_implicit_division_by_zero_exception. All other
3348   // implicit exceptions (e.g., NullPointerException or
3349   // AbstractMethodError on entry) are either at call sites or
3350   // otherwise assume that stack unwinding will be initiated, so
3351   // caller saved registers were assumed volatile in the compiler.
3352 
3353 #undef __
3354 #define __ masm->
3355 
3356   address generate_throw_exception(const char* name,
3357                                    address runtime_entry,
3358                                    Register arg1 = noreg,
3359                                    Register arg2 = noreg) {
3360     // Information about frame layout at time of blocking runtime call.
3361     // Note that we only have to preserve callee-saved registers since
3362     // the compilers are responsible for supplying a continuation point
3363     // if they expect all registers to be preserved.
3364     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3365     enum layout {
3366       rfp_off = 0,
3367       rfp_off2,
3368       return_off,
3369       return_off2,
3370       framesize // inclusive of return address
3371     };
3372 
3373     int insts_size = 512;
3374     int locs_size  = 64;
3375 
3376     CodeBuffer code(name, insts_size, locs_size);
3377     OopMapSet* oop_maps  = new OopMapSet();
3378     MacroAssembler* masm = new MacroAssembler(&code);
3379 
3380     address start = __ pc();
3381 
3382     // This is an inlined and slightly modified version of call_VM
3383     // which has the ability to fetch the return PC out of
3384     // thread-local storage and also sets up last_Java_sp slightly
3385     // differently than the real call_VM
3386 
3387     __ enter(); // Save FP and LR before call
3388 
3389     assert(is_even(framesize/2), "sp not 16-byte aligned");
3390 
3391     // lr and fp are already in place
3392     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3393 
3394     int frame_complete = __ pc() - start;
3395 
3396     // Set up last_Java_sp and last_Java_fp
3397     address the_pc = __ pc();
3398     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3399 
3400     // Call runtime
3401     if (arg1 != noreg) {
3402       assert(arg2 != c_rarg1, "clobbered");
3403       __ mov(c_rarg1, arg1);
3404     }
3405     if (arg2 != noreg) {
3406       __ mov(c_rarg2, arg2);
3407     }
3408     __ mov(c_rarg0, rthread);
3409     BLOCK_COMMENT("call runtime_entry");
3410     __ mov(rscratch1, runtime_entry);
3411     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3412 
3413     // Generate oop map
3414     OopMap* map = new OopMap(framesize, 0);
3415 
3416     oop_maps->add_gc_map(the_pc - start, map);
3417 
3418     __ reset_last_Java_frame(true);
3419     __ maybe_isb();
3420 
3421     __ leave();
3422 
3423     // check for pending exceptions
3424 #ifdef ASSERT
3425     Label L;
3426     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3427     __ cbnz(rscratch1, L);
3428     __ should_not_reach_here();
3429     __ bind(L);
3430 #endif // ASSERT
3431     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3432 
3433 
3434     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3435     RuntimeStub* stub =
3436       RuntimeStub::new_runtime_stub(name,
3437                                     &code,
3438                                     frame_complete,
3439                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3440                                     oop_maps, false);
3441     return stub->entry_point();
3442   }
3443 
3444   class MontgomeryMultiplyGenerator : public MacroAssembler {
3445 
3446     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3447       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3448 
3449     RegSet _toSave;
3450     bool _squaring;
3451 
3452   public:
3453     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3454       : MacroAssembler(as->code()), _squaring(squaring) {
3455 
3456       // Register allocation
3457 
3458       Register reg = c_rarg0;
3459       Pa_base = reg;       // Argument registers
3460       if (squaring)
3461         Pb_base = Pa_base;
3462       else
3463         Pb_base = ++reg;
3464       Pn_base = ++reg;
3465       Rlen= ++reg;
3466       inv = ++reg;
3467       Pm_base = ++reg;
3468 
3469                           // Working registers:
3470       Ra =  ++reg;        // The current digit of a, b, n, and m.
3471       Rb =  ++reg;
3472       Rm =  ++reg;
3473       Rn =  ++reg;
3474 
3475       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3476       Pb =  ++reg;
3477       Pm =  ++reg;
3478       Pn =  ++reg;
3479 
3480       t0 =  ++reg;        // Three registers which form a
3481       t1 =  ++reg;        // triple-precision accumuator.
3482       t2 =  ++reg;
3483 
3484       Ri =  ++reg;        // Inner and outer loop indexes.
3485       Rj =  ++reg;
3486 
3487       Rhi_ab = ++reg;     // Product registers: low and high parts
3488       Rlo_ab = ++reg;     // of a*b and m*n.
3489       Rhi_mn = ++reg;
3490       Rlo_mn = ++reg;
3491 
3492       // r19 and up are callee-saved.
3493       _toSave = RegSet::range(r19, reg) + Pm_base;
3494     }
3495 
3496   private:
3497     void save_regs() {
3498       push(_toSave, sp);
3499     }
3500 
3501     void restore_regs() {
3502       pop(_toSave, sp);
3503     }
3504 
3505     template <typename T>
3506     void unroll_2(Register count, T block) {
3507       Label loop, end, odd;
3508       tbnz(count, 0, odd);
3509       cbz(count, end);
3510       align(16);
3511       bind(loop);
3512       (this->*block)();
3513       bind(odd);
3514       (this->*block)();
3515       subs(count, count, 2);
3516       br(Assembler::GT, loop);
3517       bind(end);
3518     }
3519 
3520     template <typename T>
3521     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3522       Label loop, end, odd;
3523       tbnz(count, 0, odd);
3524       cbz(count, end);
3525       align(16);
3526       bind(loop);
3527       (this->*block)(d, s, tmp);
3528       bind(odd);
3529       (this->*block)(d, s, tmp);
3530       subs(count, count, 2);
3531       br(Assembler::GT, loop);
3532       bind(end);
3533     }
3534 
3535     void pre1(RegisterOrConstant i) {
3536       block_comment("pre1");
3537       // Pa = Pa_base;
3538       // Pb = Pb_base + i;
3539       // Pm = Pm_base;
3540       // Pn = Pn_base + i;
3541       // Ra = *Pa;
3542       // Rb = *Pb;
3543       // Rm = *Pm;
3544       // Rn = *Pn;
3545       ldr(Ra, Address(Pa_base));
3546       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3547       ldr(Rm, Address(Pm_base));
3548       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3549       lea(Pa, Address(Pa_base));
3550       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3551       lea(Pm, Address(Pm_base));
3552       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3553 
3554       // Zero the m*n result.
3555       mov(Rhi_mn, zr);
3556       mov(Rlo_mn, zr);
3557     }
3558 
3559     // The core multiply-accumulate step of a Montgomery
3560     // multiplication.  The idea is to schedule operations as a
3561     // pipeline so that instructions with long latencies (loads and
3562     // multiplies) have time to complete before their results are
3563     // used.  This most benefits in-order implementations of the
3564     // architecture but out-of-order ones also benefit.
3565     void step() {
3566       block_comment("step");
3567       // MACC(Ra, Rb, t0, t1, t2);
3568       // Ra = *++Pa;
3569       // Rb = *--Pb;
3570       umulh(Rhi_ab, Ra, Rb);
3571       mul(Rlo_ab, Ra, Rb);
3572       ldr(Ra, pre(Pa, wordSize));
3573       ldr(Rb, pre(Pb, -wordSize));
3574       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3575                                        // previous iteration.
3576       // MACC(Rm, Rn, t0, t1, t2);
3577       // Rm = *++Pm;
3578       // Rn = *--Pn;
3579       umulh(Rhi_mn, Rm, Rn);
3580       mul(Rlo_mn, Rm, Rn);
3581       ldr(Rm, pre(Pm, wordSize));
3582       ldr(Rn, pre(Pn, -wordSize));
3583       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3584     }
3585 
3586     void post1() {
3587       block_comment("post1");
3588 
3589       // MACC(Ra, Rb, t0, t1, t2);
3590       // Ra = *++Pa;
3591       // Rb = *--Pb;
3592       umulh(Rhi_ab, Ra, Rb);
3593       mul(Rlo_ab, Ra, Rb);
3594       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3595       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3596 
3597       // *Pm = Rm = t0 * inv;
3598       mul(Rm, t0, inv);
3599       str(Rm, Address(Pm));
3600 
3601       // MACC(Rm, Rn, t0, t1, t2);
3602       // t0 = t1; t1 = t2; t2 = 0;
3603       umulh(Rhi_mn, Rm, Rn);
3604 
3605 #ifndef PRODUCT
3606       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3607       {
3608         mul(Rlo_mn, Rm, Rn);
3609         add(Rlo_mn, t0, Rlo_mn);
3610         Label ok;
3611         cbz(Rlo_mn, ok); {
3612           stop("broken Montgomery multiply");
3613         } bind(ok);
3614       }
3615 #endif
3616       // We have very carefully set things up so that
3617       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3618       // the lower half of Rm * Rn because we know the result already:
3619       // it must be -t0.  t0 + (-t0) must generate a carry iff
3620       // t0 != 0.  So, rather than do a mul and an adds we just set
3621       // the carry flag iff t0 is nonzero.
3622       //
3623       // mul(Rlo_mn, Rm, Rn);
3624       // adds(zr, t0, Rlo_mn);
3625       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3626       adcs(t0, t1, Rhi_mn);
3627       adc(t1, t2, zr);
3628       mov(t2, zr);
3629     }
3630 
3631     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3632       block_comment("pre2");
3633       // Pa = Pa_base + i-len;
3634       // Pb = Pb_base + len;
3635       // Pm = Pm_base + i-len;
3636       // Pn = Pn_base + len;
3637 
3638       if (i.is_register()) {
3639         sub(Rj, i.as_register(), len);
3640       } else {
3641         mov(Rj, i.as_constant());
3642         sub(Rj, Rj, len);
3643       }
3644       // Rj == i-len
3645 
3646       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3647       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3648       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3649       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3650 
3651       // Ra = *++Pa;
3652       // Rb = *--Pb;
3653       // Rm = *++Pm;
3654       // Rn = *--Pn;
3655       ldr(Ra, pre(Pa, wordSize));
3656       ldr(Rb, pre(Pb, -wordSize));
3657       ldr(Rm, pre(Pm, wordSize));
3658       ldr(Rn, pre(Pn, -wordSize));
3659 
3660       mov(Rhi_mn, zr);
3661       mov(Rlo_mn, zr);
3662     }
3663 
3664     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3665       block_comment("post2");
3666       if (i.is_constant()) {
3667         mov(Rj, i.as_constant()-len.as_constant());
3668       } else {
3669         sub(Rj, i.as_register(), len);
3670       }
3671 
3672       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3673 
3674       // As soon as we know the least significant digit of our result,
3675       // store it.
3676       // Pm_base[i-len] = t0;
3677       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3678 
3679       // t0 = t1; t1 = t2; t2 = 0;
3680       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3681       adc(t1, t2, zr);
3682       mov(t2, zr);
3683     }
3684 
3685     // A carry in t0 after Montgomery multiplication means that we
3686     // should subtract multiples of n from our result in m.  We'll
3687     // keep doing that until there is no carry.
3688     void normalize(RegisterOrConstant len) {
3689       block_comment("normalize");
3690       // while (t0)
3691       //   t0 = sub(Pm_base, Pn_base, t0, len);
3692       Label loop, post, again;
3693       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3694       cbz(t0, post); {
3695         bind(again); {
3696           mov(i, zr);
3697           mov(cnt, len);
3698           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3699           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3700           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3701           align(16);
3702           bind(loop); {
3703             sbcs(Rm, Rm, Rn);
3704             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3705             add(i, i, 1);
3706             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3707             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3708             sub(cnt, cnt, 1);
3709           } cbnz(cnt, loop);
3710           sbc(t0, t0, zr);
3711         } cbnz(t0, again);
3712       } bind(post);
3713     }
3714 
3715     // Move memory at s to d, reversing words.
3716     //    Increments d to end of copied memory
3717     //    Destroys tmp1, tmp2
3718     //    Preserves len
3719     //    Leaves s pointing to the address which was in d at start
3720     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3721       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3722 
3723       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3724       mov(tmp1, len);
3725       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3726       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3727     }
3728     // where
3729     void reverse1(Register d, Register s, Register tmp) {
3730       ldr(tmp, pre(s, -wordSize));
3731       ror(tmp, tmp, 32);
3732       str(tmp, post(d, wordSize));
3733     }
3734 
3735     void step_squaring() {
3736       // An extra ACC
3737       step();
3738       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3739     }
3740 
3741     void last_squaring(RegisterOrConstant i) {
3742       Label dont;
3743       // if ((i & 1) == 0) {
3744       tbnz(i.as_register(), 0, dont); {
3745         // MACC(Ra, Rb, t0, t1, t2);
3746         // Ra = *++Pa;
3747         // Rb = *--Pb;
3748         umulh(Rhi_ab, Ra, Rb);
3749         mul(Rlo_ab, Ra, Rb);
3750         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3751       } bind(dont);
3752     }
3753 
3754     void extra_step_squaring() {
3755       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3756 
3757       // MACC(Rm, Rn, t0, t1, t2);
3758       // Rm = *++Pm;
3759       // Rn = *--Pn;
3760       umulh(Rhi_mn, Rm, Rn);
3761       mul(Rlo_mn, Rm, Rn);
3762       ldr(Rm, pre(Pm, wordSize));
3763       ldr(Rn, pre(Pn, -wordSize));
3764     }
3765 
3766     void post1_squaring() {
3767       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3768 
3769       // *Pm = Rm = t0 * inv;
3770       mul(Rm, t0, inv);
3771       str(Rm, Address(Pm));
3772 
3773       // MACC(Rm, Rn, t0, t1, t2);
3774       // t0 = t1; t1 = t2; t2 = 0;
3775       umulh(Rhi_mn, Rm, Rn);
3776 
3777 #ifndef PRODUCT
3778       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3779       {
3780         mul(Rlo_mn, Rm, Rn);
3781         add(Rlo_mn, t0, Rlo_mn);
3782         Label ok;
3783         cbz(Rlo_mn, ok); {
3784           stop("broken Montgomery multiply");
3785         } bind(ok);
3786       }
3787 #endif
3788       // We have very carefully set things up so that
3789       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3790       // the lower half of Rm * Rn because we know the result already:
3791       // it must be -t0.  t0 + (-t0) must generate a carry iff
3792       // t0 != 0.  So, rather than do a mul and an adds we just set
3793       // the carry flag iff t0 is nonzero.
3794       //
3795       // mul(Rlo_mn, Rm, Rn);
3796       // adds(zr, t0, Rlo_mn);
3797       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3798       adcs(t0, t1, Rhi_mn);
3799       adc(t1, t2, zr);
3800       mov(t2, zr);
3801     }
3802 
3803     void acc(Register Rhi, Register Rlo,
3804              Register t0, Register t1, Register t2) {
3805       adds(t0, t0, Rlo);
3806       adcs(t1, t1, Rhi);
3807       adc(t2, t2, zr);
3808     }
3809 
3810   public:
3811     /**
3812      * Fast Montgomery multiplication.  The derivation of the
3813      * algorithm is in A Cryptographic Library for the Motorola
3814      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3815      *
3816      * Arguments:
3817      *
3818      * Inputs for multiplication:
3819      *   c_rarg0   - int array elements a
3820      *   c_rarg1   - int array elements b
3821      *   c_rarg2   - int array elements n (the modulus)
3822      *   c_rarg3   - int length
3823      *   c_rarg4   - int inv
3824      *   c_rarg5   - int array elements m (the result)
3825      *
3826      * Inputs for squaring:
3827      *   c_rarg0   - int array elements a
3828      *   c_rarg1   - int array elements n (the modulus)
3829      *   c_rarg2   - int length
3830      *   c_rarg3   - int inv
3831      *   c_rarg4   - int array elements m (the result)
3832      *
3833      */
3834     address generate_multiply() {
3835       Label argh, nothing;
3836       bind(argh);
3837       stop("MontgomeryMultiply total_allocation must be <= 8192");
3838 
3839       align(CodeEntryAlignment);
3840       address entry = pc();
3841 
3842       cbzw(Rlen, nothing);
3843 
3844       enter();
3845 
3846       // Make room.
3847       cmpw(Rlen, 512);
3848       br(Assembler::HI, argh);
3849       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3850       andr(sp, Ra, -2 * wordSize);
3851 
3852       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3853 
3854       {
3855         // Copy input args, reversing as we go.  We use Ra as a
3856         // temporary variable.
3857         reverse(Ra, Pa_base, Rlen, t0, t1);
3858         if (!_squaring)
3859           reverse(Ra, Pb_base, Rlen, t0, t1);
3860         reverse(Ra, Pn_base, Rlen, t0, t1);
3861       }
3862 
3863       // Push all call-saved registers and also Pm_base which we'll need
3864       // at the end.
3865       save_regs();
3866 
3867 #ifndef PRODUCT
3868       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3869       {
3870         ldr(Rn, Address(Pn_base, 0));
3871         mul(Rlo_mn, Rn, inv);
3872         cmp(Rlo_mn, -1);
3873         Label ok;
3874         br(EQ, ok); {
3875           stop("broken inverse in Montgomery multiply");
3876         } bind(ok);
3877       }
3878 #endif
3879 
3880       mov(Pm_base, Ra);
3881 
3882       mov(t0, zr);
3883       mov(t1, zr);
3884       mov(t2, zr);
3885 
3886       block_comment("for (int i = 0; i < len; i++) {");
3887       mov(Ri, zr); {
3888         Label loop, end;
3889         cmpw(Ri, Rlen);
3890         br(Assembler::GE, end);
3891 
3892         bind(loop);
3893         pre1(Ri);
3894 
3895         block_comment("  for (j = i; j; j--) {"); {
3896           movw(Rj, Ri);
3897           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3898         } block_comment("  } // j");
3899 
3900         post1();
3901         addw(Ri, Ri, 1);
3902         cmpw(Ri, Rlen);
3903         br(Assembler::LT, loop);
3904         bind(end);
3905         block_comment("} // i");
3906       }
3907 
3908       block_comment("for (int i = len; i < 2*len; i++) {");
3909       mov(Ri, Rlen); {
3910         Label loop, end;
3911         cmpw(Ri, Rlen, Assembler::LSL, 1);
3912         br(Assembler::GE, end);
3913 
3914         bind(loop);
3915         pre2(Ri, Rlen);
3916 
3917         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3918           lslw(Rj, Rlen, 1);
3919           subw(Rj, Rj, Ri);
3920           subw(Rj, Rj, 1);
3921           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3922         } block_comment("  } // j");
3923 
3924         post2(Ri, Rlen);
3925         addw(Ri, Ri, 1);
3926         cmpw(Ri, Rlen, Assembler::LSL, 1);
3927         br(Assembler::LT, loop);
3928         bind(end);
3929       }
3930       block_comment("} // i");
3931 
3932       normalize(Rlen);
3933 
3934       mov(Ra, Pm_base);  // Save Pm_base in Ra
3935       restore_regs();  // Restore caller's Pm_base
3936 
3937       // Copy our result into caller's Pm_base
3938       reverse(Pm_base, Ra, Rlen, t0, t1);
3939 
3940       leave();
3941       bind(nothing);
3942       ret(lr);
3943 
3944       return entry;
3945     }
3946     // In C, approximately:
3947 
3948     // void
3949     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
3950     //                     unsigned long Pn_base[], unsigned long Pm_base[],
3951     //                     unsigned long inv, int len) {
3952     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3953     //   unsigned long *Pa, *Pb, *Pn, *Pm;
3954     //   unsigned long Ra, Rb, Rn, Rm;
3955 
3956     //   int i;
3957 
3958     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
3959 
3960     //   for (i = 0; i < len; i++) {
3961     //     int j;
3962 
3963     //     Pa = Pa_base;
3964     //     Pb = Pb_base + i;
3965     //     Pm = Pm_base;
3966     //     Pn = Pn_base + i;
3967 
3968     //     Ra = *Pa;
3969     //     Rb = *Pb;
3970     //     Rm = *Pm;
3971     //     Rn = *Pn;
3972 
3973     //     int iters = i;
3974     //     for (j = 0; iters--; j++) {
3975     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3976     //       MACC(Ra, Rb, t0, t1, t2);
3977     //       Ra = *++Pa;
3978     //       Rb = *--Pb;
3979     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3980     //       MACC(Rm, Rn, t0, t1, t2);
3981     //       Rm = *++Pm;
3982     //       Rn = *--Pn;
3983     //     }
3984 
3985     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
3986     //     MACC(Ra, Rb, t0, t1, t2);
3987     //     *Pm = Rm = t0 * inv;
3988     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
3989     //     MACC(Rm, Rn, t0, t1, t2);
3990 
3991     //     assert(t0 == 0, "broken Montgomery multiply");
3992 
3993     //     t0 = t1; t1 = t2; t2 = 0;
3994     //   }
3995 
3996     //   for (i = len; i < 2*len; i++) {
3997     //     int j;
3998 
3999     //     Pa = Pa_base + i-len;
4000     //     Pb = Pb_base + len;
4001     //     Pm = Pm_base + i-len;
4002     //     Pn = Pn_base + len;
4003 
4004     //     Ra = *++Pa;
4005     //     Rb = *--Pb;
4006     //     Rm = *++Pm;
4007     //     Rn = *--Pn;
4008 
4009     //     int iters = len*2-i-1;
4010     //     for (j = i-len+1; iters--; j++) {
4011     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4012     //       MACC(Ra, Rb, t0, t1, t2);
4013     //       Ra = *++Pa;
4014     //       Rb = *--Pb;
4015     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4016     //       MACC(Rm, Rn, t0, t1, t2);
4017     //       Rm = *++Pm;
4018     //       Rn = *--Pn;
4019     //     }
4020 
4021     //     Pm_base[i-len] = t0;
4022     //     t0 = t1; t1 = t2; t2 = 0;
4023     //   }
4024 
4025     //   while (t0)
4026     //     t0 = sub(Pm_base, Pn_base, t0, len);
4027     // }
4028 
4029     /**
4030      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4031      * multiplies than Montgomery multiplication so it should be up to
4032      * 25% faster.  However, its loop control is more complex and it
4033      * may actually run slower on some machines.
4034      *
4035      * Arguments:
4036      *
4037      * Inputs:
4038      *   c_rarg0   - int array elements a
4039      *   c_rarg1   - int array elements n (the modulus)
4040      *   c_rarg2   - int length
4041      *   c_rarg3   - int inv
4042      *   c_rarg4   - int array elements m (the result)
4043      *
4044      */
4045     address generate_square() {
4046       Label argh;
4047       bind(argh);
4048       stop("MontgomeryMultiply total_allocation must be <= 8192");
4049 
4050       align(CodeEntryAlignment);
4051       address entry = pc();
4052 
4053       enter();
4054 
4055       // Make room.
4056       cmpw(Rlen, 512);
4057       br(Assembler::HI, argh);
4058       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4059       andr(sp, Ra, -2 * wordSize);
4060 
4061       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4062 
4063       {
4064         // Copy input args, reversing as we go.  We use Ra as a
4065         // temporary variable.
4066         reverse(Ra, Pa_base, Rlen, t0, t1);
4067         reverse(Ra, Pn_base, Rlen, t0, t1);
4068       }
4069 
4070       // Push all call-saved registers and also Pm_base which we'll need
4071       // at the end.
4072       save_regs();
4073 
4074       mov(Pm_base, Ra);
4075 
4076       mov(t0, zr);
4077       mov(t1, zr);
4078       mov(t2, zr);
4079 
4080       block_comment("for (int i = 0; i < len; i++) {");
4081       mov(Ri, zr); {
4082         Label loop, end;
4083         bind(loop);
4084         cmp(Ri, Rlen);
4085         br(Assembler::GE, end);
4086 
4087         pre1(Ri);
4088 
4089         block_comment("for (j = (i+1)/2; j; j--) {"); {
4090           add(Rj, Ri, 1);
4091           lsr(Rj, Rj, 1);
4092           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4093         } block_comment("  } // j");
4094 
4095         last_squaring(Ri);
4096 
4097         block_comment("  for (j = i/2; j; j--) {"); {
4098           lsr(Rj, Ri, 1);
4099           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4100         } block_comment("  } // j");
4101 
4102         post1_squaring();
4103         add(Ri, Ri, 1);
4104         cmp(Ri, Rlen);
4105         br(Assembler::LT, loop);
4106 
4107         bind(end);
4108         block_comment("} // i");
4109       }
4110 
4111       block_comment("for (int i = len; i < 2*len; i++) {");
4112       mov(Ri, Rlen); {
4113         Label loop, end;
4114         bind(loop);
4115         cmp(Ri, Rlen, Assembler::LSL, 1);
4116         br(Assembler::GE, end);
4117 
4118         pre2(Ri, Rlen);
4119 
4120         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4121           lsl(Rj, Rlen, 1);
4122           sub(Rj, Rj, Ri);
4123           sub(Rj, Rj, 1);
4124           lsr(Rj, Rj, 1);
4125           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4126         } block_comment("  } // j");
4127 
4128         last_squaring(Ri);
4129 
4130         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4131           lsl(Rj, Rlen, 1);
4132           sub(Rj, Rj, Ri);
4133           lsr(Rj, Rj, 1);
4134           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4135         } block_comment("  } // j");
4136 
4137         post2(Ri, Rlen);
4138         add(Ri, Ri, 1);
4139         cmp(Ri, Rlen, Assembler::LSL, 1);
4140 
4141         br(Assembler::LT, loop);
4142         bind(end);
4143         block_comment("} // i");
4144       }
4145 
4146       normalize(Rlen);
4147 
4148       mov(Ra, Pm_base);  // Save Pm_base in Ra
4149       restore_regs();  // Restore caller's Pm_base
4150 
4151       // Copy our result into caller's Pm_base
4152       reverse(Pm_base, Ra, Rlen, t0, t1);
4153 
4154       leave();
4155       ret(lr);
4156 
4157       return entry;
4158     }
4159     // In C, approximately:
4160 
4161     // void
4162     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4163     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4164     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4165     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4166     //   unsigned long Ra, Rb, Rn, Rm;
4167 
4168     //   int i;
4169 
4170     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4171 
4172     //   for (i = 0; i < len; i++) {
4173     //     int j;
4174 
4175     //     Pa = Pa_base;
4176     //     Pb = Pa_base + i;
4177     //     Pm = Pm_base;
4178     //     Pn = Pn_base + i;
4179 
4180     //     Ra = *Pa;
4181     //     Rb = *Pb;
4182     //     Rm = *Pm;
4183     //     Rn = *Pn;
4184 
4185     //     int iters = (i+1)/2;
4186     //     for (j = 0; iters--; j++) {
4187     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4188     //       MACC2(Ra, Rb, t0, t1, t2);
4189     //       Ra = *++Pa;
4190     //       Rb = *--Pb;
4191     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4192     //       MACC(Rm, Rn, t0, t1, t2);
4193     //       Rm = *++Pm;
4194     //       Rn = *--Pn;
4195     //     }
4196     //     if ((i & 1) == 0) {
4197     //       assert(Ra == Pa_base[j], "must be");
4198     //       MACC(Ra, Ra, t0, t1, t2);
4199     //     }
4200     //     iters = i/2;
4201     //     assert(iters == i-j, "must be");
4202     //     for (; iters--; j++) {
4203     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4204     //       MACC(Rm, Rn, t0, t1, t2);
4205     //       Rm = *++Pm;
4206     //       Rn = *--Pn;
4207     //     }
4208 
4209     //     *Pm = Rm = t0 * inv;
4210     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4211     //     MACC(Rm, Rn, t0, t1, t2);
4212 
4213     //     assert(t0 == 0, "broken Montgomery multiply");
4214 
4215     //     t0 = t1; t1 = t2; t2 = 0;
4216     //   }
4217 
4218     //   for (i = len; i < 2*len; i++) {
4219     //     int start = i-len+1;
4220     //     int end = start + (len - start)/2;
4221     //     int j;
4222 
4223     //     Pa = Pa_base + i-len;
4224     //     Pb = Pa_base + len;
4225     //     Pm = Pm_base + i-len;
4226     //     Pn = Pn_base + len;
4227 
4228     //     Ra = *++Pa;
4229     //     Rb = *--Pb;
4230     //     Rm = *++Pm;
4231     //     Rn = *--Pn;
4232 
4233     //     int iters = (2*len-i-1)/2;
4234     //     assert(iters == end-start, "must be");
4235     //     for (j = start; iters--; j++) {
4236     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4237     //       MACC2(Ra, Rb, t0, t1, t2);
4238     //       Ra = *++Pa;
4239     //       Rb = *--Pb;
4240     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4241     //       MACC(Rm, Rn, t0, t1, t2);
4242     //       Rm = *++Pm;
4243     //       Rn = *--Pn;
4244     //     }
4245     //     if ((i & 1) == 0) {
4246     //       assert(Ra == Pa_base[j], "must be");
4247     //       MACC(Ra, Ra, t0, t1, t2);
4248     //     }
4249     //     iters =  (2*len-i)/2;
4250     //     assert(iters == len-j, "must be");
4251     //     for (; iters--; j++) {
4252     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4253     //       MACC(Rm, Rn, t0, t1, t2);
4254     //       Rm = *++Pm;
4255     //       Rn = *--Pn;
4256     //     }
4257     //     Pm_base[i-len] = t0;
4258     //     t0 = t1; t1 = t2; t2 = 0;
4259     //   }
4260 
4261     //   while (t0)
4262     //     t0 = sub(Pm_base, Pn_base, t0, len);
4263     // }
4264   };
4265 
4266   // Initialization
4267   void generate_initial() {
4268     // Generate initial stubs and initializes the entry points
4269 
4270     // entry points that exist in all platforms Note: This is code
4271     // that could be shared among different platforms - however the
4272     // benefit seems to be smaller than the disadvantage of having a
4273     // much more complicated generator structure. See also comment in
4274     // stubRoutines.hpp.
4275 
4276     StubRoutines::_forward_exception_entry = generate_forward_exception();
4277 
4278     StubRoutines::_call_stub_entry =
4279       generate_call_stub(StubRoutines::_call_stub_return_address);
4280 
4281     // is referenced by megamorphic call
4282     StubRoutines::_catch_exception_entry = generate_catch_exception();
4283 
4284     // Build this early so it's available for the interpreter.
4285     StubRoutines::_throw_StackOverflowError_entry =
4286       generate_throw_exception("StackOverflowError throw_exception",
4287                                CAST_FROM_FN_PTR(address,
4288                                                 SharedRuntime::
4289                                                 throw_StackOverflowError));
4290     if (UseCRC32Intrinsics) {
4291       // set table address before stub generation which use it
4292       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4293       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4294     }
4295   }
4296 
4297   void generate_all() {
4298     // support for verify_oop (must happen after universe_init)
4299     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4300     StubRoutines::_throw_AbstractMethodError_entry =
4301       generate_throw_exception("AbstractMethodError throw_exception",
4302                                CAST_FROM_FN_PTR(address,
4303                                                 SharedRuntime::
4304                                                 throw_AbstractMethodError));
4305 
4306     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4307       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4308                                CAST_FROM_FN_PTR(address,
4309                                                 SharedRuntime::
4310                                                 throw_IncompatibleClassChangeError));
4311 
4312     StubRoutines::_throw_NullPointerException_at_call_entry =
4313       generate_throw_exception("NullPointerException at call throw_exception",
4314                                CAST_FROM_FN_PTR(address,
4315                                                 SharedRuntime::
4316                                                 throw_NullPointerException_at_call));
4317 
4318     // arraycopy stubs used by compilers
4319     generate_arraycopy_stubs();
4320 
4321     if (UseMultiplyToLenIntrinsic) {
4322       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4323     }
4324 
4325     if (UseMontgomeryMultiplyIntrinsic) {
4326       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4327       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4328       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4329     }
4330 
4331     if (UseMontgomerySquareIntrinsic) {
4332       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4333       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4334       // We use generate_multiply() rather than generate_square()
4335       // because it's faster for the sizes of modulus we care about.
4336       StubRoutines::_montgomerySquare = g.generate_multiply();
4337     }
4338 
4339     if (UseShenandoahGC && ShenandoahWriteBarrier) {
4340       StubRoutines::aarch64::_shenandoah_wb = generate_shenandoah_wb(false, true);
4341       StubRoutines::_shenandoah_wb_C = generate_shenandoah_wb(true, false);
4342     }
4343 
4344 #ifndef BUILTIN_SIM
4345     if (UseAESIntrinsics) {
4346       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4347       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4348       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4349       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4350     }
4351 
4352     if (UseSHA1Intrinsics) {
4353       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4354       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4355     }
4356     if (UseSHA256Intrinsics) {
4357       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4358       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4359     }
4360 
4361     // Safefetch stubs.
4362     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4363                                                        &StubRoutines::_safefetch32_fault_pc,
4364                                                        &StubRoutines::_safefetch32_continuation_pc);
4365     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4366                                                        &StubRoutines::_safefetchN_fault_pc,
4367                                                        &StubRoutines::_safefetchN_continuation_pc);
4368 #endif
4369   }
4370 
4371  public:
4372   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4373     if (all) {
4374       generate_all();
4375     } else {
4376       generate_initial();
4377     }
4378   }
4379 }; // end class declaration
4380 
4381 void StubGenerator_generate(CodeBuffer* code, bool all) {
4382   StubGenerator g(code, all);
4383 }