1 /*
   2  * Copyright (c) 2013, Red Hat Inc.
   3  * Copyright (c) 2003, 2011, Oracle and/or its affiliates.
   4  * All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "interpreter/interpreter.hpp"
  31 #include "nativeInst_aarch64.hpp"
  32 #include "oops/instanceOop.hpp"
  33 #include "oops/method.hpp"
  34 #include "oops/objArrayKlass.hpp"
  35 #include "oops/oop.inline.hpp"
  36 #include "prims/methodHandles.hpp"
  37 #include "runtime/frame.inline.hpp"
  38 #include "runtime/handles.inline.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubCodeGenerator.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "runtime/thread.inline.hpp"
  43 #include "utilities/top.hpp"
  44 #ifdef COMPILER2
  45 #include "opto/runtime.hpp"
  46 #endif
  47 
  48 // Declaration and definition of StubGenerator (no .hpp file).
  49 // For a more detailed description of the stub routine structure
  50 // see the comment in stubRoutines.hpp
  51 
  52 #undef __
  53 #define __ _masm->
  54 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #else
  59 #define BLOCK_COMMENT(str) __ block_comment(str)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 // Stub Code definitions
  65 
  66 class StubGenerator: public StubCodeGenerator {
  67  private:
  68 
  69 #ifdef PRODUCT
  70 #define inc_counter_np(counter) ((void)0)
  71 #else
  72   void inc_counter_np_(int& counter) {
  73     __ lea(rscratch2, ExternalAddress((address)&counter));
  74     __ ldrw(rscratch1, Address(rscratch2));
  75     __ addw(rscratch1, rscratch1, 1);
  76     __ strw(rscratch1, Address(rscratch2));
  77   }
  78 #define inc_counter_np(counter) \
  79   BLOCK_COMMENT("inc_counter " #counter); \
  80   inc_counter_np_(counter);
  81 #endif
  82 
  83   // Call stubs are used to call Java from C
  84   //
  85   // Arguments:
  86   //    c_rarg0:   call wrapper address                   address
  87   //    c_rarg1:   result                                 address
  88   //    c_rarg2:   result type                            BasicType
  89   //    c_rarg3:   method                                 Method*
  90   //    c_rarg4:   (interpreter) entry point              address
  91   //    c_rarg5:   parameters                             intptr_t*
  92   //    c_rarg6:   parameter size (in words)              int
  93   //    c_rarg7:   thread                                 Thread*
  94   //
  95   // There is no return from the stub itself as any Java result
  96   // is written to result
  97   //
  98   // we save r30 (lr) as the return PC at the base of the frame and
  99   // link r29 (fp) below it as the frame pointer installing sp (r31)
 100   // into fp.
 101   //
 102   // we save r0-r7, which accounts for all the c arguments.
 103   //
 104   // TODO: strictly do we need to save them all? they are treated as
 105   // volatile by C so could we omit saving the ones we are going to
 106   // place in global registers (thread? method?) or those we only use
 107   // during setup of the Java call?
 108   //
 109   // we don't need to save r8 which C uses as an indirect result location
 110   // return register.
 111   //
 112   // we don't need to save r9-r15 which both C and Java treat as
 113   // volatile
 114   //
 115   // we don't need to save r16-18 because Java does not use them
 116   //
 117   // we save r19-r28 which Java uses as scratch registers and C
 118   // expects to be callee-save
 119   //
 120   // we save the bottom 64 bits of each value stored in v8-v15; it is
 121   // the responsibility of the caller to preserve larger values.
 122   //
 123   // so the stub frame looks like this when we enter Java code
 124   //
 125   //     [ return_from_Java     ] <--- sp
 126   //     [ argument word n      ]
 127   //      ...
 128   // -27 [ argument word 1      ]
 129   // -26 [ saved v15            ] <--- sp_after_call
 130   // -25 [ saved v14            ]
 131   // -24 [ saved v13            ]
 132   // -23 [ saved v12            ]
 133   // -22 [ saved v11            ]
 134   // -21 [ saved v10            ]
 135   // -20 [ saved v9             ]
 136   // -19 [ saved v8             ]
 137   // -18 [ saved r28            ]
 138   // -17 [ saved r27            ]
 139   // -16 [ saved r26            ]
 140   // -15 [ saved r25            ]
 141   // -14 [ saved r24            ]
 142   // -13 [ saved r23            ]
 143   // -12 [ saved r22            ]
 144   // -11 [ saved r21            ]
 145   // -10 [ saved r20            ]
 146   //  -9 [ saved r19            ]
 147   //  -8 [ call wrapper    (r0) ]
 148   //  -7 [ result          (r1) ]
 149   //  -6 [ result type     (r2) ]
 150   //  -5 [ method          (r3) ]
 151   //  -4 [ entry point     (r4) ]
 152   //  -3 [ parameters      (r5) ]
 153   //  -2 [ parameter size  (r6) ]
 154   //  -1 [ thread (r7)          ]
 155   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 156   //   1 [ saved lr       (r30) ]
 157 
 158   // Call stub stack layout word offsets from fp
 159   enum call_stub_layout {
 160     sp_after_call_off = -26,
 161 
 162     d15_off            = -26,
 163     d13_off            = -24,
 164     d11_off            = -22,
 165     d9_off             = -20,
 166 
 167     r28_off            = -18,
 168     r26_off            = -16,
 169     r24_off            = -14,
 170     r22_off            = -12,
 171     r20_off            = -10,
 172     call_wrapper_off   =  -8,
 173     result_off         =  -7,
 174     result_type_off    =  -6,
 175     method_off         =  -5,
 176     entry_point_off    =  -4,
 177     parameter_size_off =  -2,
 178     thread_off         =  -1,
 179     fp_f               =   0,
 180     retaddr_off        =   1,
 181   };
 182 
 183   address generate_call_stub(address& return_address) {
 184     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 185            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 186            "adjust this code");
 187 
 188     StubCodeMark mark(this, "StubRoutines", "call_stub");
 189     address start = __ pc();
 190 
 191     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 192 
 193     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 194     const Address result        (rfp, result_off         * wordSize);
 195     const Address result_type   (rfp, result_type_off    * wordSize);
 196     const Address method        (rfp, method_off         * wordSize);
 197     const Address entry_point   (rfp, entry_point_off    * wordSize);
 198     const Address parameter_size(rfp, parameter_size_off * wordSize);
 199 
 200     const Address thread        (rfp, thread_off         * wordSize);
 201 
 202     const Address d15_save      (rfp, d15_off * wordSize);
 203     const Address d13_save      (rfp, d13_off * wordSize);
 204     const Address d11_save      (rfp, d11_off * wordSize);
 205     const Address d9_save       (rfp, d9_off * wordSize);
 206 
 207     const Address r28_save      (rfp, r28_off * wordSize);
 208     const Address r26_save      (rfp, r26_off * wordSize);
 209     const Address r24_save      (rfp, r24_off * wordSize);
 210     const Address r22_save      (rfp, r22_off * wordSize);
 211     const Address r20_save      (rfp, r20_off * wordSize);
 212 
 213     // stub code
 214 
 215     address aarch64_entry = __ pc();
 216 
 217     // set up frame and move sp to end of save area
 218     __ enter();
 219     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 220 
 221     // save register parameters and Java scratch/global registers
 222     // n.b. we save thread even though it gets installed in
 223     // rthread because we want to sanity check rthread later
 224     __ str(c_rarg7,  thread);
 225     __ strw(c_rarg6, parameter_size);
 226     __ stp(c_rarg4, c_rarg5,  entry_point);
 227     __ stp(c_rarg2, c_rarg3,  result_type);
 228     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 229 
 230     __ stp(r20, r19,   r20_save);
 231     __ stp(r22, r21,   r22_save);
 232     __ stp(r24, r23,   r24_save);
 233     __ stp(r26, r25,   r26_save);
 234     __ stp(r28, r27,   r28_save);
 235 
 236     __ stpd(v9,  v8,   d9_save);
 237     __ stpd(v11, v10,  d11_save);
 238     __ stpd(v13, v12,  d13_save);
 239     __ stpd(v15, v14,  d15_save);
 240 
 241     // install Java thread in global register now we have saved
 242     // whatever value it held
 243     __ mov(rthread, c_rarg7);
 244     // And method
 245     __ mov(rmethod, c_rarg3);
 246 
 247     // set up the heapbase register
 248     __ reinit_heapbase();
 249 
 250 #ifdef ASSERT
 251     // make sure we have no pending exceptions
 252     {
 253       Label L;
 254       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 255       __ cmp(rscratch1, (unsigned)NULL_WORD);
 256       __ br(Assembler::EQ, L);
 257       __ stop("StubRoutines::call_stub: entered with pending exception");
 258       __ BIND(L);
 259     }
 260 #endif
 261     // pass parameters if any
 262     __ mov(esp, sp);
 263     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 264     __ andr(sp, rscratch1, -2 * wordSize);
 265 
 266     BLOCK_COMMENT("pass parameters if any");
 267     Label parameters_done;
 268     // parameter count is still in c_rarg6
 269     // and parameter pointer identifying param 1 is in c_rarg5
 270     __ cbzw(c_rarg6, parameters_done);
 271 
 272     address loop = __ pc();
 273     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 274     __ subsw(c_rarg6, c_rarg6, 1);
 275     __ push(rscratch1);
 276     __ br(Assembler::GT, loop);
 277 
 278     __ BIND(parameters_done);
 279 
 280     // call Java entry -- passing methdoOop, and current sp
 281     //      rmethod: Method*
 282     //      r13: sender sp
 283     BLOCK_COMMENT("call Java function");
 284     __ mov(r13, sp);
 285     __ blr(c_rarg4);
 286 
 287     // we do this here because the notify will already have been done
 288     // if we get to the next instruction via an exception
 289     //
 290     // n.b. adding this instruction here affects the calculation of
 291     // whether or not a routine returns to the call stub (used when
 292     // doing stack walks) since the normal test is to check the return
 293     // pc against the address saved below. so we may need to allow for
 294     // this extra instruction in the check.
 295 
 296     // save current address for use by exception handling code
 297 
 298     return_address = __ pc();
 299 
 300     // store result depending on type (everything that is not
 301     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 302     // n.b. this assumes Java returns an integral result in r0
 303     // and a floating result in j_farg0
 304     __ ldr(j_rarg2, result);
 305     Label is_long, is_float, is_double, exit;
 306     __ ldr(j_rarg1, result_type);
 307     __ cmp(j_rarg1, T_OBJECT);
 308     __ br(Assembler::EQ, is_long);
 309     __ cmp(j_rarg1, T_LONG);
 310     __ br(Assembler::EQ, is_long);
 311     __ cmp(j_rarg1, T_FLOAT);
 312     __ br(Assembler::EQ, is_float);
 313     __ cmp(j_rarg1, T_DOUBLE);
 314     __ br(Assembler::EQ, is_double);
 315 
 316     // handle T_INT case
 317     __ strw(r0, Address(j_rarg2));
 318 
 319     __ BIND(exit);
 320 
 321     // pop parameters
 322     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 323 
 324 #ifdef ASSERT
 325     // verify that threads correspond
 326     {
 327       Label L, S;
 328       __ ldr(rscratch1, thread);
 329       __ cmp(rthread, rscratch1);
 330       __ br(Assembler::NE, S);
 331       __ get_thread(rscratch1);
 332       __ cmp(rthread, rscratch1);
 333       __ br(Assembler::EQ, L);
 334       __ BIND(S);
 335       __ stop("StubRoutines::call_stub: threads must correspond");
 336       __ BIND(L);
 337     }
 338 #endif
 339 
 340     // restore callee-save registers
 341     __ ldpd(v15, v14,  d15_save);
 342     __ ldpd(v13, v12,  d13_save);
 343     __ ldpd(v11, v10,  d11_save);
 344     __ ldpd(v9,  v8,   d9_save);
 345 
 346     __ ldp(r28, r27,   r28_save);
 347     __ ldp(r26, r25,   r26_save);
 348     __ ldp(r24, r23,   r24_save);
 349     __ ldp(r22, r21,   r22_save);
 350     __ ldp(r20, r19,   r20_save);
 351 
 352     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 353     __ ldrw(c_rarg2, result_type);
 354     __ ldr(c_rarg3,  method);
 355     __ ldp(c_rarg4, c_rarg5,  entry_point);
 356     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 357 
 358     // leave frame and return to caller
 359     __ leave();
 360     __ ret(lr);
 361 
 362     // handle return types different from T_INT
 363 
 364     __ BIND(is_long);
 365     __ str(r0, Address(j_rarg2, 0));
 366     __ br(Assembler::AL, exit);
 367 
 368     __ BIND(is_float);
 369     __ strs(j_farg0, Address(j_rarg2, 0));
 370     __ br(Assembler::AL, exit);
 371 
 372     __ BIND(is_double);
 373     __ strd(j_farg0, Address(j_rarg2, 0));
 374     __ br(Assembler::AL, exit);
 375 
 376     return start;
 377   }
 378 
 379   // Return point for a Java call if there's an exception thrown in
 380   // Java code.  The exception is caught and transformed into a
 381   // pending exception stored in JavaThread that can be tested from
 382   // within the VM.
 383   //
 384   // Note: Usually the parameters are removed by the callee. In case
 385   // of an exception crossing an activation frame boundary, that is
 386   // not the case if the callee is compiled code => need to setup the
 387   // rsp.
 388   //
 389   // r0: exception oop
 390 
 391   address generate_catch_exception() {
 392     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 393     address start = __ pc();
 394 
 395     // same as in generate_call_stub():
 396     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 397     const Address thread        (rfp, thread_off         * wordSize);
 398 
 399 #ifdef ASSERT
 400     // verify that threads correspond
 401     {
 402       Label L, S;
 403       __ ldr(rscratch1, thread);
 404       __ cmp(rthread, rscratch1);
 405       __ br(Assembler::NE, S);
 406       __ get_thread(rscratch1);
 407       __ cmp(rthread, rscratch1);
 408       __ br(Assembler::EQ, L);
 409       __ bind(S);
 410       __ stop("StubRoutines::catch_exception: threads must correspond");
 411       __ bind(L);
 412     }
 413 #endif
 414 
 415     // set pending exception
 416     __ verify_oop(r0);
 417 
 418     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 419     __ mov(rscratch1, (address)__FILE__);
 420     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 421     __ movw(rscratch1, (int)__LINE__);
 422     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 423 
 424     // complete return to VM
 425     assert(StubRoutines::_call_stub_return_address != NULL,
 426            "_call_stub_return_address must have been generated before");
 427     __ b(StubRoutines::_call_stub_return_address);
 428 
 429     return start;
 430   }
 431 
 432   // Continuation point for runtime calls returning with a pending
 433   // exception.  The pending exception check happened in the runtime
 434   // or native call stub.  The pending exception in Thread is
 435   // converted into a Java-level exception.
 436   //
 437   // Contract with Java-level exception handlers:
 438   // r0: exception
 439   // r3: throwing pc
 440   //
 441   // NOTE: At entry of this stub, exception-pc must be in LR !!
 442 
 443   // NOTE: this is always used as a jump target within generated code
 444   // so it just needs to be generated code wiht no x86 prolog
 445 
 446   address generate_forward_exception() {
 447     StubCodeMark mark(this, "StubRoutines", "forward exception");
 448     address start = __ pc();
 449 
 450     // Upon entry, LR points to the return address returning into
 451     // Java (interpreted or compiled) code; i.e., the return address
 452     // becomes the throwing pc.
 453     //
 454     // Arguments pushed before the runtime call are still on the stack
 455     // but the exception handler will reset the stack pointer ->
 456     // ignore them.  A potential result in registers can be ignored as
 457     // well.
 458 
 459 #ifdef ASSERT
 460     // make sure this code is only executed if there is a pending exception
 461     {
 462       Label L;
 463       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 464       __ cbnz(rscratch1, L);
 465       __ stop("StubRoutines::forward exception: no pending exception (1)");
 466       __ bind(L);
 467     }
 468 #endif
 469 
 470     // compute exception handler into r19
 471 
 472     // call the VM to find the handler address associated with the
 473     // caller address. pass thread in r0 and caller pc (ret address)
 474     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 475     // the stack.
 476     __ mov(c_rarg1, lr);
 477     // lr will be trashed by the VM call so we move it to R19
 478     // (callee-saved) because we also need to pass it to the handler
 479     // returned by this call.
 480     __ mov(r19, lr);
 481     BLOCK_COMMENT("call exception_handler_for_return_address");
 482     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 483                          SharedRuntime::exception_handler_for_return_address),
 484                     rthread, c_rarg1);
 485     // we should not really care that lr is no longer the callee
 486     // address. we saved the value the handler needs in r19 so we can
 487     // just copy it to r3. however, the C2 handler will push its own
 488     // frame and then calls into the VM and the VM code asserts that
 489     // the PC for the frame above the handler belongs to a compiled
 490     // Java method. So, we restore lr here to satisfy that assert.
 491     __ mov(lr, r19);
 492     // setup r0 & r3 & clear pending exception
 493     __ mov(r3, r19);
 494     __ mov(r19, r0);
 495     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 496     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 497 
 498 #ifdef ASSERT
 499     // make sure exception is set
 500     {
 501       Label L;
 502       __ cbnz(r0, L);
 503       __ stop("StubRoutines::forward exception: no pending exception (2)");
 504       __ bind(L);
 505     }
 506 #endif
 507 
 508     // continue at exception handler
 509     // r0: exception
 510     // r3: throwing pc
 511     // r19: exception handler
 512     __ verify_oop(r0);
 513     __ br(r19);
 514 
 515     return start;
 516   }
 517 
 518   // Non-destructive plausibility checks for oops
 519   //
 520   // Arguments:
 521   //    r0: oop to verify
 522   //    rscratch1: error message
 523   //
 524   // Stack after saving c_rarg3:
 525   //    [tos + 0]: saved c_rarg3
 526   //    [tos + 1]: saved c_rarg2
 527   //    [tos + 2]: saved lr
 528   //    [tos + 3]: saved rscratch2
 529   //    [tos + 4]: saved r0
 530   //    [tos + 5]: saved rscratch1
 531   address generate_verify_oop() {
 532 
 533     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 534     address start = __ pc();
 535 
 536     Label exit, error;
 537 
 538     // save c_rarg2 and c_rarg3
 539     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 540 
 541     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 542     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 543     __ ldr(c_rarg3, Address(c_rarg2));
 544     __ add(c_rarg3, c_rarg3, 1);
 545     __ str(c_rarg3, Address(c_rarg2));
 546 
 547     // object is in r0
 548     // make sure object is 'reasonable'
 549     __ cbz(r0, exit); // if obj is NULL it is OK
 550 
 551     // Check if the oop is in the right area of memory
 552     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 553     __ andr(c_rarg2, r0, c_rarg3);
 554     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 555 
 556     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 557     // instruction here because the flags register is live.
 558     __ eor(c_rarg2, c_rarg2, c_rarg3);
 559     __ cbnz(c_rarg2, error);
 560 
 561     // make sure klass is 'reasonable', which is not zero.
 562     __ load_klass(r0, r0);  // get klass
 563     __ cbz(r0, error);      // if klass is NULL it is broken
 564 
 565     // return if everything seems ok
 566     __ bind(exit);
 567 
 568     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 569     __ ret(lr);
 570 
 571     // handle errors
 572     __ bind(error);
 573     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 574 
 575     __ push(RegSet::range(r0, r29), sp);
 576     // debug(char* msg, int64_t pc, int64_t regs[])
 577     __ mov(c_rarg0, rscratch1);      // pass address of error message
 578     __ mov(c_rarg1, lr);             // pass return address
 579     __ mov(c_rarg2, sp);             // pass address of regs on stack
 580 #ifndef PRODUCT
 581     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 582 #endif
 583     BLOCK_COMMENT("call MacroAssembler::debug");
 584     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 585     __ blr(rscratch1);
 586 
 587     return start;
 588   }
 589 
 590   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 591 
 592   // Generate code for an array write pre barrier
 593   //
 594   //     addr    -  starting address
 595   //     count   -  element count
 596   //     tmp     - scratch register
 597   //
 598   //     Destroy no registers except rscratch1 and rscratch2
 599   //
 600   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 601     BarrierSet* bs = Universe::heap()->barrier_set();
 602     switch (bs->kind()) {
 603     case BarrierSet::G1SATBCT:
 604     case BarrierSet::G1SATBCTLogging:
 605       // With G1, don't generate the call if we statically know that the target in uninitialized
 606       if (!dest_uninitialized) {
 607         __ push_call_clobbered_registers();
 608         if (count == c_rarg0) {
 609           if (addr == c_rarg1) {
 610             // exactly backwards!!
 611             __ mov(rscratch1, c_rarg0);
 612             __ mov(c_rarg0, c_rarg1);
 613             __ mov(c_rarg1, rscratch1);
 614           } else {
 615             __ mov(c_rarg1, count);
 616             __ mov(c_rarg0, addr);
 617           }
 618         } else {
 619           __ mov(c_rarg0, addr);
 620           __ mov(c_rarg1, count);
 621         }
 622         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 623         __ pop_call_clobbered_registers();
 624         break;
 625       case BarrierSet::CardTableModRef:
 626       case BarrierSet::CardTableExtension:
 627       case BarrierSet::ModRef:
 628         break;
 629       default:
 630         ShouldNotReachHere();
 631 
 632       }
 633     }
 634   }
 635 
 636   //
 637   // Generate code for an array write post barrier
 638   //
 639   //  Input:
 640   //     start    - register containing starting address of destination array
 641   //     end      - register containing ending address of destination array
 642   //     scratch  - scratch register
 643   //
 644   //  The input registers are overwritten.
 645   //  The ending address is inclusive.
 646   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 647     assert_different_registers(start, end, scratch);
 648     Label L_done;
 649 
 650     // "end" is inclusive end pointer == start + (count - 1) * array_element_size
 651     // If count == 0, "end" is less than "start" and we need to skip card marking.
 652     __ cmp(end, start);
 653     __ br(__ LO, L_done);
 654 
 655     BarrierSet* bs = Universe::heap()->barrier_set();
 656     switch (bs->kind()) {
 657       case BarrierSet::G1SATBCT:
 658       case BarrierSet::G1SATBCTLogging:
 659 
 660         {
 661           __ push_call_clobbered_registers();
 662           // must compute element count unless barrier set interface is changed (other platforms supply count)
 663           assert_different_registers(start, end, scratch);
 664           __ lea(scratch, Address(end, BytesPerHeapOop));
 665           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 666           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 667           __ mov(c_rarg0, start);
 668           __ mov(c_rarg1, scratch);
 669           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 670           __ pop_call_clobbered_registers();
 671         }
 672         break;
 673       case BarrierSet::CardTableModRef:
 674       case BarrierSet::CardTableExtension:
 675         {
 676           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 677           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 678 
 679           Label L_loop;
 680 
 681            __ lsr(start, start, CardTableModRefBS::card_shift);
 682            __ lsr(end, end, CardTableModRefBS::card_shift);
 683            __ sub(end, end, start); // number of bytes to copy
 684 
 685           const Register count = end; // 'end' register contains bytes count now
 686           __ load_byte_map_base(scratch);
 687           __ add(start, start, scratch);
 688           if (UseConcMarkSweepGC) {
 689             __ membar(__ StoreStore);
 690           }
 691           __ BIND(L_loop);
 692           __ strb(zr, Address(start, count));
 693           __ subs(count, count, 1);
 694           __ br(Assembler::GE, L_loop);
 695         }
 696         break;
 697       default:
 698         ShouldNotReachHere();
 699 
 700     }
 701     __ bind(L_done);
 702   }
 703 
 704   address generate_zero_longs(Register base, Register cnt) {
 705     Register tmp = rscratch1;
 706     Register tmp2 = rscratch2;
 707     int zva_length = VM_Version::zva_length();
 708     Label initial_table_end, loop_zva;
 709     Label fini;
 710 
 711     __ align(CodeEntryAlignment);
 712     StubCodeMark mark(this, "StubRoutines", "zero_longs");
 713     address start = __ pc();
 714 
 715     // Base must be 16 byte aligned. If not just return and let caller handle it
 716     __ tst(base, 0x0f);
 717     __ br(Assembler::NE, fini);
 718     // Align base with ZVA length.
 719     __ neg(tmp, base);
 720     __ andr(tmp, tmp, zva_length - 1);
 721 
 722     // tmp: the number of bytes to be filled to align the base with ZVA length.
 723     __ add(base, base, tmp);
 724     __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
 725     __ adr(tmp2, initial_table_end);
 726     __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
 727     __ br(tmp2);
 728 
 729     for (int i = -zva_length + 16; i < 0; i += 16)
 730       __ stp(zr, zr, Address(base, i));
 731     __ bind(initial_table_end);
 732 
 733     __ sub(cnt, cnt, zva_length >> 3);
 734     __ bind(loop_zva);
 735     __ dc(Assembler::ZVA, base);
 736     __ subs(cnt, cnt, zva_length >> 3);
 737     __ add(base, base, zva_length);
 738     __ br(Assembler::GE, loop_zva);
 739     __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
 740     __ bind(fini);
 741     __ ret(lr);
 742 
 743     return start;
 744   }
 745 
 746   typedef enum {
 747     copy_forwards = 1,
 748     copy_backwards = -1
 749   } copy_direction;
 750 
 751   // Bulk copy of blocks of 8 words.
 752   //
 753   // count is a count of words.
 754   //
 755   // Precondition: count >= 8
 756   //
 757   // Postconditions:
 758   //
 759   // The least significant bit of count contains the remaining count
 760   // of words to copy.  The rest of count is trash.
 761   //
 762   // s and d are adjusted to point to the remaining words to copy
 763   //
 764   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 765                            copy_direction direction) {
 766     int unit = wordSize * direction;
 767     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 768 
 769     int offset;
 770     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 771       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 772     const Register stride = r13;
 773 
 774     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 775     assert_different_registers(s, d, count, rscratch1);
 776 
 777     Label again, drain;
 778     const char *stub_name;
 779     if (direction == copy_forwards)
 780       stub_name = "foward_copy_longs";
 781     else
 782       stub_name = "backward_copy_longs";
 783 
 784     __ align(CodeEntryAlignment);
 785 
 786     StubCodeMark mark(this, "StubRoutines", stub_name);
 787 
 788     __ bind(start);
 789 
 790     Label unaligned_copy_long;
 791     if (AvoidUnalignedAccesses) {
 792       __ tbnz(d, 3, unaligned_copy_long);
 793     }
 794 
 795     if (direction == copy_forwards) {
 796       __ sub(s, s, bias);
 797       __ sub(d, d, bias);
 798     }
 799 
 800 #ifdef ASSERT
 801     // Make sure we are never given < 8 words
 802     {
 803       Label L;
 804       __ cmp(count, 8);
 805       __ br(Assembler::GE, L);
 806       __ stop("genrate_copy_longs called with < 8 words");
 807       __ bind(L);
 808     }
 809 #endif
 810 
 811     // Fill 8 registers
 812     if (UseSIMDForMemoryOps) {
 813       __ ldpq(v0, v1, Address(s, 4 * unit));
 814       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 815     } else {
 816       __ ldp(t0, t1, Address(s, 2 * unit));
 817       __ ldp(t2, t3, Address(s, 4 * unit));
 818       __ ldp(t4, t5, Address(s, 6 * unit));
 819       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 820     }
 821 
 822     __ subs(count, count, 16);
 823     __ br(Assembler::LO, drain);
 824 
 825     int prefetch = PrefetchCopyIntervalInBytes;
 826     bool use_stride = false;
 827     if (direction == copy_backwards) {
 828        use_stride = prefetch > 256;
 829        prefetch = -prefetch;
 830        if (use_stride) __ mov(stride, prefetch);
 831     }
 832 
 833     __ bind(again);
 834 
 835     if (PrefetchCopyIntervalInBytes > 0)
 836       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 837 
 838     if (UseSIMDForMemoryOps) {
 839       __ stpq(v0, v1, Address(d, 4 * unit));
 840       __ ldpq(v0, v1, Address(s, 4 * unit));
 841       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 842       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 843     } else {
 844       __ stp(t0, t1, Address(d, 2 * unit));
 845       __ ldp(t0, t1, Address(s, 2 * unit));
 846       __ stp(t2, t3, Address(d, 4 * unit));
 847       __ ldp(t2, t3, Address(s, 4 * unit));
 848       __ stp(t4, t5, Address(d, 6 * unit));
 849       __ ldp(t4, t5, Address(s, 6 * unit));
 850       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 851       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 852     }
 853 
 854     __ subs(count, count, 8);
 855     __ br(Assembler::HS, again);
 856 
 857     // Drain
 858     __ bind(drain);
 859     if (UseSIMDForMemoryOps) {
 860       __ stpq(v0, v1, Address(d, 4 * unit));
 861       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 862     } else {
 863       __ stp(t0, t1, Address(d, 2 * unit));
 864       __ stp(t2, t3, Address(d, 4 * unit));
 865       __ stp(t4, t5, Address(d, 6 * unit));
 866       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 867     }
 868 
 869     {
 870       Label L1, L2;
 871       __ tbz(count, exact_log2(4), L1);
 872       if (UseSIMDForMemoryOps) {
 873         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 874         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 875       } else {
 876         __ ldp(t0, t1, Address(s, 2 * unit));
 877         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 878         __ stp(t0, t1, Address(d, 2 * unit));
 879         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 880       }
 881       __ bind(L1);
 882 
 883       if (direction == copy_forwards) {
 884         __ add(s, s, bias);
 885         __ add(d, d, bias);
 886       }
 887 
 888       __ tbz(count, 1, L2);
 889       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 890       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 891       __ bind(L2);
 892     }
 893 
 894     __ ret(lr);
 895 
 896     if (AvoidUnalignedAccesses) {
 897       Label drain, again;
 898       // Register order for storing. Order is different for backward copy.
 899 
 900       __ bind(unaligned_copy_long);
 901 
 902       // source address is even aligned, target odd aligned
 903       //
 904       // when forward copying word pairs we read long pairs at offsets
 905       // {0, 2, 4, 6} (in long words). when backwards copying we read
 906       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 907       // address by -2 in the forwards case so we can compute the
 908       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 909       // or -1.
 910       //
 911       // when forward copying we need to store 1 word, 3 pairs and
 912       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 913       // zero offset We adjust the destination by -1 which means we
 914       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 915       //
 916       // When backwards copyng we need to store 1 word, 3 pairs and
 917       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 918       // offsets {1, 3, 5, 7, 8} * unit.
 919 
 920       if (direction == copy_forwards) {
 921         __ sub(s, s, 16);
 922         __ sub(d, d, 8);
 923       }
 924 
 925       // Fill 8 registers
 926       //
 927       // for forwards copy s was offset by -16 from the original input
 928       // value of s so the register contents are at these offsets
 929       // relative to the 64 bit block addressed by that original input
 930       // and so on for each successive 64 byte block when s is updated
 931       //
 932       // t0 at offset 0,  t1 at offset 8
 933       // t2 at offset 16, t3 at offset 24
 934       // t4 at offset 32, t5 at offset 40
 935       // t6 at offset 48, t7 at offset 56
 936 
 937       // for backwards copy s was not offset so the register contents
 938       // are at these offsets into the preceding 64 byte block
 939       // relative to that original input and so on for each successive
 940       // preceding 64 byte block when s is updated. this explains the
 941       // slightly counter-intuitive looking pattern of register usage
 942       // in the stp instructions for backwards copy.
 943       //
 944       // t0 at offset -16, t1 at offset -8
 945       // t2 at offset -32, t3 at offset -24
 946       // t4 at offset -48, t5 at offset -40
 947       // t6 at offset -64, t7 at offset -56
 948 
 949       __ ldp(t0, t1, Address(s, 2 * unit));
 950       __ ldp(t2, t3, Address(s, 4 * unit));
 951       __ ldp(t4, t5, Address(s, 6 * unit));
 952       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 953 
 954       __ subs(count, count, 16);
 955       __ br(Assembler::LO, drain);
 956 
 957       int prefetch = PrefetchCopyIntervalInBytes;
 958       bool use_stride = false;
 959       if (direction == copy_backwards) {
 960          use_stride = prefetch > 256;
 961          prefetch = -prefetch;
 962          if (use_stride) __ mov(stride, prefetch);
 963       }
 964 
 965       __ bind(again);
 966 
 967       if (PrefetchCopyIntervalInBytes > 0)
 968         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 969 
 970       if (direction == copy_forwards) {
 971        // allowing for the offset of -8 the store instructions place
 972        // registers into the target 64 bit block at the following
 973        // offsets
 974        //
 975        // t0 at offset 0
 976        // t1 at offset 8,  t2 at offset 16
 977        // t3 at offset 24, t4 at offset 32
 978        // t5 at offset 40, t6 at offset 48
 979        // t7 at offset 56
 980 
 981         __ str(t0, Address(d, 1 * unit));
 982         __ stp(t1, t2, Address(d, 2 * unit));
 983         __ ldp(t0, t1, Address(s, 2 * unit));
 984         __ stp(t3, t4, Address(d, 4 * unit));
 985         __ ldp(t2, t3, Address(s, 4 * unit));
 986         __ stp(t5, t6, Address(d, 6 * unit));
 987         __ ldp(t4, t5, Address(s, 6 * unit));
 988         __ str(t7, Address(__ pre(d, 8 * unit)));
 989         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 990       } else {
 991        // d was not offset when we started so the registers are
 992        // written into the 64 bit block preceding d with the following
 993        // offsets
 994        //
 995        // t1 at offset -8
 996        // t3 at offset -24, t0 at offset -16
 997        // t5 at offset -48, t2 at offset -32
 998        // t7 at offset -56, t4 at offset -48
 999        //                   t6 at offset -64
1000        //
1001        // note that this matches the offsets previously noted for the
1002        // loads
1003 
1004         __ str(t1, Address(d, 1 * unit));
1005         __ stp(t3, t0, Address(d, 3 * unit));
1006         __ ldp(t0, t1, Address(s, 2 * unit));
1007         __ stp(t5, t2, Address(d, 5 * unit));
1008         __ ldp(t2, t3, Address(s, 4 * unit));
1009         __ stp(t7, t4, Address(d, 7 * unit));
1010         __ ldp(t4, t5, Address(s, 6 * unit));
1011         __ str(t6, Address(__ pre(d, 8 * unit)));
1012         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1013       }
1014 
1015       __ subs(count, count, 8);
1016       __ br(Assembler::HS, again);
1017 
1018       // Drain
1019       //
1020       // this uses the same pattern of offsets and register arguments
1021       // as above
1022       __ bind(drain);
1023       if (direction == copy_forwards) {
1024         __ str(t0, Address(d, 1 * unit));
1025         __ stp(t1, t2, Address(d, 2 * unit));
1026         __ stp(t3, t4, Address(d, 4 * unit));
1027         __ stp(t5, t6, Address(d, 6 * unit));
1028         __ str(t7, Address(__ pre(d, 8 * unit)));
1029       } else {
1030         __ str(t1, Address(d, 1 * unit));
1031         __ stp(t3, t0, Address(d, 3 * unit));
1032         __ stp(t5, t2, Address(d, 5 * unit));
1033         __ stp(t7, t4, Address(d, 7 * unit));
1034         __ str(t6, Address(__ pre(d, 8 * unit)));
1035       }
1036       // now we need to copy any remaining part block which may
1037       // include a 4 word block subblock and/or a 2 word subblock.
1038       // bits 2 and 1 in the count are the tell-tale for whetehr we
1039       // have each such subblock
1040       {
1041         Label L1, L2;
1042         __ tbz(count, exact_log2(4), L1);
1043        // this is the same as above but copying only 4 longs hence
1044        // with ony one intervening stp between the str instructions
1045        // but note that the offsets and registers still follow the
1046        // same pattern
1047         __ ldp(t0, t1, Address(s, 2 * unit));
1048         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1049         if (direction == copy_forwards) {
1050           __ str(t0, Address(d, 1 * unit));
1051           __ stp(t1, t2, Address(d, 2 * unit));
1052           __ str(t3, Address(__ pre(d, 4 * unit)));
1053         } else {
1054           __ str(t1, Address(d, 1 * unit));
1055           __ stp(t3, t0, Address(d, 3 * unit));
1056           __ str(t2, Address(__ pre(d, 4 * unit)));
1057         }
1058         __ bind(L1);
1059 
1060         __ tbz(count, 1, L2);
1061        // this is the same as above but copying only 2 longs hence
1062        // there is no intervening stp between the str instructions
1063        // but note that the offset and register patterns are still
1064        // the same
1065         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1066         if (direction == copy_forwards) {
1067           __ str(t0, Address(d, 1 * unit));
1068           __ str(t1, Address(__ pre(d, 2 * unit)));
1069         } else {
1070           __ str(t1, Address(d, 1 * unit));
1071           __ str(t0, Address(__ pre(d, 2 * unit)));
1072         }
1073         __ bind(L2);
1074 
1075        // for forwards copy we need to re-adjust the offsets we
1076        // applied so that s and d are follow the last words written
1077 
1078        if (direction == copy_forwards) {
1079          __ add(s, s, 16);
1080          __ add(d, d, 8);
1081        }
1082 
1083       }
1084 
1085       __ ret(lr);
1086       }
1087   }
1088 
1089   // Small copy: less than 16 bytes.
1090   //
1091   // NB: Ignores all of the bits of count which represent more than 15
1092   // bytes, so a caller doesn't have to mask them.
1093 
1094   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1095     bool is_backwards = step < 0;
1096     size_t granularity = uabs(step);
1097     int direction = is_backwards ? -1 : 1;
1098     int unit = wordSize * direction;
1099 
1100     Label Lpair, Lword, Lint, Lshort, Lbyte;
1101 
1102     assert(granularity
1103            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1104 
1105     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1106 
1107     // ??? I don't know if this bit-test-and-branch is the right thing
1108     // to do.  It does a lot of jumping, resulting in several
1109     // mispredicted branches.  It might make more sense to do this
1110     // with something like Duff's device with a single computed branch.
1111 
1112     __ tbz(count, 3 - exact_log2(granularity), Lword);
1113     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1114     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1115     __ bind(Lword);
1116 
1117     if (granularity <= sizeof (jint)) {
1118       __ tbz(count, 2 - exact_log2(granularity), Lint);
1119       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1120       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1121       __ bind(Lint);
1122     }
1123 
1124     if (granularity <= sizeof (jshort)) {
1125       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1126       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1127       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1128       __ bind(Lshort);
1129     }
1130 
1131     if (granularity <= sizeof (jbyte)) {
1132       __ tbz(count, 0, Lbyte);
1133       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1134       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1135       __ bind(Lbyte);
1136     }
1137   }
1138 
1139   Label copy_f, copy_b;
1140 
1141   // All-singing all-dancing memory copy.
1142   //
1143   // Copy count units of memory from s to d.  The size of a unit is
1144   // step, which can be positive or negative depending on the direction
1145   // of copy.  If is_aligned is false, we align the source address.
1146   //
1147 
1148   void copy_memory(bool is_aligned, Register s, Register d,
1149                    Register count, Register tmp, int step) {
1150     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1151     bool is_backwards = step < 0;
1152     int granularity = uabs(step);
1153     const Register t0 = r3, t1 = r4;
1154 
1155     // <= 96 bytes do inline. Direction doesn't matter because we always
1156     // load all the data before writing anything
1157     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1158     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1159     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1160     const Register send = r17, dend = r18;
1161 
1162     if (PrefetchCopyIntervalInBytes > 0)
1163       __ prfm(Address(s, 0), PLDL1KEEP);
1164     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1165     __ br(Assembler::HI, copy_big);
1166 
1167     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1168     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1169 
1170     __ cmp(count, 16/granularity);
1171     __ br(Assembler::LS, copy16);
1172 
1173     __ cmp(count, 64/granularity);
1174     __ br(Assembler::HI, copy80);
1175 
1176     __ cmp(count, 32/granularity);
1177     __ br(Assembler::LS, copy32);
1178 
1179     // 33..64 bytes
1180     if (UseSIMDForMemoryOps) {
1181       __ ldpq(v0, v1, Address(s, 0));
1182       __ ldpq(v2, v3, Address(send, -32));
1183       __ stpq(v0, v1, Address(d, 0));
1184       __ stpq(v2, v3, Address(dend, -32));
1185     } else {
1186       __ ldp(t0, t1, Address(s, 0));
1187       __ ldp(t2, t3, Address(s, 16));
1188       __ ldp(t4, t5, Address(send, -32));
1189       __ ldp(t6, t7, Address(send, -16));
1190 
1191       __ stp(t0, t1, Address(d, 0));
1192       __ stp(t2, t3, Address(d, 16));
1193       __ stp(t4, t5, Address(dend, -32));
1194       __ stp(t6, t7, Address(dend, -16));
1195     }
1196     __ b(finish);
1197 
1198     // 17..32 bytes
1199     __ bind(copy32);
1200     __ ldp(t0, t1, Address(s, 0));
1201     __ ldp(t2, t3, Address(send, -16));
1202     __ stp(t0, t1, Address(d, 0));
1203     __ stp(t2, t3, Address(dend, -16));
1204     __ b(finish);
1205 
1206     // 65..80/96 bytes
1207     // (96 bytes if SIMD because we do 32 byes per instruction)
1208     __ bind(copy80);
1209     if (UseSIMDForMemoryOps) {
1210       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1211       __ ldpq(v4, v5, Address(send, -32));
1212       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1213       __ stpq(v4, v5, Address(dend, -32));
1214     } else {
1215       __ ldp(t0, t1, Address(s, 0));
1216       __ ldp(t2, t3, Address(s, 16));
1217       __ ldp(t4, t5, Address(s, 32));
1218       __ ldp(t6, t7, Address(s, 48));
1219       __ ldp(t8, t9, Address(send, -16));
1220   
1221       __ stp(t0, t1, Address(d, 0));
1222       __ stp(t2, t3, Address(d, 16));
1223       __ stp(t4, t5, Address(d, 32));
1224       __ stp(t6, t7, Address(d, 48));
1225       __ stp(t8, t9, Address(dend, -16));
1226     }
1227     __ b(finish);
1228 
1229     // 0..16 bytes
1230     __ bind(copy16);
1231     __ cmp(count, 8/granularity);
1232     __ br(Assembler::LO, copy8);
1233 
1234     // 8..16 bytes
1235     __ ldr(t0, Address(s, 0));
1236     __ ldr(t1, Address(send, -8));
1237     __ str(t0, Address(d, 0));
1238     __ str(t1, Address(dend, -8));
1239     __ b(finish);
1240 
1241     if (granularity < 8) {
1242       // 4..7 bytes
1243       __ bind(copy8);
1244       __ tbz(count, 2 - exact_log2(granularity), copy4);
1245       __ ldrw(t0, Address(s, 0));
1246       __ ldrw(t1, Address(send, -4));
1247       __ strw(t0, Address(d, 0));
1248       __ strw(t1, Address(dend, -4));
1249       __ b(finish);
1250       if (granularity < 4) {
1251         // 0..3 bytes
1252         __ bind(copy4);
1253         __ cbz(count, finish); // get rid of 0 case
1254         if (granularity == 2) {
1255           __ ldrh(t0, Address(s, 0));
1256           __ strh(t0, Address(d, 0));
1257         } else { // granularity == 1
1258           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1259           // the first and last byte.
1260           // Handle the 3 byte case by loading and storing base + count/2
1261           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1262           // This does means in the 1 byte case we load/store the same
1263           // byte 3 times.
1264           __ lsr(count, count, 1);
1265           __ ldrb(t0, Address(s, 0));
1266           __ ldrb(t1, Address(send, -1));
1267           __ ldrb(t2, Address(s, count));
1268           __ strb(t0, Address(d, 0));
1269           __ strb(t1, Address(dend, -1));
1270           __ strb(t2, Address(d, count));
1271         }
1272         __ b(finish);
1273       }
1274     }
1275 
1276     __ bind(copy_big);
1277     if (is_backwards) {
1278       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1279       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1280     }
1281 
1282     // Now we've got the small case out of the way we can align the
1283     // source address on a 2-word boundary.
1284 
1285     Label aligned;
1286 
1287     if (is_aligned) {
1288       // We may have to adjust by 1 word to get s 2-word-aligned.
1289       __ tbz(s, exact_log2(wordSize), aligned);
1290       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1291       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1292       __ sub(count, count, wordSize/granularity);
1293     } else {
1294       if (is_backwards) {
1295         __ andr(rscratch2, s, 2 * wordSize - 1);
1296       } else {
1297         __ neg(rscratch2, s);
1298         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1299       }
1300       // rscratch2 is the byte adjustment needed to align s.
1301       __ cbz(rscratch2, aligned);
1302       int shift = exact_log2(granularity);
1303       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1304       __ sub(count, count, rscratch2);
1305 
1306 #if 0
1307       // ?? This code is only correct for a disjoint copy.  It may or
1308       // may not make sense to use it in that case.
1309 
1310       // Copy the first pair; s and d may not be aligned.
1311       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1312       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1313 
1314       // Align s and d, adjust count
1315       if (is_backwards) {
1316         __ sub(s, s, rscratch2);
1317         __ sub(d, d, rscratch2);
1318       } else {
1319         __ add(s, s, rscratch2);
1320         __ add(d, d, rscratch2);
1321       }
1322 #else
1323       copy_memory_small(s, d, rscratch2, rscratch1, step);
1324 #endif
1325     }
1326 
1327     __ bind(aligned);
1328 
1329     // s is now 2-word-aligned.
1330 
1331     // We have a count of units and some trailing bytes.  Adjust the
1332     // count and do a bulk copy of words.
1333     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1334     if (direction == copy_forwards)
1335       __ bl(copy_f);
1336     else
1337       __ bl(copy_b);
1338 
1339     // And the tail.
1340     copy_memory_small(s, d, count, tmp, step);
1341 
1342     if (granularity >= 8) __ bind(copy8);
1343     if (granularity >= 4) __ bind(copy4);
1344     __ bind(finish);
1345   }
1346 
1347 
1348   void clobber_registers() {
1349 #ifdef ASSERT
1350     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1351     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1352     for (Register r = r3; r <= r18; r++)
1353       if (r != rscratch1) __ mov(r, rscratch1);
1354 #endif
1355   }
1356 
1357   // Scan over array at a for count oops, verifying each one.
1358   // Preserves a and count, clobbers rscratch1 and rscratch2.
1359   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1360     Label loop, end;
1361     __ mov(rscratch1, a);
1362     __ mov(rscratch2, zr);
1363     __ bind(loop);
1364     __ cmp(rscratch2, count);
1365     __ br(Assembler::HS, end);
1366     if (size == (size_t)wordSize) {
1367       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1368       __ verify_oop(temp);
1369     } else {
1370       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1371       __ decode_heap_oop(temp); // calls verify_oop
1372     }
1373     __ add(rscratch2, rscratch2, size);
1374     __ b(loop);
1375     __ bind(end);
1376   }
1377 
1378   // Arguments:
1379   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1380   //             ignored
1381   //   is_oop  - true => oop array, so generate store check code
1382   //   name    - stub name string
1383   //
1384   // Inputs:
1385   //   c_rarg0   - source array address
1386   //   c_rarg1   - destination array address
1387   //   c_rarg2   - element count, treated as ssize_t, can be zero
1388   //
1389   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1390   // the hardware handle it.  The two dwords within qwords that span
1391   // cache line boundaries will still be loaded and stored atomicly.
1392   //
1393   // Side Effects:
1394   //   disjoint_int_copy_entry is set to the no-overlap entry point
1395   //   used by generate_conjoint_int_oop_copy().
1396   //
1397   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1398                                   const char *name, bool dest_uninitialized = false) {
1399     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1400     __ align(CodeEntryAlignment);
1401     StubCodeMark mark(this, "StubRoutines", name);
1402     address start = __ pc();
1403     __ enter();
1404 
1405     if (entry != NULL) {
1406       *entry = __ pc();
1407       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1408       BLOCK_COMMENT("Entry:");
1409     }
1410 
1411     if (is_oop) {
1412       __ push(RegSet::of(d, count), sp);
1413       // no registers are destroyed by this call
1414       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1415     }
1416     copy_memory(aligned, s, d, count, rscratch1, size);
1417     if (is_oop) {
1418       __ pop(RegSet::of(d, count), sp);
1419       if (VerifyOops)
1420         verify_oop_array(size, d, count, r16);
1421       __ sub(count, count, 1); // make an inclusive end pointer
1422       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1423       gen_write_ref_array_post_barrier(d, count, rscratch1);
1424     }
1425     __ leave();
1426     __ mov(r0, zr); // return 0
1427     __ ret(lr);
1428     return start;
1429   }
1430 
1431   // Arguments:
1432   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1433   //             ignored
1434   //   is_oop  - true => oop array, so generate store check code
1435   //   name    - stub name string
1436   //
1437   // Inputs:
1438   //   c_rarg0   - source array address
1439   //   c_rarg1   - destination array address
1440   //   c_rarg2   - element count, treated as ssize_t, can be zero
1441   //
1442   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1443   // the hardware handle it.  The two dwords within qwords that span
1444   // cache line boundaries will still be loaded and stored atomicly.
1445   //
1446   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1447                                  address *entry, const char *name,
1448                                  bool dest_uninitialized = false) {
1449     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1450 
1451     StubCodeMark mark(this, "StubRoutines", name);
1452     address start = __ pc();
1453 
1454     __ enter();
1455 
1456     if (entry != NULL) {
1457       *entry = __ pc();
1458       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1459       BLOCK_COMMENT("Entry:");
1460     }
1461 
1462     // use fwd copy when (d-s) above_equal (count*size)
1463     __ sub(rscratch1, d, s);
1464     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1465     __ br(Assembler::HS, nooverlap_target);
1466 
1467     if (is_oop) {
1468       __ push(RegSet::of(d, count), sp);
1469       // no registers are destroyed by this call
1470       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1471     }
1472     copy_memory(aligned, s, d, count, rscratch1, -size);
1473     if (is_oop) {
1474       __ pop(RegSet::of(d, count), sp);
1475       if (VerifyOops)
1476         verify_oop_array(size, d, count, r16);
1477       __ sub(count, count, 1); // make an inclusive end pointer
1478       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1479       gen_write_ref_array_post_barrier(d, count, rscratch1);
1480     }
1481     __ leave();
1482     __ mov(r0, zr); // return 0
1483     __ ret(lr);
1484     return start;
1485 }
1486 
1487   // Arguments:
1488   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1489   //             ignored
1490   //   name    - stub name string
1491   //
1492   // Inputs:
1493   //   c_rarg0   - source array address
1494   //   c_rarg1   - destination array address
1495   //   c_rarg2   - element count, treated as ssize_t, can be zero
1496   //
1497   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1498   // we let the hardware handle it.  The one to eight bytes within words,
1499   // dwords or qwords that span cache line boundaries will still be loaded
1500   // and stored atomically.
1501   //
1502   // Side Effects:
1503   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1504   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1505   // we let the hardware handle it.  The one to eight bytes within words,
1506   // dwords or qwords that span cache line boundaries will still be loaded
1507   // and stored atomically.
1508   //
1509   // Side Effects:
1510   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1511   //   used by generate_conjoint_byte_copy().
1512   //
1513   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1514     const bool not_oop = false;
1515     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1516   }
1517 
1518   // Arguments:
1519   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1520   //             ignored
1521   //   name    - stub name string
1522   //
1523   // Inputs:
1524   //   c_rarg0   - source array address
1525   //   c_rarg1   - destination array address
1526   //   c_rarg2   - element count, treated as ssize_t, can be zero
1527   //
1528   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1529   // we let the hardware handle it.  The one to eight bytes within words,
1530   // dwords or qwords that span cache line boundaries will still be loaded
1531   // and stored atomically.
1532   //
1533   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1534                                       address* entry, const char *name) {
1535     const bool not_oop = false;
1536     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1537   }
1538 
1539   // Arguments:
1540   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1541   //             ignored
1542   //   name    - stub name string
1543   //
1544   // Inputs:
1545   //   c_rarg0   - source array address
1546   //   c_rarg1   - destination array address
1547   //   c_rarg2   - element count, treated as ssize_t, can be zero
1548   //
1549   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1550   // let the hardware handle it.  The two or four words within dwords
1551   // or qwords that span cache line boundaries will still be loaded
1552   // and stored atomically.
1553   //
1554   // Side Effects:
1555   //   disjoint_short_copy_entry is set to the no-overlap entry point
1556   //   used by generate_conjoint_short_copy().
1557   //
1558   address generate_disjoint_short_copy(bool aligned,
1559                                        address* entry, const char *name) {
1560     const bool not_oop = false;
1561     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1562   }
1563 
1564   // Arguments:
1565   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1566   //             ignored
1567   //   name    - stub name string
1568   //
1569   // Inputs:
1570   //   c_rarg0   - source array address
1571   //   c_rarg1   - destination array address
1572   //   c_rarg2   - element count, treated as ssize_t, can be zero
1573   //
1574   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1575   // let the hardware handle it.  The two or four words within dwords
1576   // or qwords that span cache line boundaries will still be loaded
1577   // and stored atomically.
1578   //
1579   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1580                                        address *entry, const char *name) {
1581     const bool not_oop = false;
1582     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1583 
1584   }
1585   // Arguments:
1586   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1587   //             ignored
1588   //   name    - stub name string
1589   //
1590   // Inputs:
1591   //   c_rarg0   - source array address
1592   //   c_rarg1   - destination array address
1593   //   c_rarg2   - element count, treated as ssize_t, can be zero
1594   //
1595   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1596   // the hardware handle it.  The two dwords within qwords that span
1597   // cache line boundaries will still be loaded and stored atomicly.
1598   //
1599   // Side Effects:
1600   //   disjoint_int_copy_entry is set to the no-overlap entry point
1601   //   used by generate_conjoint_int_oop_copy().
1602   //
1603   address generate_disjoint_int_copy(bool aligned, address *entry,
1604                                          const char *name, bool dest_uninitialized = false) {
1605     const bool not_oop = false;
1606     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1607   }
1608 
1609   // Arguments:
1610   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1611   //             ignored
1612   //   name    - stub name string
1613   //
1614   // Inputs:
1615   //   c_rarg0   - source array address
1616   //   c_rarg1   - destination array address
1617   //   c_rarg2   - element count, treated as ssize_t, can be zero
1618   //
1619   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1620   // the hardware handle it.  The two dwords within qwords that span
1621   // cache line boundaries will still be loaded and stored atomicly.
1622   //
1623   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1624                                      address *entry, const char *name,
1625                                      bool dest_uninitialized = false) {
1626     const bool not_oop = false;
1627     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1628   }
1629 
1630 
1631   // Arguments:
1632   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1633   //             ignored
1634   //   name    - stub name string
1635   //
1636   // Inputs:
1637   //   c_rarg0   - source array address
1638   //   c_rarg1   - destination array address
1639   //   c_rarg2   - element count, treated as size_t, can be zero
1640   //
1641   // Side Effects:
1642   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1643   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1644   //
1645   address generate_disjoint_long_copy(bool aligned, address *entry,
1646                                           const char *name, bool dest_uninitialized = false) {
1647     const bool not_oop = false;
1648     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1649   }
1650 
1651   // Arguments:
1652   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1653   //             ignored
1654   //   name    - stub name string
1655   //
1656   // Inputs:
1657   //   c_rarg0   - source array address
1658   //   c_rarg1   - destination array address
1659   //   c_rarg2   - element count, treated as size_t, can be zero
1660   //
1661   address generate_conjoint_long_copy(bool aligned,
1662                                       address nooverlap_target, address *entry,
1663                                       const char *name, bool dest_uninitialized = false) {
1664     const bool not_oop = false;
1665     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1666   }
1667 
1668   // Arguments:
1669   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1670   //             ignored
1671   //   name    - stub name string
1672   //
1673   // Inputs:
1674   //   c_rarg0   - source array address
1675   //   c_rarg1   - destination array address
1676   //   c_rarg2   - element count, treated as size_t, can be zero
1677   //
1678   // Side Effects:
1679   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1680   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1681   //
1682   address generate_disjoint_oop_copy(bool aligned, address *entry,
1683                                      const char *name, bool dest_uninitialized) {
1684     const bool is_oop = true;
1685     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1686     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1687   }
1688 
1689   // Arguments:
1690   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1691   //             ignored
1692   //   name    - stub name string
1693   //
1694   // Inputs:
1695   //   c_rarg0   - source array address
1696   //   c_rarg1   - destination array address
1697   //   c_rarg2   - element count, treated as size_t, can be zero
1698   //
1699   address generate_conjoint_oop_copy(bool aligned,
1700                                      address nooverlap_target, address *entry,
1701                                      const char *name, bool dest_uninitialized) {
1702     const bool is_oop = true;
1703     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1704     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1705                                   name, dest_uninitialized);
1706   }
1707 
1708 
1709   // Helper for generating a dynamic type check.
1710   // Smashes rscratch1.
1711   void generate_type_check(Register sub_klass,
1712                            Register super_check_offset,
1713                            Register super_klass,
1714                            Label& L_success) {
1715     assert_different_registers(sub_klass, super_check_offset, super_klass);
1716 
1717     BLOCK_COMMENT("type_check:");
1718 
1719     Label L_miss;
1720 
1721     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1722                                      super_check_offset);
1723     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1724 
1725     // Fall through on failure!
1726     __ BIND(L_miss);
1727   }
1728 
1729   //
1730   //  Generate checkcasting array copy stub
1731   //
1732   //  Input:
1733   //    c_rarg0   - source array address
1734   //    c_rarg1   - destination array address
1735   //    c_rarg2   - element count, treated as ssize_t, can be zero
1736   //    c_rarg3   - size_t ckoff (super_check_offset)
1737   //    c_rarg4   - oop ckval (super_klass)
1738   //
1739   //  Output:
1740   //    r0 ==  0  -  success
1741   //    r0 == -1^K - failure, where K is partial transfer count
1742   //
1743   address generate_checkcast_copy(const char *name, address *entry,
1744                                   bool dest_uninitialized = false) {
1745 
1746     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1747 
1748     // Input registers (after setup_arg_regs)
1749     const Register from        = c_rarg0;   // source array address
1750     const Register to          = c_rarg1;   // destination array address
1751     const Register count       = c_rarg2;   // elementscount
1752     const Register ckoff       = c_rarg3;   // super_check_offset
1753     const Register ckval       = c_rarg4;   // super_klass
1754 
1755     // Registers used as temps (r18, r19, r20 are save-on-entry)
1756     const Register count_save  = r21;       // orig elementscount
1757     const Register start_to    = r20;       // destination array start address
1758     const Register copied_oop  = r18;       // actual oop copied
1759     const Register r19_klass   = r19;       // oop._klass
1760 
1761     //---------------------------------------------------------------
1762     // Assembler stub will be used for this call to arraycopy
1763     // if the two arrays are subtypes of Object[] but the
1764     // destination array type is not equal to or a supertype
1765     // of the source type.  Each element must be separately
1766     // checked.
1767 
1768     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1769                                copied_oop, r19_klass, count_save);
1770 
1771     __ align(CodeEntryAlignment);
1772     StubCodeMark mark(this, "StubRoutines", name);
1773     address start = __ pc();
1774 
1775     __ enter(); // required for proper stackwalking of RuntimeStub frame
1776 
1777 #ifdef ASSERT
1778     // caller guarantees that the arrays really are different
1779     // otherwise, we would have to make conjoint checks
1780     { Label L;
1781       array_overlap_test(L, TIMES_OOP);
1782       __ stop("checkcast_copy within a single array");
1783       __ bind(L);
1784     }
1785 #endif //ASSERT
1786 
1787     // Caller of this entry point must set up the argument registers.
1788     if (entry != NULL) {
1789       *entry = __ pc();
1790       BLOCK_COMMENT("Entry:");
1791     }
1792 
1793      // Empty array:  Nothing to do.
1794     __ cbz(count, L_done);
1795 
1796     __ push(RegSet::of(r18, r19, r20, r21), sp);
1797 
1798 #ifdef ASSERT
1799     BLOCK_COMMENT("assert consistent ckoff/ckval");
1800     // The ckoff and ckval must be mutually consistent,
1801     // even though caller generates both.
1802     { Label L;
1803       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1804       __ ldrw(start_to, Address(ckval, sco_offset));
1805       __ cmpw(ckoff, start_to);
1806       __ br(Assembler::EQ, L);
1807       __ stop("super_check_offset inconsistent");
1808       __ bind(L);
1809     }
1810 #endif //ASSERT
1811 
1812     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1813 
1814     // save the original count
1815     __ mov(count_save, count);
1816 
1817     // Copy from low to high addresses
1818     __ mov(start_to, to);              // Save destination array start address
1819     __ b(L_load_element);
1820 
1821     // ======== begin loop ========
1822     // (Loop is rotated; its entry is L_load_element.)
1823     // Loop control:
1824     //   for (; count != 0; count--) {
1825     //     copied_oop = load_heap_oop(from++);
1826     //     ... generate_type_check ...;
1827     //     store_heap_oop(to++, copied_oop);
1828     //   }
1829     __ align(OptoLoopAlignment);
1830 
1831     __ BIND(L_store_element);
1832     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1833     __ sub(count, count, 1);
1834     __ cbz(count, L_do_card_marks);
1835 
1836     // ======== loop entry is here ========
1837     __ BIND(L_load_element);
1838     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1839     __ cbz(copied_oop, L_store_element);
1840 
1841     __ load_klass(r19_klass, copied_oop);// query the object klass
1842     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1843     // ======== end loop ========
1844 
1845     // It was a real error; we must depend on the caller to finish the job.
1846     // Register count = remaining oops, count_orig = total oops.
1847     // Emit GC store barriers for the oops we have copied and report
1848     // their number to the caller.
1849 
1850     __ subs(count, count_save, count);     // K = partially copied oop count
1851     __ eon(count, count, zr);                   // report (-1^K) to caller
1852     __ br(Assembler::EQ, L_done_pop);
1853 
1854     __ BIND(L_do_card_marks);
1855     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1856     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1857 
1858     __ bind(L_done_pop);
1859     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1860     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1861 
1862     __ bind(L_done);
1863     __ mov(r0, count);
1864     __ leave();
1865     __ ret(lr);
1866 
1867     return start;
1868   }
1869 
1870   // Perform range checks on the proposed arraycopy.
1871   // Kills temp, but nothing else.
1872   // Also, clean the sign bits of src_pos and dst_pos.
1873   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1874                               Register src_pos, // source position (c_rarg1)
1875                               Register dst,     // destination array oo (c_rarg2)
1876                               Register dst_pos, // destination position (c_rarg3)
1877                               Register length,
1878                               Register temp,
1879                               Label& L_failed) {
1880     BLOCK_COMMENT("arraycopy_range_checks:");
1881 
1882     assert_different_registers(rscratch1, temp);
1883 
1884     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1885     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1886     __ addw(temp, length, src_pos);
1887     __ cmpw(temp, rscratch1);
1888     __ br(Assembler::HI, L_failed);
1889 
1890     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1891     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1892     __ addw(temp, length, dst_pos);
1893     __ cmpw(temp, rscratch1);
1894     __ br(Assembler::HI, L_failed);
1895 
1896     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1897     __ movw(src_pos, src_pos);
1898     __ movw(dst_pos, dst_pos);
1899 
1900     BLOCK_COMMENT("arraycopy_range_checks done");
1901   }
1902 
1903   // These stubs get called from some dumb test routine.
1904   // I'll write them properly when they're called from
1905   // something that's actually doing something.
1906   static void fake_arraycopy_stub(address src, address dst, int count) {
1907     assert(count == 0, "huh?");
1908   }
1909 
1910 
1911   //
1912   // Generate stub for array fill. If "aligned" is true, the
1913   // "to" address is assumed to be heapword aligned.
1914   //
1915   // Arguments for generated stub:
1916   //   to:    c_rarg0
1917   //   value: c_rarg1
1918   //   count: c_rarg2 treated as signed
1919   //
1920   address generate_fill(BasicType t, bool aligned, const char *name) {
1921     __ align(CodeEntryAlignment);
1922     StubCodeMark mark(this, "StubRoutines", name);
1923     address start = __ pc();
1924 
1925     BLOCK_COMMENT("Entry:");
1926 
1927     const Register to        = c_rarg0;  // source array address
1928     const Register value     = c_rarg1;  // value
1929     const Register count     = c_rarg2;  // elements count
1930 
1931     const Register bz_base = r10;        // base for block_zero routine
1932     const Register cnt_words = r11;      // temp register
1933 
1934     __ enter();
1935 
1936     Label L_fill_elements, L_exit1;
1937 
1938     int shift = -1;
1939     switch (t) {
1940       case T_BYTE:
1941         shift = 0;
1942         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1943         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
1944         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
1945         __ br(Assembler::LO, L_fill_elements);
1946         break;
1947       case T_SHORT:
1948         shift = 1;
1949         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1950         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
1951         __ br(Assembler::LO, L_fill_elements);
1952         break;
1953       case T_INT:
1954         shift = 2;
1955         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1956         __ br(Assembler::LO, L_fill_elements);
1957         break;
1958       default: ShouldNotReachHere();
1959     }
1960 
1961     // Align source address at 8 bytes address boundary.
1962     Label L_skip_align1, L_skip_align2, L_skip_align4;
1963     if (!aligned) {
1964       switch (t) {
1965         case T_BYTE:
1966           // One byte misalignment happens only for byte arrays.
1967           __ tbz(to, 0, L_skip_align1);
1968           __ strb(value, Address(__ post(to, 1)));
1969           __ subw(count, count, 1);
1970           __ bind(L_skip_align1);
1971           // Fallthrough
1972         case T_SHORT:
1973           // Two bytes misalignment happens only for byte and short (char) arrays.
1974           __ tbz(to, 1, L_skip_align2);
1975           __ strh(value, Address(__ post(to, 2)));
1976           __ subw(count, count, 2 >> shift);
1977           __ bind(L_skip_align2);
1978           // Fallthrough
1979         case T_INT:
1980           // Align to 8 bytes, we know we are 4 byte aligned to start.
1981           __ tbz(to, 2, L_skip_align4);
1982           __ strw(value, Address(__ post(to, 4)));
1983           __ subw(count, count, 4 >> shift);
1984           __ bind(L_skip_align4);
1985           break;
1986         default: ShouldNotReachHere();
1987       }
1988     }
1989 
1990     //
1991     //  Fill large chunks
1992     //
1993     __ lsrw(cnt_words, count, 3 - shift); // number of words
1994     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
1995     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
1996     if (UseBlockZeroing) {
1997       Label non_block_zeroing, rest;
1998       // count >= BlockZeroingLowLimit && value == 0
1999       __ subs(rscratch1, cnt_words, BlockZeroingLowLimit >> 3);
2000       __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
2001       __ br(Assembler::NE, non_block_zeroing);
2002       __ mov(bz_base, to);
2003       __ block_zero(bz_base, cnt_words, true);
2004       __ mov(to, bz_base);
2005       __ b(rest);
2006       __ bind(non_block_zeroing);
2007       __ fill_words(to, cnt_words, value);
2008       __ bind(rest);
2009     }
2010     else {
2011       __ fill_words(to, cnt_words, value);
2012     }
2013 
2014     // Remaining count is less than 8 bytes. Fill it by a single store.
2015     // Note that the total length is no less than 8 bytes.
2016     if (t == T_BYTE || t == T_SHORT) {
2017       Label L_exit1;
2018       __ cbzw(count, L_exit1);
2019       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2020       __ str(value, Address(to, -8));    // overwrite some elements
2021       __ bind(L_exit1);
2022       __ leave();
2023       __ ret(lr);
2024     }
2025 
2026     // Handle copies less than 8 bytes.
2027     Label L_fill_2, L_fill_4, L_exit2;
2028     __ bind(L_fill_elements);
2029     switch (t) {
2030       case T_BYTE:
2031         __ tbz(count, 0, L_fill_2);
2032         __ strb(value, Address(__ post(to, 1)));
2033         __ bind(L_fill_2);
2034         __ tbz(count, 1, L_fill_4);
2035         __ strh(value, Address(__ post(to, 2)));
2036         __ bind(L_fill_4);
2037         __ tbz(count, 2, L_exit2);
2038         __ strw(value, Address(to));
2039         break;
2040       case T_SHORT:
2041         __ tbz(count, 0, L_fill_4);
2042         __ strh(value, Address(__ post(to, 2)));
2043         __ bind(L_fill_4);
2044         __ tbz(count, 1, L_exit2);
2045         __ strw(value, Address(to));
2046         break;
2047       case T_INT:
2048         __ cbzw(count, L_exit2);
2049         __ strw(value, Address(to));
2050         break;
2051       default: ShouldNotReachHere();
2052     }
2053     __ bind(L_exit2);
2054     __ leave();
2055     __ ret(lr);
2056     return start;
2057   }
2058 
2059   //
2060   //  Generate 'unsafe' array copy stub
2061   //  Though just as safe as the other stubs, it takes an unscaled
2062   //  size_t argument instead of an element count.
2063   //
2064   //  Input:
2065   //    c_rarg0   - source array address
2066   //    c_rarg1   - destination array address
2067   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2068   //
2069   // Examines the alignment of the operands and dispatches
2070   // to a long, int, short, or byte copy loop.
2071   //
2072   address generate_unsafe_copy(const char *name,
2073                                address byte_copy_entry,
2074                                address short_copy_entry,
2075                                address int_copy_entry,
2076                                address long_copy_entry) {
2077     Label L_long_aligned, L_int_aligned, L_short_aligned;
2078     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2079 
2080     __ align(CodeEntryAlignment);
2081     StubCodeMark mark(this, "StubRoutines", name);
2082     address start = __ pc();
2083     __ enter(); // required for proper stackwalking of RuntimeStub frame
2084 
2085     // bump this on entry, not on exit:
2086     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2087 
2088     __ orr(rscratch1, s, d);
2089     __ orr(rscratch1, rscratch1, count);
2090 
2091     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2092     __ cbz(rscratch1, L_long_aligned);
2093     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2094     __ cbz(rscratch1, L_int_aligned);
2095     __ tbz(rscratch1, 0, L_short_aligned);
2096     __ b(RuntimeAddress(byte_copy_entry));
2097 
2098     __ BIND(L_short_aligned);
2099     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2100     __ b(RuntimeAddress(short_copy_entry));
2101     __ BIND(L_int_aligned);
2102     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2103     __ b(RuntimeAddress(int_copy_entry));
2104     __ BIND(L_long_aligned);
2105     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2106     __ b(RuntimeAddress(long_copy_entry));
2107 
2108     return start;
2109   }
2110 
2111   //
2112   //  Generate generic array copy stubs
2113   //
2114   //  Input:
2115   //    c_rarg0    -  src oop
2116   //    c_rarg1    -  src_pos (32-bits)
2117   //    c_rarg2    -  dst oop
2118   //    c_rarg3    -  dst_pos (32-bits)
2119   //    c_rarg4    -  element count (32-bits)
2120   //
2121   //  Output:
2122   //    r0 ==  0  -  success
2123   //    r0 == -1^K - failure, where K is partial transfer count
2124   //
2125   address generate_generic_copy(const char *name,
2126                                 address byte_copy_entry, address short_copy_entry,
2127                                 address int_copy_entry, address oop_copy_entry,
2128                                 address long_copy_entry, address checkcast_copy_entry) {
2129 
2130     Label L_failed, L_failed_0, L_objArray;
2131     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2132 
2133     // Input registers
2134     const Register src        = c_rarg0;  // source array oop
2135     const Register src_pos    = c_rarg1;  // source position
2136     const Register dst        = c_rarg2;  // destination array oop
2137     const Register dst_pos    = c_rarg3;  // destination position
2138     const Register length     = c_rarg4;
2139 
2140     __ align(CodeEntryAlignment);
2141 
2142     StubCodeMark mark(this, "StubRoutines", name);
2143 
2144     address start = __ pc();
2145 
2146     __ enter(); // required for proper stackwalking of RuntimeStub frame
2147 
2148     // bump this on entry, not on exit:
2149     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2150 
2151     //-----------------------------------------------------------------------
2152     // Assembler stub will be used for this call to arraycopy
2153     // if the following conditions are met:
2154     //
2155     // (1) src and dst must not be null.
2156     // (2) src_pos must not be negative.
2157     // (3) dst_pos must not be negative.
2158     // (4) length  must not be negative.
2159     // (5) src klass and dst klass should be the same and not NULL.
2160     // (6) src and dst should be arrays.
2161     // (7) src_pos + length must not exceed length of src.
2162     // (8) dst_pos + length must not exceed length of dst.
2163     //
2164 
2165     //  if (src == NULL) return -1;
2166     __ cbz(src, L_failed);
2167 
2168     //  if (src_pos < 0) return -1;
2169     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2170 
2171     //  if (dst == NULL) return -1;
2172     __ cbz(dst, L_failed);
2173 
2174     //  if (dst_pos < 0) return -1;
2175     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2176 
2177     // registers used as temp
2178     const Register scratch_length    = r16; // elements count to copy
2179     const Register scratch_src_klass = r17; // array klass
2180     const Register lh                = r18; // layout helper
2181 
2182     //  if (length < 0) return -1;
2183     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2184     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2185 
2186     __ load_klass(scratch_src_klass, src);
2187 #ifdef ASSERT
2188     //  assert(src->klass() != NULL);
2189     {
2190       BLOCK_COMMENT("assert klasses not null {");
2191       Label L1, L2;
2192       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2193       __ bind(L1);
2194       __ stop("broken null klass");
2195       __ bind(L2);
2196       __ load_klass(rscratch1, dst);
2197       __ cbz(rscratch1, L1);     // this would be broken also
2198       BLOCK_COMMENT("} assert klasses not null done");
2199     }
2200 #endif
2201 
2202     // Load layout helper (32-bits)
2203     //
2204     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2205     // 32        30    24            16              8     2                 0
2206     //
2207     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2208     //
2209 
2210     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2211 
2212     // Handle objArrays completely differently...
2213     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2214     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2215     __ movw(rscratch1, objArray_lh);
2216     __ eorw(rscratch2, lh, rscratch1);
2217     __ cbzw(rscratch2, L_objArray);
2218 
2219     //  if (src->klass() != dst->klass()) return -1;
2220     __ load_klass(rscratch2, dst);
2221     __ eor(rscratch2, rscratch2, scratch_src_klass);
2222     __ cbnz(rscratch2, L_failed);
2223 
2224     //  if (!src->is_Array()) return -1;
2225     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2226 
2227     // At this point, it is known to be a typeArray (array_tag 0x3).
2228 #ifdef ASSERT
2229     {
2230       BLOCK_COMMENT("assert primitive array {");
2231       Label L;
2232       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2233       __ cmpw(lh, rscratch2);
2234       __ br(Assembler::GE, L);
2235       __ stop("must be a primitive array");
2236       __ bind(L);
2237       BLOCK_COMMENT("} assert primitive array done");
2238     }
2239 #endif
2240 
2241     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2242                            rscratch2, L_failed);
2243 
2244     // TypeArrayKlass
2245     //
2246     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2247     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2248     //
2249 
2250     const Register rscratch1_offset = rscratch1;    // array offset
2251     const Register r18_elsize = lh; // element size
2252 
2253     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2254            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2255     __ add(src, src, rscratch1_offset);           // src array offset
2256     __ add(dst, dst, rscratch1_offset);           // dst array offset
2257     BLOCK_COMMENT("choose copy loop based on element size");
2258 
2259     // next registers should be set before the jump to corresponding stub
2260     const Register from     = c_rarg0;  // source array address
2261     const Register to       = c_rarg1;  // destination array address
2262     const Register count    = c_rarg2;  // elements count
2263 
2264     // 'from', 'to', 'count' registers should be set in such order
2265     // since they are the same as 'src', 'src_pos', 'dst'.
2266 
2267     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2268 
2269     // The possible values of elsize are 0-3, i.e. exact_log2(element
2270     // size in bytes).  We do a simple bitwise binary search.
2271   __ BIND(L_copy_bytes);
2272     __ tbnz(r18_elsize, 1, L_copy_ints);
2273     __ tbnz(r18_elsize, 0, L_copy_shorts);
2274     __ lea(from, Address(src, src_pos));// src_addr
2275     __ lea(to,   Address(dst, dst_pos));// dst_addr
2276     __ movw(count, scratch_length); // length
2277     __ b(RuntimeAddress(byte_copy_entry));
2278 
2279   __ BIND(L_copy_shorts);
2280     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2281     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2282     __ movw(count, scratch_length); // length
2283     __ b(RuntimeAddress(short_copy_entry));
2284 
2285   __ BIND(L_copy_ints);
2286     __ tbnz(r18_elsize, 0, L_copy_longs);
2287     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2288     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2289     __ movw(count, scratch_length); // length
2290     __ b(RuntimeAddress(int_copy_entry));
2291 
2292   __ BIND(L_copy_longs);
2293 #ifdef ASSERT
2294     {
2295       BLOCK_COMMENT("assert long copy {");
2296       Label L;
2297       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2298       __ cmpw(r18_elsize, LogBytesPerLong);
2299       __ br(Assembler::EQ, L);
2300       __ stop("must be long copy, but elsize is wrong");
2301       __ bind(L);
2302       BLOCK_COMMENT("} assert long copy done");
2303     }
2304 #endif
2305     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2306     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2307     __ movw(count, scratch_length); // length
2308     __ b(RuntimeAddress(long_copy_entry));
2309 
2310     // ObjArrayKlass
2311   __ BIND(L_objArray);
2312     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2313 
2314     Label L_plain_copy, L_checkcast_copy;
2315     //  test array classes for subtyping
2316     __ load_klass(r18, dst);
2317     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2318     __ br(Assembler::NE, L_checkcast_copy);
2319 
2320     // Identically typed arrays can be copied without element-wise checks.
2321     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2322                            rscratch2, L_failed);
2323 
2324     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2325     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2326     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2327     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2328     __ movw(count, scratch_length); // length
2329   __ BIND(L_plain_copy);
2330     __ b(RuntimeAddress(oop_copy_entry));
2331 
2332   __ BIND(L_checkcast_copy);
2333     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2334     {
2335       // Before looking at dst.length, make sure dst is also an objArray.
2336       __ ldrw(rscratch1, Address(r18, lh_offset));
2337       __ movw(rscratch2, objArray_lh);
2338       __ eorw(rscratch1, rscratch1, rscratch2);
2339       __ cbnzw(rscratch1, L_failed);
2340 
2341       // It is safe to examine both src.length and dst.length.
2342       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2343                              r18, L_failed);
2344 
2345       const Register rscratch2_dst_klass = rscratch2;
2346       __ load_klass(rscratch2_dst_klass, dst); // reload
2347 
2348       // Marshal the base address arguments now, freeing registers.
2349       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2350       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2351       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2352       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2353       __ movw(count, length);           // length (reloaded)
2354       Register sco_temp = c_rarg3;      // this register is free now
2355       assert_different_registers(from, to, count, sco_temp,
2356                                  rscratch2_dst_klass, scratch_src_klass);
2357       // assert_clean_int(count, sco_temp);
2358 
2359       // Generate the type check.
2360       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2361       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2362       // assert_clean_int(sco_temp, r18);
2363       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2364 
2365       // Fetch destination element klass from the ObjArrayKlass header.
2366       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2367       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2368       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2369 
2370       // the checkcast_copy loop needs two extra arguments:
2371       assert(c_rarg3 == sco_temp, "#3 already in place");
2372       // Set up arguments for checkcast_copy_entry.
2373       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2374       __ b(RuntimeAddress(checkcast_copy_entry));
2375     }
2376 
2377   __ BIND(L_failed);
2378     __ mov(r0, -1);
2379     __ leave();   // required for proper stackwalking of RuntimeStub frame
2380     __ ret(lr);
2381 
2382     return start;
2383   }
2384 
2385   void generate_arraycopy_stubs() {
2386     address entry;
2387     address entry_jbyte_arraycopy;
2388     address entry_jshort_arraycopy;
2389     address entry_jint_arraycopy;
2390     address entry_oop_arraycopy;
2391     address entry_jlong_arraycopy;
2392     address entry_checkcast_arraycopy;
2393 
2394     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2395     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2396 
2397     StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
2398 
2399     //*** jbyte
2400     // Always need aligned and unaligned versions
2401     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2402                                                                                   "jbyte_disjoint_arraycopy");
2403     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2404                                                                                   &entry_jbyte_arraycopy,
2405                                                                                   "jbyte_arraycopy");
2406     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2407                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2408     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2409                                                                                   "arrayof_jbyte_arraycopy");
2410 
2411     //*** jshort
2412     // Always need aligned and unaligned versions
2413     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2414                                                                                     "jshort_disjoint_arraycopy");
2415     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2416                                                                                     &entry_jshort_arraycopy,
2417                                                                                     "jshort_arraycopy");
2418     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2419                                                                                     "arrayof_jshort_disjoint_arraycopy");
2420     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2421                                                                                     "arrayof_jshort_arraycopy");
2422 
2423     //*** jint
2424     // Aligned versions
2425     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2426                                                                                 "arrayof_jint_disjoint_arraycopy");
2427     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2428                                                                                 "arrayof_jint_arraycopy");
2429     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2430     // entry_jint_arraycopy always points to the unaligned version
2431     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2432                                                                                 "jint_disjoint_arraycopy");
2433     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2434                                                                                 &entry_jint_arraycopy,
2435                                                                                 "jint_arraycopy");
2436 
2437     //*** jlong
2438     // It is always aligned
2439     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2440                                                                                   "arrayof_jlong_disjoint_arraycopy");
2441     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2442                                                                                   "arrayof_jlong_arraycopy");
2443     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2444     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2445 
2446     //*** oops
2447     {
2448       // With compressed oops we need unaligned versions; notice that
2449       // we overwrite entry_oop_arraycopy.
2450       bool aligned = !UseCompressedOops;
2451 
2452       StubRoutines::_arrayof_oop_disjoint_arraycopy
2453         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2454                                      /*dest_uninitialized*/false);
2455       StubRoutines::_arrayof_oop_arraycopy
2456         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2457                                      /*dest_uninitialized*/false);
2458       // Aligned versions without pre-barriers
2459       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2460         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2461                                      /*dest_uninitialized*/true);
2462       StubRoutines::_arrayof_oop_arraycopy_uninit
2463         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2464                                      /*dest_uninitialized*/true);
2465     }
2466 
2467     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2468     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2469     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2470     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2471 
2472     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2473     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2474                                                                         /*dest_uninitialized*/true);
2475 
2476     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2477                                                               entry_jbyte_arraycopy,
2478                                                               entry_jshort_arraycopy,
2479                                                               entry_jint_arraycopy,
2480                                                               entry_jlong_arraycopy);
2481 
2482     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2483                                                                entry_jbyte_arraycopy,
2484                                                                entry_jshort_arraycopy,
2485                                                                entry_jint_arraycopy,
2486                                                                entry_oop_arraycopy,
2487                                                                entry_jlong_arraycopy,
2488                                                                entry_checkcast_arraycopy);
2489 
2490     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2491     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2492     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2493     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2494     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2495     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2496   }
2497 
2498   // Arguments:
2499   //
2500   // Inputs:
2501   //   c_rarg0   - source byte array address
2502   //   c_rarg1   - destination byte array address
2503   //   c_rarg2   - K (key) in little endian int array
2504   //
2505   address generate_aescrypt_encryptBlock() {
2506     __ align(CodeEntryAlignment);
2507     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2508 
2509     Label L_doLast;
2510 
2511     const Register from        = c_rarg0;  // source array address
2512     const Register to          = c_rarg1;  // destination array address
2513     const Register key         = c_rarg2;  // key array address
2514     const Register keylen      = rscratch1;
2515 
2516     address start = __ pc();
2517     __ enter();
2518 
2519     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2520 
2521     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2522 
2523     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2524     __ rev32(v1, __ T16B, v1);
2525     __ rev32(v2, __ T16B, v2);
2526     __ rev32(v3, __ T16B, v3);
2527     __ rev32(v4, __ T16B, v4);
2528     __ aese(v0, v1);
2529     __ aesmc(v0, v0);
2530     __ aese(v0, v2);
2531     __ aesmc(v0, v0);
2532     __ aese(v0, v3);
2533     __ aesmc(v0, v0);
2534     __ aese(v0, v4);
2535     __ aesmc(v0, v0);
2536 
2537     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2538     __ rev32(v1, __ T16B, v1);
2539     __ rev32(v2, __ T16B, v2);
2540     __ rev32(v3, __ T16B, v3);
2541     __ rev32(v4, __ T16B, v4);
2542     __ aese(v0, v1);
2543     __ aesmc(v0, v0);
2544     __ aese(v0, v2);
2545     __ aesmc(v0, v0);
2546     __ aese(v0, v3);
2547     __ aesmc(v0, v0);
2548     __ aese(v0, v4);
2549     __ aesmc(v0, v0);
2550 
2551     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2552     __ rev32(v1, __ T16B, v1);
2553     __ rev32(v2, __ T16B, v2);
2554 
2555     __ cmpw(keylen, 44);
2556     __ br(Assembler::EQ, L_doLast);
2557 
2558     __ aese(v0, v1);
2559     __ aesmc(v0, v0);
2560     __ aese(v0, v2);
2561     __ aesmc(v0, v0);
2562 
2563     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2564     __ rev32(v1, __ T16B, v1);
2565     __ rev32(v2, __ T16B, v2);
2566 
2567     __ cmpw(keylen, 52);
2568     __ br(Assembler::EQ, L_doLast);
2569 
2570     __ aese(v0, v1);
2571     __ aesmc(v0, v0);
2572     __ aese(v0, v2);
2573     __ aesmc(v0, v0);
2574 
2575     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2576     __ rev32(v1, __ T16B, v1);
2577     __ rev32(v2, __ T16B, v2);
2578 
2579     __ BIND(L_doLast);
2580 
2581     __ aese(v0, v1);
2582     __ aesmc(v0, v0);
2583     __ aese(v0, v2);
2584 
2585     __ ld1(v1, __ T16B, key);
2586     __ rev32(v1, __ T16B, v1);
2587     __ eor(v0, __ T16B, v0, v1);
2588 
2589     __ st1(v0, __ T16B, to);
2590 
2591     __ mov(r0, 0);
2592 
2593     __ leave();
2594     __ ret(lr);
2595 
2596     return start;
2597   }
2598 
2599   // Arguments:
2600   //
2601   // Inputs:
2602   //   c_rarg0   - source byte array address
2603   //   c_rarg1   - destination byte array address
2604   //   c_rarg2   - K (key) in little endian int array
2605   //
2606   address generate_aescrypt_decryptBlock() {
2607     assert(UseAES, "need AES instructions and misaligned SSE support");
2608     __ align(CodeEntryAlignment);
2609     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2610     Label L_doLast;
2611 
2612     const Register from        = c_rarg0;  // source array address
2613     const Register to          = c_rarg1;  // destination array address
2614     const Register key         = c_rarg2;  // key array address
2615     const Register keylen      = rscratch1;
2616 
2617     address start = __ pc();
2618     __ enter(); // required for proper stackwalking of RuntimeStub frame
2619 
2620     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2621 
2622     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2623 
2624     __ ld1(v5, __ T16B, __ post(key, 16));
2625     __ rev32(v5, __ T16B, v5);
2626 
2627     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2628     __ rev32(v1, __ T16B, v1);
2629     __ rev32(v2, __ T16B, v2);
2630     __ rev32(v3, __ T16B, v3);
2631     __ rev32(v4, __ T16B, v4);
2632     __ aesd(v0, v1);
2633     __ aesimc(v0, v0);
2634     __ aesd(v0, v2);
2635     __ aesimc(v0, v0);
2636     __ aesd(v0, v3);
2637     __ aesimc(v0, v0);
2638     __ aesd(v0, v4);
2639     __ aesimc(v0, v0);
2640 
2641     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2642     __ rev32(v1, __ T16B, v1);
2643     __ rev32(v2, __ T16B, v2);
2644     __ rev32(v3, __ T16B, v3);
2645     __ rev32(v4, __ T16B, v4);
2646     __ aesd(v0, v1);
2647     __ aesimc(v0, v0);
2648     __ aesd(v0, v2);
2649     __ aesimc(v0, v0);
2650     __ aesd(v0, v3);
2651     __ aesimc(v0, v0);
2652     __ aesd(v0, v4);
2653     __ aesimc(v0, v0);
2654 
2655     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2656     __ rev32(v1, __ T16B, v1);
2657     __ rev32(v2, __ T16B, v2);
2658 
2659     __ cmpw(keylen, 44);
2660     __ br(Assembler::EQ, L_doLast);
2661 
2662     __ aesd(v0, v1);
2663     __ aesimc(v0, v0);
2664     __ aesd(v0, v2);
2665     __ aesimc(v0, v0);
2666 
2667     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2668     __ rev32(v1, __ T16B, v1);
2669     __ rev32(v2, __ T16B, v2);
2670 
2671     __ cmpw(keylen, 52);
2672     __ br(Assembler::EQ, L_doLast);
2673 
2674     __ aesd(v0, v1);
2675     __ aesimc(v0, v0);
2676     __ aesd(v0, v2);
2677     __ aesimc(v0, v0);
2678 
2679     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2680     __ rev32(v1, __ T16B, v1);
2681     __ rev32(v2, __ T16B, v2);
2682 
2683     __ BIND(L_doLast);
2684 
2685     __ aesd(v0, v1);
2686     __ aesimc(v0, v0);
2687     __ aesd(v0, v2);
2688 
2689     __ eor(v0, __ T16B, v0, v5);
2690 
2691     __ st1(v0, __ T16B, to);
2692 
2693     __ mov(r0, 0);
2694 
2695     __ leave();
2696     __ ret(lr);
2697 
2698     return start;
2699   }
2700 
2701   // Arguments:
2702   //
2703   // Inputs:
2704   //   c_rarg0   - source byte array address
2705   //   c_rarg1   - destination byte array address
2706   //   c_rarg2   - K (key) in little endian int array
2707   //   c_rarg3   - r vector byte array address
2708   //   c_rarg4   - input length
2709   //
2710   // Output:
2711   //   x0        - input length
2712   //
2713   address generate_cipherBlockChaining_encryptAESCrypt() {
2714     assert(UseAES, "need AES instructions and misaligned SSE support");
2715     __ align(CodeEntryAlignment);
2716     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2717 
2718     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish;
2719 
2720     const Register from        = c_rarg0;  // source array address
2721     const Register to          = c_rarg1;  // destination array address
2722     const Register key         = c_rarg2;  // key array address
2723     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2724                                            // and left with the results of the last encryption block
2725     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2726     const Register keylen      = rscratch1;
2727 
2728     address start = __ pc();
2729 
2730       __ enter();
2731 
2732       __ subsw(rscratch2, len_reg, zr);
2733       __ br(Assembler::LE, _L_finish);
2734 
2735       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2736 
2737       __ ld1(v0, __ T16B, rvec);
2738 
2739       __ cmpw(keylen, 52);
2740       __ br(Assembler::CC, L_loadkeys_44);
2741       __ br(Assembler::EQ, L_loadkeys_52);
2742 
2743       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2744       __ rev32(v17, __ T16B, v17);
2745       __ rev32(v18, __ T16B, v18);
2746     __ BIND(L_loadkeys_52);
2747       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2748       __ rev32(v19, __ T16B, v19);
2749       __ rev32(v20, __ T16B, v20);
2750     __ BIND(L_loadkeys_44);
2751       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2752       __ rev32(v21, __ T16B, v21);
2753       __ rev32(v22, __ T16B, v22);
2754       __ rev32(v23, __ T16B, v23);
2755       __ rev32(v24, __ T16B, v24);
2756       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2757       __ rev32(v25, __ T16B, v25);
2758       __ rev32(v26, __ T16B, v26);
2759       __ rev32(v27, __ T16B, v27);
2760       __ rev32(v28, __ T16B, v28);
2761       __ ld1(v29, v30, v31, __ T16B, key);
2762       __ rev32(v29, __ T16B, v29);
2763       __ rev32(v30, __ T16B, v30);
2764       __ rev32(v31, __ T16B, v31);
2765 
2766     __ BIND(L_aes_loop);
2767       __ ld1(v1, __ T16B, __ post(from, 16));
2768       __ eor(v0, __ T16B, v0, v1);
2769 
2770       __ br(Assembler::CC, L_rounds_44);
2771       __ br(Assembler::EQ, L_rounds_52);
2772 
2773       __ aese(v0, v17); __ aesmc(v0, v0);
2774       __ aese(v0, v18); __ aesmc(v0, v0);
2775     __ BIND(L_rounds_52);
2776       __ aese(v0, v19); __ aesmc(v0, v0);
2777       __ aese(v0, v20); __ aesmc(v0, v0);
2778     __ BIND(L_rounds_44);
2779       __ aese(v0, v21); __ aesmc(v0, v0);
2780       __ aese(v0, v22); __ aesmc(v0, v0);
2781       __ aese(v0, v23); __ aesmc(v0, v0);
2782       __ aese(v0, v24); __ aesmc(v0, v0);
2783       __ aese(v0, v25); __ aesmc(v0, v0);
2784       __ aese(v0, v26); __ aesmc(v0, v0);
2785       __ aese(v0, v27); __ aesmc(v0, v0);
2786       __ aese(v0, v28); __ aesmc(v0, v0);
2787       __ aese(v0, v29); __ aesmc(v0, v0);
2788       __ aese(v0, v30);
2789       __ eor(v0, __ T16B, v0, v31);
2790 
2791       __ st1(v0, __ T16B, __ post(to, 16));
2792 
2793       __ subw(len_reg, len_reg, 16);
2794       __ cbnzw(len_reg, L_aes_loop);
2795 
2796       __ st1(v0, __ T16B, rvec);
2797 
2798     __ BIND(_L_finish);
2799       __ mov(r0, rscratch2);
2800 
2801       __ leave();
2802       __ ret(lr);
2803 
2804       return start;
2805   }
2806 
2807   // Arguments:
2808   //
2809   // Inputs:
2810   //   c_rarg0   - source byte array address
2811   //   c_rarg1   - destination byte array address
2812   //   c_rarg2   - K (key) in little endian int array
2813   //   c_rarg3   - r vector byte array address
2814   //   c_rarg4   - input length
2815   //
2816   // Output:
2817   //   r0       - input length
2818   //
2819   address generate_cipherBlockChaining_decryptAESCrypt() {
2820     assert(UseAES, "need AES instructions and misaligned SSE support");
2821     __ align(CodeEntryAlignment);
2822     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2823 
2824     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish;
2825 
2826     const Register from        = c_rarg0;  // source array address
2827     const Register to          = c_rarg1;  // destination array address
2828     const Register key         = c_rarg2;  // key array address
2829     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2830                                            // and left with the results of the last encryption block
2831     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2832     const Register keylen      = rscratch1;
2833 
2834     address start = __ pc();
2835 
2836       __ enter();
2837 
2838       __ subsw(rscratch2, len_reg, zr);
2839       __ br(Assembler::LE, _L_finish);
2840 
2841       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2842 
2843       __ ld1(v2, __ T16B, rvec);
2844 
2845       __ ld1(v31, __ T16B, __ post(key, 16));
2846       __ rev32(v31, __ T16B, v31);
2847 
2848       __ cmpw(keylen, 52);
2849       __ br(Assembler::CC, L_loadkeys_44);
2850       __ br(Assembler::EQ, L_loadkeys_52);
2851 
2852       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2853       __ rev32(v17, __ T16B, v17);
2854       __ rev32(v18, __ T16B, v18);
2855     __ BIND(L_loadkeys_52);
2856       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2857       __ rev32(v19, __ T16B, v19);
2858       __ rev32(v20, __ T16B, v20);
2859     __ BIND(L_loadkeys_44);
2860       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2861       __ rev32(v21, __ T16B, v21);
2862       __ rev32(v22, __ T16B, v22);
2863       __ rev32(v23, __ T16B, v23);
2864       __ rev32(v24, __ T16B, v24);
2865       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2866       __ rev32(v25, __ T16B, v25);
2867       __ rev32(v26, __ T16B, v26);
2868       __ rev32(v27, __ T16B, v27);
2869       __ rev32(v28, __ T16B, v28);
2870       __ ld1(v29, v30, __ T16B, key);
2871       __ rev32(v29, __ T16B, v29);
2872       __ rev32(v30, __ T16B, v30);
2873 
2874     __ BIND(L_aes_loop);
2875       __ ld1(v0, __ T16B, __ post(from, 16));
2876       __ orr(v1, __ T16B, v0, v0);
2877 
2878       __ br(Assembler::CC, L_rounds_44);
2879       __ br(Assembler::EQ, L_rounds_52);
2880 
2881       __ aesd(v0, v17); __ aesimc(v0, v0);
2882       __ aesd(v0, v18); __ aesimc(v0, v0);
2883     __ BIND(L_rounds_52);
2884       __ aesd(v0, v19); __ aesimc(v0, v0);
2885       __ aesd(v0, v20); __ aesimc(v0, v0);
2886     __ BIND(L_rounds_44);
2887       __ aesd(v0, v21); __ aesimc(v0, v0);
2888       __ aesd(v0, v22); __ aesimc(v0, v0);
2889       __ aesd(v0, v23); __ aesimc(v0, v0);
2890       __ aesd(v0, v24); __ aesimc(v0, v0);
2891       __ aesd(v0, v25); __ aesimc(v0, v0);
2892       __ aesd(v0, v26); __ aesimc(v0, v0);
2893       __ aesd(v0, v27); __ aesimc(v0, v0);
2894       __ aesd(v0, v28); __ aesimc(v0, v0);
2895       __ aesd(v0, v29); __ aesimc(v0, v0);
2896       __ aesd(v0, v30);
2897       __ eor(v0, __ T16B, v0, v31);
2898       __ eor(v0, __ T16B, v0, v2);
2899 
2900       __ st1(v0, __ T16B, __ post(to, 16));
2901       __ orr(v2, __ T16B, v1, v1);
2902 
2903       __ subw(len_reg, len_reg, 16);
2904       __ cbnzw(len_reg, L_aes_loop);
2905 
2906       __ st1(v2, __ T16B, rvec);
2907 
2908     __ BIND(_L_finish);
2909       __ mov(r0, rscratch2);
2910 
2911       __ leave();
2912       __ ret(lr);
2913 
2914     return start;
2915   }
2916 
2917   // Arguments:
2918   //
2919   // Inputs:
2920   //   c_rarg0   - byte[]  source+offset
2921   //   c_rarg1   - int[]   SHA.state
2922   //   c_rarg2   - int     offset
2923   //   c_rarg3   - int     limit
2924   //
2925   address generate_sha1_implCompress(bool multi_block, const char *name) {
2926     __ align(CodeEntryAlignment);
2927     StubCodeMark mark(this, "StubRoutines", name);
2928     address start = __ pc();
2929 
2930     Register buf   = c_rarg0;
2931     Register state = c_rarg1;
2932     Register ofs   = c_rarg2;
2933     Register limit = c_rarg3;
2934 
2935     Label keys;
2936     Label sha1_loop;
2937 
2938     // load the keys into v0..v3
2939     __ adr(rscratch1, keys);
2940     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2941     // load 5 words state into v6, v7
2942     __ ldrq(v6, Address(state, 0));
2943     __ ldrs(v7, Address(state, 16));
2944 
2945 
2946     __ BIND(sha1_loop);
2947     // load 64 bytes of data into v16..v19
2948     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2949     __ rev32(v16, __ T16B, v16);
2950     __ rev32(v17, __ T16B, v17);
2951     __ rev32(v18, __ T16B, v18);
2952     __ rev32(v19, __ T16B, v19);
2953 
2954     // do the sha1
2955     __ addv(v4, __ T4S, v16, v0);
2956     __ orr(v20, __ T16B, v6, v6);
2957 
2958     FloatRegister d0 = v16;
2959     FloatRegister d1 = v17;
2960     FloatRegister d2 = v18;
2961     FloatRegister d3 = v19;
2962 
2963     for (int round = 0; round < 20; round++) {
2964       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2965       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2966       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2967       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2968       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2969 
2970       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2971       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2972       __ sha1h(tmp2, __ T4S, v20);
2973       if (round < 5)
2974         __ sha1c(v20, __ T4S, tmp3, tmp4);
2975       else if (round < 10 || round >= 15)
2976         __ sha1p(v20, __ T4S, tmp3, tmp4);
2977       else
2978         __ sha1m(v20, __ T4S, tmp3, tmp4);
2979       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2980 
2981       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2982     }
2983 
2984     __ addv(v7, __ T2S, v7, v21);
2985     __ addv(v6, __ T4S, v6, v20);
2986 
2987     if (multi_block) {
2988       __ add(ofs, ofs, 64);
2989       __ cmp(ofs, limit);
2990       __ br(Assembler::LE, sha1_loop);
2991       __ mov(c_rarg0, ofs); // return ofs
2992     }
2993 
2994     __ strq(v6, Address(state, 0));
2995     __ strs(v7, Address(state, 16));
2996 
2997     __ ret(lr);
2998 
2999     __ bind(keys);
3000     __ emit_int32(0x5a827999);
3001     __ emit_int32(0x6ed9eba1);
3002     __ emit_int32(0x8f1bbcdc);
3003     __ emit_int32(0xca62c1d6);
3004 
3005     return start;
3006   }
3007 
3008 
3009   // Arguments:
3010   //
3011   // Inputs:
3012   //   c_rarg0   - byte[]  source+offset
3013   //   c_rarg1   - int[]   SHA.state
3014   //   c_rarg2   - int     offset
3015   //   c_rarg3   - int     limit
3016   //
3017   address generate_sha256_implCompress(bool multi_block, const char *name) {
3018     static const uint32_t round_consts[64] = {
3019       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3020       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3021       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3022       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3023       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3024       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3025       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3026       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3027       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3028       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3029       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3030       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3031       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3032       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3033       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3034       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3035     };
3036     __ align(CodeEntryAlignment);
3037     StubCodeMark mark(this, "StubRoutines", name);
3038     address start = __ pc();
3039 
3040     Register buf   = c_rarg0;
3041     Register state = c_rarg1;
3042     Register ofs   = c_rarg2;
3043     Register limit = c_rarg3;
3044 
3045     Label sha1_loop;
3046 
3047     __ stpd(v8, v9, __ pre(sp, -32));
3048     __ stpd(v10, v11, Address(sp, 16));
3049 
3050 // dga == v0
3051 // dgb == v1
3052 // dg0 == v2
3053 // dg1 == v3
3054 // dg2 == v4
3055 // t0 == v6
3056 // t1 == v7
3057 
3058     // load 16 keys to v16..v31
3059     __ lea(rscratch1, ExternalAddress((address)round_consts));
3060     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3061     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3062     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3063     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3064 
3065     // load 8 words (256 bits) state
3066     __ ldpq(v0, v1, state);
3067 
3068     __ BIND(sha1_loop);
3069     // load 64 bytes of data into v8..v11
3070     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3071     __ rev32(v8, __ T16B, v8);
3072     __ rev32(v9, __ T16B, v9);
3073     __ rev32(v10, __ T16B, v10);
3074     __ rev32(v11, __ T16B, v11);
3075 
3076     __ addv(v6, __ T4S, v8, v16);
3077     __ orr(v2, __ T16B, v0, v0);
3078     __ orr(v3, __ T16B, v1, v1);
3079 
3080     FloatRegister d0 = v8;
3081     FloatRegister d1 = v9;
3082     FloatRegister d2 = v10;
3083     FloatRegister d3 = v11;
3084 
3085 
3086     for (int round = 0; round < 16; round++) {
3087       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3088       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3089       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3090       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3091 
3092       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3093        __ orr(v4, __ T16B, v2, v2);
3094       if (round < 15)
3095         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3096       __ sha256h(v2, __ T4S, v3, tmp2);
3097       __ sha256h2(v3, __ T4S, v4, tmp2);
3098       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3099 
3100       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3101     }
3102 
3103     __ addv(v0, __ T4S, v0, v2);
3104     __ addv(v1, __ T4S, v1, v3);
3105 
3106     if (multi_block) {
3107       __ add(ofs, ofs, 64);
3108       __ cmp(ofs, limit);
3109       __ br(Assembler::LE, sha1_loop);
3110       __ mov(c_rarg0, ofs); // return ofs
3111     }
3112 
3113     __ ldpd(v10, v11, Address(sp, 16));
3114     __ ldpd(v8, v9, __ post(sp, 32));
3115 
3116     __ stpq(v0, v1, state);
3117 
3118     __ ret(lr);
3119 
3120     return start;
3121   }
3122 
3123   // Safefetch stubs.
3124   void generate_safefetch(const char* name, int size, address* entry,
3125                           address* fault_pc, address* continuation_pc) {
3126     // safefetch signatures:
3127     //   int      SafeFetch32(int*      adr, int      errValue);
3128     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3129     //
3130     // arguments:
3131     //   c_rarg0 = adr
3132     //   c_rarg1 = errValue
3133     //
3134     // result:
3135     //   PPC_RET  = *adr or errValue
3136 
3137     StubCodeMark mark(this, "StubRoutines", name);
3138 
3139     // Entry point, pc or function descriptor.
3140     *entry = __ pc();
3141 
3142     // Load *adr into c_rarg1, may fault.
3143     *fault_pc = __ pc();
3144     switch (size) {
3145       case 4:
3146         // int32_t
3147         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3148         break;
3149       case 8:
3150         // int64_t
3151         __ ldr(c_rarg1, Address(c_rarg0, 0));
3152         break;
3153       default:
3154         ShouldNotReachHere();
3155     }
3156 
3157     // return errValue or *adr
3158     *continuation_pc = __ pc();
3159     __ mov(r0, c_rarg1);
3160     __ ret(lr);
3161   }
3162 
3163   /**
3164    *  Arguments:
3165    *
3166    * Inputs:
3167    *   c_rarg0   - int crc
3168    *   c_rarg1   - byte* buf
3169    *   c_rarg2   - int length
3170    *
3171    * Output:
3172    *       r0   - int crc result
3173    *
3174    * Preserves:
3175    *       r13
3176    *
3177    */
3178   address generate_updateBytesCRC32() {
3179     assert(UseCRC32Intrinsics, "what are we doing here?");
3180 
3181     __ align(CodeEntryAlignment);
3182     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3183 
3184     address start = __ pc();
3185 
3186     const Register crc   = c_rarg0;  // crc
3187     const Register buf   = c_rarg1;  // source java byte array address
3188     const Register len   = c_rarg2;  // length
3189     const Register table0 = c_rarg3; // crc_table address
3190     const Register table1 = c_rarg4;
3191     const Register table2 = c_rarg5;
3192     const Register table3 = c_rarg6;
3193     const Register tmp3 = c_rarg7;
3194 
3195     BLOCK_COMMENT("Entry:");
3196     __ enter(); // required for proper stackwalking of RuntimeStub frame
3197 
3198     __ kernel_crc32(crc, buf, len,
3199               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3200 
3201     __ leave(); // required for proper stackwalking of RuntimeStub frame
3202     __ ret(lr);
3203 
3204     return start;
3205   }
3206 
3207   /**
3208    *  Arguments:
3209    *
3210    *  Input:
3211    *    c_rarg0   - x address
3212    *    c_rarg1   - x length
3213    *    c_rarg2   - y address
3214    *    c_rarg3   - y lenth
3215    *    c_rarg4   - z address
3216    *    c_rarg5   - z length
3217    */
3218   address generate_multiplyToLen() {
3219     __ align(CodeEntryAlignment);
3220     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3221 
3222     address start = __ pc();
3223     const Register x     = r0;
3224     const Register xlen  = r1;
3225     const Register y     = r2;
3226     const Register ylen  = r3;
3227     const Register z     = r4;
3228     const Register zlen  = r5;
3229 
3230     const Register tmp1  = r10;
3231     const Register tmp2  = r11;
3232     const Register tmp3  = r12;
3233     const Register tmp4  = r13;
3234     const Register tmp5  = r14;
3235     const Register tmp6  = r15;
3236     const Register tmp7  = r16;
3237 
3238     BLOCK_COMMENT("Entry:");
3239     __ enter(); // required for proper stackwalking of RuntimeStub frame
3240     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3241     __ leave(); // required for proper stackwalking of RuntimeStub frame
3242     __ ret(lr);
3243 
3244     return start;
3245   }
3246 
3247   // Continuation point for throwing of implicit exceptions that are
3248   // not handled in the current activation. Fabricates an exception
3249   // oop and initiates normal exception dispatching in this
3250   // frame. Since we need to preserve callee-saved values (currently
3251   // only for C2, but done for C1 as well) we need a callee-saved oop
3252   // map and therefore have to make these stubs into RuntimeStubs
3253   // rather than BufferBlobs.  If the compiler needs all registers to
3254   // be preserved between the fault point and the exception handler
3255   // then it must assume responsibility for that in
3256   // AbstractCompiler::continuation_for_implicit_null_exception or
3257   // continuation_for_implicit_division_by_zero_exception. All other
3258   // implicit exceptions (e.g., NullPointerException or
3259   // AbstractMethodError on entry) are either at call sites or
3260   // otherwise assume that stack unwinding will be initiated, so
3261   // caller saved registers were assumed volatile in the compiler.
3262 
3263 #undef __
3264 #define __ masm->
3265 
3266   address generate_throw_exception(const char* name,
3267                                    address runtime_entry,
3268                                    Register arg1 = noreg,
3269                                    Register arg2 = noreg) {
3270     // Information about frame layout at time of blocking runtime call.
3271     // Note that we only have to preserve callee-saved registers since
3272     // the compilers are responsible for supplying a continuation point
3273     // if they expect all registers to be preserved.
3274     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3275     enum layout {
3276       rfp_off = 0,
3277       rfp_off2,
3278       return_off,
3279       return_off2,
3280       framesize // inclusive of return address
3281     };
3282 
3283     int insts_size = 512;
3284     int locs_size  = 64;
3285 
3286     CodeBuffer code(name, insts_size, locs_size);
3287     OopMapSet* oop_maps  = new OopMapSet();
3288     MacroAssembler* masm = new MacroAssembler(&code);
3289 
3290     address start = __ pc();
3291 
3292     // This is an inlined and slightly modified version of call_VM
3293     // which has the ability to fetch the return PC out of
3294     // thread-local storage and also sets up last_Java_sp slightly
3295     // differently than the real call_VM
3296 
3297     __ enter(); // Save FP and LR before call
3298 
3299     assert(is_even(framesize/2), "sp not 16-byte aligned");
3300 
3301     // lr and fp are already in place
3302     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3303 
3304     int frame_complete = __ pc() - start;
3305 
3306     // Set up last_Java_sp and last_Java_fp
3307     address the_pc = __ pc();
3308     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3309 
3310     // Call runtime
3311     if (arg1 != noreg) {
3312       assert(arg2 != c_rarg1, "clobbered");
3313       __ mov(c_rarg1, arg1);
3314     }
3315     if (arg2 != noreg) {
3316       __ mov(c_rarg2, arg2);
3317     }
3318     __ mov(c_rarg0, rthread);
3319     BLOCK_COMMENT("call runtime_entry");
3320     __ mov(rscratch1, runtime_entry);
3321     __ blr(rscratch1);
3322 
3323     // Generate oop map
3324     OopMap* map = new OopMap(framesize, 0);
3325 
3326     oop_maps->add_gc_map(the_pc - start, map);
3327 
3328     __ reset_last_Java_frame(true);
3329     __ maybe_isb();
3330 
3331     __ leave();
3332 
3333     // check for pending exceptions
3334 #ifdef ASSERT
3335     Label L;
3336     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3337     __ cbnz(rscratch1, L);
3338     __ should_not_reach_here();
3339     __ bind(L);
3340 #endif // ASSERT
3341     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3342 
3343 
3344     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3345     RuntimeStub* stub =
3346       RuntimeStub::new_runtime_stub(name,
3347                                     &code,
3348                                     frame_complete,
3349                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3350                                     oop_maps, false);
3351     return stub->entry_point();
3352   }
3353 
3354   class MontgomeryMultiplyGenerator : public MacroAssembler {
3355 
3356     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3357       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3358 
3359     RegSet _toSave;
3360     bool _squaring;
3361 
3362   public:
3363     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3364       : MacroAssembler(as->code()), _squaring(squaring) {
3365 
3366       // Register allocation
3367 
3368       Register reg = c_rarg0;
3369       Pa_base = reg;       // Argument registers
3370       if (squaring)
3371         Pb_base = Pa_base;
3372       else
3373         Pb_base = ++reg;
3374       Pn_base = ++reg;
3375       Rlen= ++reg;
3376       inv = ++reg;
3377       Pm_base = ++reg;
3378 
3379                           // Working registers:
3380       Ra =  ++reg;        // The current digit of a, b, n, and m.
3381       Rb =  ++reg;
3382       Rm =  ++reg;
3383       Rn =  ++reg;
3384 
3385       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3386       Pb =  ++reg;
3387       Pm =  ++reg;
3388       Pn =  ++reg;
3389 
3390       t0 =  ++reg;        // Three registers which form a
3391       t1 =  ++reg;        // triple-precision accumuator.
3392       t2 =  ++reg;
3393 
3394       Ri =  ++reg;        // Inner and outer loop indexes.
3395       Rj =  ++reg;
3396 
3397       Rhi_ab = ++reg;     // Product registers: low and high parts
3398       Rlo_ab = ++reg;     // of a*b and m*n.
3399       Rhi_mn = ++reg;
3400       Rlo_mn = ++reg;
3401 
3402       // r19 and up are callee-saved.
3403       _toSave = RegSet::range(r19, reg) + Pm_base;
3404     }
3405 
3406   private:
3407     void save_regs() {
3408       push(_toSave, sp);
3409     }
3410 
3411     void restore_regs() {
3412       pop(_toSave, sp);
3413     }
3414 
3415     template <typename T>
3416     void unroll_2(Register count, T block) {
3417       Label loop, end, odd;
3418       tbnz(count, 0, odd);
3419       cbz(count, end);
3420       align(16);
3421       bind(loop);
3422       (this->*block)();
3423       bind(odd);
3424       (this->*block)();
3425       subs(count, count, 2);
3426       br(Assembler::GT, loop);
3427       bind(end);
3428     }
3429 
3430     template <typename T>
3431     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3432       Label loop, end, odd;
3433       tbnz(count, 0, odd);
3434       cbz(count, end);
3435       align(16);
3436       bind(loop);
3437       (this->*block)(d, s, tmp);
3438       bind(odd);
3439       (this->*block)(d, s, tmp);
3440       subs(count, count, 2);
3441       br(Assembler::GT, loop);
3442       bind(end);
3443     }
3444 
3445     void pre1(RegisterOrConstant i) {
3446       block_comment("pre1");
3447       // Pa = Pa_base;
3448       // Pb = Pb_base + i;
3449       // Pm = Pm_base;
3450       // Pn = Pn_base + i;
3451       // Ra = *Pa;
3452       // Rb = *Pb;
3453       // Rm = *Pm;
3454       // Rn = *Pn;
3455       ldr(Ra, Address(Pa_base));
3456       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3457       ldr(Rm, Address(Pm_base));
3458       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3459       lea(Pa, Address(Pa_base));
3460       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3461       lea(Pm, Address(Pm_base));
3462       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3463 
3464       // Zero the m*n result.
3465       mov(Rhi_mn, zr);
3466       mov(Rlo_mn, zr);
3467     }
3468 
3469     // The core multiply-accumulate step of a Montgomery
3470     // multiplication.  The idea is to schedule operations as a
3471     // pipeline so that instructions with long latencies (loads and
3472     // multiplies) have time to complete before their results are
3473     // used.  This most benefits in-order implementations of the
3474     // architecture but out-of-order ones also benefit.
3475     void step() {
3476       block_comment("step");
3477       // MACC(Ra, Rb, t0, t1, t2);
3478       // Ra = *++Pa;
3479       // Rb = *--Pb;
3480       umulh(Rhi_ab, Ra, Rb);
3481       mul(Rlo_ab, Ra, Rb);
3482       ldr(Ra, pre(Pa, wordSize));
3483       ldr(Rb, pre(Pb, -wordSize));
3484       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3485                                        // previous iteration.
3486       // MACC(Rm, Rn, t0, t1, t2);
3487       // Rm = *++Pm;
3488       // Rn = *--Pn;
3489       umulh(Rhi_mn, Rm, Rn);
3490       mul(Rlo_mn, Rm, Rn);
3491       ldr(Rm, pre(Pm, wordSize));
3492       ldr(Rn, pre(Pn, -wordSize));
3493       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3494     }
3495 
3496     void post1() {
3497       block_comment("post1");
3498 
3499       // MACC(Ra, Rb, t0, t1, t2);
3500       // Ra = *++Pa;
3501       // Rb = *--Pb;
3502       umulh(Rhi_ab, Ra, Rb);
3503       mul(Rlo_ab, Ra, Rb);
3504       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3505       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3506 
3507       // *Pm = Rm = t0 * inv;
3508       mul(Rm, t0, inv);
3509       str(Rm, Address(Pm));
3510 
3511       // MACC(Rm, Rn, t0, t1, t2);
3512       // t0 = t1; t1 = t2; t2 = 0;
3513       umulh(Rhi_mn, Rm, Rn);
3514 
3515 #ifndef PRODUCT
3516       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3517       {
3518         mul(Rlo_mn, Rm, Rn);
3519         add(Rlo_mn, t0, Rlo_mn);
3520         Label ok;
3521         cbz(Rlo_mn, ok); {
3522           stop("broken Montgomery multiply");
3523         } bind(ok);
3524       }
3525 #endif
3526       // We have very carefully set things up so that
3527       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3528       // the lower half of Rm * Rn because we know the result already:
3529       // it must be -t0.  t0 + (-t0) must generate a carry iff
3530       // t0 != 0.  So, rather than do a mul and an adds we just set
3531       // the carry flag iff t0 is nonzero.
3532       //
3533       // mul(Rlo_mn, Rm, Rn);
3534       // adds(zr, t0, Rlo_mn);
3535       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3536       adcs(t0, t1, Rhi_mn);
3537       adc(t1, t2, zr);
3538       mov(t2, zr);
3539     }
3540 
3541     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3542       block_comment("pre2");
3543       // Pa = Pa_base + i-len;
3544       // Pb = Pb_base + len;
3545       // Pm = Pm_base + i-len;
3546       // Pn = Pn_base + len;
3547 
3548       if (i.is_register()) {
3549         sub(Rj, i.as_register(), len);
3550       } else {
3551         mov(Rj, i.as_constant());
3552         sub(Rj, Rj, len);
3553       }
3554       // Rj == i-len
3555 
3556       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3557       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3558       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3559       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3560 
3561       // Ra = *++Pa;
3562       // Rb = *--Pb;
3563       // Rm = *++Pm;
3564       // Rn = *--Pn;
3565       ldr(Ra, pre(Pa, wordSize));
3566       ldr(Rb, pre(Pb, -wordSize));
3567       ldr(Rm, pre(Pm, wordSize));
3568       ldr(Rn, pre(Pn, -wordSize));
3569 
3570       mov(Rhi_mn, zr);
3571       mov(Rlo_mn, zr);
3572     }
3573 
3574     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3575       block_comment("post2");
3576       if (i.is_constant()) {
3577         mov(Rj, i.as_constant()-len.as_constant());
3578       } else {
3579         sub(Rj, i.as_register(), len);
3580       }
3581 
3582       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3583 
3584       // As soon as we know the least significant digit of our result,
3585       // store it.
3586       // Pm_base[i-len] = t0;
3587       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3588 
3589       // t0 = t1; t1 = t2; t2 = 0;
3590       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3591       adc(t1, t2, zr);
3592       mov(t2, zr);
3593     }
3594 
3595     // A carry in t0 after Montgomery multiplication means that we
3596     // should subtract multiples of n from our result in m.  We'll
3597     // keep doing that until there is no carry.
3598     void normalize(RegisterOrConstant len) {
3599       block_comment("normalize");
3600       // while (t0)
3601       //   t0 = sub(Pm_base, Pn_base, t0, len);
3602       Label loop, post, again;
3603       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3604       cbz(t0, post); {
3605         bind(again); {
3606           mov(i, zr);
3607           mov(cnt, len);
3608           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3609           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3610           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3611           align(16);
3612           bind(loop); {
3613             sbcs(Rm, Rm, Rn);
3614             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3615             add(i, i, 1);
3616             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3617             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3618             sub(cnt, cnt, 1);
3619           } cbnz(cnt, loop);
3620           sbc(t0, t0, zr);
3621         } cbnz(t0, again);
3622       } bind(post);
3623     }
3624 
3625     // Move memory at s to d, reversing words.
3626     //    Increments d to end of copied memory
3627     //    Destroys tmp1, tmp2
3628     //    Preserves len
3629     //    Leaves s pointing to the address which was in d at start
3630     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3631       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3632 
3633       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3634       mov(tmp1, len);
3635       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3636       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3637     }
3638     // where
3639     void reverse1(Register d, Register s, Register tmp) {
3640       ldr(tmp, pre(s, -wordSize));
3641       ror(tmp, tmp, 32);
3642       str(tmp, post(d, wordSize));
3643     }
3644 
3645     void step_squaring() {
3646       // An extra ACC
3647       step();
3648       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3649     }
3650 
3651     void last_squaring(RegisterOrConstant i) {
3652       Label dont;
3653       // if ((i & 1) == 0) {
3654       tbnz(i.as_register(), 0, dont); {
3655         // MACC(Ra, Rb, t0, t1, t2);
3656         // Ra = *++Pa;
3657         // Rb = *--Pb;
3658         umulh(Rhi_ab, Ra, Rb);
3659         mul(Rlo_ab, Ra, Rb);
3660         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3661       } bind(dont);
3662     }
3663 
3664     void extra_step_squaring() {
3665       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3666 
3667       // MACC(Rm, Rn, t0, t1, t2);
3668       // Rm = *++Pm;
3669       // Rn = *--Pn;
3670       umulh(Rhi_mn, Rm, Rn);
3671       mul(Rlo_mn, Rm, Rn);
3672       ldr(Rm, pre(Pm, wordSize));
3673       ldr(Rn, pre(Pn, -wordSize));
3674     }
3675 
3676     void post1_squaring() {
3677       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3678 
3679       // *Pm = Rm = t0 * inv;
3680       mul(Rm, t0, inv);
3681       str(Rm, Address(Pm));
3682 
3683       // MACC(Rm, Rn, t0, t1, t2);
3684       // t0 = t1; t1 = t2; t2 = 0;
3685       umulh(Rhi_mn, Rm, Rn);
3686 
3687 #ifndef PRODUCT
3688       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3689       {
3690         mul(Rlo_mn, Rm, Rn);
3691         add(Rlo_mn, t0, Rlo_mn);
3692         Label ok;
3693         cbz(Rlo_mn, ok); {
3694           stop("broken Montgomery multiply");
3695         } bind(ok);
3696       }
3697 #endif
3698       // We have very carefully set things up so that
3699       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3700       // the lower half of Rm * Rn because we know the result already:
3701       // it must be -t0.  t0 + (-t0) must generate a carry iff
3702       // t0 != 0.  So, rather than do a mul and an adds we just set
3703       // the carry flag iff t0 is nonzero.
3704       //
3705       // mul(Rlo_mn, Rm, Rn);
3706       // adds(zr, t0, Rlo_mn);
3707       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3708       adcs(t0, t1, Rhi_mn);
3709       adc(t1, t2, zr);
3710       mov(t2, zr);
3711     }
3712 
3713     void acc(Register Rhi, Register Rlo,
3714              Register t0, Register t1, Register t2) {
3715       adds(t0, t0, Rlo);
3716       adcs(t1, t1, Rhi);
3717       adc(t2, t2, zr);
3718     }
3719 
3720   public:
3721     /**
3722      * Fast Montgomery multiplication.  The derivation of the
3723      * algorithm is in A Cryptographic Library for the Motorola
3724      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3725      *
3726      * Arguments:
3727      *
3728      * Inputs for multiplication:
3729      *   c_rarg0   - int array elements a
3730      *   c_rarg1   - int array elements b
3731      *   c_rarg2   - int array elements n (the modulus)
3732      *   c_rarg3   - int length
3733      *   c_rarg4   - int inv
3734      *   c_rarg5   - int array elements m (the result)
3735      *
3736      * Inputs for squaring:
3737      *   c_rarg0   - int array elements a
3738      *   c_rarg1   - int array elements n (the modulus)
3739      *   c_rarg2   - int length
3740      *   c_rarg3   - int inv
3741      *   c_rarg4   - int array elements m (the result)
3742      *
3743      */
3744     address generate_multiply() {
3745       Label argh, nothing;
3746       bind(argh);
3747       stop("MontgomeryMultiply total_allocation must be <= 8192");
3748 
3749       align(CodeEntryAlignment);
3750       address entry = pc();
3751 
3752       cbzw(Rlen, nothing);
3753 
3754       enter();
3755 
3756       // Make room.
3757       cmpw(Rlen, 512);
3758       br(Assembler::HI, argh);
3759       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3760       andr(sp, Ra, -2 * wordSize);
3761 
3762       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3763 
3764       {
3765         // Copy input args, reversing as we go.  We use Ra as a
3766         // temporary variable.
3767         reverse(Ra, Pa_base, Rlen, t0, t1);
3768         if (!_squaring)
3769           reverse(Ra, Pb_base, Rlen, t0, t1);
3770         reverse(Ra, Pn_base, Rlen, t0, t1);
3771       }
3772 
3773       // Push all call-saved registers and also Pm_base which we'll need
3774       // at the end.
3775       save_regs();
3776 
3777 #ifndef PRODUCT
3778       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3779       {
3780         ldr(Rn, Address(Pn_base, 0));
3781         mul(Rlo_mn, Rn, inv);
3782         cmp(Rlo_mn, -1);
3783         Label ok;
3784         br(EQ, ok); {
3785           stop("broken inverse in Montgomery multiply");
3786         } bind(ok);
3787       }
3788 #endif
3789 
3790       mov(Pm_base, Ra);
3791 
3792       mov(t0, zr);
3793       mov(t1, zr);
3794       mov(t2, zr);
3795 
3796       block_comment("for (int i = 0; i < len; i++) {");
3797       mov(Ri, zr); {
3798         Label loop, end;
3799         cmpw(Ri, Rlen);
3800         br(Assembler::GE, end);
3801 
3802         bind(loop);
3803         pre1(Ri);
3804 
3805         block_comment("  for (j = i; j; j--) {"); {
3806           movw(Rj, Ri);
3807           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3808         } block_comment("  } // j");
3809 
3810         post1();
3811         addw(Ri, Ri, 1);
3812         cmpw(Ri, Rlen);
3813         br(Assembler::LT, loop);
3814         bind(end);
3815         block_comment("} // i");
3816       }
3817 
3818       block_comment("for (int i = len; i < 2*len; i++) {");
3819       mov(Ri, Rlen); {
3820         Label loop, end;
3821         cmpw(Ri, Rlen, Assembler::LSL, 1);
3822         br(Assembler::GE, end);
3823 
3824         bind(loop);
3825         pre2(Ri, Rlen);
3826 
3827         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3828           lslw(Rj, Rlen, 1);
3829           subw(Rj, Rj, Ri);
3830           subw(Rj, Rj, 1);
3831           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3832         } block_comment("  } // j");
3833 
3834         post2(Ri, Rlen);
3835         addw(Ri, Ri, 1);
3836         cmpw(Ri, Rlen, Assembler::LSL, 1);
3837         br(Assembler::LT, loop);
3838         bind(end);
3839       }
3840       block_comment("} // i");
3841 
3842       normalize(Rlen);
3843 
3844       mov(Ra, Pm_base);  // Save Pm_base in Ra
3845       restore_regs();  // Restore caller's Pm_base
3846 
3847       // Copy our result into caller's Pm_base
3848       reverse(Pm_base, Ra, Rlen, t0, t1);
3849 
3850       leave();
3851       bind(nothing);
3852       ret(lr);
3853 
3854       return entry;
3855     }
3856     // In C, approximately:
3857 
3858     // void
3859     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
3860     //                     unsigned long Pn_base[], unsigned long Pm_base[],
3861     //                     unsigned long inv, int len) {
3862     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3863     //   unsigned long *Pa, *Pb, *Pn, *Pm;
3864     //   unsigned long Ra, Rb, Rn, Rm;
3865 
3866     //   int i;
3867 
3868     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
3869 
3870     //   for (i = 0; i < len; i++) {
3871     //     int j;
3872 
3873     //     Pa = Pa_base;
3874     //     Pb = Pb_base + i;
3875     //     Pm = Pm_base;
3876     //     Pn = Pn_base + i;
3877 
3878     //     Ra = *Pa;
3879     //     Rb = *Pb;
3880     //     Rm = *Pm;
3881     //     Rn = *Pn;
3882 
3883     //     int iters = i;
3884     //     for (j = 0; iters--; j++) {
3885     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3886     //       MACC(Ra, Rb, t0, t1, t2);
3887     //       Ra = *++Pa;
3888     //       Rb = *--Pb;
3889     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3890     //       MACC(Rm, Rn, t0, t1, t2);
3891     //       Rm = *++Pm;
3892     //       Rn = *--Pn;
3893     //     }
3894 
3895     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
3896     //     MACC(Ra, Rb, t0, t1, t2);
3897     //     *Pm = Rm = t0 * inv;
3898     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
3899     //     MACC(Rm, Rn, t0, t1, t2);
3900 
3901     //     assert(t0 == 0, "broken Montgomery multiply");
3902 
3903     //     t0 = t1; t1 = t2; t2 = 0;
3904     //   }
3905 
3906     //   for (i = len; i < 2*len; i++) {
3907     //     int j;
3908 
3909     //     Pa = Pa_base + i-len;
3910     //     Pb = Pb_base + len;
3911     //     Pm = Pm_base + i-len;
3912     //     Pn = Pn_base + len;
3913 
3914     //     Ra = *++Pa;
3915     //     Rb = *--Pb;
3916     //     Rm = *++Pm;
3917     //     Rn = *--Pn;
3918 
3919     //     int iters = len*2-i-1;
3920     //     for (j = i-len+1; iters--; j++) {
3921     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3922     //       MACC(Ra, Rb, t0, t1, t2);
3923     //       Ra = *++Pa;
3924     //       Rb = *--Pb;
3925     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3926     //       MACC(Rm, Rn, t0, t1, t2);
3927     //       Rm = *++Pm;
3928     //       Rn = *--Pn;
3929     //     }
3930 
3931     //     Pm_base[i-len] = t0;
3932     //     t0 = t1; t1 = t2; t2 = 0;
3933     //   }
3934 
3935     //   while (t0)
3936     //     t0 = sub(Pm_base, Pn_base, t0, len);
3937     // }
3938 
3939     /**
3940      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
3941      * multiplies than Montgomery multiplication so it should be up to
3942      * 25% faster.  However, its loop control is more complex and it
3943      * may actually run slower on some machines.
3944      *
3945      * Arguments:
3946      *
3947      * Inputs:
3948      *   c_rarg0   - int array elements a
3949      *   c_rarg1   - int array elements n (the modulus)
3950      *   c_rarg2   - int length
3951      *   c_rarg3   - int inv
3952      *   c_rarg4   - int array elements m (the result)
3953      *
3954      */
3955     address generate_square() {
3956       Label argh;
3957       bind(argh);
3958       stop("MontgomeryMultiply total_allocation must be <= 8192");
3959 
3960       align(CodeEntryAlignment);
3961       address entry = pc();
3962 
3963       enter();
3964 
3965       // Make room.
3966       cmpw(Rlen, 512);
3967       br(Assembler::HI, argh);
3968       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3969       andr(sp, Ra, -2 * wordSize);
3970 
3971       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3972 
3973       {
3974         // Copy input args, reversing as we go.  We use Ra as a
3975         // temporary variable.
3976         reverse(Ra, Pa_base, Rlen, t0, t1);
3977         reverse(Ra, Pn_base, Rlen, t0, t1);
3978       }
3979 
3980       // Push all call-saved registers and also Pm_base which we'll need
3981       // at the end.
3982       save_regs();
3983 
3984       mov(Pm_base, Ra);
3985 
3986       mov(t0, zr);
3987       mov(t1, zr);
3988       mov(t2, zr);
3989 
3990       block_comment("for (int i = 0; i < len; i++) {");
3991       mov(Ri, zr); {
3992         Label loop, end;
3993         bind(loop);
3994         cmp(Ri, Rlen);
3995         br(Assembler::GE, end);
3996 
3997         pre1(Ri);
3998 
3999         block_comment("for (j = (i+1)/2; j; j--) {"); {
4000           add(Rj, Ri, 1);
4001           lsr(Rj, Rj, 1);
4002           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4003         } block_comment("  } // j");
4004 
4005         last_squaring(Ri);
4006 
4007         block_comment("  for (j = i/2; j; j--) {"); {
4008           lsr(Rj, Ri, 1);
4009           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4010         } block_comment("  } // j");
4011 
4012         post1_squaring();
4013         add(Ri, Ri, 1);
4014         cmp(Ri, Rlen);
4015         br(Assembler::LT, loop);
4016 
4017         bind(end);
4018         block_comment("} // i");
4019       }
4020 
4021       block_comment("for (int i = len; i < 2*len; i++) {");
4022       mov(Ri, Rlen); {
4023         Label loop, end;
4024         bind(loop);
4025         cmp(Ri, Rlen, Assembler::LSL, 1);
4026         br(Assembler::GE, end);
4027 
4028         pre2(Ri, Rlen);
4029 
4030         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4031           lsl(Rj, Rlen, 1);
4032           sub(Rj, Rj, Ri);
4033           sub(Rj, Rj, 1);
4034           lsr(Rj, Rj, 1);
4035           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4036         } block_comment("  } // j");
4037 
4038         last_squaring(Ri);
4039 
4040         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4041           lsl(Rj, Rlen, 1);
4042           sub(Rj, Rj, Ri);
4043           lsr(Rj, Rj, 1);
4044           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4045         } block_comment("  } // j");
4046 
4047         post2(Ri, Rlen);
4048         add(Ri, Ri, 1);
4049         cmp(Ri, Rlen, Assembler::LSL, 1);
4050 
4051         br(Assembler::LT, loop);
4052         bind(end);
4053         block_comment("} // i");
4054       }
4055 
4056       normalize(Rlen);
4057 
4058       mov(Ra, Pm_base);  // Save Pm_base in Ra
4059       restore_regs();  // Restore caller's Pm_base
4060 
4061       // Copy our result into caller's Pm_base
4062       reverse(Pm_base, Ra, Rlen, t0, t1);
4063 
4064       leave();
4065       ret(lr);
4066 
4067       return entry;
4068     }
4069     // In C, approximately:
4070 
4071     // void
4072     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4073     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4074     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4075     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4076     //   unsigned long Ra, Rb, Rn, Rm;
4077 
4078     //   int i;
4079 
4080     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4081 
4082     //   for (i = 0; i < len; i++) {
4083     //     int j;
4084 
4085     //     Pa = Pa_base;
4086     //     Pb = Pa_base + i;
4087     //     Pm = Pm_base;
4088     //     Pn = Pn_base + i;
4089 
4090     //     Ra = *Pa;
4091     //     Rb = *Pb;
4092     //     Rm = *Pm;
4093     //     Rn = *Pn;
4094 
4095     //     int iters = (i+1)/2;
4096     //     for (j = 0; iters--; j++) {
4097     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4098     //       MACC2(Ra, Rb, t0, t1, t2);
4099     //       Ra = *++Pa;
4100     //       Rb = *--Pb;
4101     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4102     //       MACC(Rm, Rn, t0, t1, t2);
4103     //       Rm = *++Pm;
4104     //       Rn = *--Pn;
4105     //     }
4106     //     if ((i & 1) == 0) {
4107     //       assert(Ra == Pa_base[j], "must be");
4108     //       MACC(Ra, Ra, t0, t1, t2);
4109     //     }
4110     //     iters = i/2;
4111     //     assert(iters == i-j, "must be");
4112     //     for (; iters--; j++) {
4113     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4114     //       MACC(Rm, Rn, t0, t1, t2);
4115     //       Rm = *++Pm;
4116     //       Rn = *--Pn;
4117     //     }
4118 
4119     //     *Pm = Rm = t0 * inv;
4120     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4121     //     MACC(Rm, Rn, t0, t1, t2);
4122 
4123     //     assert(t0 == 0, "broken Montgomery multiply");
4124 
4125     //     t0 = t1; t1 = t2; t2 = 0;
4126     //   }
4127 
4128     //   for (i = len; i < 2*len; i++) {
4129     //     int start = i-len+1;
4130     //     int end = start + (len - start)/2;
4131     //     int j;
4132 
4133     //     Pa = Pa_base + i-len;
4134     //     Pb = Pa_base + len;
4135     //     Pm = Pm_base + i-len;
4136     //     Pn = Pn_base + len;
4137 
4138     //     Ra = *++Pa;
4139     //     Rb = *--Pb;
4140     //     Rm = *++Pm;
4141     //     Rn = *--Pn;
4142 
4143     //     int iters = (2*len-i-1)/2;
4144     //     assert(iters == end-start, "must be");
4145     //     for (j = start; iters--; j++) {
4146     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4147     //       MACC2(Ra, Rb, t0, t1, t2);
4148     //       Ra = *++Pa;
4149     //       Rb = *--Pb;
4150     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4151     //       MACC(Rm, Rn, t0, t1, t2);
4152     //       Rm = *++Pm;
4153     //       Rn = *--Pn;
4154     //     }
4155     //     if ((i & 1) == 0) {
4156     //       assert(Ra == Pa_base[j], "must be");
4157     //       MACC(Ra, Ra, t0, t1, t2);
4158     //     }
4159     //     iters =  (2*len-i)/2;
4160     //     assert(iters == len-j, "must be");
4161     //     for (; iters--; j++) {
4162     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4163     //       MACC(Rm, Rn, t0, t1, t2);
4164     //       Rm = *++Pm;
4165     //       Rn = *--Pn;
4166     //     }
4167     //     Pm_base[i-len] = t0;
4168     //     t0 = t1; t1 = t2; t2 = 0;
4169     //   }
4170 
4171     //   while (t0)
4172     //     t0 = sub(Pm_base, Pn_base, t0, len);
4173     // }
4174   };
4175 
4176   // Initialization
4177   void generate_initial() {
4178     // Generate initial stubs and initializes the entry points
4179 
4180     // entry points that exist in all platforms Note: This is code
4181     // that could be shared among different platforms - however the
4182     // benefit seems to be smaller than the disadvantage of having a
4183     // much more complicated generator structure. See also comment in
4184     // stubRoutines.hpp.
4185 
4186     StubRoutines::_forward_exception_entry = generate_forward_exception();
4187 
4188     StubRoutines::_call_stub_entry =
4189       generate_call_stub(StubRoutines::_call_stub_return_address);
4190 
4191     // is referenced by megamorphic call
4192     StubRoutines::_catch_exception_entry = generate_catch_exception();
4193 
4194     // Build this early so it's available for the interpreter.
4195     StubRoutines::_throw_StackOverflowError_entry =
4196       generate_throw_exception("StackOverflowError throw_exception",
4197                                CAST_FROM_FN_PTR(address,
4198                                                 SharedRuntime::
4199                                                 throw_StackOverflowError));
4200     if (UseCRC32Intrinsics) {
4201       // set table address before stub generation which use it
4202       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4203       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4204     }
4205   }
4206 
4207   void generate_all() {
4208     // support for verify_oop (must happen after universe_init)
4209     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4210     StubRoutines::_throw_AbstractMethodError_entry =
4211       generate_throw_exception("AbstractMethodError throw_exception",
4212                                CAST_FROM_FN_PTR(address,
4213                                                 SharedRuntime::
4214                                                 throw_AbstractMethodError));
4215 
4216     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4217       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4218                                CAST_FROM_FN_PTR(address,
4219                                                 SharedRuntime::
4220                                                 throw_IncompatibleClassChangeError));
4221 
4222     StubRoutines::_throw_NullPointerException_at_call_entry =
4223       generate_throw_exception("NullPointerException at call throw_exception",
4224                                CAST_FROM_FN_PTR(address,
4225                                                 SharedRuntime::
4226                                                 throw_NullPointerException_at_call));
4227 
4228     // arraycopy stubs used by compilers
4229     generate_arraycopy_stubs();
4230 
4231     if (UseMultiplyToLenIntrinsic) {
4232       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4233     }
4234 
4235     if (UseMontgomeryMultiplyIntrinsic) {
4236       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4237       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4238       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4239     }
4240 
4241     if (UseMontgomerySquareIntrinsic) {
4242       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4243       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4244       // We use generate_multiply() rather than generate_square()
4245       // because it's faster for the sizes of modulus we care about.
4246       StubRoutines::_montgomerySquare = g.generate_multiply();
4247     }
4248 
4249     if (UseAESIntrinsics) {
4250       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4251       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4252       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4253       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4254     }
4255 
4256     if (UseSHA1Intrinsics) {
4257       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4258       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4259     }
4260     if (UseSHA256Intrinsics) {
4261       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4262       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4263     }
4264 
4265     // Safefetch stubs.
4266     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4267                                                        &StubRoutines::_safefetch32_fault_pc,
4268                                                        &StubRoutines::_safefetch32_continuation_pc);
4269     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4270                                                        &StubRoutines::_safefetchN_fault_pc,
4271                                                        &StubRoutines::_safefetchN_continuation_pc);
4272   }
4273 
4274  public:
4275   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4276     if (all) {
4277       generate_all();
4278     } else {
4279       generate_initial();
4280     }
4281   }
4282 }; // end class declaration
4283 
4284 void StubGenerator_generate(CodeBuffer* code, bool all) {
4285   StubGenerator g(code, all);
4286 }