1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shenandoah/brooksPointer.hpp"
  30 #include "gc/shenandoah/shenandoahBarrierSet.hpp"
  31 #include "gc/shenandoah/shenandoahHeap.hpp"
  32 #include "gc/shenandoah/shenandoahHeapRegion.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "nativeInst_aarch64.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/method.hpp"
  37 #include "oops/objArrayKlass.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "runtime/frame.inline.hpp"
  41 #include "runtime/handles.inline.hpp"
  42 #include "runtime/sharedRuntime.hpp"
  43 #include "runtime/stubCodeGenerator.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "runtime/thread.inline.hpp"
  46 #include "utilities/align.hpp"
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 
  51 #ifdef BUILTIN_SIM
  52 #include "../../../../../../simulator/simulator.hpp"
  53 #endif
  54 
  55 // Declaration and definition of StubGenerator (no .hpp file).
  56 // For a more detailed description of the stub routine structure
  57 // see the comment in stubRoutines.hpp
  58 
  59 #undef __
  60 #define __ _masm->
  61 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #else
  66 #define BLOCK_COMMENT(str) __ block_comment(str)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Stub Code definitions
  72 
  73 class StubGenerator: public StubCodeGenerator {
  74  private:
  75 
  76 #ifdef PRODUCT
  77 #define inc_counter_np(counter) ((void)0)
  78 #else
  79   void inc_counter_np_(int& counter) {
  80     __ lea(rscratch2, ExternalAddress((address)&counter));
  81     __ ldrw(rscratch1, Address(rscratch2));
  82     __ addw(rscratch1, rscratch1, 1);
  83     __ strw(rscratch1, Address(rscratch2));
  84   }
  85 #define inc_counter_np(counter) \
  86   BLOCK_COMMENT("inc_counter " #counter); \
  87   inc_counter_np_(counter);
  88 #endif
  89 
  90   // Call stubs are used to call Java from C
  91   //
  92   // Arguments:
  93   //    c_rarg0:   call wrapper address                   address
  94   //    c_rarg1:   result                                 address
  95   //    c_rarg2:   result type                            BasicType
  96   //    c_rarg3:   method                                 Method*
  97   //    c_rarg4:   (interpreter) entry point              address
  98   //    c_rarg5:   parameters                             intptr_t*
  99   //    c_rarg6:   parameter size (in words)              int
 100   //    c_rarg7:   thread                                 Thread*
 101   //
 102   // There is no return from the stub itself as any Java result
 103   // is written to result
 104   //
 105   // we save r30 (lr) as the return PC at the base of the frame and
 106   // link r29 (fp) below it as the frame pointer installing sp (r31)
 107   // into fp.
 108   //
 109   // we save r0-r7, which accounts for all the c arguments.
 110   //
 111   // TODO: strictly do we need to save them all? they are treated as
 112   // volatile by C so could we omit saving the ones we are going to
 113   // place in global registers (thread? method?) or those we only use
 114   // during setup of the Java call?
 115   //
 116   // we don't need to save r8 which C uses as an indirect result location
 117   // return register.
 118   //
 119   // we don't need to save r9-r15 which both C and Java treat as
 120   // volatile
 121   //
 122   // we don't need to save r16-18 because Java does not use them
 123   //
 124   // we save r19-r28 which Java uses as scratch registers and C
 125   // expects to be callee-save
 126   //
 127   // we save the bottom 64 bits of each value stored in v8-v15; it is
 128   // the responsibility of the caller to preserve larger values.
 129   //
 130   // so the stub frame looks like this when we enter Java code
 131   //
 132   //     [ return_from_Java     ] <--- sp
 133   //     [ argument word n      ]
 134   //      ...
 135   // -27 [ argument word 1      ]
 136   // -26 [ saved v15            ] <--- sp_after_call
 137   // -25 [ saved v14            ]
 138   // -24 [ saved v13            ]
 139   // -23 [ saved v12            ]
 140   // -22 [ saved v11            ]
 141   // -21 [ saved v10            ]
 142   // -20 [ saved v9             ]
 143   // -19 [ saved v8             ]
 144   // -18 [ saved r28            ]
 145   // -17 [ saved r27            ]
 146   // -16 [ saved r26            ]
 147   // -15 [ saved r25            ]
 148   // -14 [ saved r24            ]
 149   // -13 [ saved r23            ]
 150   // -12 [ saved r22            ]
 151   // -11 [ saved r21            ]
 152   // -10 [ saved r20            ]
 153   //  -9 [ saved r19            ]
 154   //  -8 [ call wrapper    (r0) ]
 155   //  -7 [ result          (r1) ]
 156   //  -6 [ result type     (r2) ]
 157   //  -5 [ method          (r3) ]
 158   //  -4 [ entry point     (r4) ]
 159   //  -3 [ parameters      (r5) ]
 160   //  -2 [ parameter size  (r6) ]
 161   //  -1 [ thread (r7)          ]
 162   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 163   //   1 [ saved lr       (r30) ]
 164 
 165   // Call stub stack layout word offsets from fp
 166   enum call_stub_layout {
 167     sp_after_call_off = -26,
 168 
 169     d15_off            = -26,
 170     d13_off            = -24,
 171     d11_off            = -22,
 172     d9_off             = -20,
 173 
 174     r28_off            = -18,
 175     r26_off            = -16,
 176     r24_off            = -14,
 177     r22_off            = -12,
 178     r20_off            = -10,
 179     call_wrapper_off   =  -8,
 180     result_off         =  -7,
 181     result_type_off    =  -6,
 182     method_off         =  -5,
 183     entry_point_off    =  -4,
 184     parameter_size_off =  -2,
 185     thread_off         =  -1,
 186     fp_f               =   0,
 187     retaddr_off        =   1,
 188   };
 189 
 190   address generate_call_stub(address& return_address) {
 191     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 192            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 193            "adjust this code");
 194 
 195     StubCodeMark mark(this, "StubRoutines", "call_stub");
 196     address start = __ pc();
 197 
 198     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 199 
 200     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 201     const Address result        (rfp, result_off         * wordSize);
 202     const Address result_type   (rfp, result_type_off    * wordSize);
 203     const Address method        (rfp, method_off         * wordSize);
 204     const Address entry_point   (rfp, entry_point_off    * wordSize);
 205     const Address parameter_size(rfp, parameter_size_off * wordSize);
 206 
 207     const Address thread        (rfp, thread_off         * wordSize);
 208 
 209     const Address d15_save      (rfp, d15_off * wordSize);
 210     const Address d13_save      (rfp, d13_off * wordSize);
 211     const Address d11_save      (rfp, d11_off * wordSize);
 212     const Address d9_save       (rfp, d9_off * wordSize);
 213 
 214     const Address r28_save      (rfp, r28_off * wordSize);
 215     const Address r26_save      (rfp, r26_off * wordSize);
 216     const Address r24_save      (rfp, r24_off * wordSize);
 217     const Address r22_save      (rfp, r22_off * wordSize);
 218     const Address r20_save      (rfp, r20_off * wordSize);
 219 
 220     // stub code
 221 
 222     // we need a C prolog to bootstrap the x86 caller into the sim
 223     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 224 
 225     address aarch64_entry = __ pc();
 226 
 227 #ifdef BUILTIN_SIM
 228     // Save sender's SP for stack traces.
 229     __ mov(rscratch1, sp);
 230     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 231 #endif
 232     // set up frame and move sp to end of save area
 233     __ enter();
 234     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 235 
 236     // save register parameters and Java scratch/global registers
 237     // n.b. we save thread even though it gets installed in
 238     // rthread because we want to sanity check rthread later
 239     __ str(c_rarg7,  thread);
 240     __ strw(c_rarg6, parameter_size);
 241     __ stp(c_rarg4, c_rarg5,  entry_point);
 242     __ stp(c_rarg2, c_rarg3,  result_type);
 243     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 244 
 245     __ stp(r20, r19,   r20_save);
 246     __ stp(r22, r21,   r22_save);
 247     __ stp(r24, r23,   r24_save);
 248     __ stp(r26, r25,   r26_save);
 249     __ stp(r28, r27,   r28_save);
 250 
 251     __ stpd(v9,  v8,   d9_save);
 252     __ stpd(v11, v10,  d11_save);
 253     __ stpd(v13, v12,  d13_save);
 254     __ stpd(v15, v14,  d15_save);
 255 
 256     // install Java thread in global register now we have saved
 257     // whatever value it held
 258     __ mov(rthread, c_rarg7);
 259     // And method
 260     __ mov(rmethod, c_rarg3);
 261 
 262     // set up the heapbase register
 263     __ reinit_heapbase();
 264 
 265 #ifdef ASSERT
 266     // make sure we have no pending exceptions
 267     {
 268       Label L;
 269       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 270       __ cmp(rscratch1, (unsigned)NULL_WORD);
 271       __ br(Assembler::EQ, L);
 272       __ stop("StubRoutines::call_stub: entered with pending exception");
 273       __ BIND(L);
 274     }
 275 #endif
 276     // pass parameters if any
 277     __ mov(esp, sp);
 278     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 279     __ andr(sp, rscratch1, -2 * wordSize);
 280 
 281     BLOCK_COMMENT("pass parameters if any");
 282     Label parameters_done;
 283     // parameter count is still in c_rarg6
 284     // and parameter pointer identifying param 1 is in c_rarg5
 285     __ cbzw(c_rarg6, parameters_done);
 286 
 287     address loop = __ pc();
 288     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 289     __ subsw(c_rarg6, c_rarg6, 1);
 290     __ push(rscratch1);
 291     __ br(Assembler::GT, loop);
 292 
 293     __ BIND(parameters_done);
 294 
 295     // call Java entry -- passing methdoOop, and current sp
 296     //      rmethod: Method*
 297     //      r13: sender sp
 298     BLOCK_COMMENT("call Java function");
 299     __ mov(r13, sp);
 300     __ blr(c_rarg4);
 301 
 302     // tell the simulator we have returned to the stub
 303 
 304     // we do this here because the notify will already have been done
 305     // if we get to the next instruction via an exception
 306     //
 307     // n.b. adding this instruction here affects the calculation of
 308     // whether or not a routine returns to the call stub (used when
 309     // doing stack walks) since the normal test is to check the return
 310     // pc against the address saved below. so we may need to allow for
 311     // this extra instruction in the check.
 312 
 313     if (NotifySimulator) {
 314       __ notify(Assembler::method_reentry);
 315     }
 316     // save current address for use by exception handling code
 317 
 318     return_address = __ pc();
 319 
 320     // store result depending on type (everything that is not
 321     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 322     // n.b. this assumes Java returns an integral result in r0
 323     // and a floating result in j_farg0
 324     __ ldr(j_rarg2, result);
 325     Label is_long, is_float, is_double, exit;
 326     __ ldr(j_rarg1, result_type);
 327     __ cmp(j_rarg1, T_OBJECT);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, T_LONG);
 330     __ br(Assembler::EQ, is_long);
 331     __ cmp(j_rarg1, T_FLOAT);
 332     __ br(Assembler::EQ, is_float);
 333     __ cmp(j_rarg1, T_DOUBLE);
 334     __ br(Assembler::EQ, is_double);
 335 
 336     // handle T_INT case
 337     __ strw(r0, Address(j_rarg2));
 338 
 339     __ BIND(exit);
 340 
 341     // pop parameters
 342     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 343 
 344 #ifdef ASSERT
 345     // verify that threads correspond
 346     {
 347       Label L, S;
 348       __ ldr(rscratch1, thread);
 349       __ cmp(rthread, rscratch1);
 350       __ br(Assembler::NE, S);
 351       __ get_thread(rscratch1);
 352       __ cmp(rthread, rscratch1);
 353       __ br(Assembler::EQ, L);
 354       __ BIND(S);
 355       __ stop("StubRoutines::call_stub: threads must correspond");
 356       __ BIND(L);
 357     }
 358 #endif
 359 
 360     // restore callee-save registers
 361     __ ldpd(v15, v14,  d15_save);
 362     __ ldpd(v13, v12,  d13_save);
 363     __ ldpd(v11, v10,  d11_save);
 364     __ ldpd(v9,  v8,   d9_save);
 365 
 366     __ ldp(r28, r27,   r28_save);
 367     __ ldp(r26, r25,   r26_save);
 368     __ ldp(r24, r23,   r24_save);
 369     __ ldp(r22, r21,   r22_save);
 370     __ ldp(r20, r19,   r20_save);
 371 
 372     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 373     __ ldrw(c_rarg2, result_type);
 374     __ ldr(c_rarg3,  method);
 375     __ ldp(c_rarg4, c_rarg5,  entry_point);
 376     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 377 
 378 #ifndef PRODUCT
 379     // tell the simulator we are about to end Java execution
 380     if (NotifySimulator) {
 381       __ notify(Assembler::method_exit);
 382     }
 383 #endif
 384     // leave frame and return to caller
 385     __ leave();
 386     __ ret(lr);
 387 
 388     // handle return types different from T_INT
 389 
 390     __ BIND(is_long);
 391     __ str(r0, Address(j_rarg2, 0));
 392     __ br(Assembler::AL, exit);
 393 
 394     __ BIND(is_float);
 395     __ strs(j_farg0, Address(j_rarg2, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     __ BIND(is_double);
 399     __ strd(j_farg0, Address(j_rarg2, 0));
 400     __ br(Assembler::AL, exit);
 401 
 402     return start;
 403   }
 404 
 405   // Return point for a Java call if there's an exception thrown in
 406   // Java code.  The exception is caught and transformed into a
 407   // pending exception stored in JavaThread that can be tested from
 408   // within the VM.
 409   //
 410   // Note: Usually the parameters are removed by the callee. In case
 411   // of an exception crossing an activation frame boundary, that is
 412   // not the case if the callee is compiled code => need to setup the
 413   // rsp.
 414   //
 415   // r0: exception oop
 416 
 417   // NOTE: this is used as a target from the signal handler so it
 418   // needs an x86 prolog which returns into the current simulator
 419   // executing the generated catch_exception code. so the prolog
 420   // needs to install rax in a sim register and adjust the sim's
 421   // restart pc to enter the generated code at the start position
 422   // then return from native to simulated execution.
 423 
 424   address generate_catch_exception() {
 425     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 426     address start = __ pc();
 427 
 428     // same as in generate_call_stub():
 429     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 430     const Address thread        (rfp, thread_off         * wordSize);
 431 
 432 #ifdef ASSERT
 433     // verify that threads correspond
 434     {
 435       Label L, S;
 436       __ ldr(rscratch1, thread);
 437       __ cmp(rthread, rscratch1);
 438       __ br(Assembler::NE, S);
 439       __ get_thread(rscratch1);
 440       __ cmp(rthread, rscratch1);
 441       __ br(Assembler::EQ, L);
 442       __ bind(S);
 443       __ stop("StubRoutines::catch_exception: threads must correspond");
 444       __ bind(L);
 445     }
 446 #endif
 447 
 448     // set pending exception
 449     __ verify_oop(r0);
 450 
 451     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 452     __ mov(rscratch1, (address)__FILE__);
 453     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 454     __ movw(rscratch1, (int)__LINE__);
 455     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 456 
 457     // complete return to VM
 458     assert(StubRoutines::_call_stub_return_address != NULL,
 459            "_call_stub_return_address must have been generated before");
 460     __ b(StubRoutines::_call_stub_return_address);
 461 
 462     return start;
 463   }
 464 
 465   // Continuation point for runtime calls returning with a pending
 466   // exception.  The pending exception check happened in the runtime
 467   // or native call stub.  The pending exception in Thread is
 468   // converted into a Java-level exception.
 469   //
 470   // Contract with Java-level exception handlers:
 471   // r0: exception
 472   // r3: throwing pc
 473   //
 474   // NOTE: At entry of this stub, exception-pc must be in LR !!
 475 
 476   // NOTE: this is always used as a jump target within generated code
 477   // so it just needs to be generated code wiht no x86 prolog
 478 
 479   address generate_forward_exception() {
 480     StubCodeMark mark(this, "StubRoutines", "forward exception");
 481     address start = __ pc();
 482 
 483     // Upon entry, LR points to the return address returning into
 484     // Java (interpreted or compiled) code; i.e., the return address
 485     // becomes the throwing pc.
 486     //
 487     // Arguments pushed before the runtime call are still on the stack
 488     // but the exception handler will reset the stack pointer ->
 489     // ignore them.  A potential result in registers can be ignored as
 490     // well.
 491 
 492 #ifdef ASSERT
 493     // make sure this code is only executed if there is a pending exception
 494     {
 495       Label L;
 496       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 497       __ cbnz(rscratch1, L);
 498       __ stop("StubRoutines::forward exception: no pending exception (1)");
 499       __ bind(L);
 500     }
 501 #endif
 502 
 503     // compute exception handler into r19
 504 
 505     // call the VM to find the handler address associated with the
 506     // caller address. pass thread in r0 and caller pc (ret address)
 507     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 508     // the stack.
 509     __ mov(c_rarg1, lr);
 510     // lr will be trashed by the VM call so we move it to R19
 511     // (callee-saved) because we also need to pass it to the handler
 512     // returned by this call.
 513     __ mov(r19, lr);
 514     BLOCK_COMMENT("call exception_handler_for_return_address");
 515     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 516                          SharedRuntime::exception_handler_for_return_address),
 517                     rthread, c_rarg1);
 518     // we should not really care that lr is no longer the callee
 519     // address. we saved the value the handler needs in r19 so we can
 520     // just copy it to r3. however, the C2 handler will push its own
 521     // frame and then calls into the VM and the VM code asserts that
 522     // the PC for the frame above the handler belongs to a compiled
 523     // Java method. So, we restore lr here to satisfy that assert.
 524     __ mov(lr, r19);
 525     // setup r0 & r3 & clear pending exception
 526     __ mov(r3, r19);
 527     __ mov(r19, r0);
 528     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 529     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 530 
 531 #ifdef ASSERT
 532     // make sure exception is set
 533     {
 534       Label L;
 535       __ cbnz(r0, L);
 536       __ stop("StubRoutines::forward exception: no pending exception (2)");
 537       __ bind(L);
 538     }
 539 #endif
 540 
 541     // continue at exception handler
 542     // r0: exception
 543     // r3: throwing pc
 544     // r19: exception handler
 545     __ verify_oop(r0);
 546     __ br(r19);
 547 
 548     return start;
 549   }
 550 
 551   // Shenandoah write barrier.
 552   //
 553   // Input:
 554   //   r0: OOP to evacuate.  Not null.
 555   //
 556   // Output:
 557   //   r0: Pointer to evacuated OOP.
 558   //
 559   // Trash rscratch1, rscratch2.  Preserve everything else.
 560 
 561   address generate_shenandoah_wb(bool c_abi, bool do_cset_test) {
 562     StubCodeMark mark(this, "StubRoutines", "shenandoah_wb");
 563 
 564     __ align(6);
 565     address start = __ pc();
 566 
 567     if (do_cset_test) {
 568       Label work;
 569       __ mov(rscratch2, ShenandoahHeap::in_cset_fast_test_addr());
 570       __ lsr(rscratch1, r0, ShenandoahHeapRegion::region_size_bytes_shift_jint());
 571       __ ldrb(rscratch2, Address(rscratch2, rscratch1));
 572       __ tbnz(rscratch2, 0, work);
 573       __ ret(lr);
 574       __ bind(work);
 575     }
 576 
 577     Register obj = r0;
 578 
 579     __ enter(); // required for proper stackwalking of RuntimeStub frame
 580 
 581     if (!c_abi) {
 582       __ push_call_clobbered_registers();
 583     } else {
 584       __ push_call_clobbered_fp_registers();
 585     }
 586 
 587     __ mov(lr, CAST_FROM_FN_PTR(address, ShenandoahBarrierSet::write_barrier_JRT));
 588     __ blrt(lr, 1, 0, MacroAssembler::ret_type_integral);
 589     if (!c_abi) {
 590       __ mov(rscratch1, obj);
 591       __ pop_call_clobbered_registers();
 592       __ mov(obj, rscratch1);
 593     } else {
 594       __ pop_call_clobbered_fp_registers();
 595     }
 596 
 597     __ leave(); // required for proper stackwalking of RuntimeStub frame
 598     __ ret(lr);
 599 
 600     return start;
 601   }
 602 
 603   // Non-destructive plausibility checks for oops
 604   //
 605   // Arguments:
 606   //    r0: oop to verify
 607   //    rscratch1: error message
 608   //
 609   // Stack after saving c_rarg3:
 610   //    [tos + 0]: saved c_rarg3
 611   //    [tos + 1]: saved c_rarg2
 612   //    [tos + 2]: saved lr
 613   //    [tos + 3]: saved rscratch2
 614   //    [tos + 4]: saved r0
 615   //    [tos + 5]: saved rscratch1
 616   address generate_verify_oop() {
 617 
 618     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 619     address start = __ pc();
 620 
 621     Label exit, error;
 622 
 623     // save c_rarg2 and c_rarg3
 624     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 625 
 626     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 627     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 628     __ ldr(c_rarg3, Address(c_rarg2));
 629     __ add(c_rarg3, c_rarg3, 1);
 630     __ str(c_rarg3, Address(c_rarg2));
 631 
 632     // object is in r0
 633     // make sure object is 'reasonable'
 634     __ cbz(r0, exit); // if obj is NULL it is OK
 635 
 636     // Check if the oop is in the right area of memory
 637     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 638     __ andr(c_rarg2, r0, c_rarg3);
 639     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 640 
 641     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 642     // instruction here because the flags register is live.
 643     __ eor(c_rarg2, c_rarg2, c_rarg3);
 644     __ cbnz(c_rarg2, error);
 645 
 646     // make sure klass is 'reasonable', which is not zero.
 647     __ load_klass(r0, r0);  // get klass
 648     __ cbz(r0, error);      // if klass is NULL it is broken
 649 
 650     // return if everything seems ok
 651     __ bind(exit);
 652 
 653     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 654     __ ret(lr);
 655 
 656     // handle errors
 657     __ bind(error);
 658     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 659 
 660     __ push(RegSet::range(r0, r29), sp);
 661     // debug(char* msg, int64_t pc, int64_t regs[])
 662     __ mov(c_rarg0, rscratch1);      // pass address of error message
 663     __ mov(c_rarg1, lr);             // pass return address
 664     __ mov(c_rarg2, sp);             // pass address of regs on stack
 665 #ifndef PRODUCT
 666     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 667 #endif
 668     BLOCK_COMMENT("call MacroAssembler::debug");
 669     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 670     __ blrt(rscratch1, 3, 0, 1);
 671 
 672     return start;
 673   }
 674 
 675   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 676 
 677   // Generate code for an array write pre barrier
 678   //
 679   //     addr       - starting address
 680   //     count      - element count
 681   //     tmp        - scratch register
 682   //     saved_regs - registers to be saved before calling static_write_ref_array_pre
 683   //
 684   //     Callers must specify which registers to preserve in saved_regs.
 685   //     Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs.
 686   //
 687   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized, RegSet saved_regs) {
 688     BarrierSet* bs = Universe::heap()->barrier_set();
 689     switch (bs->kind()) {
 690     case BarrierSet::G1SATBCTLogging:
 691     case BarrierSet::Shenandoah:
 692       // Don't generate the call if we statically know that the target is uninitialized
 693       if (!dest_uninitialized) {
 694         __ push(saved_regs, sp);
 695         if (count == c_rarg0) {
 696           if (addr == c_rarg1) {
 697             // exactly backwards!!
 698             __ mov(rscratch1, c_rarg0);
 699             __ mov(c_rarg0, c_rarg1);
 700             __ mov(c_rarg1, rscratch1);
 701           } else {
 702             __ mov(c_rarg1, count);
 703             __ mov(c_rarg0, addr);
 704           }
 705         } else {
 706           __ mov(c_rarg0, addr);
 707           __ mov(c_rarg1, count);
 708         }
 709         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 710         __ pop(saved_regs, sp);
 711         break;
 712       case BarrierSet::CardTableForRS:
 713       case BarrierSet::CardTableExtension:
 714       case BarrierSet::ModRef:
 715         break;
 716       default:
 717         ShouldNotReachHere();
 718 
 719       }
 720     }
 721   }
 722 
 723   //
 724   // Generate code for an array write post barrier
 725   //
 726   //  Input:
 727   //     start      - register containing starting address of destination array
 728   //     end        - register containing ending address of destination array
 729   //     scratch    - scratch register
 730   //     saved_regs - registers to be saved before calling static_write_ref_array_post
 731   //
 732   //  The input registers are overwritten.
 733   //  The ending address is inclusive.
 734   //  Callers must specify which registers to preserve in saved_regs.
 735   //  Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs.
 736   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch, RegSet saved_regs) {
 737     assert_different_registers(start, end, scratch);
 738     BarrierSet* bs = Universe::heap()->barrier_set();
 739     switch (bs->kind()) {
 740       case BarrierSet::G1SATBCTLogging:
 741       case BarrierSet::Shenandoah:
 742         {
 743           __ push(saved_regs, sp);
 744           // must compute element count unless barrier set interface is changed (other platforms supply count)
 745           assert_different_registers(start, end, scratch);
 746           __ lea(scratch, Address(end, BytesPerHeapOop));
 747           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 748           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 749           __ mov(c_rarg0, start);
 750           __ mov(c_rarg1, scratch);
 751           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 752           __ pop(saved_regs, sp);
 753         }
 754         break;
 755       case BarrierSet::CardTableForRS:
 756       case BarrierSet::CardTableExtension:
 757         {
 758           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 759           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 760 
 761           Label L_loop;
 762 
 763            __ lsr(start, start, CardTableModRefBS::card_shift);
 764            __ lsr(end, end, CardTableModRefBS::card_shift);
 765            __ sub(end, end, start); // number of bytes to copy
 766 
 767           const Register count = end; // 'end' register contains bytes count now
 768           __ load_byte_map_base(scratch);
 769           __ add(start, start, scratch);
 770           if (UseConcMarkSweepGC) {
 771             __ membar(__ StoreStore);
 772           }
 773           __ BIND(L_loop);
 774           __ strb(zr, Address(start, count));
 775           __ subs(count, count, 1);
 776           __ br(Assembler::GE, L_loop);
 777         }
 778         break;
 779       default:
 780         ShouldNotReachHere();
 781 
 782     }
 783   }
 784 
 785   // The inner part of zero_words().  This is the bulk operation,
 786   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 787   // caller is responsible for zeroing the last few words.
 788   //
 789   // Inputs:
 790   // r10: the HeapWord-aligned base address of an array to zero.
 791   // r11: the count in HeapWords, r11 > 0.
 792   //
 793   // Returns r10 and r11, adjusted for the caller to clear.
 794   // r10: the base address of the tail of words left to clear.
 795   // r11: the number of words in the tail.
 796   //      r11 < MacroAssembler::zero_words_block_size.
 797 
 798   address generate_zero_blocks() {
 799     Label store_pair, loop_store_pair, done;
 800     Label base_aligned;
 801 
 802     Register base = r10, cnt = r11;
 803 
 804     __ align(CodeEntryAlignment);
 805     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 806     address start = __ pc();
 807 
 808     if (UseBlockZeroing) {
 809       int zva_length = VM_Version::zva_length();
 810 
 811       // Ensure ZVA length can be divided by 16. This is required by
 812       // the subsequent operations.
 813       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 814 
 815       __ tbz(base, 3, base_aligned);
 816       __ str(zr, Address(__ post(base, 8)));
 817       __ sub(cnt, cnt, 1);
 818       __ bind(base_aligned);
 819 
 820       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 821       // alignment.
 822       Label small;
 823       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 824       __ subs(rscratch1, cnt, low_limit >> 3);
 825       __ br(Assembler::LT, small);
 826       __ zero_dcache_blocks(base, cnt);
 827       __ bind(small);
 828     }
 829 
 830     {
 831       // Number of stp instructions we'll unroll
 832       const int unroll =
 833         MacroAssembler::zero_words_block_size / 2;
 834       // Clear the remaining blocks.
 835       Label loop;
 836       __ subs(cnt, cnt, unroll * 2);
 837       __ br(Assembler::LT, done);
 838       __ bind(loop);
 839       for (int i = 0; i < unroll; i++)
 840         __ stp(zr, zr, __ post(base, 16));
 841       __ subs(cnt, cnt, unroll * 2);
 842       __ br(Assembler::GE, loop);
 843       __ bind(done);
 844       __ add(cnt, cnt, unroll * 2);
 845     }
 846 
 847     __ ret(lr);
 848 
 849     return start;
 850   }
 851 
 852 
 853   typedef enum {
 854     copy_forwards = 1,
 855     copy_backwards = -1
 856   } copy_direction;
 857 
 858   // Bulk copy of blocks of 8 words.
 859   //
 860   // count is a count of words.
 861   //
 862   // Precondition: count >= 8
 863   //
 864   // Postconditions:
 865   //
 866   // The least significant bit of count contains the remaining count
 867   // of words to copy.  The rest of count is trash.
 868   //
 869   // s and d are adjusted to point to the remaining words to copy
 870   //
 871   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 872                            copy_direction direction) {
 873     int unit = wordSize * direction;
 874     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 875 
 876     int offset;
 877     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 878       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 879     const Register stride = r13;
 880 
 881     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 882     assert_different_registers(s, d, count, rscratch1);
 883 
 884     Label again, drain;
 885     const char *stub_name;
 886     if (direction == copy_forwards)
 887       stub_name = "forward_copy_longs";
 888     else
 889       stub_name = "backward_copy_longs";
 890     StubCodeMark mark(this, "StubRoutines", stub_name);
 891     __ align(CodeEntryAlignment);
 892     __ bind(start);
 893 
 894     Label unaligned_copy_long;
 895     if (AvoidUnalignedAccesses) {
 896       __ tbnz(d, 3, unaligned_copy_long);
 897     }
 898 
 899     if (direction == copy_forwards) {
 900       __ sub(s, s, bias);
 901       __ sub(d, d, bias);
 902     }
 903 
 904 #ifdef ASSERT
 905     // Make sure we are never given < 8 words
 906     {
 907       Label L;
 908       __ cmp(count, 8);
 909       __ br(Assembler::GE, L);
 910       __ stop("genrate_copy_longs called with < 8 words");
 911       __ bind(L);
 912     }
 913 #endif
 914 
 915     // Fill 8 registers
 916     if (UseSIMDForMemoryOps) {
 917       __ ldpq(v0, v1, Address(s, 4 * unit));
 918       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 919     } else {
 920       __ ldp(t0, t1, Address(s, 2 * unit));
 921       __ ldp(t2, t3, Address(s, 4 * unit));
 922       __ ldp(t4, t5, Address(s, 6 * unit));
 923       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 924     }
 925 
 926     __ subs(count, count, 16);
 927     __ br(Assembler::LO, drain);
 928 
 929     int prefetch = PrefetchCopyIntervalInBytes;
 930     bool use_stride = false;
 931     if (direction == copy_backwards) {
 932        use_stride = prefetch > 256;
 933        prefetch = -prefetch;
 934        if (use_stride) __ mov(stride, prefetch);
 935     }
 936 
 937     __ bind(again);
 938 
 939     if (PrefetchCopyIntervalInBytes > 0)
 940       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 941 
 942     if (UseSIMDForMemoryOps) {
 943       __ stpq(v0, v1, Address(d, 4 * unit));
 944       __ ldpq(v0, v1, Address(s, 4 * unit));
 945       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 946       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 947     } else {
 948       __ stp(t0, t1, Address(d, 2 * unit));
 949       __ ldp(t0, t1, Address(s, 2 * unit));
 950       __ stp(t2, t3, Address(d, 4 * unit));
 951       __ ldp(t2, t3, Address(s, 4 * unit));
 952       __ stp(t4, t5, Address(d, 6 * unit));
 953       __ ldp(t4, t5, Address(s, 6 * unit));
 954       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 955       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 956     }
 957 
 958     __ subs(count, count, 8);
 959     __ br(Assembler::HS, again);
 960 
 961     // Drain
 962     __ bind(drain);
 963     if (UseSIMDForMemoryOps) {
 964       __ stpq(v0, v1, Address(d, 4 * unit));
 965       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 966     } else {
 967       __ stp(t0, t1, Address(d, 2 * unit));
 968       __ stp(t2, t3, Address(d, 4 * unit));
 969       __ stp(t4, t5, Address(d, 6 * unit));
 970       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 971     }
 972 
 973     {
 974       Label L1, L2;
 975       __ tbz(count, exact_log2(4), L1);
 976       if (UseSIMDForMemoryOps) {
 977         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 978         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 979       } else {
 980         __ ldp(t0, t1, Address(s, 2 * unit));
 981         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 982         __ stp(t0, t1, Address(d, 2 * unit));
 983         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 984       }
 985       __ bind(L1);
 986 
 987       if (direction == copy_forwards) {
 988         __ add(s, s, bias);
 989         __ add(d, d, bias);
 990       }
 991 
 992       __ tbz(count, 1, L2);
 993       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 994       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 995       __ bind(L2);
 996     }
 997 
 998     __ ret(lr);
 999 
1000     if (AvoidUnalignedAccesses) {
1001       Label drain, again;
1002       // Register order for storing. Order is different for backward copy.
1003 
1004       __ bind(unaligned_copy_long);
1005 
1006       // source address is even aligned, target odd aligned
1007       //
1008       // when forward copying word pairs we read long pairs at offsets
1009       // {0, 2, 4, 6} (in long words). when backwards copying we read
1010       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
1011       // address by -2 in the forwards case so we can compute the
1012       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
1013       // or -1.
1014       //
1015       // when forward copying we need to store 1 word, 3 pairs and
1016       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
1017       // zero offset We adjust the destination by -1 which means we
1018       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1019       //
1020       // When backwards copyng we need to store 1 word, 3 pairs and
1021       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1022       // offsets {1, 3, 5, 7, 8} * unit.
1023 
1024       if (direction == copy_forwards) {
1025         __ sub(s, s, 16);
1026         __ sub(d, d, 8);
1027       }
1028 
1029       // Fill 8 registers
1030       //
1031       // for forwards copy s was offset by -16 from the original input
1032       // value of s so the register contents are at these offsets
1033       // relative to the 64 bit block addressed by that original input
1034       // and so on for each successive 64 byte block when s is updated
1035       //
1036       // t0 at offset 0,  t1 at offset 8
1037       // t2 at offset 16, t3 at offset 24
1038       // t4 at offset 32, t5 at offset 40
1039       // t6 at offset 48, t7 at offset 56
1040 
1041       // for backwards copy s was not offset so the register contents
1042       // are at these offsets into the preceding 64 byte block
1043       // relative to that original input and so on for each successive
1044       // preceding 64 byte block when s is updated. this explains the
1045       // slightly counter-intuitive looking pattern of register usage
1046       // in the stp instructions for backwards copy.
1047       //
1048       // t0 at offset -16, t1 at offset -8
1049       // t2 at offset -32, t3 at offset -24
1050       // t4 at offset -48, t5 at offset -40
1051       // t6 at offset -64, t7 at offset -56
1052 
1053       __ ldp(t0, t1, Address(s, 2 * unit));
1054       __ ldp(t2, t3, Address(s, 4 * unit));
1055       __ ldp(t4, t5, Address(s, 6 * unit));
1056       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1057 
1058       __ subs(count, count, 16);
1059       __ br(Assembler::LO, drain);
1060 
1061       int prefetch = PrefetchCopyIntervalInBytes;
1062       bool use_stride = false;
1063       if (direction == copy_backwards) {
1064          use_stride = prefetch > 256;
1065          prefetch = -prefetch;
1066          if (use_stride) __ mov(stride, prefetch);
1067       }
1068 
1069       __ bind(again);
1070 
1071       if (PrefetchCopyIntervalInBytes > 0)
1072         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1073 
1074       if (direction == copy_forwards) {
1075        // allowing for the offset of -8 the store instructions place
1076        // registers into the target 64 bit block at the following
1077        // offsets
1078        //
1079        // t0 at offset 0
1080        // t1 at offset 8,  t2 at offset 16
1081        // t3 at offset 24, t4 at offset 32
1082        // t5 at offset 40, t6 at offset 48
1083        // t7 at offset 56
1084 
1085         __ str(t0, Address(d, 1 * unit));
1086         __ stp(t1, t2, Address(d, 2 * unit));
1087         __ ldp(t0, t1, Address(s, 2 * unit));
1088         __ stp(t3, t4, Address(d, 4 * unit));
1089         __ ldp(t2, t3, Address(s, 4 * unit));
1090         __ stp(t5, t6, Address(d, 6 * unit));
1091         __ ldp(t4, t5, Address(s, 6 * unit));
1092         __ str(t7, Address(__ pre(d, 8 * unit)));
1093         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1094       } else {
1095        // d was not offset when we started so the registers are
1096        // written into the 64 bit block preceding d with the following
1097        // offsets
1098        //
1099        // t1 at offset -8
1100        // t3 at offset -24, t0 at offset -16
1101        // t5 at offset -48, t2 at offset -32
1102        // t7 at offset -56, t4 at offset -48
1103        //                   t6 at offset -64
1104        //
1105        // note that this matches the offsets previously noted for the
1106        // loads
1107 
1108         __ str(t1, Address(d, 1 * unit));
1109         __ stp(t3, t0, Address(d, 3 * unit));
1110         __ ldp(t0, t1, Address(s, 2 * unit));
1111         __ stp(t5, t2, Address(d, 5 * unit));
1112         __ ldp(t2, t3, Address(s, 4 * unit));
1113         __ stp(t7, t4, Address(d, 7 * unit));
1114         __ ldp(t4, t5, Address(s, 6 * unit));
1115         __ str(t6, Address(__ pre(d, 8 * unit)));
1116         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1117       }
1118 
1119       __ subs(count, count, 8);
1120       __ br(Assembler::HS, again);
1121 
1122       // Drain
1123       //
1124       // this uses the same pattern of offsets and register arguments
1125       // as above
1126       __ bind(drain);
1127       if (direction == copy_forwards) {
1128         __ str(t0, Address(d, 1 * unit));
1129         __ stp(t1, t2, Address(d, 2 * unit));
1130         __ stp(t3, t4, Address(d, 4 * unit));
1131         __ stp(t5, t6, Address(d, 6 * unit));
1132         __ str(t7, Address(__ pre(d, 8 * unit)));
1133       } else {
1134         __ str(t1, Address(d, 1 * unit));
1135         __ stp(t3, t0, Address(d, 3 * unit));
1136         __ stp(t5, t2, Address(d, 5 * unit));
1137         __ stp(t7, t4, Address(d, 7 * unit));
1138         __ str(t6, Address(__ pre(d, 8 * unit)));
1139       }
1140       // now we need to copy any remaining part block which may
1141       // include a 4 word block subblock and/or a 2 word subblock.
1142       // bits 2 and 1 in the count are the tell-tale for whetehr we
1143       // have each such subblock
1144       {
1145         Label L1, L2;
1146         __ tbz(count, exact_log2(4), L1);
1147        // this is the same as above but copying only 4 longs hence
1148        // with ony one intervening stp between the str instructions
1149        // but note that the offsets and registers still follow the
1150        // same pattern
1151         __ ldp(t0, t1, Address(s, 2 * unit));
1152         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1153         if (direction == copy_forwards) {
1154           __ str(t0, Address(d, 1 * unit));
1155           __ stp(t1, t2, Address(d, 2 * unit));
1156           __ str(t3, Address(__ pre(d, 4 * unit)));
1157         } else {
1158           __ str(t1, Address(d, 1 * unit));
1159           __ stp(t3, t0, Address(d, 3 * unit));
1160           __ str(t2, Address(__ pre(d, 4 * unit)));
1161         }
1162         __ bind(L1);
1163 
1164         __ tbz(count, 1, L2);
1165        // this is the same as above but copying only 2 longs hence
1166        // there is no intervening stp between the str instructions
1167        // but note that the offset and register patterns are still
1168        // the same
1169         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1170         if (direction == copy_forwards) {
1171           __ str(t0, Address(d, 1 * unit));
1172           __ str(t1, Address(__ pre(d, 2 * unit)));
1173         } else {
1174           __ str(t1, Address(d, 1 * unit));
1175           __ str(t0, Address(__ pre(d, 2 * unit)));
1176         }
1177         __ bind(L2);
1178 
1179        // for forwards copy we need to re-adjust the offsets we
1180        // applied so that s and d are follow the last words written
1181 
1182        if (direction == copy_forwards) {
1183          __ add(s, s, 16);
1184          __ add(d, d, 8);
1185        }
1186 
1187       }
1188 
1189       __ ret(lr);
1190       }
1191   }
1192 
1193   // Small copy: less than 16 bytes.
1194   //
1195   // NB: Ignores all of the bits of count which represent more than 15
1196   // bytes, so a caller doesn't have to mask them.
1197 
1198   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1199     bool is_backwards = step < 0;
1200     size_t granularity = uabs(step);
1201     int direction = is_backwards ? -1 : 1;
1202     int unit = wordSize * direction;
1203 
1204     Label Lpair, Lword, Lint, Lshort, Lbyte;
1205 
1206     assert(granularity
1207            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1208 
1209     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1210 
1211     // ??? I don't know if this bit-test-and-branch is the right thing
1212     // to do.  It does a lot of jumping, resulting in several
1213     // mispredicted branches.  It might make more sense to do this
1214     // with something like Duff's device with a single computed branch.
1215 
1216     __ tbz(count, 3 - exact_log2(granularity), Lword);
1217     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1218     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1219     __ bind(Lword);
1220 
1221     if (granularity <= sizeof (jint)) {
1222       __ tbz(count, 2 - exact_log2(granularity), Lint);
1223       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1224       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1225       __ bind(Lint);
1226     }
1227 
1228     if (granularity <= sizeof (jshort)) {
1229       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1230       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1231       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1232       __ bind(Lshort);
1233     }
1234 
1235     if (granularity <= sizeof (jbyte)) {
1236       __ tbz(count, 0, Lbyte);
1237       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1238       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1239       __ bind(Lbyte);
1240     }
1241   }
1242 
1243   Label copy_f, copy_b;
1244 
1245   // All-singing all-dancing memory copy.
1246   //
1247   // Copy count units of memory from s to d.  The size of a unit is
1248   // step, which can be positive or negative depending on the direction
1249   // of copy.  If is_aligned is false, we align the source address.
1250   //
1251 
1252   void copy_memory(bool is_aligned, Register s, Register d,
1253                    Register count, Register tmp, int step) {
1254     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1255     bool is_backwards = step < 0;
1256     int granularity = uabs(step);
1257     const Register t0 = r3, t1 = r4;
1258 
1259     // <= 96 bytes do inline. Direction doesn't matter because we always
1260     // load all the data before writing anything
1261     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1262     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1263     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1264     const Register send = r17, dend = r18;
1265 
1266     if (PrefetchCopyIntervalInBytes > 0)
1267       __ prfm(Address(s, 0), PLDL1KEEP);
1268     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1269     __ br(Assembler::HI, copy_big);
1270 
1271     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1272     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1273 
1274     __ cmp(count, 16/granularity);
1275     __ br(Assembler::LS, copy16);
1276 
1277     __ cmp(count, 64/granularity);
1278     __ br(Assembler::HI, copy80);
1279 
1280     __ cmp(count, 32/granularity);
1281     __ br(Assembler::LS, copy32);
1282 
1283     // 33..64 bytes
1284     if (UseSIMDForMemoryOps) {
1285       __ ldpq(v0, v1, Address(s, 0));
1286       __ ldpq(v2, v3, Address(send, -32));
1287       __ stpq(v0, v1, Address(d, 0));
1288       __ stpq(v2, v3, Address(dend, -32));
1289     } else {
1290       __ ldp(t0, t1, Address(s, 0));
1291       __ ldp(t2, t3, Address(s, 16));
1292       __ ldp(t4, t5, Address(send, -32));
1293       __ ldp(t6, t7, Address(send, -16));
1294 
1295       __ stp(t0, t1, Address(d, 0));
1296       __ stp(t2, t3, Address(d, 16));
1297       __ stp(t4, t5, Address(dend, -32));
1298       __ stp(t6, t7, Address(dend, -16));
1299     }
1300     __ b(finish);
1301 
1302     // 17..32 bytes
1303     __ bind(copy32);
1304     __ ldp(t0, t1, Address(s, 0));
1305     __ ldp(t2, t3, Address(send, -16));
1306     __ stp(t0, t1, Address(d, 0));
1307     __ stp(t2, t3, Address(dend, -16));
1308     __ b(finish);
1309 
1310     // 65..80/96 bytes
1311     // (96 bytes if SIMD because we do 32 byes per instruction)
1312     __ bind(copy80);
1313     if (UseSIMDForMemoryOps) {
1314       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1315       __ ldpq(v4, v5, Address(send, -32));
1316       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1317       __ stpq(v4, v5, Address(dend, -32));
1318     } else {
1319       __ ldp(t0, t1, Address(s, 0));
1320       __ ldp(t2, t3, Address(s, 16));
1321       __ ldp(t4, t5, Address(s, 32));
1322       __ ldp(t6, t7, Address(s, 48));
1323       __ ldp(t8, t9, Address(send, -16));
1324 
1325       __ stp(t0, t1, Address(d, 0));
1326       __ stp(t2, t3, Address(d, 16));
1327       __ stp(t4, t5, Address(d, 32));
1328       __ stp(t6, t7, Address(d, 48));
1329       __ stp(t8, t9, Address(dend, -16));
1330     }
1331     __ b(finish);
1332 
1333     // 0..16 bytes
1334     __ bind(copy16);
1335     __ cmp(count, 8/granularity);
1336     __ br(Assembler::LO, copy8);
1337 
1338     // 8..16 bytes
1339     __ ldr(t0, Address(s, 0));
1340     __ ldr(t1, Address(send, -8));
1341     __ str(t0, Address(d, 0));
1342     __ str(t1, Address(dend, -8));
1343     __ b(finish);
1344 
1345     if (granularity < 8) {
1346       // 4..7 bytes
1347       __ bind(copy8);
1348       __ tbz(count, 2 - exact_log2(granularity), copy4);
1349       __ ldrw(t0, Address(s, 0));
1350       __ ldrw(t1, Address(send, -4));
1351       __ strw(t0, Address(d, 0));
1352       __ strw(t1, Address(dend, -4));
1353       __ b(finish);
1354       if (granularity < 4) {
1355         // 0..3 bytes
1356         __ bind(copy4);
1357         __ cbz(count, finish); // get rid of 0 case
1358         if (granularity == 2) {
1359           __ ldrh(t0, Address(s, 0));
1360           __ strh(t0, Address(d, 0));
1361         } else { // granularity == 1
1362           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1363           // the first and last byte.
1364           // Handle the 3 byte case by loading and storing base + count/2
1365           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1366           // This does means in the 1 byte case we load/store the same
1367           // byte 3 times.
1368           __ lsr(count, count, 1);
1369           __ ldrb(t0, Address(s, 0));
1370           __ ldrb(t1, Address(send, -1));
1371           __ ldrb(t2, Address(s, count));
1372           __ strb(t0, Address(d, 0));
1373           __ strb(t1, Address(dend, -1));
1374           __ strb(t2, Address(d, count));
1375         }
1376         __ b(finish);
1377       }
1378     }
1379 
1380     __ bind(copy_big);
1381     if (is_backwards) {
1382       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1383       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1384     }
1385 
1386     // Now we've got the small case out of the way we can align the
1387     // source address on a 2-word boundary.
1388 
1389     Label aligned;
1390 
1391     if (is_aligned) {
1392       // We may have to adjust by 1 word to get s 2-word-aligned.
1393       __ tbz(s, exact_log2(wordSize), aligned);
1394       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1395       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1396       __ sub(count, count, wordSize/granularity);
1397     } else {
1398       if (is_backwards) {
1399         __ andr(rscratch2, s, 2 * wordSize - 1);
1400       } else {
1401         __ neg(rscratch2, s);
1402         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1403       }
1404       // rscratch2 is the byte adjustment needed to align s.
1405       __ cbz(rscratch2, aligned);
1406       int shift = exact_log2(granularity);
1407       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1408       __ sub(count, count, rscratch2);
1409 
1410 #if 0
1411       // ?? This code is only correct for a disjoint copy.  It may or
1412       // may not make sense to use it in that case.
1413 
1414       // Copy the first pair; s and d may not be aligned.
1415       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1416       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1417 
1418       // Align s and d, adjust count
1419       if (is_backwards) {
1420         __ sub(s, s, rscratch2);
1421         __ sub(d, d, rscratch2);
1422       } else {
1423         __ add(s, s, rscratch2);
1424         __ add(d, d, rscratch2);
1425       }
1426 #else
1427       copy_memory_small(s, d, rscratch2, rscratch1, step);
1428 #endif
1429     }
1430 
1431     __ bind(aligned);
1432 
1433     // s is now 2-word-aligned.
1434 
1435     // We have a count of units and some trailing bytes.  Adjust the
1436     // count and do a bulk copy of words.
1437     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1438     if (direction == copy_forwards)
1439       __ bl(copy_f);
1440     else
1441       __ bl(copy_b);
1442 
1443     // And the tail.
1444     copy_memory_small(s, d, count, tmp, step);
1445 
1446     if (granularity >= 8) __ bind(copy8);
1447     if (granularity >= 4) __ bind(copy4);
1448     __ bind(finish);
1449   }
1450 
1451 
1452   void clobber_registers() {
1453 #ifdef ASSERT
1454     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1455     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1456     for (Register r = r3; r <= r18; r++)
1457       if (r != rscratch1) __ mov(r, rscratch1);
1458 #endif
1459   }
1460 
1461   // Scan over array at a for count oops, verifying each one.
1462   // Preserves a and count, clobbers rscratch1 and rscratch2.
1463   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1464     Label loop, end;
1465     __ mov(rscratch1, a);
1466     __ mov(rscratch2, zr);
1467     __ bind(loop);
1468     __ cmp(rscratch2, count);
1469     __ br(Assembler::HS, end);
1470     if (size == (size_t)wordSize) {
1471       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1472       __ verify_oop(temp);
1473     } else {
1474       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1475       __ decode_heap_oop(temp); // calls verify_oop
1476     }
1477     __ add(rscratch2, rscratch2, size);
1478     __ b(loop);
1479     __ bind(end);
1480   }
1481 
1482   // Arguments:
1483   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1484   //             ignored
1485   //   is_oop  - true => oop array, so generate store check code
1486   //   name    - stub name string
1487   //
1488   // Inputs:
1489   //   c_rarg0   - source array address
1490   //   c_rarg1   - destination array address
1491   //   c_rarg2   - element count, treated as ssize_t, can be zero
1492   //
1493   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1494   // the hardware handle it.  The two dwords within qwords that span
1495   // cache line boundaries will still be loaded and stored atomicly.
1496   //
1497   // Side Effects:
1498   //   disjoint_int_copy_entry is set to the no-overlap entry point
1499   //   used by generate_conjoint_int_oop_copy().
1500   //
1501   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1502                                   const char *name, bool dest_uninitialized = false) {
1503     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1504     RegSet saved_reg = RegSet::of(s, d, count);
1505     __ align(CodeEntryAlignment);
1506     StubCodeMark mark(this, "StubRoutines", name);
1507     address start = __ pc();
1508     __ enter();
1509 
1510     if (entry != NULL) {
1511       *entry = __ pc();
1512       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1513       BLOCK_COMMENT("Entry:");
1514     }
1515 
1516     if (is_oop) {
1517       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_reg);
1518       // save regs before copy_memory
1519       __ push(RegSet::of(d, count), sp);
1520     }
1521     copy_memory(aligned, s, d, count, rscratch1, size);
1522     if (is_oop) {
1523       __ pop(RegSet::of(d, count), sp);
1524       if (VerifyOops)
1525         verify_oop_array(size, d, count, r16);
1526       __ sub(count, count, 1); // make an inclusive end pointer
1527       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1528       gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet());
1529     }
1530     __ leave();
1531     __ mov(r0, zr); // return 0
1532     __ ret(lr);
1533 #ifdef BUILTIN_SIM
1534     {
1535       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1536       sim->notifyCompile(const_cast<char*>(name), start);
1537     }
1538 #endif
1539     return start;
1540   }
1541 
1542   // Arguments:
1543   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1544   //             ignored
1545   //   is_oop  - true => oop array, so generate store check code
1546   //   name    - stub name string
1547   //
1548   // Inputs:
1549   //   c_rarg0   - source array address
1550   //   c_rarg1   - destination array address
1551   //   c_rarg2   - element count, treated as ssize_t, can be zero
1552   //
1553   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1554   // the hardware handle it.  The two dwords within qwords that span
1555   // cache line boundaries will still be loaded and stored atomicly.
1556   //
1557   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1558                                  address *entry, const char *name,
1559                                  bool dest_uninitialized = false) {
1560     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1561     RegSet saved_regs = RegSet::of(s, d, count);
1562     StubCodeMark mark(this, "StubRoutines", name);
1563     address start = __ pc();
1564     __ enter();
1565 
1566     if (entry != NULL) {
1567       *entry = __ pc();
1568       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1569       BLOCK_COMMENT("Entry:");
1570     }
1571 
1572     // use fwd copy when (d-s) above_equal (count*size)
1573     __ sub(rscratch1, d, s);
1574     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1575     __ br(Assembler::HS, nooverlap_target);
1576 
1577     if (is_oop) {
1578       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_regs);
1579       // save regs before copy_memory
1580       __ push(RegSet::of(d, count), sp);
1581     }
1582     copy_memory(aligned, s, d, count, rscratch1, -size);
1583     if (is_oop) {
1584       __ pop(RegSet::of(d, count), sp);
1585       if (VerifyOops)
1586         verify_oop_array(size, d, count, r16);
1587       __ sub(count, count, 1); // make an inclusive end pointer
1588       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1589       gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet());
1590     }
1591     __ leave();
1592     __ mov(r0, zr); // return 0
1593     __ ret(lr);
1594 #ifdef BUILTIN_SIM
1595     {
1596       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1597       sim->notifyCompile(const_cast<char*>(name), start);
1598     }
1599 #endif
1600     return start;
1601 }
1602 
1603   // Arguments:
1604   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1605   //             ignored
1606   //   name    - stub name string
1607   //
1608   // Inputs:
1609   //   c_rarg0   - source array address
1610   //   c_rarg1   - destination array address
1611   //   c_rarg2   - element count, treated as ssize_t, can be zero
1612   //
1613   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1614   // we let the hardware handle it.  The one to eight bytes within words,
1615   // dwords or qwords that span cache line boundaries will still be loaded
1616   // and stored atomically.
1617   //
1618   // Side Effects:
1619   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1620   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1621   // we let the hardware handle it.  The one to eight bytes within words,
1622   // dwords or qwords that span cache line boundaries will still be loaded
1623   // and stored atomically.
1624   //
1625   // Side Effects:
1626   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1627   //   used by generate_conjoint_byte_copy().
1628   //
1629   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1630     const bool not_oop = false;
1631     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1632   }
1633 
1634   // Arguments:
1635   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1636   //             ignored
1637   //   name    - stub name string
1638   //
1639   // Inputs:
1640   //   c_rarg0   - source array address
1641   //   c_rarg1   - destination array address
1642   //   c_rarg2   - element count, treated as ssize_t, can be zero
1643   //
1644   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1645   // we let the hardware handle it.  The one to eight bytes within words,
1646   // dwords or qwords that span cache line boundaries will still be loaded
1647   // and stored atomically.
1648   //
1649   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1650                                       address* entry, const char *name) {
1651     const bool not_oop = false;
1652     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1653   }
1654 
1655   // Arguments:
1656   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1657   //             ignored
1658   //   name    - stub name string
1659   //
1660   // Inputs:
1661   //   c_rarg0   - source array address
1662   //   c_rarg1   - destination array address
1663   //   c_rarg2   - element count, treated as ssize_t, can be zero
1664   //
1665   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1666   // let the hardware handle it.  The two or four words within dwords
1667   // or qwords that span cache line boundaries will still be loaded
1668   // and stored atomically.
1669   //
1670   // Side Effects:
1671   //   disjoint_short_copy_entry is set to the no-overlap entry point
1672   //   used by generate_conjoint_short_copy().
1673   //
1674   address generate_disjoint_short_copy(bool aligned,
1675                                        address* entry, const char *name) {
1676     const bool not_oop = false;
1677     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1678   }
1679 
1680   // Arguments:
1681   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1682   //             ignored
1683   //   name    - stub name string
1684   //
1685   // Inputs:
1686   //   c_rarg0   - source array address
1687   //   c_rarg1   - destination array address
1688   //   c_rarg2   - element count, treated as ssize_t, can be zero
1689   //
1690   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1691   // let the hardware handle it.  The two or four words within dwords
1692   // or qwords that span cache line boundaries will still be loaded
1693   // and stored atomically.
1694   //
1695   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1696                                        address *entry, const char *name) {
1697     const bool not_oop = false;
1698     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1699 
1700   }
1701   // Arguments:
1702   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1703   //             ignored
1704   //   name    - stub name string
1705   //
1706   // Inputs:
1707   //   c_rarg0   - source array address
1708   //   c_rarg1   - destination array address
1709   //   c_rarg2   - element count, treated as ssize_t, can be zero
1710   //
1711   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1712   // the hardware handle it.  The two dwords within qwords that span
1713   // cache line boundaries will still be loaded and stored atomicly.
1714   //
1715   // Side Effects:
1716   //   disjoint_int_copy_entry is set to the no-overlap entry point
1717   //   used by generate_conjoint_int_oop_copy().
1718   //
1719   address generate_disjoint_int_copy(bool aligned, address *entry,
1720                                          const char *name, bool dest_uninitialized = false) {
1721     const bool not_oop = false;
1722     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1723   }
1724 
1725   // Arguments:
1726   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1727   //             ignored
1728   //   name    - stub name string
1729   //
1730   // Inputs:
1731   //   c_rarg0   - source array address
1732   //   c_rarg1   - destination array address
1733   //   c_rarg2   - element count, treated as ssize_t, can be zero
1734   //
1735   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1736   // the hardware handle it.  The two dwords within qwords that span
1737   // cache line boundaries will still be loaded and stored atomicly.
1738   //
1739   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1740                                      address *entry, const char *name,
1741                                      bool dest_uninitialized = false) {
1742     const bool not_oop = false;
1743     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1744   }
1745 
1746 
1747   // Arguments:
1748   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1749   //             ignored
1750   //   name    - stub name string
1751   //
1752   // Inputs:
1753   //   c_rarg0   - source array address
1754   //   c_rarg1   - destination array address
1755   //   c_rarg2   - element count, treated as size_t, can be zero
1756   //
1757   // Side Effects:
1758   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1759   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1760   //
1761   address generate_disjoint_long_copy(bool aligned, address *entry,
1762                                           const char *name, bool dest_uninitialized = false) {
1763     const bool not_oop = false;
1764     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1765   }
1766 
1767   // Arguments:
1768   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1769   //             ignored
1770   //   name    - stub name string
1771   //
1772   // Inputs:
1773   //   c_rarg0   - source array address
1774   //   c_rarg1   - destination array address
1775   //   c_rarg2   - element count, treated as size_t, can be zero
1776   //
1777   address generate_conjoint_long_copy(bool aligned,
1778                                       address nooverlap_target, address *entry,
1779                                       const char *name, bool dest_uninitialized = false) {
1780     const bool not_oop = false;
1781     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1782   }
1783 
1784   // Arguments:
1785   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1786   //             ignored
1787   //   name    - stub name string
1788   //
1789   // Inputs:
1790   //   c_rarg0   - source array address
1791   //   c_rarg1   - destination array address
1792   //   c_rarg2   - element count, treated as size_t, can be zero
1793   //
1794   // Side Effects:
1795   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1796   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1797   //
1798   address generate_disjoint_oop_copy(bool aligned, address *entry,
1799                                      const char *name, bool dest_uninitialized) {
1800     const bool is_oop = true;
1801     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1802     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1803   }
1804 
1805   // Arguments:
1806   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1807   //             ignored
1808   //   name    - stub name string
1809   //
1810   // Inputs:
1811   //   c_rarg0   - source array address
1812   //   c_rarg1   - destination array address
1813   //   c_rarg2   - element count, treated as size_t, can be zero
1814   //
1815   address generate_conjoint_oop_copy(bool aligned,
1816                                      address nooverlap_target, address *entry,
1817                                      const char *name, bool dest_uninitialized) {
1818     const bool is_oop = true;
1819     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1820     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1821                                   name, dest_uninitialized);
1822   }
1823 
1824 
1825   // Helper for generating a dynamic type check.
1826   // Smashes rscratch1.
1827   void generate_type_check(Register sub_klass,
1828                            Register super_check_offset,
1829                            Register super_klass,
1830                            Label& L_success) {
1831     assert_different_registers(sub_klass, super_check_offset, super_klass);
1832 
1833     BLOCK_COMMENT("type_check:");
1834 
1835     Label L_miss;
1836 
1837     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1838                                      super_check_offset);
1839     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1840 
1841     // Fall through on failure!
1842     __ BIND(L_miss);
1843   }
1844 
1845   //
1846   //  Generate checkcasting array copy stub
1847   //
1848   //  Input:
1849   //    c_rarg0   - source array address
1850   //    c_rarg1   - destination array address
1851   //    c_rarg2   - element count, treated as ssize_t, can be zero
1852   //    c_rarg3   - size_t ckoff (super_check_offset)
1853   //    c_rarg4   - oop ckval (super_klass)
1854   //
1855   //  Output:
1856   //    r0 ==  0  -  success
1857   //    r0 == -1^K - failure, where K is partial transfer count
1858   //
1859   address generate_checkcast_copy(const char *name, address *entry,
1860                                   bool dest_uninitialized = false) {
1861 
1862     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1863 
1864     // Input registers (after setup_arg_regs)
1865     const Register from        = c_rarg0;   // source array address
1866     const Register to          = c_rarg1;   // destination array address
1867     const Register count       = c_rarg2;   // elementscount
1868     const Register ckoff       = c_rarg3;   // super_check_offset
1869     const Register ckval       = c_rarg4;   // super_klass
1870 
1871     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1872     RegSet wb_post_saved_regs = RegSet::of(count);
1873 
1874     // Registers used as temps (r18, r19, r20 are save-on-entry)
1875     const Register count_save  = r21;       // orig elementscount
1876     const Register start_to    = r20;       // destination array start address
1877     const Register copied_oop  = r18;       // actual oop copied
1878     const Register r19_klass   = r19;       // oop._klass
1879 
1880     //---------------------------------------------------------------
1881     // Assembler stub will be used for this call to arraycopy
1882     // if the two arrays are subtypes of Object[] but the
1883     // destination array type is not equal to or a supertype
1884     // of the source type.  Each element must be separately
1885     // checked.
1886 
1887     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1888                                copied_oop, r19_klass, count_save);
1889 
1890     __ align(CodeEntryAlignment);
1891     StubCodeMark mark(this, "StubRoutines", name);
1892     address start = __ pc();
1893 
1894     __ enter(); // required for proper stackwalking of RuntimeStub frame
1895 
1896 #ifdef ASSERT
1897     // caller guarantees that the arrays really are different
1898     // otherwise, we would have to make conjoint checks
1899     { Label L;
1900       array_overlap_test(L, TIMES_OOP);
1901       __ stop("checkcast_copy within a single array");
1902       __ bind(L);
1903     }
1904 #endif //ASSERT
1905 
1906     // Caller of this entry point must set up the argument registers.
1907     if (entry != NULL) {
1908       *entry = __ pc();
1909       BLOCK_COMMENT("Entry:");
1910     }
1911 
1912      // Empty array:  Nothing to do.
1913     __ cbz(count, L_done);
1914 
1915     __ push(RegSet::of(r18, r19, r20, r21), sp);
1916 
1917 #ifdef ASSERT
1918     BLOCK_COMMENT("assert consistent ckoff/ckval");
1919     // The ckoff and ckval must be mutually consistent,
1920     // even though caller generates both.
1921     { Label L;
1922       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1923       __ ldrw(start_to, Address(ckval, sco_offset));
1924       __ cmpw(ckoff, start_to);
1925       __ br(Assembler::EQ, L);
1926       __ stop("super_check_offset inconsistent");
1927       __ bind(L);
1928     }
1929 #endif //ASSERT
1930 
1931     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized, wb_pre_saved_regs);
1932 
1933     // save the original count
1934     __ mov(count_save, count);
1935 
1936     // Copy from low to high addresses
1937     __ mov(start_to, to);              // Save destination array start address
1938     __ b(L_load_element);
1939 
1940     // ======== begin loop ========
1941     // (Loop is rotated; its entry is L_load_element.)
1942     // Loop control:
1943     //   for (; count != 0; count--) {
1944     //     copied_oop = load_heap_oop(from++);
1945     //     ... generate_type_check ...;
1946     //     store_heap_oop(to++, copied_oop);
1947     //   }
1948     __ align(OptoLoopAlignment);
1949 
1950     __ BIND(L_store_element);
1951     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1952     __ sub(count, count, 1);
1953     __ cbz(count, L_do_card_marks);
1954 
1955     // ======== loop entry is here ========
1956     __ BIND(L_load_element);
1957     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1958     __ cbz(copied_oop, L_store_element);
1959 
1960     __ load_klass(r19_klass, copied_oop);// query the object klass
1961     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1962     // ======== end loop ========
1963 
1964     // It was a real error; we must depend on the caller to finish the job.
1965     // Register count = remaining oops, count_orig = total oops.
1966     // Emit GC store barriers for the oops we have copied and report
1967     // their number to the caller.
1968 
1969     __ subs(count, count_save, count);     // K = partially copied oop count
1970     __ eon(count, count, zr);                   // report (-1^K) to caller
1971     __ br(Assembler::EQ, L_done_pop);
1972 
1973     __ BIND(L_do_card_marks);
1974     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1975     gen_write_ref_array_post_barrier(start_to, to, rscratch1, wb_post_saved_regs);
1976 
1977     __ bind(L_done_pop);
1978     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1979     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1980 
1981     __ bind(L_done);
1982     __ mov(r0, count);
1983     __ leave();
1984     __ ret(lr);
1985 
1986     return start;
1987   }
1988 
1989   // Perform range checks on the proposed arraycopy.
1990   // Kills temp, but nothing else.
1991   // Also, clean the sign bits of src_pos and dst_pos.
1992   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1993                               Register src_pos, // source position (c_rarg1)
1994                               Register dst,     // destination array oo (c_rarg2)
1995                               Register dst_pos, // destination position (c_rarg3)
1996                               Register length,
1997                               Register temp,
1998                               Label& L_failed) {
1999     BLOCK_COMMENT("arraycopy_range_checks:");
2000 
2001     assert_different_registers(rscratch1, temp);
2002 
2003     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2004     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2005     __ addw(temp, length, src_pos);
2006     __ cmpw(temp, rscratch1);
2007     __ br(Assembler::HI, L_failed);
2008 
2009     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2010     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2011     __ addw(temp, length, dst_pos);
2012     __ cmpw(temp, rscratch1);
2013     __ br(Assembler::HI, L_failed);
2014 
2015     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2016     __ movw(src_pos, src_pos);
2017     __ movw(dst_pos, dst_pos);
2018 
2019     BLOCK_COMMENT("arraycopy_range_checks done");
2020   }
2021 
2022   // These stubs get called from some dumb test routine.
2023   // I'll write them properly when they're called from
2024   // something that's actually doing something.
2025   static void fake_arraycopy_stub(address src, address dst, int count) {
2026     assert(count == 0, "huh?");
2027   }
2028 
2029 
2030   //
2031   //  Generate 'unsafe' array copy stub
2032   //  Though just as safe as the other stubs, it takes an unscaled
2033   //  size_t argument instead of an element count.
2034   //
2035   //  Input:
2036   //    c_rarg0   - source array address
2037   //    c_rarg1   - destination array address
2038   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2039   //
2040   // Examines the alignment of the operands and dispatches
2041   // to a long, int, short, or byte copy loop.
2042   //
2043   address generate_unsafe_copy(const char *name,
2044                                address byte_copy_entry,
2045                                address short_copy_entry,
2046                                address int_copy_entry,
2047                                address long_copy_entry) {
2048     Label L_long_aligned, L_int_aligned, L_short_aligned;
2049     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2050 
2051     __ align(CodeEntryAlignment);
2052     StubCodeMark mark(this, "StubRoutines", name);
2053     address start = __ pc();
2054     __ enter(); // required for proper stackwalking of RuntimeStub frame
2055 
2056     // bump this on entry, not on exit:
2057     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2058 
2059     __ orr(rscratch1, s, d);
2060     __ orr(rscratch1, rscratch1, count);
2061 
2062     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2063     __ cbz(rscratch1, L_long_aligned);
2064     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2065     __ cbz(rscratch1, L_int_aligned);
2066     __ tbz(rscratch1, 0, L_short_aligned);
2067     __ b(RuntimeAddress(byte_copy_entry));
2068 
2069     __ BIND(L_short_aligned);
2070     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2071     __ b(RuntimeAddress(short_copy_entry));
2072     __ BIND(L_int_aligned);
2073     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2074     __ b(RuntimeAddress(int_copy_entry));
2075     __ BIND(L_long_aligned);
2076     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2077     __ b(RuntimeAddress(long_copy_entry));
2078 
2079     return start;
2080   }
2081 
2082   //
2083   //  Generate generic array copy stubs
2084   //
2085   //  Input:
2086   //    c_rarg0    -  src oop
2087   //    c_rarg1    -  src_pos (32-bits)
2088   //    c_rarg2    -  dst oop
2089   //    c_rarg3    -  dst_pos (32-bits)
2090   //    c_rarg4    -  element count (32-bits)
2091   //
2092   //  Output:
2093   //    r0 ==  0  -  success
2094   //    r0 == -1^K - failure, where K is partial transfer count
2095   //
2096   address generate_generic_copy(const char *name,
2097                                 address byte_copy_entry, address short_copy_entry,
2098                                 address int_copy_entry, address oop_copy_entry,
2099                                 address long_copy_entry, address checkcast_copy_entry) {
2100 
2101     Label L_failed, L_failed_0, L_objArray;
2102     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2103 
2104     // Input registers
2105     const Register src        = c_rarg0;  // source array oop
2106     const Register src_pos    = c_rarg1;  // source position
2107     const Register dst        = c_rarg2;  // destination array oop
2108     const Register dst_pos    = c_rarg3;  // destination position
2109     const Register length     = c_rarg4;
2110 
2111     StubCodeMark mark(this, "StubRoutines", name);
2112 
2113     __ align(CodeEntryAlignment);
2114     address start = __ pc();
2115 
2116     __ enter(); // required for proper stackwalking of RuntimeStub frame
2117 
2118     // bump this on entry, not on exit:
2119     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2120 
2121     //-----------------------------------------------------------------------
2122     // Assembler stub will be used for this call to arraycopy
2123     // if the following conditions are met:
2124     //
2125     // (1) src and dst must not be null.
2126     // (2) src_pos must not be negative.
2127     // (3) dst_pos must not be negative.
2128     // (4) length  must not be negative.
2129     // (5) src klass and dst klass should be the same and not NULL.
2130     // (6) src and dst should be arrays.
2131     // (7) src_pos + length must not exceed length of src.
2132     // (8) dst_pos + length must not exceed length of dst.
2133     //
2134 
2135     //  if (src == NULL) return -1;
2136     __ cbz(src, L_failed);
2137 
2138     //  if (src_pos < 0) return -1;
2139     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2140 
2141     //  if (dst == NULL) return -1;
2142     __ cbz(dst, L_failed);
2143 
2144     //  if (dst_pos < 0) return -1;
2145     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2146 
2147     // registers used as temp
2148     const Register scratch_length    = r16; // elements count to copy
2149     const Register scratch_src_klass = r17; // array klass
2150     const Register lh                = r18; // layout helper
2151 
2152     //  if (length < 0) return -1;
2153     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2154     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2155 
2156     __ load_klass(scratch_src_klass, src);
2157 #ifdef ASSERT
2158     //  assert(src->klass() != NULL);
2159     {
2160       BLOCK_COMMENT("assert klasses not null {");
2161       Label L1, L2;
2162       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2163       __ bind(L1);
2164       __ stop("broken null klass");
2165       __ bind(L2);
2166       __ load_klass(rscratch1, dst);
2167       __ cbz(rscratch1, L1);     // this would be broken also
2168       BLOCK_COMMENT("} assert klasses not null done");
2169     }
2170 #endif
2171 
2172     // Load layout helper (32-bits)
2173     //
2174     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2175     // 32        30    24            16              8     2                 0
2176     //
2177     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2178     //
2179 
2180     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2181 
2182     // Handle objArrays completely differently...
2183     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2184     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2185     __ movw(rscratch1, objArray_lh);
2186     __ eorw(rscratch2, lh, rscratch1);
2187     __ cbzw(rscratch2, L_objArray);
2188 
2189     //  if (src->klass() != dst->klass()) return -1;
2190     __ load_klass(rscratch2, dst);
2191     __ eor(rscratch2, rscratch2, scratch_src_klass);
2192     __ cbnz(rscratch2, L_failed);
2193 
2194     //  if (!src->is_Array()) return -1;
2195     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2196 
2197     // At this point, it is known to be a typeArray (array_tag 0x3).
2198 #ifdef ASSERT
2199     {
2200       BLOCK_COMMENT("assert primitive array {");
2201       Label L;
2202       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2203       __ cmpw(lh, rscratch2);
2204       __ br(Assembler::GE, L);
2205       __ stop("must be a primitive array");
2206       __ bind(L);
2207       BLOCK_COMMENT("} assert primitive array done");
2208     }
2209 #endif
2210 
2211     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2212                            rscratch2, L_failed);
2213 
2214     // TypeArrayKlass
2215     //
2216     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2217     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2218     //
2219 
2220     const Register rscratch1_offset = rscratch1;    // array offset
2221     const Register r18_elsize = lh; // element size
2222 
2223     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2224            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2225     __ add(src, src, rscratch1_offset);           // src array offset
2226     __ add(dst, dst, rscratch1_offset);           // dst array offset
2227     BLOCK_COMMENT("choose copy loop based on element size");
2228 
2229     // next registers should be set before the jump to corresponding stub
2230     const Register from     = c_rarg0;  // source array address
2231     const Register to       = c_rarg1;  // destination array address
2232     const Register count    = c_rarg2;  // elements count
2233 
2234     // 'from', 'to', 'count' registers should be set in such order
2235     // since they are the same as 'src', 'src_pos', 'dst'.
2236 
2237     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2238 
2239     // The possible values of elsize are 0-3, i.e. exact_log2(element
2240     // size in bytes).  We do a simple bitwise binary search.
2241   __ BIND(L_copy_bytes);
2242     __ tbnz(r18_elsize, 1, L_copy_ints);
2243     __ tbnz(r18_elsize, 0, L_copy_shorts);
2244     __ lea(from, Address(src, src_pos));// src_addr
2245     __ lea(to,   Address(dst, dst_pos));// dst_addr
2246     __ movw(count, scratch_length); // length
2247     __ b(RuntimeAddress(byte_copy_entry));
2248 
2249   __ BIND(L_copy_shorts);
2250     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2251     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2252     __ movw(count, scratch_length); // length
2253     __ b(RuntimeAddress(short_copy_entry));
2254 
2255   __ BIND(L_copy_ints);
2256     __ tbnz(r18_elsize, 0, L_copy_longs);
2257     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2258     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2259     __ movw(count, scratch_length); // length
2260     __ b(RuntimeAddress(int_copy_entry));
2261 
2262   __ BIND(L_copy_longs);
2263 #ifdef ASSERT
2264     {
2265       BLOCK_COMMENT("assert long copy {");
2266       Label L;
2267       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2268       __ cmpw(r18_elsize, LogBytesPerLong);
2269       __ br(Assembler::EQ, L);
2270       __ stop("must be long copy, but elsize is wrong");
2271       __ bind(L);
2272       BLOCK_COMMENT("} assert long copy done");
2273     }
2274 #endif
2275     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2276     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2277     __ movw(count, scratch_length); // length
2278     __ b(RuntimeAddress(long_copy_entry));
2279 
2280     // ObjArrayKlass
2281   __ BIND(L_objArray);
2282     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2283 
2284     Label L_plain_copy, L_checkcast_copy;
2285     //  test array classes for subtyping
2286     __ load_klass(r18, dst);
2287     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2288     __ br(Assembler::NE, L_checkcast_copy);
2289 
2290     // Identically typed arrays can be copied without element-wise checks.
2291     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2292                            rscratch2, L_failed);
2293 
2294     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2295     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2296     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2297     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2298     __ movw(count, scratch_length); // length
2299   __ BIND(L_plain_copy);
2300     __ b(RuntimeAddress(oop_copy_entry));
2301 
2302   __ BIND(L_checkcast_copy);
2303     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2304     {
2305       // Before looking at dst.length, make sure dst is also an objArray.
2306       __ ldrw(rscratch1, Address(r18, lh_offset));
2307       __ movw(rscratch2, objArray_lh);
2308       __ eorw(rscratch1, rscratch1, rscratch2);
2309       __ cbnzw(rscratch1, L_failed);
2310 
2311       // It is safe to examine both src.length and dst.length.
2312       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2313                              r18, L_failed);
2314 
2315       const Register rscratch2_dst_klass = rscratch2;
2316       __ load_klass(rscratch2_dst_klass, dst); // reload
2317 
2318       // Marshal the base address arguments now, freeing registers.
2319       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2320       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2321       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2322       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2323       __ movw(count, length);           // length (reloaded)
2324       Register sco_temp = c_rarg3;      // this register is free now
2325       assert_different_registers(from, to, count, sco_temp,
2326                                  rscratch2_dst_klass, scratch_src_klass);
2327       // assert_clean_int(count, sco_temp);
2328 
2329       // Generate the type check.
2330       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2331       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2332       // assert_clean_int(sco_temp, r18);
2333       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2334 
2335       // Fetch destination element klass from the ObjArrayKlass header.
2336       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2337       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2338       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2339 
2340       // the checkcast_copy loop needs two extra arguments:
2341       assert(c_rarg3 == sco_temp, "#3 already in place");
2342       // Set up arguments for checkcast_copy_entry.
2343       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2344       __ b(RuntimeAddress(checkcast_copy_entry));
2345     }
2346 
2347   __ BIND(L_failed);
2348     __ mov(r0, -1);
2349     __ leave();   // required for proper stackwalking of RuntimeStub frame
2350     __ ret(lr);
2351 
2352     return start;
2353   }
2354 
2355   //
2356   // Generate stub for array fill. If "aligned" is true, the
2357   // "to" address is assumed to be heapword aligned.
2358   //
2359   // Arguments for generated stub:
2360   //   to:    c_rarg0
2361   //   value: c_rarg1
2362   //   count: c_rarg2 treated as signed
2363   //
2364   address generate_fill(BasicType t, bool aligned, const char *name) {
2365     __ align(CodeEntryAlignment);
2366     StubCodeMark mark(this, "StubRoutines", name);
2367     address start = __ pc();
2368 
2369     BLOCK_COMMENT("Entry:");
2370 
2371     const Register to        = c_rarg0;  // source array address
2372     const Register value     = c_rarg1;  // value
2373     const Register count     = c_rarg2;  // elements count
2374 
2375     const Register bz_base = r10;        // base for block_zero routine
2376     const Register cnt_words = r11;      // temp register
2377 
2378     __ enter();
2379 
2380     Label L_fill_elements, L_exit1;
2381 
2382     int shift = -1;
2383     switch (t) {
2384       case T_BYTE:
2385         shift = 0;
2386         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2387         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2388         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2389         __ br(Assembler::LO, L_fill_elements);
2390         break;
2391       case T_SHORT:
2392         shift = 1;
2393         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2394         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2395         __ br(Assembler::LO, L_fill_elements);
2396         break;
2397       case T_INT:
2398         shift = 2;
2399         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2400         __ br(Assembler::LO, L_fill_elements);
2401         break;
2402       default: ShouldNotReachHere();
2403     }
2404 
2405     // Align source address at 8 bytes address boundary.
2406     Label L_skip_align1, L_skip_align2, L_skip_align4;
2407     if (!aligned) {
2408       switch (t) {
2409         case T_BYTE:
2410           // One byte misalignment happens only for byte arrays.
2411           __ tbz(to, 0, L_skip_align1);
2412           __ strb(value, Address(__ post(to, 1)));
2413           __ subw(count, count, 1);
2414           __ bind(L_skip_align1);
2415           // Fallthrough
2416         case T_SHORT:
2417           // Two bytes misalignment happens only for byte and short (char) arrays.
2418           __ tbz(to, 1, L_skip_align2);
2419           __ strh(value, Address(__ post(to, 2)));
2420           __ subw(count, count, 2 >> shift);
2421           __ bind(L_skip_align2);
2422           // Fallthrough
2423         case T_INT:
2424           // Align to 8 bytes, we know we are 4 byte aligned to start.
2425           __ tbz(to, 2, L_skip_align4);
2426           __ strw(value, Address(__ post(to, 4)));
2427           __ subw(count, count, 4 >> shift);
2428           __ bind(L_skip_align4);
2429           break;
2430         default: ShouldNotReachHere();
2431       }
2432     }
2433 
2434     //
2435     //  Fill large chunks
2436     //
2437     __ lsrw(cnt_words, count, 3 - shift); // number of words
2438     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2439     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2440     if (UseBlockZeroing) {
2441       Label non_block_zeroing, rest;
2442       // If the fill value is zero we can use the fast zero_words().
2443       __ cbnz(value, non_block_zeroing);
2444       __ mov(bz_base, to);
2445       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2446       __ zero_words(bz_base, cnt_words);
2447       __ b(rest);
2448       __ bind(non_block_zeroing);
2449       __ fill_words(to, cnt_words, value);
2450       __ bind(rest);
2451     } else {
2452       __ fill_words(to, cnt_words, value);
2453     }
2454 
2455     // Remaining count is less than 8 bytes. Fill it by a single store.
2456     // Note that the total length is no less than 8 bytes.
2457     if (t == T_BYTE || t == T_SHORT) {
2458       Label L_exit1;
2459       __ cbzw(count, L_exit1);
2460       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2461       __ str(value, Address(to, -8));    // overwrite some elements
2462       __ bind(L_exit1);
2463       __ leave();
2464       __ ret(lr);
2465     }
2466 
2467     // Handle copies less than 8 bytes.
2468     Label L_fill_2, L_fill_4, L_exit2;
2469     __ bind(L_fill_elements);
2470     switch (t) {
2471       case T_BYTE:
2472         __ tbz(count, 0, L_fill_2);
2473         __ strb(value, Address(__ post(to, 1)));
2474         __ bind(L_fill_2);
2475         __ tbz(count, 1, L_fill_4);
2476         __ strh(value, Address(__ post(to, 2)));
2477         __ bind(L_fill_4);
2478         __ tbz(count, 2, L_exit2);
2479         __ strw(value, Address(to));
2480         break;
2481       case T_SHORT:
2482         __ tbz(count, 0, L_fill_4);
2483         __ strh(value, Address(__ post(to, 2)));
2484         __ bind(L_fill_4);
2485         __ tbz(count, 1, L_exit2);
2486         __ strw(value, Address(to));
2487         break;
2488       case T_INT:
2489         __ cbzw(count, L_exit2);
2490         __ strw(value, Address(to));
2491         break;
2492       default: ShouldNotReachHere();
2493     }
2494     __ bind(L_exit2);
2495     __ leave();
2496     __ ret(lr);
2497     return start;
2498   }
2499 
2500   void generate_arraycopy_stubs() {
2501     address entry;
2502     address entry_jbyte_arraycopy;
2503     address entry_jshort_arraycopy;
2504     address entry_jint_arraycopy;
2505     address entry_oop_arraycopy;
2506     address entry_jlong_arraycopy;
2507     address entry_checkcast_arraycopy;
2508 
2509     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2510     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2511 
2512     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2513 
2514     //*** jbyte
2515     // Always need aligned and unaligned versions
2516     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2517                                                                                   "jbyte_disjoint_arraycopy");
2518     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2519                                                                                   &entry_jbyte_arraycopy,
2520                                                                                   "jbyte_arraycopy");
2521     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2522                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2523     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2524                                                                                   "arrayof_jbyte_arraycopy");
2525 
2526     //*** jshort
2527     // Always need aligned and unaligned versions
2528     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2529                                                                                     "jshort_disjoint_arraycopy");
2530     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2531                                                                                     &entry_jshort_arraycopy,
2532                                                                                     "jshort_arraycopy");
2533     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2534                                                                                     "arrayof_jshort_disjoint_arraycopy");
2535     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2536                                                                                     "arrayof_jshort_arraycopy");
2537 
2538     //*** jint
2539     // Aligned versions
2540     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2541                                                                                 "arrayof_jint_disjoint_arraycopy");
2542     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2543                                                                                 "arrayof_jint_arraycopy");
2544     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2545     // entry_jint_arraycopy always points to the unaligned version
2546     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2547                                                                                 "jint_disjoint_arraycopy");
2548     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2549                                                                                 &entry_jint_arraycopy,
2550                                                                                 "jint_arraycopy");
2551 
2552     //*** jlong
2553     // It is always aligned
2554     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2555                                                                                   "arrayof_jlong_disjoint_arraycopy");
2556     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2557                                                                                   "arrayof_jlong_arraycopy");
2558     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2559     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2560 
2561     //*** oops
2562     {
2563       // With compressed oops we need unaligned versions; notice that
2564       // we overwrite entry_oop_arraycopy.
2565       bool aligned = !UseCompressedOops;
2566 
2567       StubRoutines::_arrayof_oop_disjoint_arraycopy
2568         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2569                                      /*dest_uninitialized*/false);
2570       StubRoutines::_arrayof_oop_arraycopy
2571         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2572                                      /*dest_uninitialized*/false);
2573       // Aligned versions without pre-barriers
2574       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2575         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2576                                      /*dest_uninitialized*/true);
2577       StubRoutines::_arrayof_oop_arraycopy_uninit
2578         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2579                                      /*dest_uninitialized*/true);
2580     }
2581 
2582     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2583     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2584     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2585     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2586 
2587     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2588     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2589                                                                         /*dest_uninitialized*/true);
2590 
2591     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2592                                                               entry_jbyte_arraycopy,
2593                                                               entry_jshort_arraycopy,
2594                                                               entry_jint_arraycopy,
2595                                                               entry_jlong_arraycopy);
2596 
2597     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2598                                                                entry_jbyte_arraycopy,
2599                                                                entry_jshort_arraycopy,
2600                                                                entry_jint_arraycopy,
2601                                                                entry_oop_arraycopy,
2602                                                                entry_jlong_arraycopy,
2603                                                                entry_checkcast_arraycopy);
2604 
2605     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2606     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2607     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2608     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2609     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2610     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2611   }
2612 
2613   void generate_math_stubs() { Unimplemented(); }
2614 
2615   // Arguments:
2616   //
2617   // Inputs:
2618   //   c_rarg0   - source byte array address
2619   //   c_rarg1   - destination byte array address
2620   //   c_rarg2   - K (key) in little endian int array
2621   //
2622   address generate_aescrypt_encryptBlock() {
2623     __ align(CodeEntryAlignment);
2624     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2625 
2626     Label L_doLast;
2627 
2628     const Register from        = c_rarg0;  // source array address
2629     const Register to          = c_rarg1;  // destination array address
2630     const Register key         = c_rarg2;  // key array address
2631     const Register keylen      = rscratch1;
2632 
2633     address start = __ pc();
2634     __ enter();
2635 
2636     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2637 
2638     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2639 
2640     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2641     __ rev32(v1, __ T16B, v1);
2642     __ rev32(v2, __ T16B, v2);
2643     __ rev32(v3, __ T16B, v3);
2644     __ rev32(v4, __ T16B, v4);
2645     __ aese(v0, v1);
2646     __ aesmc(v0, v0);
2647     __ aese(v0, v2);
2648     __ aesmc(v0, v0);
2649     __ aese(v0, v3);
2650     __ aesmc(v0, v0);
2651     __ aese(v0, v4);
2652     __ aesmc(v0, v0);
2653 
2654     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2655     __ rev32(v1, __ T16B, v1);
2656     __ rev32(v2, __ T16B, v2);
2657     __ rev32(v3, __ T16B, v3);
2658     __ rev32(v4, __ T16B, v4);
2659     __ aese(v0, v1);
2660     __ aesmc(v0, v0);
2661     __ aese(v0, v2);
2662     __ aesmc(v0, v0);
2663     __ aese(v0, v3);
2664     __ aesmc(v0, v0);
2665     __ aese(v0, v4);
2666     __ aesmc(v0, v0);
2667 
2668     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2669     __ rev32(v1, __ T16B, v1);
2670     __ rev32(v2, __ T16B, v2);
2671 
2672     __ cmpw(keylen, 44);
2673     __ br(Assembler::EQ, L_doLast);
2674 
2675     __ aese(v0, v1);
2676     __ aesmc(v0, v0);
2677     __ aese(v0, v2);
2678     __ aesmc(v0, v0);
2679 
2680     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2681     __ rev32(v1, __ T16B, v1);
2682     __ rev32(v2, __ T16B, v2);
2683 
2684     __ cmpw(keylen, 52);
2685     __ br(Assembler::EQ, L_doLast);
2686 
2687     __ aese(v0, v1);
2688     __ aesmc(v0, v0);
2689     __ aese(v0, v2);
2690     __ aesmc(v0, v0);
2691 
2692     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2693     __ rev32(v1, __ T16B, v1);
2694     __ rev32(v2, __ T16B, v2);
2695 
2696     __ BIND(L_doLast);
2697 
2698     __ aese(v0, v1);
2699     __ aesmc(v0, v0);
2700     __ aese(v0, v2);
2701 
2702     __ ld1(v1, __ T16B, key);
2703     __ rev32(v1, __ T16B, v1);
2704     __ eor(v0, __ T16B, v0, v1);
2705 
2706     __ st1(v0, __ T16B, to);
2707 
2708     __ mov(r0, 0);
2709 
2710     __ leave();
2711     __ ret(lr);
2712 
2713     return start;
2714   }
2715 
2716   // Arguments:
2717   //
2718   // Inputs:
2719   //   c_rarg0   - source byte array address
2720   //   c_rarg1   - destination byte array address
2721   //   c_rarg2   - K (key) in little endian int array
2722   //
2723   address generate_aescrypt_decryptBlock() {
2724     assert(UseAES, "need AES instructions and misaligned SSE support");
2725     __ align(CodeEntryAlignment);
2726     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2727     Label L_doLast;
2728 
2729     const Register from        = c_rarg0;  // source array address
2730     const Register to          = c_rarg1;  // destination array address
2731     const Register key         = c_rarg2;  // key array address
2732     const Register keylen      = rscratch1;
2733 
2734     address start = __ pc();
2735     __ enter(); // required for proper stackwalking of RuntimeStub frame
2736 
2737     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2738 
2739     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2740 
2741     __ ld1(v5, __ T16B, __ post(key, 16));
2742     __ rev32(v5, __ T16B, v5);
2743 
2744     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2745     __ rev32(v1, __ T16B, v1);
2746     __ rev32(v2, __ T16B, v2);
2747     __ rev32(v3, __ T16B, v3);
2748     __ rev32(v4, __ T16B, v4);
2749     __ aesd(v0, v1);
2750     __ aesimc(v0, v0);
2751     __ aesd(v0, v2);
2752     __ aesimc(v0, v0);
2753     __ aesd(v0, v3);
2754     __ aesimc(v0, v0);
2755     __ aesd(v0, v4);
2756     __ aesimc(v0, v0);
2757 
2758     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2759     __ rev32(v1, __ T16B, v1);
2760     __ rev32(v2, __ T16B, v2);
2761     __ rev32(v3, __ T16B, v3);
2762     __ rev32(v4, __ T16B, v4);
2763     __ aesd(v0, v1);
2764     __ aesimc(v0, v0);
2765     __ aesd(v0, v2);
2766     __ aesimc(v0, v0);
2767     __ aesd(v0, v3);
2768     __ aesimc(v0, v0);
2769     __ aesd(v0, v4);
2770     __ aesimc(v0, v0);
2771 
2772     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2773     __ rev32(v1, __ T16B, v1);
2774     __ rev32(v2, __ T16B, v2);
2775 
2776     __ cmpw(keylen, 44);
2777     __ br(Assembler::EQ, L_doLast);
2778 
2779     __ aesd(v0, v1);
2780     __ aesimc(v0, v0);
2781     __ aesd(v0, v2);
2782     __ aesimc(v0, v0);
2783 
2784     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2785     __ rev32(v1, __ T16B, v1);
2786     __ rev32(v2, __ T16B, v2);
2787 
2788     __ cmpw(keylen, 52);
2789     __ br(Assembler::EQ, L_doLast);
2790 
2791     __ aesd(v0, v1);
2792     __ aesimc(v0, v0);
2793     __ aesd(v0, v2);
2794     __ aesimc(v0, v0);
2795 
2796     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2797     __ rev32(v1, __ T16B, v1);
2798     __ rev32(v2, __ T16B, v2);
2799 
2800     __ BIND(L_doLast);
2801 
2802     __ aesd(v0, v1);
2803     __ aesimc(v0, v0);
2804     __ aesd(v0, v2);
2805 
2806     __ eor(v0, __ T16B, v0, v5);
2807 
2808     __ st1(v0, __ T16B, to);
2809 
2810     __ mov(r0, 0);
2811 
2812     __ leave();
2813     __ ret(lr);
2814 
2815     return start;
2816   }
2817 
2818   // Arguments:
2819   //
2820   // Inputs:
2821   //   c_rarg0   - source byte array address
2822   //   c_rarg1   - destination byte array address
2823   //   c_rarg2   - K (key) in little endian int array
2824   //   c_rarg3   - r vector byte array address
2825   //   c_rarg4   - input length
2826   //
2827   // Output:
2828   //   x0        - input length
2829   //
2830   address generate_cipherBlockChaining_encryptAESCrypt() {
2831     assert(UseAES, "need AES instructions and misaligned SSE support");
2832     __ align(CodeEntryAlignment);
2833     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2834 
2835     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2836 
2837     const Register from        = c_rarg0;  // source array address
2838     const Register to          = c_rarg1;  // destination array address
2839     const Register key         = c_rarg2;  // key array address
2840     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2841                                            // and left with the results of the last encryption block
2842     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2843     const Register keylen      = rscratch1;
2844 
2845     address start = __ pc();
2846 
2847       __ enter();
2848 
2849       __ movw(rscratch2, len_reg);
2850 
2851       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2852 
2853       __ ld1(v0, __ T16B, rvec);
2854 
2855       __ cmpw(keylen, 52);
2856       __ br(Assembler::CC, L_loadkeys_44);
2857       __ br(Assembler::EQ, L_loadkeys_52);
2858 
2859       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2860       __ rev32(v17, __ T16B, v17);
2861       __ rev32(v18, __ T16B, v18);
2862     __ BIND(L_loadkeys_52);
2863       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2864       __ rev32(v19, __ T16B, v19);
2865       __ rev32(v20, __ T16B, v20);
2866     __ BIND(L_loadkeys_44);
2867       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2868       __ rev32(v21, __ T16B, v21);
2869       __ rev32(v22, __ T16B, v22);
2870       __ rev32(v23, __ T16B, v23);
2871       __ rev32(v24, __ T16B, v24);
2872       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2873       __ rev32(v25, __ T16B, v25);
2874       __ rev32(v26, __ T16B, v26);
2875       __ rev32(v27, __ T16B, v27);
2876       __ rev32(v28, __ T16B, v28);
2877       __ ld1(v29, v30, v31, __ T16B, key);
2878       __ rev32(v29, __ T16B, v29);
2879       __ rev32(v30, __ T16B, v30);
2880       __ rev32(v31, __ T16B, v31);
2881 
2882     __ BIND(L_aes_loop);
2883       __ ld1(v1, __ T16B, __ post(from, 16));
2884       __ eor(v0, __ T16B, v0, v1);
2885 
2886       __ br(Assembler::CC, L_rounds_44);
2887       __ br(Assembler::EQ, L_rounds_52);
2888 
2889       __ aese(v0, v17); __ aesmc(v0, v0);
2890       __ aese(v0, v18); __ aesmc(v0, v0);
2891     __ BIND(L_rounds_52);
2892       __ aese(v0, v19); __ aesmc(v0, v0);
2893       __ aese(v0, v20); __ aesmc(v0, v0);
2894     __ BIND(L_rounds_44);
2895       __ aese(v0, v21); __ aesmc(v0, v0);
2896       __ aese(v0, v22); __ aesmc(v0, v0);
2897       __ aese(v0, v23); __ aesmc(v0, v0);
2898       __ aese(v0, v24); __ aesmc(v0, v0);
2899       __ aese(v0, v25); __ aesmc(v0, v0);
2900       __ aese(v0, v26); __ aesmc(v0, v0);
2901       __ aese(v0, v27); __ aesmc(v0, v0);
2902       __ aese(v0, v28); __ aesmc(v0, v0);
2903       __ aese(v0, v29); __ aesmc(v0, v0);
2904       __ aese(v0, v30);
2905       __ eor(v0, __ T16B, v0, v31);
2906 
2907       __ st1(v0, __ T16B, __ post(to, 16));
2908 
2909       __ subw(len_reg, len_reg, 16);
2910       __ cbnzw(len_reg, L_aes_loop);
2911 
2912       __ st1(v0, __ T16B, rvec);
2913 
2914       __ mov(r0, rscratch2);
2915 
2916       __ leave();
2917       __ ret(lr);
2918 
2919       return start;
2920   }
2921 
2922   // Arguments:
2923   //
2924   // Inputs:
2925   //   c_rarg0   - source byte array address
2926   //   c_rarg1   - destination byte array address
2927   //   c_rarg2   - K (key) in little endian int array
2928   //   c_rarg3   - r vector byte array address
2929   //   c_rarg4   - input length
2930   //
2931   // Output:
2932   //   r0        - input length
2933   //
2934   address generate_cipherBlockChaining_decryptAESCrypt() {
2935     assert(UseAES, "need AES instructions and misaligned SSE support");
2936     __ align(CodeEntryAlignment);
2937     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2938 
2939     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2940 
2941     const Register from        = c_rarg0;  // source array address
2942     const Register to          = c_rarg1;  // destination array address
2943     const Register key         = c_rarg2;  // key array address
2944     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2945                                            // and left with the results of the last encryption block
2946     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2947     const Register keylen      = rscratch1;
2948 
2949     address start = __ pc();
2950 
2951       __ enter();
2952 
2953       __ movw(rscratch2, len_reg);
2954 
2955       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2956 
2957       __ ld1(v2, __ T16B, rvec);
2958 
2959       __ ld1(v31, __ T16B, __ post(key, 16));
2960       __ rev32(v31, __ T16B, v31);
2961 
2962       __ cmpw(keylen, 52);
2963       __ br(Assembler::CC, L_loadkeys_44);
2964       __ br(Assembler::EQ, L_loadkeys_52);
2965 
2966       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2967       __ rev32(v17, __ T16B, v17);
2968       __ rev32(v18, __ T16B, v18);
2969     __ BIND(L_loadkeys_52);
2970       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2971       __ rev32(v19, __ T16B, v19);
2972       __ rev32(v20, __ T16B, v20);
2973     __ BIND(L_loadkeys_44);
2974       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2975       __ rev32(v21, __ T16B, v21);
2976       __ rev32(v22, __ T16B, v22);
2977       __ rev32(v23, __ T16B, v23);
2978       __ rev32(v24, __ T16B, v24);
2979       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2980       __ rev32(v25, __ T16B, v25);
2981       __ rev32(v26, __ T16B, v26);
2982       __ rev32(v27, __ T16B, v27);
2983       __ rev32(v28, __ T16B, v28);
2984       __ ld1(v29, v30, __ T16B, key);
2985       __ rev32(v29, __ T16B, v29);
2986       __ rev32(v30, __ T16B, v30);
2987 
2988     __ BIND(L_aes_loop);
2989       __ ld1(v0, __ T16B, __ post(from, 16));
2990       __ orr(v1, __ T16B, v0, v0);
2991 
2992       __ br(Assembler::CC, L_rounds_44);
2993       __ br(Assembler::EQ, L_rounds_52);
2994 
2995       __ aesd(v0, v17); __ aesimc(v0, v0);
2996       __ aesd(v0, v18); __ aesimc(v0, v0);
2997     __ BIND(L_rounds_52);
2998       __ aesd(v0, v19); __ aesimc(v0, v0);
2999       __ aesd(v0, v20); __ aesimc(v0, v0);
3000     __ BIND(L_rounds_44);
3001       __ aesd(v0, v21); __ aesimc(v0, v0);
3002       __ aesd(v0, v22); __ aesimc(v0, v0);
3003       __ aesd(v0, v23); __ aesimc(v0, v0);
3004       __ aesd(v0, v24); __ aesimc(v0, v0);
3005       __ aesd(v0, v25); __ aesimc(v0, v0);
3006       __ aesd(v0, v26); __ aesimc(v0, v0);
3007       __ aesd(v0, v27); __ aesimc(v0, v0);
3008       __ aesd(v0, v28); __ aesimc(v0, v0);
3009       __ aesd(v0, v29); __ aesimc(v0, v0);
3010       __ aesd(v0, v30);
3011       __ eor(v0, __ T16B, v0, v31);
3012       __ eor(v0, __ T16B, v0, v2);
3013 
3014       __ st1(v0, __ T16B, __ post(to, 16));
3015       __ orr(v2, __ T16B, v1, v1);
3016 
3017       __ subw(len_reg, len_reg, 16);
3018       __ cbnzw(len_reg, L_aes_loop);
3019 
3020       __ st1(v2, __ T16B, rvec);
3021 
3022       __ mov(r0, rscratch2);
3023 
3024       __ leave();
3025       __ ret(lr);
3026 
3027     return start;
3028   }
3029 
3030   // Arguments:
3031   //
3032   // Inputs:
3033   //   c_rarg0   - byte[]  source+offset
3034   //   c_rarg1   - int[]   SHA.state
3035   //   c_rarg2   - int     offset
3036   //   c_rarg3   - int     limit
3037   //
3038   address generate_sha1_implCompress(bool multi_block, const char *name) {
3039     __ align(CodeEntryAlignment);
3040     StubCodeMark mark(this, "StubRoutines", name);
3041     address start = __ pc();
3042 
3043     Register buf   = c_rarg0;
3044     Register state = c_rarg1;
3045     Register ofs   = c_rarg2;
3046     Register limit = c_rarg3;
3047 
3048     Label keys;
3049     Label sha1_loop;
3050 
3051     // load the keys into v0..v3
3052     __ adr(rscratch1, keys);
3053     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3054     // load 5 words state into v6, v7
3055     __ ldrq(v6, Address(state, 0));
3056     __ ldrs(v7, Address(state, 16));
3057 
3058 
3059     __ BIND(sha1_loop);
3060     // load 64 bytes of data into v16..v19
3061     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3062     __ rev32(v16, __ T16B, v16);
3063     __ rev32(v17, __ T16B, v17);
3064     __ rev32(v18, __ T16B, v18);
3065     __ rev32(v19, __ T16B, v19);
3066 
3067     // do the sha1
3068     __ addv(v4, __ T4S, v16, v0);
3069     __ orr(v20, __ T16B, v6, v6);
3070 
3071     FloatRegister d0 = v16;
3072     FloatRegister d1 = v17;
3073     FloatRegister d2 = v18;
3074     FloatRegister d3 = v19;
3075 
3076     for (int round = 0; round < 20; round++) {
3077       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3078       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3079       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3080       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3081       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3082 
3083       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3084       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3085       __ sha1h(tmp2, __ T4S, v20);
3086       if (round < 5)
3087         __ sha1c(v20, __ T4S, tmp3, tmp4);
3088       else if (round < 10 || round >= 15)
3089         __ sha1p(v20, __ T4S, tmp3, tmp4);
3090       else
3091         __ sha1m(v20, __ T4S, tmp3, tmp4);
3092       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3093 
3094       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3095     }
3096 
3097     __ addv(v7, __ T2S, v7, v21);
3098     __ addv(v6, __ T4S, v6, v20);
3099 
3100     if (multi_block) {
3101       __ add(ofs, ofs, 64);
3102       __ cmp(ofs, limit);
3103       __ br(Assembler::LE, sha1_loop);
3104       __ mov(c_rarg0, ofs); // return ofs
3105     }
3106 
3107     __ strq(v6, Address(state, 0));
3108     __ strs(v7, Address(state, 16));
3109 
3110     __ ret(lr);
3111 
3112     __ bind(keys);
3113     __ emit_int32(0x5a827999);
3114     __ emit_int32(0x6ed9eba1);
3115     __ emit_int32(0x8f1bbcdc);
3116     __ emit_int32(0xca62c1d6);
3117 
3118     return start;
3119   }
3120 
3121 
3122   // Arguments:
3123   //
3124   // Inputs:
3125   //   c_rarg0   - byte[]  source+offset
3126   //   c_rarg1   - int[]   SHA.state
3127   //   c_rarg2   - int     offset
3128   //   c_rarg3   - int     limit
3129   //
3130   address generate_sha256_implCompress(bool multi_block, const char *name) {
3131     static const uint32_t round_consts[64] = {
3132       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3133       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3134       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3135       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3136       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3137       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3138       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3139       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3140       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3141       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3142       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3143       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3144       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3145       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3146       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3147       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3148     };
3149     __ align(CodeEntryAlignment);
3150     StubCodeMark mark(this, "StubRoutines", name);
3151     address start = __ pc();
3152 
3153     Register buf   = c_rarg0;
3154     Register state = c_rarg1;
3155     Register ofs   = c_rarg2;
3156     Register limit = c_rarg3;
3157 
3158     Label sha1_loop;
3159 
3160     __ stpd(v8, v9, __ pre(sp, -32));
3161     __ stpd(v10, v11, Address(sp, 16));
3162 
3163 // dga == v0
3164 // dgb == v1
3165 // dg0 == v2
3166 // dg1 == v3
3167 // dg2 == v4
3168 // t0 == v6
3169 // t1 == v7
3170 
3171     // load 16 keys to v16..v31
3172     __ lea(rscratch1, ExternalAddress((address)round_consts));
3173     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3174     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3175     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3176     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3177 
3178     // load 8 words (256 bits) state
3179     __ ldpq(v0, v1, state);
3180 
3181     __ BIND(sha1_loop);
3182     // load 64 bytes of data into v8..v11
3183     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3184     __ rev32(v8, __ T16B, v8);
3185     __ rev32(v9, __ T16B, v9);
3186     __ rev32(v10, __ T16B, v10);
3187     __ rev32(v11, __ T16B, v11);
3188 
3189     __ addv(v6, __ T4S, v8, v16);
3190     __ orr(v2, __ T16B, v0, v0);
3191     __ orr(v3, __ T16B, v1, v1);
3192 
3193     FloatRegister d0 = v8;
3194     FloatRegister d1 = v9;
3195     FloatRegister d2 = v10;
3196     FloatRegister d3 = v11;
3197 
3198 
3199     for (int round = 0; round < 16; round++) {
3200       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3201       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3202       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3203       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3204 
3205       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3206        __ orr(v4, __ T16B, v2, v2);
3207       if (round < 15)
3208         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3209       __ sha256h(v2, __ T4S, v3, tmp2);
3210       __ sha256h2(v3, __ T4S, v4, tmp2);
3211       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3212 
3213       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3214     }
3215 
3216     __ addv(v0, __ T4S, v0, v2);
3217     __ addv(v1, __ T4S, v1, v3);
3218 
3219     if (multi_block) {
3220       __ add(ofs, ofs, 64);
3221       __ cmp(ofs, limit);
3222       __ br(Assembler::LE, sha1_loop);
3223       __ mov(c_rarg0, ofs); // return ofs
3224     }
3225 
3226     __ ldpd(v10, v11, Address(sp, 16));
3227     __ ldpd(v8, v9, __ post(sp, 32));
3228 
3229     __ stpq(v0, v1, state);
3230 
3231     __ ret(lr);
3232 
3233     return start;
3234   }
3235 
3236 #ifndef BUILTIN_SIM
3237   // Safefetch stubs.
3238   void generate_safefetch(const char* name, int size, address* entry,
3239                           address* fault_pc, address* continuation_pc) {
3240     // safefetch signatures:
3241     //   int      SafeFetch32(int*      adr, int      errValue);
3242     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3243     //
3244     // arguments:
3245     //   c_rarg0 = adr
3246     //   c_rarg1 = errValue
3247     //
3248     // result:
3249     //   PPC_RET  = *adr or errValue
3250 
3251     StubCodeMark mark(this, "StubRoutines", name);
3252 
3253     // Entry point, pc or function descriptor.
3254     *entry = __ pc();
3255 
3256     // Load *adr into c_rarg1, may fault.
3257     *fault_pc = __ pc();
3258     switch (size) {
3259       case 4:
3260         // int32_t
3261         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3262         break;
3263       case 8:
3264         // int64_t
3265         __ ldr(c_rarg1, Address(c_rarg0, 0));
3266         break;
3267       default:
3268         ShouldNotReachHere();
3269     }
3270 
3271     // return errValue or *adr
3272     *continuation_pc = __ pc();
3273     __ mov(r0, c_rarg1);
3274     __ ret(lr);
3275   }
3276 #endif
3277 
3278   /**
3279    *  Arguments:
3280    *
3281    * Inputs:
3282    *   c_rarg0   - int crc
3283    *   c_rarg1   - byte* buf
3284    *   c_rarg2   - int length
3285    *
3286    * Ouput:
3287    *       rax   - int crc result
3288    */
3289   address generate_updateBytesCRC32() {
3290     assert(UseCRC32Intrinsics, "what are we doing here?");
3291 
3292     __ align(CodeEntryAlignment);
3293     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3294 
3295     address start = __ pc();
3296 
3297     const Register crc   = c_rarg0;  // crc
3298     const Register buf   = c_rarg1;  // source java byte array address
3299     const Register len   = c_rarg2;  // length
3300     const Register table0 = c_rarg3; // crc_table address
3301     const Register table1 = c_rarg4;
3302     const Register table2 = c_rarg5;
3303     const Register table3 = c_rarg6;
3304     const Register tmp3 = c_rarg7;
3305 
3306     BLOCK_COMMENT("Entry:");
3307     __ enter(); // required for proper stackwalking of RuntimeStub frame
3308 
3309     __ kernel_crc32(crc, buf, len,
3310               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3311 
3312     __ leave(); // required for proper stackwalking of RuntimeStub frame
3313     __ ret(lr);
3314 
3315     return start;
3316   }
3317 
3318   /**
3319    *  Arguments:
3320    *
3321    * Inputs:
3322    *   c_rarg0   - int crc
3323    *   c_rarg1   - byte* buf
3324    *   c_rarg2   - int length
3325    *   c_rarg3   - int* table
3326    *
3327    * Ouput:
3328    *       r0   - int crc result
3329    */
3330   address generate_updateBytesCRC32C() {
3331     assert(UseCRC32CIntrinsics, "what are we doing here?");
3332 
3333     __ align(CodeEntryAlignment);
3334     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3335 
3336     address start = __ pc();
3337 
3338     const Register crc   = c_rarg0;  // crc
3339     const Register buf   = c_rarg1;  // source java byte array address
3340     const Register len   = c_rarg2;  // length
3341     const Register table0 = c_rarg3; // crc_table address
3342     const Register table1 = c_rarg4;
3343     const Register table2 = c_rarg5;
3344     const Register table3 = c_rarg6;
3345     const Register tmp3 = c_rarg7;
3346 
3347     BLOCK_COMMENT("Entry:");
3348     __ enter(); // required for proper stackwalking of RuntimeStub frame
3349 
3350     __ kernel_crc32c(crc, buf, len,
3351               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3352 
3353     __ leave(); // required for proper stackwalking of RuntimeStub frame
3354     __ ret(lr);
3355 
3356     return start;
3357   }
3358 
3359   /***
3360    *  Arguments:
3361    *
3362    *  Inputs:
3363    *   c_rarg0   - int   adler
3364    *   c_rarg1   - byte* buff
3365    *   c_rarg2   - int   len
3366    *
3367    * Output:
3368    *   c_rarg0   - int adler result
3369    */
3370   address generate_updateBytesAdler32() {
3371     __ align(CodeEntryAlignment);
3372     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3373     address start = __ pc();
3374 
3375     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3376 
3377     // Aliases
3378     Register adler  = c_rarg0;
3379     Register s1     = c_rarg0;
3380     Register s2     = c_rarg3;
3381     Register buff   = c_rarg1;
3382     Register len    = c_rarg2;
3383     Register nmax  = r4;
3384     Register base = r5;
3385     Register count = r6;
3386     Register temp0 = rscratch1;
3387     Register temp1 = rscratch2;
3388     Register temp2 = r7;
3389 
3390     // Max number of bytes we can process before having to take the mod
3391     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3392     unsigned long BASE = 0xfff1;
3393     unsigned long NMAX = 0x15B0;
3394 
3395     __ mov(base, BASE);
3396     __ mov(nmax, NMAX);
3397 
3398     // s1 is initialized to the lower 16 bits of adler
3399     // s2 is initialized to the upper 16 bits of adler
3400     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3401     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3402 
3403     // The pipelined loop needs at least 16 elements for 1 iteration
3404     // It does check this, but it is more effective to skip to the cleanup loop
3405     __ cmp(len, 16);
3406     __ br(Assembler::HS, L_nmax);
3407     __ cbz(len, L_combine);
3408 
3409     __ bind(L_simple_by1_loop);
3410     __ ldrb(temp0, Address(__ post(buff, 1)));
3411     __ add(s1, s1, temp0);
3412     __ add(s2, s2, s1);
3413     __ subs(len, len, 1);
3414     __ br(Assembler::HI, L_simple_by1_loop);
3415 
3416     // s1 = s1 % BASE
3417     __ subs(temp0, s1, base);
3418     __ csel(s1, temp0, s1, Assembler::HS);
3419 
3420     // s2 = s2 % BASE
3421     __ lsr(temp0, s2, 16);
3422     __ lsl(temp1, temp0, 4);
3423     __ sub(temp1, temp1, temp0);
3424     __ add(s2, temp1, s2, ext::uxth);
3425 
3426     __ subs(temp0, s2, base);
3427     __ csel(s2, temp0, s2, Assembler::HS);
3428 
3429     __ b(L_combine);
3430 
3431     __ bind(L_nmax);
3432     __ subs(len, len, nmax);
3433     __ sub(count, nmax, 16);
3434     __ br(Assembler::LO, L_by16);
3435 
3436     __ bind(L_nmax_loop);
3437 
3438     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3439 
3440     __ add(s1, s1, temp0, ext::uxtb);
3441     __ ubfx(temp2, temp0, 8, 8);
3442     __ add(s2, s2, s1);
3443     __ add(s1, s1, temp2);
3444     __ ubfx(temp2, temp0, 16, 8);
3445     __ add(s2, s2, s1);
3446     __ add(s1, s1, temp2);
3447     __ ubfx(temp2, temp0, 24, 8);
3448     __ add(s2, s2, s1);
3449     __ add(s1, s1, temp2);
3450     __ ubfx(temp2, temp0, 32, 8);
3451     __ add(s2, s2, s1);
3452     __ add(s1, s1, temp2);
3453     __ ubfx(temp2, temp0, 40, 8);
3454     __ add(s2, s2, s1);
3455     __ add(s1, s1, temp2);
3456     __ ubfx(temp2, temp0, 48, 8);
3457     __ add(s2, s2, s1);
3458     __ add(s1, s1, temp2);
3459     __ add(s2, s2, s1);
3460     __ add(s1, s1, temp0, Assembler::LSR, 56);
3461     __ add(s2, s2, s1);
3462 
3463     __ add(s1, s1, temp1, ext::uxtb);
3464     __ ubfx(temp2, temp1, 8, 8);
3465     __ add(s2, s2, s1);
3466     __ add(s1, s1, temp2);
3467     __ ubfx(temp2, temp1, 16, 8);
3468     __ add(s2, s2, s1);
3469     __ add(s1, s1, temp2);
3470     __ ubfx(temp2, temp1, 24, 8);
3471     __ add(s2, s2, s1);
3472     __ add(s1, s1, temp2);
3473     __ ubfx(temp2, temp1, 32, 8);
3474     __ add(s2, s2, s1);
3475     __ add(s1, s1, temp2);
3476     __ ubfx(temp2, temp1, 40, 8);
3477     __ add(s2, s2, s1);
3478     __ add(s1, s1, temp2);
3479     __ ubfx(temp2, temp1, 48, 8);
3480     __ add(s2, s2, s1);
3481     __ add(s1, s1, temp2);
3482     __ add(s2, s2, s1);
3483     __ add(s1, s1, temp1, Assembler::LSR, 56);
3484     __ add(s2, s2, s1);
3485 
3486     __ subs(count, count, 16);
3487     __ br(Assembler::HS, L_nmax_loop);
3488 
3489     // s1 = s1 % BASE
3490     __ lsr(temp0, s1, 16);
3491     __ lsl(temp1, temp0, 4);
3492     __ sub(temp1, temp1, temp0);
3493     __ add(temp1, temp1, s1, ext::uxth);
3494 
3495     __ lsr(temp0, temp1, 16);
3496     __ lsl(s1, temp0, 4);
3497     __ sub(s1, s1, temp0);
3498     __ add(s1, s1, temp1, ext:: uxth);
3499 
3500     __ subs(temp0, s1, base);
3501     __ csel(s1, temp0, s1, Assembler::HS);
3502 
3503     // s2 = s2 % BASE
3504     __ lsr(temp0, s2, 16);
3505     __ lsl(temp1, temp0, 4);
3506     __ sub(temp1, temp1, temp0);
3507     __ add(temp1, temp1, s2, ext::uxth);
3508 
3509     __ lsr(temp0, temp1, 16);
3510     __ lsl(s2, temp0, 4);
3511     __ sub(s2, s2, temp0);
3512     __ add(s2, s2, temp1, ext:: uxth);
3513 
3514     __ subs(temp0, s2, base);
3515     __ csel(s2, temp0, s2, Assembler::HS);
3516 
3517     __ subs(len, len, nmax);
3518     __ sub(count, nmax, 16);
3519     __ br(Assembler::HS, L_nmax_loop);
3520 
3521     __ bind(L_by16);
3522     __ adds(len, len, count);
3523     __ br(Assembler::LO, L_by1);
3524 
3525     __ bind(L_by16_loop);
3526 
3527     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3528 
3529     __ add(s1, s1, temp0, ext::uxtb);
3530     __ ubfx(temp2, temp0, 8, 8);
3531     __ add(s2, s2, s1);
3532     __ add(s1, s1, temp2);
3533     __ ubfx(temp2, temp0, 16, 8);
3534     __ add(s2, s2, s1);
3535     __ add(s1, s1, temp2);
3536     __ ubfx(temp2, temp0, 24, 8);
3537     __ add(s2, s2, s1);
3538     __ add(s1, s1, temp2);
3539     __ ubfx(temp2, temp0, 32, 8);
3540     __ add(s2, s2, s1);
3541     __ add(s1, s1, temp2);
3542     __ ubfx(temp2, temp0, 40, 8);
3543     __ add(s2, s2, s1);
3544     __ add(s1, s1, temp2);
3545     __ ubfx(temp2, temp0, 48, 8);
3546     __ add(s2, s2, s1);
3547     __ add(s1, s1, temp2);
3548     __ add(s2, s2, s1);
3549     __ add(s1, s1, temp0, Assembler::LSR, 56);
3550     __ add(s2, s2, s1);
3551 
3552     __ add(s1, s1, temp1, ext::uxtb);
3553     __ ubfx(temp2, temp1, 8, 8);
3554     __ add(s2, s2, s1);
3555     __ add(s1, s1, temp2);
3556     __ ubfx(temp2, temp1, 16, 8);
3557     __ add(s2, s2, s1);
3558     __ add(s1, s1, temp2);
3559     __ ubfx(temp2, temp1, 24, 8);
3560     __ add(s2, s2, s1);
3561     __ add(s1, s1, temp2);
3562     __ ubfx(temp2, temp1, 32, 8);
3563     __ add(s2, s2, s1);
3564     __ add(s1, s1, temp2);
3565     __ ubfx(temp2, temp1, 40, 8);
3566     __ add(s2, s2, s1);
3567     __ add(s1, s1, temp2);
3568     __ ubfx(temp2, temp1, 48, 8);
3569     __ add(s2, s2, s1);
3570     __ add(s1, s1, temp2);
3571     __ add(s2, s2, s1);
3572     __ add(s1, s1, temp1, Assembler::LSR, 56);
3573     __ add(s2, s2, s1);
3574 
3575     __ subs(len, len, 16);
3576     __ br(Assembler::HS, L_by16_loop);
3577 
3578     __ bind(L_by1);
3579     __ adds(len, len, 15);
3580     __ br(Assembler::LO, L_do_mod);
3581 
3582     __ bind(L_by1_loop);
3583     __ ldrb(temp0, Address(__ post(buff, 1)));
3584     __ add(s1, temp0, s1);
3585     __ add(s2, s2, s1);
3586     __ subs(len, len, 1);
3587     __ br(Assembler::HS, L_by1_loop);
3588 
3589     __ bind(L_do_mod);
3590     // s1 = s1 % BASE
3591     __ lsr(temp0, s1, 16);
3592     __ lsl(temp1, temp0, 4);
3593     __ sub(temp1, temp1, temp0);
3594     __ add(temp1, temp1, s1, ext::uxth);
3595 
3596     __ lsr(temp0, temp1, 16);
3597     __ lsl(s1, temp0, 4);
3598     __ sub(s1, s1, temp0);
3599     __ add(s1, s1, temp1, ext:: uxth);
3600 
3601     __ subs(temp0, s1, base);
3602     __ csel(s1, temp0, s1, Assembler::HS);
3603 
3604     // s2 = s2 % BASE
3605     __ lsr(temp0, s2, 16);
3606     __ lsl(temp1, temp0, 4);
3607     __ sub(temp1, temp1, temp0);
3608     __ add(temp1, temp1, s2, ext::uxth);
3609 
3610     __ lsr(temp0, temp1, 16);
3611     __ lsl(s2, temp0, 4);
3612     __ sub(s2, s2, temp0);
3613     __ add(s2, s2, temp1, ext:: uxth);
3614 
3615     __ subs(temp0, s2, base);
3616     __ csel(s2, temp0, s2, Assembler::HS);
3617 
3618     // Combine lower bits and higher bits
3619     __ bind(L_combine);
3620     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3621 
3622     __ ret(lr);
3623 
3624     return start;
3625   }
3626 
3627   /**
3628    *  Arguments:
3629    *
3630    *  Input:
3631    *    c_rarg0   - x address
3632    *    c_rarg1   - x length
3633    *    c_rarg2   - y address
3634    *    c_rarg3   - y lenth
3635    *    c_rarg4   - z address
3636    *    c_rarg5   - z length
3637    */
3638   address generate_multiplyToLen() {
3639     __ align(CodeEntryAlignment);
3640     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3641 
3642     address start = __ pc();
3643     const Register x     = r0;
3644     const Register xlen  = r1;
3645     const Register y     = r2;
3646     const Register ylen  = r3;
3647     const Register z     = r4;
3648     const Register zlen  = r5;
3649 
3650     const Register tmp1  = r10;
3651     const Register tmp2  = r11;
3652     const Register tmp3  = r12;
3653     const Register tmp4  = r13;
3654     const Register tmp5  = r14;
3655     const Register tmp6  = r15;
3656     const Register tmp7  = r16;
3657 
3658     BLOCK_COMMENT("Entry:");
3659     __ enter(); // required for proper stackwalking of RuntimeStub frame
3660     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3661     __ leave(); // required for proper stackwalking of RuntimeStub frame
3662     __ ret(lr);
3663 
3664     return start;
3665   }
3666 
3667   address generate_squareToLen() {
3668     // squareToLen algorithm for sizes 1..127 described in java code works
3669     // faster than multiply_to_len on some CPUs and slower on others, but
3670     // multiply_to_len shows a bit better overall results
3671     __ align(CodeEntryAlignment);
3672     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3673     address start = __ pc();
3674 
3675     const Register x     = r0;
3676     const Register xlen  = r1;
3677     const Register z     = r2;
3678     const Register zlen  = r3;
3679     const Register y     = r4; // == x
3680     const Register ylen  = r5; // == xlen
3681 
3682     const Register tmp1  = r10;
3683     const Register tmp2  = r11;
3684     const Register tmp3  = r12;
3685     const Register tmp4  = r13;
3686     const Register tmp5  = r14;
3687     const Register tmp6  = r15;
3688     const Register tmp7  = r16;
3689 
3690     RegSet spilled_regs = RegSet::of(y, ylen);
3691     BLOCK_COMMENT("Entry:");
3692     __ enter();
3693     __ push(spilled_regs, sp);
3694     __ mov(y, x);
3695     __ mov(ylen, xlen);
3696     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3697     __ pop(spilled_regs, sp);
3698     __ leave();
3699     __ ret(lr);
3700     return start;
3701   }
3702 
3703   address generate_mulAdd() {
3704     __ align(CodeEntryAlignment);
3705     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3706 
3707     address start = __ pc();
3708 
3709     const Register out     = r0;
3710     const Register in      = r1;
3711     const Register offset  = r2;
3712     const Register len     = r3;
3713     const Register k       = r4;
3714 
3715     BLOCK_COMMENT("Entry:");
3716     __ enter();
3717     __ mul_add(out, in, offset, len, k);
3718     __ leave();
3719     __ ret(lr);
3720 
3721     return start;
3722   }
3723 
3724   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3725                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3726                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3727     // Karatsuba multiplication performs a 128*128 -> 256-bit
3728     // multiplication in three 128-bit multiplications and a few
3729     // additions.
3730     //
3731     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3732     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3733     //
3734     // Inputs:
3735     //
3736     // A0 in a.d[0]     (subkey)
3737     // A1 in a.d[1]
3738     // (A1+A0) in a1_xor_a0.d[0]
3739     //
3740     // B0 in b.d[0]     (state)
3741     // B1 in b.d[1]
3742 
3743     __ ext(tmp1, __ T16B, b, b, 0x08);
3744     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3745     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3746     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3747     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3748 
3749     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3750     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3751     __ eor(tmp2, __ T16B, tmp2, tmp4);
3752     __ eor(tmp2, __ T16B, tmp2, tmp3);
3753 
3754     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3755     __ ins(result_hi, __ D, tmp2, 0, 1);
3756     __ ins(result_lo, __ D, tmp2, 1, 0);
3757   }
3758 
3759   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3760                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3761     const FloatRegister t0 = result;
3762 
3763     // The GCM field polynomial f is z^128 + p(z), where p =
3764     // z^7+z^2+z+1.
3765     //
3766     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3767     //
3768     // so, given that the product we're reducing is
3769     //    a == lo + hi * z^128
3770     // substituting,
3771     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3772     //
3773     // we reduce by multiplying hi by p(z) and subtracting the result
3774     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3775     // bits we can do this with two 64-bit multiplications, lo*p and
3776     // hi*p.
3777 
3778     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3779     __ ext(t1, __ T16B, t0, z, 8);
3780     __ eor(hi, __ T16B, hi, t1);
3781     __ ext(t1, __ T16B, z, t0, 8);
3782     __ eor(lo, __ T16B, lo, t1);
3783     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3784     __ eor(result, __ T16B, lo, t0);
3785   }
3786 
3787   address generate_has_negatives(address &has_negatives_long) {
3788     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3789     const int large_loop_size = 64;
3790     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3791     int dcache_line = VM_Version::dcache_line_size();
3792 
3793     Register ary1 = r1, len = r2, result = r0;
3794 
3795     __ align(CodeEntryAlignment);
3796     address entry = __ pc();
3797 
3798     __ enter();
3799 
3800   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3801         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3802 
3803   __ cmp(len, 15);
3804   __ br(Assembler::GT, LEN_OVER_15);
3805   // The only case when execution falls into this code is when pointer is near
3806   // the end of memory page and we have to avoid reading next page
3807   __ add(ary1, ary1, len);
3808   __ subs(len, len, 8);
3809   __ br(Assembler::GT, LEN_OVER_8);
3810   __ ldr(rscratch2, Address(ary1, -8));
3811   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3812   __ lsrv(rscratch2, rscratch2, rscratch1);
3813   __ tst(rscratch2, UPPER_BIT_MASK);
3814   __ cset(result, Assembler::NE);
3815   __ leave();
3816   __ ret(lr);
3817   __ bind(LEN_OVER_8);
3818   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3819   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3820   __ tst(rscratch2, UPPER_BIT_MASK);
3821   __ br(Assembler::NE, RET_TRUE_NO_POP);
3822   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3823   __ lsrv(rscratch1, rscratch1, rscratch2);
3824   __ tst(rscratch1, UPPER_BIT_MASK);
3825   __ cset(result, Assembler::NE);
3826   __ leave();
3827   __ ret(lr);
3828 
3829   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3830   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3831 
3832   has_negatives_long = __ pc(); // 2nd entry point
3833 
3834   __ enter();
3835 
3836   __ bind(LEN_OVER_15);
3837     __ push(spilled_regs, sp);
3838     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3839     __ cbz(rscratch2, ALIGNED);
3840     __ ldp(tmp6, tmp1, Address(ary1));
3841     __ mov(tmp5, 16);
3842     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3843     __ add(ary1, ary1, rscratch1);
3844     __ sub(len, len, rscratch1);
3845     __ orr(tmp6, tmp6, tmp1);
3846     __ tst(tmp6, UPPER_BIT_MASK);
3847     __ br(Assembler::NE, RET_TRUE);
3848 
3849   __ bind(ALIGNED);
3850     __ cmp(len, large_loop_size);
3851     __ br(Assembler::LT, CHECK_16);
3852     // Perform 16-byte load as early return in pre-loop to handle situation
3853     // when initially aligned large array has negative values at starting bytes,
3854     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3855     // slower. Cases with negative bytes further ahead won't be affected that
3856     // much. In fact, it'll be faster due to early loads, less instructions and
3857     // less branches in LARGE_LOOP.
3858     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3859     __ sub(len, len, 16);
3860     __ orr(tmp6, tmp6, tmp1);
3861     __ tst(tmp6, UPPER_BIT_MASK);
3862     __ br(Assembler::NE, RET_TRUE);
3863     __ cmp(len, large_loop_size);
3864     __ br(Assembler::LT, CHECK_16);
3865 
3866     if (SoftwarePrefetchHintDistance >= 0
3867         && SoftwarePrefetchHintDistance >= dcache_line) {
3868       // initial prefetch
3869       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3870     }
3871   __ bind(LARGE_LOOP);
3872     if (SoftwarePrefetchHintDistance >= 0) {
3873       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3874     }
3875     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3876     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3877     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3878     // instructions per cycle and have less branches, but this approach disables
3879     // early return, thus, all 64 bytes are loaded and checked every time.
3880     __ ldp(tmp2, tmp3, Address(ary1));
3881     __ ldp(tmp4, tmp5, Address(ary1, 16));
3882     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3883     __ ldp(tmp6, tmp1, Address(ary1, 48));
3884     __ add(ary1, ary1, large_loop_size);
3885     __ sub(len, len, large_loop_size);
3886     __ orr(tmp2, tmp2, tmp3);
3887     __ orr(tmp4, tmp4, tmp5);
3888     __ orr(rscratch1, rscratch1, rscratch2);
3889     __ orr(tmp6, tmp6, tmp1);
3890     __ orr(tmp2, tmp2, tmp4);
3891     __ orr(rscratch1, rscratch1, tmp6);
3892     __ orr(tmp2, tmp2, rscratch1);
3893     __ tst(tmp2, UPPER_BIT_MASK);
3894     __ br(Assembler::NE, RET_TRUE);
3895     __ cmp(len, large_loop_size);
3896     __ br(Assembler::GE, LARGE_LOOP);
3897 
3898   __ bind(CHECK_16); // small 16-byte load pre-loop
3899     __ cmp(len, 16);
3900     __ br(Assembler::LT, POST_LOOP16);
3901 
3902   __ bind(LOOP16); // small 16-byte load loop
3903     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3904     __ sub(len, len, 16);
3905     __ orr(tmp2, tmp2, tmp3);
3906     __ tst(tmp2, UPPER_BIT_MASK);
3907     __ br(Assembler::NE, RET_TRUE);
3908     __ cmp(len, 16);
3909     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3910 
3911   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3912     __ cmp(len, 8);
3913     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3914     __ ldr(tmp3, Address(__ post(ary1, 8)));
3915     __ sub(len, len, 8);
3916     __ tst(tmp3, UPPER_BIT_MASK);
3917     __ br(Assembler::NE, RET_TRUE);
3918 
3919   __ bind(POST_LOOP16_LOAD_TAIL);
3920     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3921     __ ldr(tmp1, Address(ary1));
3922     __ mov(tmp2, 64);
3923     __ sub(tmp4, tmp2, len, __ LSL, 3);
3924     __ lslv(tmp1, tmp1, tmp4);
3925     __ tst(tmp1, UPPER_BIT_MASK);
3926     __ br(Assembler::NE, RET_TRUE);
3927     // Fallthrough
3928 
3929   __ bind(RET_FALSE);
3930     __ pop(spilled_regs, sp);
3931     __ leave();
3932     __ mov(result, zr);
3933     __ ret(lr);
3934 
3935   __ bind(RET_TRUE);
3936     __ pop(spilled_regs, sp);
3937   __ bind(RET_TRUE_NO_POP);
3938     __ leave();
3939     __ mov(result, 1);
3940     __ ret(lr);
3941 
3942   __ bind(DONE);
3943     __ pop(spilled_regs, sp);
3944     __ leave();
3945     __ ret(lr);
3946     return entry;
3947   }
3948   /**
3949    *  Arguments:
3950    *
3951    *  Input:
3952    *  c_rarg0   - current state address
3953    *  c_rarg1   - H key address
3954    *  c_rarg2   - data address
3955    *  c_rarg3   - number of blocks
3956    *
3957    *  Output:
3958    *  Updated state at c_rarg0
3959    */
3960   address generate_ghash_processBlocks() {
3961     // Bafflingly, GCM uses little-endian for the byte order, but
3962     // big-endian for the bit order.  For example, the polynomial 1 is
3963     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3964     //
3965     // So, we must either reverse the bytes in each word and do
3966     // everything big-endian or reverse the bits in each byte and do
3967     // it little-endian.  On AArch64 it's more idiomatic to reverse
3968     // the bits in each byte (we have an instruction, RBIT, to do
3969     // that) and keep the data in little-endian bit order throught the
3970     // calculation, bit-reversing the inputs and outputs.
3971 
3972     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3973     __ align(wordSize * 2);
3974     address p = __ pc();
3975     __ emit_int64(0x87);  // The low-order bits of the field
3976                           // polynomial (i.e. p = z^7+z^2+z+1)
3977                           // repeated in the low and high parts of a
3978                           // 128-bit vector
3979     __ emit_int64(0x87);
3980 
3981     __ align(CodeEntryAlignment);
3982     address start = __ pc();
3983 
3984     Register state   = c_rarg0;
3985     Register subkeyH = c_rarg1;
3986     Register data    = c_rarg2;
3987     Register blocks  = c_rarg3;
3988 
3989     FloatRegister vzr = v30;
3990     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3991 
3992     __ ldrq(v0, Address(state));
3993     __ ldrq(v1, Address(subkeyH));
3994 
3995     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3996     __ rbit(v0, __ T16B, v0);
3997     __ rev64(v1, __ T16B, v1);
3998     __ rbit(v1, __ T16B, v1);
3999 
4000     __ ldrq(v26, p);
4001 
4002     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4003     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4004 
4005     {
4006       Label L_ghash_loop;
4007       __ bind(L_ghash_loop);
4008 
4009       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4010                                                  // reversing each byte
4011       __ rbit(v2, __ T16B, v2);
4012       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4013 
4014       // Multiply state in v2 by subkey in v1
4015       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4016                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4017                      /*temps*/v6, v20, v18, v21);
4018       // Reduce v7:v5 by the field polynomial
4019       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4020 
4021       __ sub(blocks, blocks, 1);
4022       __ cbnz(blocks, L_ghash_loop);
4023     }
4024 
4025     // The bit-reversed result is at this point in v0
4026     __ rev64(v1, __ T16B, v0);
4027     __ rbit(v1, __ T16B, v1);
4028 
4029     __ st1(v1, __ T16B, state);
4030     __ ret(lr);
4031 
4032     return start;
4033   }
4034 
4035   // Continuation point for throwing of implicit exceptions that are
4036   // not handled in the current activation. Fabricates an exception
4037   // oop and initiates normal exception dispatching in this
4038   // frame. Since we need to preserve callee-saved values (currently
4039   // only for C2, but done for C1 as well) we need a callee-saved oop
4040   // map and therefore have to make these stubs into RuntimeStubs
4041   // rather than BufferBlobs.  If the compiler needs all registers to
4042   // be preserved between the fault point and the exception handler
4043   // then it must assume responsibility for that in
4044   // AbstractCompiler::continuation_for_implicit_null_exception or
4045   // continuation_for_implicit_division_by_zero_exception. All other
4046   // implicit exceptions (e.g., NullPointerException or
4047   // AbstractMethodError on entry) are either at call sites or
4048   // otherwise assume that stack unwinding will be initiated, so
4049   // caller saved registers were assumed volatile in the compiler.
4050 
4051 #undef __
4052 #define __ masm->
4053 
4054   address generate_throw_exception(const char* name,
4055                                    address runtime_entry,
4056                                    Register arg1 = noreg,
4057                                    Register arg2 = noreg) {
4058     // Information about frame layout at time of blocking runtime call.
4059     // Note that we only have to preserve callee-saved registers since
4060     // the compilers are responsible for supplying a continuation point
4061     // if they expect all registers to be preserved.
4062     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4063     enum layout {
4064       rfp_off = 0,
4065       rfp_off2,
4066       return_off,
4067       return_off2,
4068       framesize // inclusive of return address
4069     };
4070 
4071     int insts_size = 512;
4072     int locs_size  = 64;
4073 
4074     CodeBuffer code(name, insts_size, locs_size);
4075     OopMapSet* oop_maps  = new OopMapSet();
4076     MacroAssembler* masm = new MacroAssembler(&code);
4077 
4078     address start = __ pc();
4079 
4080     // This is an inlined and slightly modified version of call_VM
4081     // which has the ability to fetch the return PC out of
4082     // thread-local storage and also sets up last_Java_sp slightly
4083     // differently than the real call_VM
4084 
4085     __ enter(); // Save FP and LR before call
4086 
4087     assert(is_even(framesize/2), "sp not 16-byte aligned");
4088 
4089     // lr and fp are already in place
4090     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4091 
4092     int frame_complete = __ pc() - start;
4093 
4094     // Set up last_Java_sp and last_Java_fp
4095     address the_pc = __ pc();
4096     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4097 
4098     // Call runtime
4099     if (arg1 != noreg) {
4100       assert(arg2 != c_rarg1, "clobbered");
4101       __ mov(c_rarg1, arg1);
4102     }
4103     if (arg2 != noreg) {
4104       __ mov(c_rarg2, arg2);
4105     }
4106     __ mov(c_rarg0, rthread);
4107     BLOCK_COMMENT("call runtime_entry");
4108     __ mov(rscratch1, runtime_entry);
4109     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4110 
4111     // Generate oop map
4112     OopMap* map = new OopMap(framesize, 0);
4113 
4114     oop_maps->add_gc_map(the_pc - start, map);
4115 
4116     __ reset_last_Java_frame(true);
4117     __ maybe_isb();
4118 
4119     __ leave();
4120 
4121     // check for pending exceptions
4122 #ifdef ASSERT
4123     Label L;
4124     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4125     __ cbnz(rscratch1, L);
4126     __ should_not_reach_here();
4127     __ bind(L);
4128 #endif // ASSERT
4129     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4130 
4131 
4132     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4133     RuntimeStub* stub =
4134       RuntimeStub::new_runtime_stub(name,
4135                                     &code,
4136                                     frame_complete,
4137                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4138                                     oop_maps, false);
4139     return stub->entry_point();
4140   }
4141 
4142   class MontgomeryMultiplyGenerator : public MacroAssembler {
4143 
4144     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4145       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4146 
4147     RegSet _toSave;
4148     bool _squaring;
4149 
4150   public:
4151     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4152       : MacroAssembler(as->code()), _squaring(squaring) {
4153 
4154       // Register allocation
4155 
4156       Register reg = c_rarg0;
4157       Pa_base = reg;       // Argument registers
4158       if (squaring)
4159         Pb_base = Pa_base;
4160       else
4161         Pb_base = ++reg;
4162       Pn_base = ++reg;
4163       Rlen= ++reg;
4164       inv = ++reg;
4165       Pm_base = ++reg;
4166 
4167                           // Working registers:
4168       Ra =  ++reg;        // The current digit of a, b, n, and m.
4169       Rb =  ++reg;
4170       Rm =  ++reg;
4171       Rn =  ++reg;
4172 
4173       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4174       Pb =  ++reg;
4175       Pm =  ++reg;
4176       Pn =  ++reg;
4177 
4178       t0 =  ++reg;        // Three registers which form a
4179       t1 =  ++reg;        // triple-precision accumuator.
4180       t2 =  ++reg;
4181 
4182       Ri =  ++reg;        // Inner and outer loop indexes.
4183       Rj =  ++reg;
4184 
4185       Rhi_ab = ++reg;     // Product registers: low and high parts
4186       Rlo_ab = ++reg;     // of a*b and m*n.
4187       Rhi_mn = ++reg;
4188       Rlo_mn = ++reg;
4189 
4190       // r19 and up are callee-saved.
4191       _toSave = RegSet::range(r19, reg) + Pm_base;
4192     }
4193 
4194   private:
4195     void save_regs() {
4196       push(_toSave, sp);
4197     }
4198 
4199     void restore_regs() {
4200       pop(_toSave, sp);
4201     }
4202 
4203     template <typename T>
4204     void unroll_2(Register count, T block) {
4205       Label loop, end, odd;
4206       tbnz(count, 0, odd);
4207       cbz(count, end);
4208       align(16);
4209       bind(loop);
4210       (this->*block)();
4211       bind(odd);
4212       (this->*block)();
4213       subs(count, count, 2);
4214       br(Assembler::GT, loop);
4215       bind(end);
4216     }
4217 
4218     template <typename T>
4219     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4220       Label loop, end, odd;
4221       tbnz(count, 0, odd);
4222       cbz(count, end);
4223       align(16);
4224       bind(loop);
4225       (this->*block)(d, s, tmp);
4226       bind(odd);
4227       (this->*block)(d, s, tmp);
4228       subs(count, count, 2);
4229       br(Assembler::GT, loop);
4230       bind(end);
4231     }
4232 
4233     void pre1(RegisterOrConstant i) {
4234       block_comment("pre1");
4235       // Pa = Pa_base;
4236       // Pb = Pb_base + i;
4237       // Pm = Pm_base;
4238       // Pn = Pn_base + i;
4239       // Ra = *Pa;
4240       // Rb = *Pb;
4241       // Rm = *Pm;
4242       // Rn = *Pn;
4243       ldr(Ra, Address(Pa_base));
4244       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4245       ldr(Rm, Address(Pm_base));
4246       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4247       lea(Pa, Address(Pa_base));
4248       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4249       lea(Pm, Address(Pm_base));
4250       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4251 
4252       // Zero the m*n result.
4253       mov(Rhi_mn, zr);
4254       mov(Rlo_mn, zr);
4255     }
4256 
4257     // The core multiply-accumulate step of a Montgomery
4258     // multiplication.  The idea is to schedule operations as a
4259     // pipeline so that instructions with long latencies (loads and
4260     // multiplies) have time to complete before their results are
4261     // used.  This most benefits in-order implementations of the
4262     // architecture but out-of-order ones also benefit.
4263     void step() {
4264       block_comment("step");
4265       // MACC(Ra, Rb, t0, t1, t2);
4266       // Ra = *++Pa;
4267       // Rb = *--Pb;
4268       umulh(Rhi_ab, Ra, Rb);
4269       mul(Rlo_ab, Ra, Rb);
4270       ldr(Ra, pre(Pa, wordSize));
4271       ldr(Rb, pre(Pb, -wordSize));
4272       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4273                                        // previous iteration.
4274       // MACC(Rm, Rn, t0, t1, t2);
4275       // Rm = *++Pm;
4276       // Rn = *--Pn;
4277       umulh(Rhi_mn, Rm, Rn);
4278       mul(Rlo_mn, Rm, Rn);
4279       ldr(Rm, pre(Pm, wordSize));
4280       ldr(Rn, pre(Pn, -wordSize));
4281       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4282     }
4283 
4284     void post1() {
4285       block_comment("post1");
4286 
4287       // MACC(Ra, Rb, t0, t1, t2);
4288       // Ra = *++Pa;
4289       // Rb = *--Pb;
4290       umulh(Rhi_ab, Ra, Rb);
4291       mul(Rlo_ab, Ra, Rb);
4292       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4293       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4294 
4295       // *Pm = Rm = t0 * inv;
4296       mul(Rm, t0, inv);
4297       str(Rm, Address(Pm));
4298 
4299       // MACC(Rm, Rn, t0, t1, t2);
4300       // t0 = t1; t1 = t2; t2 = 0;
4301       umulh(Rhi_mn, Rm, Rn);
4302 
4303 #ifndef PRODUCT
4304       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4305       {
4306         mul(Rlo_mn, Rm, Rn);
4307         add(Rlo_mn, t0, Rlo_mn);
4308         Label ok;
4309         cbz(Rlo_mn, ok); {
4310           stop("broken Montgomery multiply");
4311         } bind(ok);
4312       }
4313 #endif
4314       // We have very carefully set things up so that
4315       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4316       // the lower half of Rm * Rn because we know the result already:
4317       // it must be -t0.  t0 + (-t0) must generate a carry iff
4318       // t0 != 0.  So, rather than do a mul and an adds we just set
4319       // the carry flag iff t0 is nonzero.
4320       //
4321       // mul(Rlo_mn, Rm, Rn);
4322       // adds(zr, t0, Rlo_mn);
4323       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4324       adcs(t0, t1, Rhi_mn);
4325       adc(t1, t2, zr);
4326       mov(t2, zr);
4327     }
4328 
4329     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4330       block_comment("pre2");
4331       // Pa = Pa_base + i-len;
4332       // Pb = Pb_base + len;
4333       // Pm = Pm_base + i-len;
4334       // Pn = Pn_base + len;
4335 
4336       if (i.is_register()) {
4337         sub(Rj, i.as_register(), len);
4338       } else {
4339         mov(Rj, i.as_constant());
4340         sub(Rj, Rj, len);
4341       }
4342       // Rj == i-len
4343 
4344       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4345       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4346       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4347       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4348 
4349       // Ra = *++Pa;
4350       // Rb = *--Pb;
4351       // Rm = *++Pm;
4352       // Rn = *--Pn;
4353       ldr(Ra, pre(Pa, wordSize));
4354       ldr(Rb, pre(Pb, -wordSize));
4355       ldr(Rm, pre(Pm, wordSize));
4356       ldr(Rn, pre(Pn, -wordSize));
4357 
4358       mov(Rhi_mn, zr);
4359       mov(Rlo_mn, zr);
4360     }
4361 
4362     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4363       block_comment("post2");
4364       if (i.is_constant()) {
4365         mov(Rj, i.as_constant()-len.as_constant());
4366       } else {
4367         sub(Rj, i.as_register(), len);
4368       }
4369 
4370       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4371 
4372       // As soon as we know the least significant digit of our result,
4373       // store it.
4374       // Pm_base[i-len] = t0;
4375       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4376 
4377       // t0 = t1; t1 = t2; t2 = 0;
4378       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4379       adc(t1, t2, zr);
4380       mov(t2, zr);
4381     }
4382 
4383     // A carry in t0 after Montgomery multiplication means that we
4384     // should subtract multiples of n from our result in m.  We'll
4385     // keep doing that until there is no carry.
4386     void normalize(RegisterOrConstant len) {
4387       block_comment("normalize");
4388       // while (t0)
4389       //   t0 = sub(Pm_base, Pn_base, t0, len);
4390       Label loop, post, again;
4391       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4392       cbz(t0, post); {
4393         bind(again); {
4394           mov(i, zr);
4395           mov(cnt, len);
4396           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4397           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4398           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4399           align(16);
4400           bind(loop); {
4401             sbcs(Rm, Rm, Rn);
4402             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4403             add(i, i, 1);
4404             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4405             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4406             sub(cnt, cnt, 1);
4407           } cbnz(cnt, loop);
4408           sbc(t0, t0, zr);
4409         } cbnz(t0, again);
4410       } bind(post);
4411     }
4412 
4413     // Move memory at s to d, reversing words.
4414     //    Increments d to end of copied memory
4415     //    Destroys tmp1, tmp2
4416     //    Preserves len
4417     //    Leaves s pointing to the address which was in d at start
4418     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4419       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4420 
4421       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4422       mov(tmp1, len);
4423       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4424       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4425     }
4426     // where
4427     void reverse1(Register d, Register s, Register tmp) {
4428       ldr(tmp, pre(s, -wordSize));
4429       ror(tmp, tmp, 32);
4430       str(tmp, post(d, wordSize));
4431     }
4432 
4433     void step_squaring() {
4434       // An extra ACC
4435       step();
4436       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4437     }
4438 
4439     void last_squaring(RegisterOrConstant i) {
4440       Label dont;
4441       // if ((i & 1) == 0) {
4442       tbnz(i.as_register(), 0, dont); {
4443         // MACC(Ra, Rb, t0, t1, t2);
4444         // Ra = *++Pa;
4445         // Rb = *--Pb;
4446         umulh(Rhi_ab, Ra, Rb);
4447         mul(Rlo_ab, Ra, Rb);
4448         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4449       } bind(dont);
4450     }
4451 
4452     void extra_step_squaring() {
4453       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4454 
4455       // MACC(Rm, Rn, t0, t1, t2);
4456       // Rm = *++Pm;
4457       // Rn = *--Pn;
4458       umulh(Rhi_mn, Rm, Rn);
4459       mul(Rlo_mn, Rm, Rn);
4460       ldr(Rm, pre(Pm, wordSize));
4461       ldr(Rn, pre(Pn, -wordSize));
4462     }
4463 
4464     void post1_squaring() {
4465       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4466 
4467       // *Pm = Rm = t0 * inv;
4468       mul(Rm, t0, inv);
4469       str(Rm, Address(Pm));
4470 
4471       // MACC(Rm, Rn, t0, t1, t2);
4472       // t0 = t1; t1 = t2; t2 = 0;
4473       umulh(Rhi_mn, Rm, Rn);
4474 
4475 #ifndef PRODUCT
4476       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4477       {
4478         mul(Rlo_mn, Rm, Rn);
4479         add(Rlo_mn, t0, Rlo_mn);
4480         Label ok;
4481         cbz(Rlo_mn, ok); {
4482           stop("broken Montgomery multiply");
4483         } bind(ok);
4484       }
4485 #endif
4486       // We have very carefully set things up so that
4487       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4488       // the lower half of Rm * Rn because we know the result already:
4489       // it must be -t0.  t0 + (-t0) must generate a carry iff
4490       // t0 != 0.  So, rather than do a mul and an adds we just set
4491       // the carry flag iff t0 is nonzero.
4492       //
4493       // mul(Rlo_mn, Rm, Rn);
4494       // adds(zr, t0, Rlo_mn);
4495       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4496       adcs(t0, t1, Rhi_mn);
4497       adc(t1, t2, zr);
4498       mov(t2, zr);
4499     }
4500 
4501     void acc(Register Rhi, Register Rlo,
4502              Register t0, Register t1, Register t2) {
4503       adds(t0, t0, Rlo);
4504       adcs(t1, t1, Rhi);
4505       adc(t2, t2, zr);
4506     }
4507 
4508   public:
4509     /**
4510      * Fast Montgomery multiplication.  The derivation of the
4511      * algorithm is in A Cryptographic Library for the Motorola
4512      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4513      *
4514      * Arguments:
4515      *
4516      * Inputs for multiplication:
4517      *   c_rarg0   - int array elements a
4518      *   c_rarg1   - int array elements b
4519      *   c_rarg2   - int array elements n (the modulus)
4520      *   c_rarg3   - int length
4521      *   c_rarg4   - int inv
4522      *   c_rarg5   - int array elements m (the result)
4523      *
4524      * Inputs for squaring:
4525      *   c_rarg0   - int array elements a
4526      *   c_rarg1   - int array elements n (the modulus)
4527      *   c_rarg2   - int length
4528      *   c_rarg3   - int inv
4529      *   c_rarg4   - int array elements m (the result)
4530      *
4531      */
4532     address generate_multiply() {
4533       Label argh, nothing;
4534       bind(argh);
4535       stop("MontgomeryMultiply total_allocation must be <= 8192");
4536 
4537       align(CodeEntryAlignment);
4538       address entry = pc();
4539 
4540       cbzw(Rlen, nothing);
4541 
4542       enter();
4543 
4544       // Make room.
4545       cmpw(Rlen, 512);
4546       br(Assembler::HI, argh);
4547       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4548       andr(sp, Ra, -2 * wordSize);
4549 
4550       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4551 
4552       {
4553         // Copy input args, reversing as we go.  We use Ra as a
4554         // temporary variable.
4555         reverse(Ra, Pa_base, Rlen, t0, t1);
4556         if (!_squaring)
4557           reverse(Ra, Pb_base, Rlen, t0, t1);
4558         reverse(Ra, Pn_base, Rlen, t0, t1);
4559       }
4560 
4561       // Push all call-saved registers and also Pm_base which we'll need
4562       // at the end.
4563       save_regs();
4564 
4565 #ifndef PRODUCT
4566       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4567       {
4568         ldr(Rn, Address(Pn_base, 0));
4569         mul(Rlo_mn, Rn, inv);
4570         cmp(Rlo_mn, -1);
4571         Label ok;
4572         br(EQ, ok); {
4573           stop("broken inverse in Montgomery multiply");
4574         } bind(ok);
4575       }
4576 #endif
4577 
4578       mov(Pm_base, Ra);
4579 
4580       mov(t0, zr);
4581       mov(t1, zr);
4582       mov(t2, zr);
4583 
4584       block_comment("for (int i = 0; i < len; i++) {");
4585       mov(Ri, zr); {
4586         Label loop, end;
4587         cmpw(Ri, Rlen);
4588         br(Assembler::GE, end);
4589 
4590         bind(loop);
4591         pre1(Ri);
4592 
4593         block_comment("  for (j = i; j; j--) {"); {
4594           movw(Rj, Ri);
4595           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4596         } block_comment("  } // j");
4597 
4598         post1();
4599         addw(Ri, Ri, 1);
4600         cmpw(Ri, Rlen);
4601         br(Assembler::LT, loop);
4602         bind(end);
4603         block_comment("} // i");
4604       }
4605 
4606       block_comment("for (int i = len; i < 2*len; i++) {");
4607       mov(Ri, Rlen); {
4608         Label loop, end;
4609         cmpw(Ri, Rlen, Assembler::LSL, 1);
4610         br(Assembler::GE, end);
4611 
4612         bind(loop);
4613         pre2(Ri, Rlen);
4614 
4615         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4616           lslw(Rj, Rlen, 1);
4617           subw(Rj, Rj, Ri);
4618           subw(Rj, Rj, 1);
4619           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4620         } block_comment("  } // j");
4621 
4622         post2(Ri, Rlen);
4623         addw(Ri, Ri, 1);
4624         cmpw(Ri, Rlen, Assembler::LSL, 1);
4625         br(Assembler::LT, loop);
4626         bind(end);
4627       }
4628       block_comment("} // i");
4629 
4630       normalize(Rlen);
4631 
4632       mov(Ra, Pm_base);  // Save Pm_base in Ra
4633       restore_regs();  // Restore caller's Pm_base
4634 
4635       // Copy our result into caller's Pm_base
4636       reverse(Pm_base, Ra, Rlen, t0, t1);
4637 
4638       leave();
4639       bind(nothing);
4640       ret(lr);
4641 
4642       return entry;
4643     }
4644     // In C, approximately:
4645 
4646     // void
4647     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4648     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4649     //                     unsigned long inv, int len) {
4650     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4651     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4652     //   unsigned long Ra, Rb, Rn, Rm;
4653 
4654     //   int i;
4655 
4656     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4657 
4658     //   for (i = 0; i < len; i++) {
4659     //     int j;
4660 
4661     //     Pa = Pa_base;
4662     //     Pb = Pb_base + i;
4663     //     Pm = Pm_base;
4664     //     Pn = Pn_base + i;
4665 
4666     //     Ra = *Pa;
4667     //     Rb = *Pb;
4668     //     Rm = *Pm;
4669     //     Rn = *Pn;
4670 
4671     //     int iters = i;
4672     //     for (j = 0; iters--; j++) {
4673     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4674     //       MACC(Ra, Rb, t0, t1, t2);
4675     //       Ra = *++Pa;
4676     //       Rb = *--Pb;
4677     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4678     //       MACC(Rm, Rn, t0, t1, t2);
4679     //       Rm = *++Pm;
4680     //       Rn = *--Pn;
4681     //     }
4682 
4683     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4684     //     MACC(Ra, Rb, t0, t1, t2);
4685     //     *Pm = Rm = t0 * inv;
4686     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4687     //     MACC(Rm, Rn, t0, t1, t2);
4688 
4689     //     assert(t0 == 0, "broken Montgomery multiply");
4690 
4691     //     t0 = t1; t1 = t2; t2 = 0;
4692     //   }
4693 
4694     //   for (i = len; i < 2*len; i++) {
4695     //     int j;
4696 
4697     //     Pa = Pa_base + i-len;
4698     //     Pb = Pb_base + len;
4699     //     Pm = Pm_base + i-len;
4700     //     Pn = Pn_base + len;
4701 
4702     //     Ra = *++Pa;
4703     //     Rb = *--Pb;
4704     //     Rm = *++Pm;
4705     //     Rn = *--Pn;
4706 
4707     //     int iters = len*2-i-1;
4708     //     for (j = i-len+1; iters--; j++) {
4709     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4710     //       MACC(Ra, Rb, t0, t1, t2);
4711     //       Ra = *++Pa;
4712     //       Rb = *--Pb;
4713     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4714     //       MACC(Rm, Rn, t0, t1, t2);
4715     //       Rm = *++Pm;
4716     //       Rn = *--Pn;
4717     //     }
4718 
4719     //     Pm_base[i-len] = t0;
4720     //     t0 = t1; t1 = t2; t2 = 0;
4721     //   }
4722 
4723     //   while (t0)
4724     //     t0 = sub(Pm_base, Pn_base, t0, len);
4725     // }
4726 
4727     /**
4728      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4729      * multiplies than Montgomery multiplication so it should be up to
4730      * 25% faster.  However, its loop control is more complex and it
4731      * may actually run slower on some machines.
4732      *
4733      * Arguments:
4734      *
4735      * Inputs:
4736      *   c_rarg0   - int array elements a
4737      *   c_rarg1   - int array elements n (the modulus)
4738      *   c_rarg2   - int length
4739      *   c_rarg3   - int inv
4740      *   c_rarg4   - int array elements m (the result)
4741      *
4742      */
4743     address generate_square() {
4744       Label argh;
4745       bind(argh);
4746       stop("MontgomeryMultiply total_allocation must be <= 8192");
4747 
4748       align(CodeEntryAlignment);
4749       address entry = pc();
4750 
4751       enter();
4752 
4753       // Make room.
4754       cmpw(Rlen, 512);
4755       br(Assembler::HI, argh);
4756       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4757       andr(sp, Ra, -2 * wordSize);
4758 
4759       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4760 
4761       {
4762         // Copy input args, reversing as we go.  We use Ra as a
4763         // temporary variable.
4764         reverse(Ra, Pa_base, Rlen, t0, t1);
4765         reverse(Ra, Pn_base, Rlen, t0, t1);
4766       }
4767 
4768       // Push all call-saved registers and also Pm_base which we'll need
4769       // at the end.
4770       save_regs();
4771 
4772       mov(Pm_base, Ra);
4773 
4774       mov(t0, zr);
4775       mov(t1, zr);
4776       mov(t2, zr);
4777 
4778       block_comment("for (int i = 0; i < len; i++) {");
4779       mov(Ri, zr); {
4780         Label loop, end;
4781         bind(loop);
4782         cmp(Ri, Rlen);
4783         br(Assembler::GE, end);
4784 
4785         pre1(Ri);
4786 
4787         block_comment("for (j = (i+1)/2; j; j--) {"); {
4788           add(Rj, Ri, 1);
4789           lsr(Rj, Rj, 1);
4790           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4791         } block_comment("  } // j");
4792 
4793         last_squaring(Ri);
4794 
4795         block_comment("  for (j = i/2; j; j--) {"); {
4796           lsr(Rj, Ri, 1);
4797           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4798         } block_comment("  } // j");
4799 
4800         post1_squaring();
4801         add(Ri, Ri, 1);
4802         cmp(Ri, Rlen);
4803         br(Assembler::LT, loop);
4804 
4805         bind(end);
4806         block_comment("} // i");
4807       }
4808 
4809       block_comment("for (int i = len; i < 2*len; i++) {");
4810       mov(Ri, Rlen); {
4811         Label loop, end;
4812         bind(loop);
4813         cmp(Ri, Rlen, Assembler::LSL, 1);
4814         br(Assembler::GE, end);
4815 
4816         pre2(Ri, Rlen);
4817 
4818         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4819           lsl(Rj, Rlen, 1);
4820           sub(Rj, Rj, Ri);
4821           sub(Rj, Rj, 1);
4822           lsr(Rj, Rj, 1);
4823           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4824         } block_comment("  } // j");
4825 
4826         last_squaring(Ri);
4827 
4828         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4829           lsl(Rj, Rlen, 1);
4830           sub(Rj, Rj, Ri);
4831           lsr(Rj, Rj, 1);
4832           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4833         } block_comment("  } // j");
4834 
4835         post2(Ri, Rlen);
4836         add(Ri, Ri, 1);
4837         cmp(Ri, Rlen, Assembler::LSL, 1);
4838 
4839         br(Assembler::LT, loop);
4840         bind(end);
4841         block_comment("} // i");
4842       }
4843 
4844       normalize(Rlen);
4845 
4846       mov(Ra, Pm_base);  // Save Pm_base in Ra
4847       restore_regs();  // Restore caller's Pm_base
4848 
4849       // Copy our result into caller's Pm_base
4850       reverse(Pm_base, Ra, Rlen, t0, t1);
4851 
4852       leave();
4853       ret(lr);
4854 
4855       return entry;
4856     }
4857     // In C, approximately:
4858 
4859     // void
4860     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4861     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4862     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4863     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4864     //   unsigned long Ra, Rb, Rn, Rm;
4865 
4866     //   int i;
4867 
4868     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4869 
4870     //   for (i = 0; i < len; i++) {
4871     //     int j;
4872 
4873     //     Pa = Pa_base;
4874     //     Pb = Pa_base + i;
4875     //     Pm = Pm_base;
4876     //     Pn = Pn_base + i;
4877 
4878     //     Ra = *Pa;
4879     //     Rb = *Pb;
4880     //     Rm = *Pm;
4881     //     Rn = *Pn;
4882 
4883     //     int iters = (i+1)/2;
4884     //     for (j = 0; iters--; j++) {
4885     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4886     //       MACC2(Ra, Rb, t0, t1, t2);
4887     //       Ra = *++Pa;
4888     //       Rb = *--Pb;
4889     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4890     //       MACC(Rm, Rn, t0, t1, t2);
4891     //       Rm = *++Pm;
4892     //       Rn = *--Pn;
4893     //     }
4894     //     if ((i & 1) == 0) {
4895     //       assert(Ra == Pa_base[j], "must be");
4896     //       MACC(Ra, Ra, t0, t1, t2);
4897     //     }
4898     //     iters = i/2;
4899     //     assert(iters == i-j, "must be");
4900     //     for (; iters--; j++) {
4901     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4902     //       MACC(Rm, Rn, t0, t1, t2);
4903     //       Rm = *++Pm;
4904     //       Rn = *--Pn;
4905     //     }
4906 
4907     //     *Pm = Rm = t0 * inv;
4908     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4909     //     MACC(Rm, Rn, t0, t1, t2);
4910 
4911     //     assert(t0 == 0, "broken Montgomery multiply");
4912 
4913     //     t0 = t1; t1 = t2; t2 = 0;
4914     //   }
4915 
4916     //   for (i = len; i < 2*len; i++) {
4917     //     int start = i-len+1;
4918     //     int end = start + (len - start)/2;
4919     //     int j;
4920 
4921     //     Pa = Pa_base + i-len;
4922     //     Pb = Pa_base + len;
4923     //     Pm = Pm_base + i-len;
4924     //     Pn = Pn_base + len;
4925 
4926     //     Ra = *++Pa;
4927     //     Rb = *--Pb;
4928     //     Rm = *++Pm;
4929     //     Rn = *--Pn;
4930 
4931     //     int iters = (2*len-i-1)/2;
4932     //     assert(iters == end-start, "must be");
4933     //     for (j = start; iters--; j++) {
4934     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4935     //       MACC2(Ra, Rb, t0, t1, t2);
4936     //       Ra = *++Pa;
4937     //       Rb = *--Pb;
4938     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4939     //       MACC(Rm, Rn, t0, t1, t2);
4940     //       Rm = *++Pm;
4941     //       Rn = *--Pn;
4942     //     }
4943     //     if ((i & 1) == 0) {
4944     //       assert(Ra == Pa_base[j], "must be");
4945     //       MACC(Ra, Ra, t0, t1, t2);
4946     //     }
4947     //     iters =  (2*len-i)/2;
4948     //     assert(iters == len-j, "must be");
4949     //     for (; iters--; j++) {
4950     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4951     //       MACC(Rm, Rn, t0, t1, t2);
4952     //       Rm = *++Pm;
4953     //       Rn = *--Pn;
4954     //     }
4955     //     Pm_base[i-len] = t0;
4956     //     t0 = t1; t1 = t2; t2 = 0;
4957     //   }
4958 
4959     //   while (t0)
4960     //     t0 = sub(Pm_base, Pn_base, t0, len);
4961     // }
4962   };
4963 
4964 
4965   // Initialization
4966   void generate_initial() {
4967     // Generate initial stubs and initializes the entry points
4968 
4969     // entry points that exist in all platforms Note: This is code
4970     // that could be shared among different platforms - however the
4971     // benefit seems to be smaller than the disadvantage of having a
4972     // much more complicated generator structure. See also comment in
4973     // stubRoutines.hpp.
4974 
4975     StubRoutines::_forward_exception_entry = generate_forward_exception();
4976 
4977     StubRoutines::_call_stub_entry =
4978       generate_call_stub(StubRoutines::_call_stub_return_address);
4979 
4980     // is referenced by megamorphic call
4981     StubRoutines::_catch_exception_entry = generate_catch_exception();
4982 
4983     // Build this early so it's available for the interpreter.
4984     StubRoutines::_throw_StackOverflowError_entry =
4985       generate_throw_exception("StackOverflowError throw_exception",
4986                                CAST_FROM_FN_PTR(address,
4987                                                 SharedRuntime::throw_StackOverflowError));
4988     StubRoutines::_throw_delayed_StackOverflowError_entry =
4989       generate_throw_exception("delayed StackOverflowError throw_exception",
4990                                CAST_FROM_FN_PTR(address,
4991                                                 SharedRuntime::throw_delayed_StackOverflowError));
4992     if (UseCRC32Intrinsics) {
4993       // set table address before stub generation which use it
4994       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4995       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4996     }
4997 
4998     if (UseCRC32CIntrinsics) {
4999       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5000     }
5001   }
5002 
5003   void generate_all() {
5004     // support for verify_oop (must happen after universe_init)
5005     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5006     StubRoutines::_throw_AbstractMethodError_entry =
5007       generate_throw_exception("AbstractMethodError throw_exception",
5008                                CAST_FROM_FN_PTR(address,
5009                                                 SharedRuntime::
5010                                                 throw_AbstractMethodError));
5011 
5012     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5013       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5014                                CAST_FROM_FN_PTR(address,
5015                                                 SharedRuntime::
5016                                                 throw_IncompatibleClassChangeError));
5017 
5018     StubRoutines::_throw_NullPointerException_at_call_entry =
5019       generate_throw_exception("NullPointerException at call throw_exception",
5020                                CAST_FROM_FN_PTR(address,
5021                                                 SharedRuntime::
5022                                                 throw_NullPointerException_at_call));
5023 
5024     // arraycopy stubs used by compilers
5025     generate_arraycopy_stubs();
5026 
5027     // has negatives stub for large arrays.
5028     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5029 
5030     if (UseMultiplyToLenIntrinsic) {
5031       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5032     }
5033 
5034     if (UseSquareToLenIntrinsic) {
5035       StubRoutines::_squareToLen = generate_squareToLen();
5036     }
5037 
5038     if (UseMulAddIntrinsic) {
5039       StubRoutines::_mulAdd = generate_mulAdd();
5040     }
5041 
5042     if (UseMontgomeryMultiplyIntrinsic) {
5043       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5044       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5045       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5046     }
5047 
5048     if (UseMontgomerySquareIntrinsic) {
5049       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5050       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5051       // We use generate_multiply() rather than generate_square()
5052       // because it's faster for the sizes of modulus we care about.
5053       StubRoutines::_montgomerySquare = g.generate_multiply();
5054     }
5055 
5056     if (UseShenandoahGC && (ShenandoahWriteBarrier || ShenandoahStoreValWriteBarrier)) {
5057       StubRoutines::aarch64::_shenandoah_wb = generate_shenandoah_wb(false, true);
5058       StubRoutines::_shenandoah_wb_C = generate_shenandoah_wb(true, !ShenandoahWriteBarrierCsetTestInIR);
5059     }
5060 
5061 #ifndef BUILTIN_SIM
5062     // generate GHASH intrinsics code
5063     if (UseGHASHIntrinsics) {
5064       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5065     }
5066 
5067     if (UseAESIntrinsics) {
5068       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5069       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5070       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5071       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5072     }
5073 
5074     if (UseSHA1Intrinsics) {
5075       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5076       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5077     }
5078     if (UseSHA256Intrinsics) {
5079       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5080       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5081     }
5082 
5083     // generate Adler32 intrinsics code
5084     if (UseAdler32Intrinsics) {
5085       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5086     }
5087 
5088     // Safefetch stubs.
5089     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5090                                                        &StubRoutines::_safefetch32_fault_pc,
5091                                                        &StubRoutines::_safefetch32_continuation_pc);
5092     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5093                                                        &StubRoutines::_safefetchN_fault_pc,
5094                                                        &StubRoutines::_safefetchN_continuation_pc);
5095 #endif
5096     StubRoutines::aarch64::set_completed();
5097   }
5098 
5099  public:
5100   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5101     if (all) {
5102       generate_all();
5103     } else {
5104       generate_initial();
5105     }
5106   }
5107 }; // end class declaration
5108 
5109 void StubGenerator_generate(CodeBuffer* code, bool all) {
5110   StubGenerator g(code, all);
5111 }