1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetCodeGen.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (unsigned)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(j_rarg1, T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(j_rarg2));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376 #ifndef PRODUCT
 377     // tell the simulator we are about to end Java execution
 378     if (NotifySimulator) {
 379       __ notify(Assembler::method_exit);
 380     }
 381 #endif
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387 
 388     __ BIND(is_long);
 389     __ str(r0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_float);
 393     __ strs(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_double);
 397     __ strd(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     return start;
 401   }
 402 
 403   // Return point for a Java call if there's an exception thrown in
 404   // Java code.  The exception is caught and transformed into a
 405   // pending exception stored in JavaThread that can be tested from
 406   // within the VM.
 407   //
 408   // Note: Usually the parameters are removed by the callee. In case
 409   // of an exception crossing an activation frame boundary, that is
 410   // not the case if the callee is compiled code => need to setup the
 411   // rsp.
 412   //
 413   // r0: exception oop
 414 
 415   // NOTE: this is used as a target from the signal handler so it
 416   // needs an x86 prolog which returns into the current simulator
 417   // executing the generated catch_exception code. so the prolog
 418   // needs to install rax in a sim register and adjust the sim's
 419   // restart pc to enter the generated code at the start position
 420   // then return from native to simulated execution.
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != NULL,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code wiht no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // we should not really care that lr is no longer the callee
 517     // address. we saved the value the handler needs in r19 so we can
 518     // just copy it to r3. however, the C2 handler will push its own
 519     // frame and then calls into the VM and the VM code asserts that
 520     // the PC for the frame above the handler belongs to a compiled
 521     // Java method. So, we restore lr here to satisfy that assert.
 522     __ mov(lr, r19);
 523     // setup r0 & r3 & clear pending exception
 524     __ mov(r3, r19);
 525     __ mov(r19, r0);
 526     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 527     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ cbnz(r0, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler
 540     // r0: exception
 541     // r3: throwing pc
 542     // r19: exception handler
 543     __ verify_oop(r0);
 544     __ br(r19);
 545 
 546     return start;
 547   }
 548 
 549   // Non-destructive plausibility checks for oops
 550   //
 551   // Arguments:
 552   //    r0: oop to verify
 553   //    rscratch1: error message
 554   //
 555   // Stack after saving c_rarg3:
 556   //    [tos + 0]: saved c_rarg3
 557   //    [tos + 1]: saved c_rarg2
 558   //    [tos + 2]: saved lr
 559   //    [tos + 3]: saved rscratch2
 560   //    [tos + 4]: saved r0
 561   //    [tos + 5]: saved rscratch1
 562   address generate_verify_oop() {
 563 
 564     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 565     address start = __ pc();
 566 
 567     Label exit, error;
 568 
 569     // save c_rarg2 and c_rarg3
 570     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 571 
 572     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 573     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ ldr(c_rarg3, Address(c_rarg2));
 575     __ add(c_rarg3, c_rarg3, 1);
 576     __ str(c_rarg3, Address(c_rarg2));
 577 
 578     // object is in r0
 579     // make sure object is 'reasonable'
 580     __ cbz(r0, exit); // if obj is NULL it is OK
 581 
 582     // Check if the oop is in the right area of memory
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 584     __ andr(c_rarg2, r0, c_rarg3);
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 586 
 587     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 588     // instruction here because the flags register is live.
 589     __ eor(c_rarg2, c_rarg2, c_rarg3);
 590     __ cbnz(c_rarg2, error);
 591 
 592     // make sure klass is 'reasonable', which is not zero.
 593     __ load_klass(r0, r0);  // get klass
 594     __ cbz(r0, error);      // if klass is NULL it is broken
 595 
 596     // return if everything seems ok
 597     __ bind(exit);
 598 
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600     __ ret(lr);
 601 
 602     // handle errors
 603     __ bind(error);
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605 
 606     __ push(RegSet::range(r0, r29), sp);
 607     // debug(char* msg, int64_t pc, int64_t regs[])
 608     __ mov(c_rarg0, rscratch1);      // pass address of error message
 609     __ mov(c_rarg1, lr);             // pass return address
 610     __ mov(c_rarg2, sp);             // pass address of regs on stack
 611 #ifndef PRODUCT
 612     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 613 #endif
 614     BLOCK_COMMENT("call MacroAssembler::debug");
 615     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 616     __ blrt(rscratch1, 3, 0, 1);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label store_pair, loop_store_pair, done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657 
 658       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 659       // alignment.
 660       Label small;
 661       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 662       __ subs(rscratch1, cnt, low_limit >> 3);
 663       __ br(Assembler::LT, small);
 664       __ zero_dcache_blocks(base, cnt);
 665       __ bind(small);
 666     }
 667 
 668     {
 669       // Number of stp instructions we'll unroll
 670       const int unroll =
 671         MacroAssembler::zero_words_block_size / 2;
 672       // Clear the remaining blocks.
 673       Label loop;
 674       __ subs(cnt, cnt, unroll * 2);
 675       __ br(Assembler::LT, done);
 676       __ bind(loop);
 677       for (int i = 0; i < unroll; i++)
 678         __ stp(zr, zr, __ post(base, 16));
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::GE, loop);
 681       __ bind(done);
 682       __ add(cnt, cnt, unroll * 2);
 683     }
 684 
 685     __ ret(lr);
 686 
 687     return start;
 688   }
 689 
 690 
 691   typedef enum {
 692     copy_forwards = 1,
 693     copy_backwards = -1
 694   } copy_direction;
 695 
 696   // Bulk copy of blocks of 8 words.
 697   //
 698   // count is a count of words.
 699   //
 700   // Precondition: count >= 8
 701   //
 702   // Postconditions:
 703   //
 704   // The least significant bit of count contains the remaining count
 705   // of words to copy.  The rest of count is trash.
 706   //
 707   // s and d are adjusted to point to the remaining words to copy
 708   //
 709   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 710                            copy_direction direction) {
 711     int unit = wordSize * direction;
 712     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 713 
 714     int offset;
 715     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 716       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 717     const Register stride = r13;
 718 
 719     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 720     assert_different_registers(s, d, count, rscratch1);
 721 
 722     Label again, drain;
 723     const char *stub_name;
 724     if (direction == copy_forwards)
 725       stub_name = "forward_copy_longs";
 726     else
 727       stub_name = "backward_copy_longs";
 728     StubCodeMark mark(this, "StubRoutines", stub_name);
 729     __ align(CodeEntryAlignment);
 730     __ bind(start);
 731 
 732     Label unaligned_copy_long;
 733     if (AvoidUnalignedAccesses) {
 734       __ tbnz(d, 3, unaligned_copy_long);
 735     }
 736 
 737     if (direction == copy_forwards) {
 738       __ sub(s, s, bias);
 739       __ sub(d, d, bias);
 740     }
 741 
 742 #ifdef ASSERT
 743     // Make sure we are never given < 8 words
 744     {
 745       Label L;
 746       __ cmp(count, 8);
 747       __ br(Assembler::GE, L);
 748       __ stop("genrate_copy_longs called with < 8 words");
 749       __ bind(L);
 750     }
 751 #endif
 752 
 753     // Fill 8 registers
 754     if (UseSIMDForMemoryOps) {
 755       __ ldpq(v0, v1, Address(s, 4 * unit));
 756       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 757     } else {
 758       __ ldp(t0, t1, Address(s, 2 * unit));
 759       __ ldp(t2, t3, Address(s, 4 * unit));
 760       __ ldp(t4, t5, Address(s, 6 * unit));
 761       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 762     }
 763 
 764     __ subs(count, count, 16);
 765     __ br(Assembler::LO, drain);
 766 
 767     int prefetch = PrefetchCopyIntervalInBytes;
 768     bool use_stride = false;
 769     if (direction == copy_backwards) {
 770        use_stride = prefetch > 256;
 771        prefetch = -prefetch;
 772        if (use_stride) __ mov(stride, prefetch);
 773     }
 774 
 775     __ bind(again);
 776 
 777     if (PrefetchCopyIntervalInBytes > 0)
 778       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 779 
 780     if (UseSIMDForMemoryOps) {
 781       __ stpq(v0, v1, Address(d, 4 * unit));
 782       __ ldpq(v0, v1, Address(s, 4 * unit));
 783       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 784       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 785     } else {
 786       __ stp(t0, t1, Address(d, 2 * unit));
 787       __ ldp(t0, t1, Address(s, 2 * unit));
 788       __ stp(t2, t3, Address(d, 4 * unit));
 789       __ ldp(t2, t3, Address(s, 4 * unit));
 790       __ stp(t4, t5, Address(d, 6 * unit));
 791       __ ldp(t4, t5, Address(s, 6 * unit));
 792       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 793       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 794     }
 795 
 796     __ subs(count, count, 8);
 797     __ br(Assembler::HS, again);
 798 
 799     // Drain
 800     __ bind(drain);
 801     if (UseSIMDForMemoryOps) {
 802       __ stpq(v0, v1, Address(d, 4 * unit));
 803       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 804     } else {
 805       __ stp(t0, t1, Address(d, 2 * unit));
 806       __ stp(t2, t3, Address(d, 4 * unit));
 807       __ stp(t4, t5, Address(d, 6 * unit));
 808       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 809     }
 810 
 811     {
 812       Label L1, L2;
 813       __ tbz(count, exact_log2(4), L1);
 814       if (UseSIMDForMemoryOps) {
 815         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 816         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 817       } else {
 818         __ ldp(t0, t1, Address(s, 2 * unit));
 819         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 820         __ stp(t0, t1, Address(d, 2 * unit));
 821         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 822       }
 823       __ bind(L1);
 824 
 825       if (direction == copy_forwards) {
 826         __ add(s, s, bias);
 827         __ add(d, d, bias);
 828       }
 829 
 830       __ tbz(count, 1, L2);
 831       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 832       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 833       __ bind(L2);
 834     }
 835 
 836     __ ret(lr);
 837 
 838     if (AvoidUnalignedAccesses) {
 839       Label drain, again;
 840       // Register order for storing. Order is different for backward copy.
 841 
 842       __ bind(unaligned_copy_long);
 843 
 844       // source address is even aligned, target odd aligned
 845       //
 846       // when forward copying word pairs we read long pairs at offsets
 847       // {0, 2, 4, 6} (in long words). when backwards copying we read
 848       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 849       // address by -2 in the forwards case so we can compute the
 850       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 851       // or -1.
 852       //
 853       // when forward copying we need to store 1 word, 3 pairs and
 854       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 855       // zero offset We adjust the destination by -1 which means we
 856       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 857       //
 858       // When backwards copyng we need to store 1 word, 3 pairs and
 859       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 860       // offsets {1, 3, 5, 7, 8} * unit.
 861 
 862       if (direction == copy_forwards) {
 863         __ sub(s, s, 16);
 864         __ sub(d, d, 8);
 865       }
 866 
 867       // Fill 8 registers
 868       //
 869       // for forwards copy s was offset by -16 from the original input
 870       // value of s so the register contents are at these offsets
 871       // relative to the 64 bit block addressed by that original input
 872       // and so on for each successive 64 byte block when s is updated
 873       //
 874       // t0 at offset 0,  t1 at offset 8
 875       // t2 at offset 16, t3 at offset 24
 876       // t4 at offset 32, t5 at offset 40
 877       // t6 at offset 48, t7 at offset 56
 878 
 879       // for backwards copy s was not offset so the register contents
 880       // are at these offsets into the preceding 64 byte block
 881       // relative to that original input and so on for each successive
 882       // preceding 64 byte block when s is updated. this explains the
 883       // slightly counter-intuitive looking pattern of register usage
 884       // in the stp instructions for backwards copy.
 885       //
 886       // t0 at offset -16, t1 at offset -8
 887       // t2 at offset -32, t3 at offset -24
 888       // t4 at offset -48, t5 at offset -40
 889       // t6 at offset -64, t7 at offset -56
 890 
 891       __ ldp(t0, t1, Address(s, 2 * unit));
 892       __ ldp(t2, t3, Address(s, 4 * unit));
 893       __ ldp(t4, t5, Address(s, 6 * unit));
 894       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 895 
 896       __ subs(count, count, 16);
 897       __ br(Assembler::LO, drain);
 898 
 899       int prefetch = PrefetchCopyIntervalInBytes;
 900       bool use_stride = false;
 901       if (direction == copy_backwards) {
 902          use_stride = prefetch > 256;
 903          prefetch = -prefetch;
 904          if (use_stride) __ mov(stride, prefetch);
 905       }
 906 
 907       __ bind(again);
 908 
 909       if (PrefetchCopyIntervalInBytes > 0)
 910         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 911 
 912       if (direction == copy_forwards) {
 913        // allowing for the offset of -8 the store instructions place
 914        // registers into the target 64 bit block at the following
 915        // offsets
 916        //
 917        // t0 at offset 0
 918        // t1 at offset 8,  t2 at offset 16
 919        // t3 at offset 24, t4 at offset 32
 920        // t5 at offset 40, t6 at offset 48
 921        // t7 at offset 56
 922 
 923         __ str(t0, Address(d, 1 * unit));
 924         __ stp(t1, t2, Address(d, 2 * unit));
 925         __ ldp(t0, t1, Address(s, 2 * unit));
 926         __ stp(t3, t4, Address(d, 4 * unit));
 927         __ ldp(t2, t3, Address(s, 4 * unit));
 928         __ stp(t5, t6, Address(d, 6 * unit));
 929         __ ldp(t4, t5, Address(s, 6 * unit));
 930         __ str(t7, Address(__ pre(d, 8 * unit)));
 931         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 932       } else {
 933        // d was not offset when we started so the registers are
 934        // written into the 64 bit block preceding d with the following
 935        // offsets
 936        //
 937        // t1 at offset -8
 938        // t3 at offset -24, t0 at offset -16
 939        // t5 at offset -48, t2 at offset -32
 940        // t7 at offset -56, t4 at offset -48
 941        //                   t6 at offset -64
 942        //
 943        // note that this matches the offsets previously noted for the
 944        // loads
 945 
 946         __ str(t1, Address(d, 1 * unit));
 947         __ stp(t3, t0, Address(d, 3 * unit));
 948         __ ldp(t0, t1, Address(s, 2 * unit));
 949         __ stp(t5, t2, Address(d, 5 * unit));
 950         __ ldp(t2, t3, Address(s, 4 * unit));
 951         __ stp(t7, t4, Address(d, 7 * unit));
 952         __ ldp(t4, t5, Address(s, 6 * unit));
 953         __ str(t6, Address(__ pre(d, 8 * unit)));
 954         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 955       }
 956 
 957       __ subs(count, count, 8);
 958       __ br(Assembler::HS, again);
 959 
 960       // Drain
 961       //
 962       // this uses the same pattern of offsets and register arguments
 963       // as above
 964       __ bind(drain);
 965       if (direction == copy_forwards) {
 966         __ str(t0, Address(d, 1 * unit));
 967         __ stp(t1, t2, Address(d, 2 * unit));
 968         __ stp(t3, t4, Address(d, 4 * unit));
 969         __ stp(t5, t6, Address(d, 6 * unit));
 970         __ str(t7, Address(__ pre(d, 8 * unit)));
 971       } else {
 972         __ str(t1, Address(d, 1 * unit));
 973         __ stp(t3, t0, Address(d, 3 * unit));
 974         __ stp(t5, t2, Address(d, 5 * unit));
 975         __ stp(t7, t4, Address(d, 7 * unit));
 976         __ str(t6, Address(__ pre(d, 8 * unit)));
 977       }
 978       // now we need to copy any remaining part block which may
 979       // include a 4 word block subblock and/or a 2 word subblock.
 980       // bits 2 and 1 in the count are the tell-tale for whetehr we
 981       // have each such subblock
 982       {
 983         Label L1, L2;
 984         __ tbz(count, exact_log2(4), L1);
 985        // this is the same as above but copying only 4 longs hence
 986        // with ony one intervening stp between the str instructions
 987        // but note that the offsets and registers still follow the
 988        // same pattern
 989         __ ldp(t0, t1, Address(s, 2 * unit));
 990         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 991         if (direction == copy_forwards) {
 992           __ str(t0, Address(d, 1 * unit));
 993           __ stp(t1, t2, Address(d, 2 * unit));
 994           __ str(t3, Address(__ pre(d, 4 * unit)));
 995         } else {
 996           __ str(t1, Address(d, 1 * unit));
 997           __ stp(t3, t0, Address(d, 3 * unit));
 998           __ str(t2, Address(__ pre(d, 4 * unit)));
 999         }
1000         __ bind(L1);
1001 
1002         __ tbz(count, 1, L2);
1003        // this is the same as above but copying only 2 longs hence
1004        // there is no intervening stp between the str instructions
1005        // but note that the offset and register patterns are still
1006        // the same
1007         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1008         if (direction == copy_forwards) {
1009           __ str(t0, Address(d, 1 * unit));
1010           __ str(t1, Address(__ pre(d, 2 * unit)));
1011         } else {
1012           __ str(t1, Address(d, 1 * unit));
1013           __ str(t0, Address(__ pre(d, 2 * unit)));
1014         }
1015         __ bind(L2);
1016 
1017        // for forwards copy we need to re-adjust the offsets we
1018        // applied so that s and d are follow the last words written
1019 
1020        if (direction == copy_forwards) {
1021          __ add(s, s, 16);
1022          __ add(d, d, 8);
1023        }
1024 
1025       }
1026 
1027       __ ret(lr);
1028       }
1029   }
1030 
1031   // Small copy: less than 16 bytes.
1032   //
1033   // NB: Ignores all of the bits of count which represent more than 15
1034   // bytes, so a caller doesn't have to mask them.
1035 
1036   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1037     bool is_backwards = step < 0;
1038     size_t granularity = uabs(step);
1039     int direction = is_backwards ? -1 : 1;
1040     int unit = wordSize * direction;
1041 
1042     Label Lpair, Lword, Lint, Lshort, Lbyte;
1043 
1044     assert(granularity
1045            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1046 
1047     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1048 
1049     // ??? I don't know if this bit-test-and-branch is the right thing
1050     // to do.  It does a lot of jumping, resulting in several
1051     // mispredicted branches.  It might make more sense to do this
1052     // with something like Duff's device with a single computed branch.
1053 
1054     __ tbz(count, 3 - exact_log2(granularity), Lword);
1055     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1056     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1057     __ bind(Lword);
1058 
1059     if (granularity <= sizeof (jint)) {
1060       __ tbz(count, 2 - exact_log2(granularity), Lint);
1061       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1062       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1063       __ bind(Lint);
1064     }
1065 
1066     if (granularity <= sizeof (jshort)) {
1067       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1068       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1069       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1070       __ bind(Lshort);
1071     }
1072 
1073     if (granularity <= sizeof (jbyte)) {
1074       __ tbz(count, 0, Lbyte);
1075       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1076       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1077       __ bind(Lbyte);
1078     }
1079   }
1080 
1081   Label copy_f, copy_b;
1082 
1083   // All-singing all-dancing memory copy.
1084   //
1085   // Copy count units of memory from s to d.  The size of a unit is
1086   // step, which can be positive or negative depending on the direction
1087   // of copy.  If is_aligned is false, we align the source address.
1088   //
1089 
1090   void copy_memory(bool is_aligned, Register s, Register d,
1091                    Register count, Register tmp, int step) {
1092     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1093     bool is_backwards = step < 0;
1094     int granularity = uabs(step);
1095     const Register t0 = r3, t1 = r4;
1096 
1097     // <= 96 bytes do inline. Direction doesn't matter because we always
1098     // load all the data before writing anything
1099     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1100     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1101     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1102     const Register send = r17, dend = r18;
1103 
1104     if (PrefetchCopyIntervalInBytes > 0)
1105       __ prfm(Address(s, 0), PLDL1KEEP);
1106     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1107     __ br(Assembler::HI, copy_big);
1108 
1109     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1110     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1111 
1112     __ cmp(count, 16/granularity);
1113     __ br(Assembler::LS, copy16);
1114 
1115     __ cmp(count, 64/granularity);
1116     __ br(Assembler::HI, copy80);
1117 
1118     __ cmp(count, 32/granularity);
1119     __ br(Assembler::LS, copy32);
1120 
1121     // 33..64 bytes
1122     if (UseSIMDForMemoryOps) {
1123       __ ldpq(v0, v1, Address(s, 0));
1124       __ ldpq(v2, v3, Address(send, -32));
1125       __ stpq(v0, v1, Address(d, 0));
1126       __ stpq(v2, v3, Address(dend, -32));
1127     } else {
1128       __ ldp(t0, t1, Address(s, 0));
1129       __ ldp(t2, t3, Address(s, 16));
1130       __ ldp(t4, t5, Address(send, -32));
1131       __ ldp(t6, t7, Address(send, -16));
1132 
1133       __ stp(t0, t1, Address(d, 0));
1134       __ stp(t2, t3, Address(d, 16));
1135       __ stp(t4, t5, Address(dend, -32));
1136       __ stp(t6, t7, Address(dend, -16));
1137     }
1138     __ b(finish);
1139 
1140     // 17..32 bytes
1141     __ bind(copy32);
1142     __ ldp(t0, t1, Address(s, 0));
1143     __ ldp(t2, t3, Address(send, -16));
1144     __ stp(t0, t1, Address(d, 0));
1145     __ stp(t2, t3, Address(dend, -16));
1146     __ b(finish);
1147 
1148     // 65..80/96 bytes
1149     // (96 bytes if SIMD because we do 32 byes per instruction)
1150     __ bind(copy80);
1151     if (UseSIMDForMemoryOps) {
1152       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1153       __ ldpq(v4, v5, Address(send, -32));
1154       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1155       __ stpq(v4, v5, Address(dend, -32));
1156     } else {
1157       __ ldp(t0, t1, Address(s, 0));
1158       __ ldp(t2, t3, Address(s, 16));
1159       __ ldp(t4, t5, Address(s, 32));
1160       __ ldp(t6, t7, Address(s, 48));
1161       __ ldp(t8, t9, Address(send, -16));
1162 
1163       __ stp(t0, t1, Address(d, 0));
1164       __ stp(t2, t3, Address(d, 16));
1165       __ stp(t4, t5, Address(d, 32));
1166       __ stp(t6, t7, Address(d, 48));
1167       __ stp(t8, t9, Address(dend, -16));
1168     }
1169     __ b(finish);
1170 
1171     // 0..16 bytes
1172     __ bind(copy16);
1173     __ cmp(count, 8/granularity);
1174     __ br(Assembler::LO, copy8);
1175 
1176     // 8..16 bytes
1177     __ ldr(t0, Address(s, 0));
1178     __ ldr(t1, Address(send, -8));
1179     __ str(t0, Address(d, 0));
1180     __ str(t1, Address(dend, -8));
1181     __ b(finish);
1182 
1183     if (granularity < 8) {
1184       // 4..7 bytes
1185       __ bind(copy8);
1186       __ tbz(count, 2 - exact_log2(granularity), copy4);
1187       __ ldrw(t0, Address(s, 0));
1188       __ ldrw(t1, Address(send, -4));
1189       __ strw(t0, Address(d, 0));
1190       __ strw(t1, Address(dend, -4));
1191       __ b(finish);
1192       if (granularity < 4) {
1193         // 0..3 bytes
1194         __ bind(copy4);
1195         __ cbz(count, finish); // get rid of 0 case
1196         if (granularity == 2) {
1197           __ ldrh(t0, Address(s, 0));
1198           __ strh(t0, Address(d, 0));
1199         } else { // granularity == 1
1200           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1201           // the first and last byte.
1202           // Handle the 3 byte case by loading and storing base + count/2
1203           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1204           // This does means in the 1 byte case we load/store the same
1205           // byte 3 times.
1206           __ lsr(count, count, 1);
1207           __ ldrb(t0, Address(s, 0));
1208           __ ldrb(t1, Address(send, -1));
1209           __ ldrb(t2, Address(s, count));
1210           __ strb(t0, Address(d, 0));
1211           __ strb(t1, Address(dend, -1));
1212           __ strb(t2, Address(d, count));
1213         }
1214         __ b(finish);
1215       }
1216     }
1217 
1218     __ bind(copy_big);
1219     if (is_backwards) {
1220       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1221       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1222     }
1223 
1224     // Now we've got the small case out of the way we can align the
1225     // source address on a 2-word boundary.
1226 
1227     Label aligned;
1228 
1229     if (is_aligned) {
1230       // We may have to adjust by 1 word to get s 2-word-aligned.
1231       __ tbz(s, exact_log2(wordSize), aligned);
1232       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1233       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1234       __ sub(count, count, wordSize/granularity);
1235     } else {
1236       if (is_backwards) {
1237         __ andr(rscratch2, s, 2 * wordSize - 1);
1238       } else {
1239         __ neg(rscratch2, s);
1240         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1241       }
1242       // rscratch2 is the byte adjustment needed to align s.
1243       __ cbz(rscratch2, aligned);
1244       int shift = exact_log2(granularity);
1245       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1246       __ sub(count, count, rscratch2);
1247 
1248 #if 0
1249       // ?? This code is only correct for a disjoint copy.  It may or
1250       // may not make sense to use it in that case.
1251 
1252       // Copy the first pair; s and d may not be aligned.
1253       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1254       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1255 
1256       // Align s and d, adjust count
1257       if (is_backwards) {
1258         __ sub(s, s, rscratch2);
1259         __ sub(d, d, rscratch2);
1260       } else {
1261         __ add(s, s, rscratch2);
1262         __ add(d, d, rscratch2);
1263       }
1264 #else
1265       copy_memory_small(s, d, rscratch2, rscratch1, step);
1266 #endif
1267     }
1268 
1269     __ bind(aligned);
1270 
1271     // s is now 2-word-aligned.
1272 
1273     // We have a count of units and some trailing bytes.  Adjust the
1274     // count and do a bulk copy of words.
1275     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1276     if (direction == copy_forwards)
1277       __ bl(copy_f);
1278     else
1279       __ bl(copy_b);
1280 
1281     // And the tail.
1282     copy_memory_small(s, d, count, tmp, step);
1283 
1284     if (granularity >= 8) __ bind(copy8);
1285     if (granularity >= 4) __ bind(copy4);
1286     __ bind(finish);
1287   }
1288 
1289 
1290   void clobber_registers() {
1291 #ifdef ASSERT
1292     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1293     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1294     for (Register r = r3; r <= r18; r++)
1295       if (r != rscratch1) __ mov(r, rscratch1);
1296 #endif
1297   }
1298 
1299   // Scan over array at a for count oops, verifying each one.
1300   // Preserves a and count, clobbers rscratch1 and rscratch2.
1301   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1302     Label loop, end;
1303     __ mov(rscratch1, a);
1304     __ mov(rscratch2, zr);
1305     __ bind(loop);
1306     __ cmp(rscratch2, count);
1307     __ br(Assembler::HS, end);
1308     if (size == (size_t)wordSize) {
1309       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1310       __ verify_oop(temp);
1311     } else {
1312       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1313       __ decode_heap_oop(temp); // calls verify_oop
1314     }
1315     __ add(rscratch2, rscratch2, size);
1316     __ b(loop);
1317     __ bind(end);
1318   }
1319 
1320   // Arguments:
1321   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1322   //             ignored
1323   //   is_oop  - true => oop array, so generate store check code
1324   //   name    - stub name string
1325   //
1326   // Inputs:
1327   //   c_rarg0   - source array address
1328   //   c_rarg1   - destination array address
1329   //   c_rarg2   - element count, treated as ssize_t, can be zero
1330   //
1331   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1332   // the hardware handle it.  The two dwords within qwords that span
1333   // cache line boundaries will still be loaded and stored atomicly.
1334   //
1335   // Side Effects:
1336   //   disjoint_int_copy_entry is set to the no-overlap entry point
1337   //   used by generate_conjoint_int_oop_copy().
1338   //
1339   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1340                                   const char *name, bool dest_uninitialized = false) {
1341     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1342     RegSet saved_reg = RegSet::of(s, d, count);
1343     __ align(CodeEntryAlignment);
1344     StubCodeMark mark(this, "StubRoutines", name);
1345     address start = __ pc();
1346     __ enter();
1347 
1348     if (entry != NULL) {
1349       *entry = __ pc();
1350       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1351       BLOCK_COMMENT("Entry:");
1352     }
1353 
1354     BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen();
1355     DecoratorSet decorators = ARRAYCOPY_DISJOINT;
1356     if (dest_uninitialized) {
1357       decorators |= AS_DEST_NOT_INITIALIZED;
1358     }
1359     if (aligned) {
1360       decorators |= ARRAYCOPY_ALIGNED;
1361     }
1362 
1363     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1364 
1365     if (is_oop) {
1366       // save regs before copy_memory
1367       __ push(RegSet::of(d, count), sp);
1368     }
1369     copy_memory(aligned, s, d, count, rscratch1, size);
1370 
1371     if (is_oop) {
1372       __ pop(RegSet::of(d, count), sp);
1373       if (VerifyOops)
1374         verify_oop_array(size, d, count, r16);
1375       __ sub(count, count, 1); // make an inclusive end pointer
1376       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1377     }
1378 
1379     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1380 
1381     __ leave();
1382     __ mov(r0, zr); // return 0
1383     __ ret(lr);
1384 #ifdef BUILTIN_SIM
1385     {
1386       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1387       sim->notifyCompile(const_cast<char*>(name), start);
1388     }
1389 #endif
1390     return start;
1391   }
1392 
1393   // Arguments:
1394   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1395   //             ignored
1396   //   is_oop  - true => oop array, so generate store check code
1397   //   name    - stub name string
1398   //
1399   // Inputs:
1400   //   c_rarg0   - source array address
1401   //   c_rarg1   - destination array address
1402   //   c_rarg2   - element count, treated as ssize_t, can be zero
1403   //
1404   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1405   // the hardware handle it.  The two dwords within qwords that span
1406   // cache line boundaries will still be loaded and stored atomicly.
1407   //
1408   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1409                                  address *entry, const char *name,
1410                                  bool dest_uninitialized = false) {
1411     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1412     RegSet saved_regs = RegSet::of(s, d, count);
1413     StubCodeMark mark(this, "StubRoutines", name);
1414     address start = __ pc();
1415     __ enter();
1416 
1417     if (entry != NULL) {
1418       *entry = __ pc();
1419       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1420       BLOCK_COMMENT("Entry:");
1421     }
1422 
1423     // use fwd copy when (d-s) above_equal (count*size)
1424     __ sub(rscratch1, d, s);
1425     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1426     __ br(Assembler::HS, nooverlap_target);
1427 
1428     BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen();
1429     DecoratorSet decorators = 0;
1430     if (dest_uninitialized) {
1431       decorators |= AS_DEST_NOT_INITIALIZED;
1432     }
1433     if (aligned) {
1434       decorators |= ARRAYCOPY_ALIGNED;
1435     }
1436     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1437 
1438     if (is_oop) {
1439       // save regs before copy_memory
1440       __ push(RegSet::of(d, count), sp);
1441     }
1442     copy_memory(aligned, s, d, count, rscratch1, -size);
1443     if (is_oop) {
1444       __ pop(RegSet::of(d, count), sp);
1445       if (VerifyOops)
1446         verify_oop_array(size, d, count, r16);
1447       __ sub(count, count, 1); // make an inclusive end pointer
1448       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1449     }
1450     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1451     __ leave();
1452     __ mov(r0, zr); // return 0
1453     __ ret(lr);
1454 #ifdef BUILTIN_SIM
1455     {
1456       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1457       sim->notifyCompile(const_cast<char*>(name), start);
1458     }
1459 #endif
1460     return start;
1461 }
1462 
1463   // Arguments:
1464   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1465   //             ignored
1466   //   name    - stub name string
1467   //
1468   // Inputs:
1469   //   c_rarg0   - source array address
1470   //   c_rarg1   - destination array address
1471   //   c_rarg2   - element count, treated as ssize_t, can be zero
1472   //
1473   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1474   // we let the hardware handle it.  The one to eight bytes within words,
1475   // dwords or qwords that span cache line boundaries will still be loaded
1476   // and stored atomically.
1477   //
1478   // Side Effects:
1479   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1480   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1481   // we let the hardware handle it.  The one to eight bytes within words,
1482   // dwords or qwords that span cache line boundaries will still be loaded
1483   // and stored atomically.
1484   //
1485   // Side Effects:
1486   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1487   //   used by generate_conjoint_byte_copy().
1488   //
1489   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1490     const bool not_oop = false;
1491     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1492   }
1493 
1494   // Arguments:
1495   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1496   //             ignored
1497   //   name    - stub name string
1498   //
1499   // Inputs:
1500   //   c_rarg0   - source array address
1501   //   c_rarg1   - destination array address
1502   //   c_rarg2   - element count, treated as ssize_t, can be zero
1503   //
1504   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1505   // we let the hardware handle it.  The one to eight bytes within words,
1506   // dwords or qwords that span cache line boundaries will still be loaded
1507   // and stored atomically.
1508   //
1509   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1510                                       address* entry, const char *name) {
1511     const bool not_oop = false;
1512     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1513   }
1514 
1515   // Arguments:
1516   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1517   //             ignored
1518   //   name    - stub name string
1519   //
1520   // Inputs:
1521   //   c_rarg0   - source array address
1522   //   c_rarg1   - destination array address
1523   //   c_rarg2   - element count, treated as ssize_t, can be zero
1524   //
1525   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1526   // let the hardware handle it.  The two or four words within dwords
1527   // or qwords that span cache line boundaries will still be loaded
1528   // and stored atomically.
1529   //
1530   // Side Effects:
1531   //   disjoint_short_copy_entry is set to the no-overlap entry point
1532   //   used by generate_conjoint_short_copy().
1533   //
1534   address generate_disjoint_short_copy(bool aligned,
1535                                        address* entry, const char *name) {
1536     const bool not_oop = false;
1537     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1538   }
1539 
1540   // Arguments:
1541   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1542   //             ignored
1543   //   name    - stub name string
1544   //
1545   // Inputs:
1546   //   c_rarg0   - source array address
1547   //   c_rarg1   - destination array address
1548   //   c_rarg2   - element count, treated as ssize_t, can be zero
1549   //
1550   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1551   // let the hardware handle it.  The two or four words within dwords
1552   // or qwords that span cache line boundaries will still be loaded
1553   // and stored atomically.
1554   //
1555   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1556                                        address *entry, const char *name) {
1557     const bool not_oop = false;
1558     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1559 
1560   }
1561   // Arguments:
1562   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1563   //             ignored
1564   //   name    - stub name string
1565   //
1566   // Inputs:
1567   //   c_rarg0   - source array address
1568   //   c_rarg1   - destination array address
1569   //   c_rarg2   - element count, treated as ssize_t, can be zero
1570   //
1571   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1572   // the hardware handle it.  The two dwords within qwords that span
1573   // cache line boundaries will still be loaded and stored atomicly.
1574   //
1575   // Side Effects:
1576   //   disjoint_int_copy_entry is set to the no-overlap entry point
1577   //   used by generate_conjoint_int_oop_copy().
1578   //
1579   address generate_disjoint_int_copy(bool aligned, address *entry,
1580                                          const char *name, bool dest_uninitialized = false) {
1581     const bool not_oop = false;
1582     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1583   }
1584 
1585   // Arguments:
1586   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1587   //             ignored
1588   //   name    - stub name string
1589   //
1590   // Inputs:
1591   //   c_rarg0   - source array address
1592   //   c_rarg1   - destination array address
1593   //   c_rarg2   - element count, treated as ssize_t, can be zero
1594   //
1595   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1596   // the hardware handle it.  The two dwords within qwords that span
1597   // cache line boundaries will still be loaded and stored atomicly.
1598   //
1599   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1600                                      address *entry, const char *name,
1601                                      bool dest_uninitialized = false) {
1602     const bool not_oop = false;
1603     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1604   }
1605 
1606 
1607   // Arguments:
1608   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1609   //             ignored
1610   //   name    - stub name string
1611   //
1612   // Inputs:
1613   //   c_rarg0   - source array address
1614   //   c_rarg1   - destination array address
1615   //   c_rarg2   - element count, treated as size_t, can be zero
1616   //
1617   // Side Effects:
1618   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1619   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1620   //
1621   address generate_disjoint_long_copy(bool aligned, address *entry,
1622                                           const char *name, bool dest_uninitialized = false) {
1623     const bool not_oop = false;
1624     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1625   }
1626 
1627   // Arguments:
1628   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1629   //             ignored
1630   //   name    - stub name string
1631   //
1632   // Inputs:
1633   //   c_rarg0   - source array address
1634   //   c_rarg1   - destination array address
1635   //   c_rarg2   - element count, treated as size_t, can be zero
1636   //
1637   address generate_conjoint_long_copy(bool aligned,
1638                                       address nooverlap_target, address *entry,
1639                                       const char *name, bool dest_uninitialized = false) {
1640     const bool not_oop = false;
1641     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1642   }
1643 
1644   // Arguments:
1645   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1646   //             ignored
1647   //   name    - stub name string
1648   //
1649   // Inputs:
1650   //   c_rarg0   - source array address
1651   //   c_rarg1   - destination array address
1652   //   c_rarg2   - element count, treated as size_t, can be zero
1653   //
1654   // Side Effects:
1655   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1656   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1657   //
1658   address generate_disjoint_oop_copy(bool aligned, address *entry,
1659                                      const char *name, bool dest_uninitialized) {
1660     const bool is_oop = true;
1661     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1662     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1663   }
1664 
1665   // Arguments:
1666   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1667   //             ignored
1668   //   name    - stub name string
1669   //
1670   // Inputs:
1671   //   c_rarg0   - source array address
1672   //   c_rarg1   - destination array address
1673   //   c_rarg2   - element count, treated as size_t, can be zero
1674   //
1675   address generate_conjoint_oop_copy(bool aligned,
1676                                      address nooverlap_target, address *entry,
1677                                      const char *name, bool dest_uninitialized) {
1678     const bool is_oop = true;
1679     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1680     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1681                                   name, dest_uninitialized);
1682   }
1683 
1684 
1685   // Helper for generating a dynamic type check.
1686   // Smashes rscratch1.
1687   void generate_type_check(Register sub_klass,
1688                            Register super_check_offset,
1689                            Register super_klass,
1690                            Label& L_success) {
1691     assert_different_registers(sub_klass, super_check_offset, super_klass);
1692 
1693     BLOCK_COMMENT("type_check:");
1694 
1695     Label L_miss;
1696 
1697     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1698                                      super_check_offset);
1699     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1700 
1701     // Fall through on failure!
1702     __ BIND(L_miss);
1703   }
1704 
1705   //
1706   //  Generate checkcasting array copy stub
1707   //
1708   //  Input:
1709   //    c_rarg0   - source array address
1710   //    c_rarg1   - destination array address
1711   //    c_rarg2   - element count, treated as ssize_t, can be zero
1712   //    c_rarg3   - size_t ckoff (super_check_offset)
1713   //    c_rarg4   - oop ckval (super_klass)
1714   //
1715   //  Output:
1716   //    r0 ==  0  -  success
1717   //    r0 == -1^K - failure, where K is partial transfer count
1718   //
1719   address generate_checkcast_copy(const char *name, address *entry,
1720                                   bool dest_uninitialized = false) {
1721 
1722     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1723 
1724     // Input registers (after setup_arg_regs)
1725     const Register from        = c_rarg0;   // source array address
1726     const Register to          = c_rarg1;   // destination array address
1727     const Register count       = c_rarg2;   // elementscount
1728     const Register ckoff       = c_rarg3;   // super_check_offset
1729     const Register ckval       = c_rarg4;   // super_klass
1730 
1731     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1732     RegSet wb_post_saved_regs = RegSet::of(count);
1733 
1734     // Registers used as temps (r18, r19, r20 are save-on-entry)
1735     const Register count_save  = r21;       // orig elementscount
1736     const Register start_to    = r20;       // destination array start address
1737     const Register copied_oop  = r18;       // actual oop copied
1738     const Register r19_klass   = r19;       // oop._klass
1739 
1740     //---------------------------------------------------------------
1741     // Assembler stub will be used for this call to arraycopy
1742     // if the two arrays are subtypes of Object[] but the
1743     // destination array type is not equal to or a supertype
1744     // of the source type.  Each element must be separately
1745     // checked.
1746 
1747     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1748                                copied_oop, r19_klass, count_save);
1749 
1750     __ align(CodeEntryAlignment);
1751     StubCodeMark mark(this, "StubRoutines", name);
1752     address start = __ pc();
1753 
1754     __ enter(); // required for proper stackwalking of RuntimeStub frame
1755 
1756 #ifdef ASSERT
1757     // caller guarantees that the arrays really are different
1758     // otherwise, we would have to make conjoint checks
1759     { Label L;
1760       array_overlap_test(L, TIMES_OOP);
1761       __ stop("checkcast_copy within a single array");
1762       __ bind(L);
1763     }
1764 #endif //ASSERT
1765 
1766     // Caller of this entry point must set up the argument registers.
1767     if (entry != NULL) {
1768       *entry = __ pc();
1769       BLOCK_COMMENT("Entry:");
1770     }
1771 
1772      // Empty array:  Nothing to do.
1773     __ cbz(count, L_done);
1774 
1775     __ push(RegSet::of(r18, r19, r20, r21), sp);
1776 
1777 #ifdef ASSERT
1778     BLOCK_COMMENT("assert consistent ckoff/ckval");
1779     // The ckoff and ckval must be mutually consistent,
1780     // even though caller generates both.
1781     { Label L;
1782       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1783       __ ldrw(start_to, Address(ckval, sco_offset));
1784       __ cmpw(ckoff, start_to);
1785       __ br(Assembler::EQ, L);
1786       __ stop("super_check_offset inconsistent");
1787       __ bind(L);
1788     }
1789 #endif //ASSERT
1790 
1791     BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen();
1792     DecoratorSet decorators = ARRAYCOPY_CHECKCAST;
1793     bool is_oop = true;
1794     if (dest_uninitialized) {
1795       decorators |= AS_DEST_NOT_INITIALIZED;
1796     }
1797 
1798     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1799 
1800     // save the original count
1801     __ mov(count_save, count);
1802 
1803     // Copy from low to high addresses
1804     __ mov(start_to, to);              // Save destination array start address
1805     __ b(L_load_element);
1806 
1807     // ======== begin loop ========
1808     // (Loop is rotated; its entry is L_load_element.)
1809     // Loop control:
1810     //   for (; count != 0; count--) {
1811     //     copied_oop = load_heap_oop(from++);
1812     //     ... generate_type_check ...;
1813     //     store_heap_oop(to++, copied_oop);
1814     //   }
1815     __ align(OptoLoopAlignment);
1816 
1817     __ BIND(L_store_element);
1818     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1819     __ sub(count, count, 1);
1820     __ cbz(count, L_do_card_marks);
1821 
1822     // ======== loop entry is here ========
1823     __ BIND(L_load_element);
1824     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1825     __ cbz(copied_oop, L_store_element);
1826 
1827     __ load_klass(r19_klass, copied_oop);// query the object klass
1828     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1829     // ======== end loop ========
1830 
1831     // It was a real error; we must depend on the caller to finish the job.
1832     // Register count = remaining oops, count_orig = total oops.
1833     // Emit GC store barriers for the oops we have copied and report
1834     // their number to the caller.
1835 
1836     __ subs(count, count_save, count);     // K = partially copied oop count
1837     __ eon(count, count, zr);                   // report (-1^K) to caller
1838     __ br(Assembler::EQ, L_done_pop);
1839 
1840     __ BIND(L_do_card_marks);
1841     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1842     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1843 
1844     __ bind(L_done_pop);
1845     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1846     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1847 
1848     __ bind(L_done);
1849     __ mov(r0, count);
1850     __ leave();
1851     __ ret(lr);
1852 
1853     return start;
1854   }
1855 
1856   // Perform range checks on the proposed arraycopy.
1857   // Kills temp, but nothing else.
1858   // Also, clean the sign bits of src_pos and dst_pos.
1859   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1860                               Register src_pos, // source position (c_rarg1)
1861                               Register dst,     // destination array oo (c_rarg2)
1862                               Register dst_pos, // destination position (c_rarg3)
1863                               Register length,
1864                               Register temp,
1865                               Label& L_failed) {
1866     BLOCK_COMMENT("arraycopy_range_checks:");
1867 
1868     assert_different_registers(rscratch1, temp);
1869 
1870     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1871     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1872     __ addw(temp, length, src_pos);
1873     __ cmpw(temp, rscratch1);
1874     __ br(Assembler::HI, L_failed);
1875 
1876     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1877     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1878     __ addw(temp, length, dst_pos);
1879     __ cmpw(temp, rscratch1);
1880     __ br(Assembler::HI, L_failed);
1881 
1882     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1883     __ movw(src_pos, src_pos);
1884     __ movw(dst_pos, dst_pos);
1885 
1886     BLOCK_COMMENT("arraycopy_range_checks done");
1887   }
1888 
1889   // These stubs get called from some dumb test routine.
1890   // I'll write them properly when they're called from
1891   // something that's actually doing something.
1892   static void fake_arraycopy_stub(address src, address dst, int count) {
1893     assert(count == 0, "huh?");
1894   }
1895 
1896 
1897   //
1898   //  Generate 'unsafe' array copy stub
1899   //  Though just as safe as the other stubs, it takes an unscaled
1900   //  size_t argument instead of an element count.
1901   //
1902   //  Input:
1903   //    c_rarg0   - source array address
1904   //    c_rarg1   - destination array address
1905   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1906   //
1907   // Examines the alignment of the operands and dispatches
1908   // to a long, int, short, or byte copy loop.
1909   //
1910   address generate_unsafe_copy(const char *name,
1911                                address byte_copy_entry,
1912                                address short_copy_entry,
1913                                address int_copy_entry,
1914                                address long_copy_entry) {
1915     Label L_long_aligned, L_int_aligned, L_short_aligned;
1916     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1917 
1918     __ align(CodeEntryAlignment);
1919     StubCodeMark mark(this, "StubRoutines", name);
1920     address start = __ pc();
1921     __ enter(); // required for proper stackwalking of RuntimeStub frame
1922 
1923     // bump this on entry, not on exit:
1924     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1925 
1926     __ orr(rscratch1, s, d);
1927     __ orr(rscratch1, rscratch1, count);
1928 
1929     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1930     __ cbz(rscratch1, L_long_aligned);
1931     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1932     __ cbz(rscratch1, L_int_aligned);
1933     __ tbz(rscratch1, 0, L_short_aligned);
1934     __ b(RuntimeAddress(byte_copy_entry));
1935 
1936     __ BIND(L_short_aligned);
1937     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1938     __ b(RuntimeAddress(short_copy_entry));
1939     __ BIND(L_int_aligned);
1940     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1941     __ b(RuntimeAddress(int_copy_entry));
1942     __ BIND(L_long_aligned);
1943     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1944     __ b(RuntimeAddress(long_copy_entry));
1945 
1946     return start;
1947   }
1948 
1949   //
1950   //  Generate generic array copy stubs
1951   //
1952   //  Input:
1953   //    c_rarg0    -  src oop
1954   //    c_rarg1    -  src_pos (32-bits)
1955   //    c_rarg2    -  dst oop
1956   //    c_rarg3    -  dst_pos (32-bits)
1957   //    c_rarg4    -  element count (32-bits)
1958   //
1959   //  Output:
1960   //    r0 ==  0  -  success
1961   //    r0 == -1^K - failure, where K is partial transfer count
1962   //
1963   address generate_generic_copy(const char *name,
1964                                 address byte_copy_entry, address short_copy_entry,
1965                                 address int_copy_entry, address oop_copy_entry,
1966                                 address long_copy_entry, address checkcast_copy_entry) {
1967 
1968     Label L_failed, L_failed_0, L_objArray;
1969     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1970 
1971     // Input registers
1972     const Register src        = c_rarg0;  // source array oop
1973     const Register src_pos    = c_rarg1;  // source position
1974     const Register dst        = c_rarg2;  // destination array oop
1975     const Register dst_pos    = c_rarg3;  // destination position
1976     const Register length     = c_rarg4;
1977 
1978     StubCodeMark mark(this, "StubRoutines", name);
1979 
1980     __ align(CodeEntryAlignment);
1981     address start = __ pc();
1982 
1983     __ enter(); // required for proper stackwalking of RuntimeStub frame
1984 
1985     // bump this on entry, not on exit:
1986     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1987 
1988     //-----------------------------------------------------------------------
1989     // Assembler stub will be used for this call to arraycopy
1990     // if the following conditions are met:
1991     //
1992     // (1) src and dst must not be null.
1993     // (2) src_pos must not be negative.
1994     // (3) dst_pos must not be negative.
1995     // (4) length  must not be negative.
1996     // (5) src klass and dst klass should be the same and not NULL.
1997     // (6) src and dst should be arrays.
1998     // (7) src_pos + length must not exceed length of src.
1999     // (8) dst_pos + length must not exceed length of dst.
2000     //
2001 
2002     //  if (src == NULL) return -1;
2003     __ cbz(src, L_failed);
2004 
2005     //  if (src_pos < 0) return -1;
2006     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2007 
2008     //  if (dst == NULL) return -1;
2009     __ cbz(dst, L_failed);
2010 
2011     //  if (dst_pos < 0) return -1;
2012     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2013 
2014     // registers used as temp
2015     const Register scratch_length    = r16; // elements count to copy
2016     const Register scratch_src_klass = r17; // array klass
2017     const Register lh                = r18; // layout helper
2018 
2019     //  if (length < 0) return -1;
2020     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2021     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2022 
2023     __ load_klass(scratch_src_klass, src);
2024 #ifdef ASSERT
2025     //  assert(src->klass() != NULL);
2026     {
2027       BLOCK_COMMENT("assert klasses not null {");
2028       Label L1, L2;
2029       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2030       __ bind(L1);
2031       __ stop("broken null klass");
2032       __ bind(L2);
2033       __ load_klass(rscratch1, dst);
2034       __ cbz(rscratch1, L1);     // this would be broken also
2035       BLOCK_COMMENT("} assert klasses not null done");
2036     }
2037 #endif
2038 
2039     // Load layout helper (32-bits)
2040     //
2041     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2042     // 32        30    24            16              8     2                 0
2043     //
2044     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2045     //
2046 
2047     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2048 
2049     // Handle objArrays completely differently...
2050     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2051     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2052     __ movw(rscratch1, objArray_lh);
2053     __ eorw(rscratch2, lh, rscratch1);
2054     __ cbzw(rscratch2, L_objArray);
2055 
2056     //  if (src->klass() != dst->klass()) return -1;
2057     __ load_klass(rscratch2, dst);
2058     __ eor(rscratch2, rscratch2, scratch_src_klass);
2059     __ cbnz(rscratch2, L_failed);
2060 
2061     //  if (!src->is_Array()) return -1;
2062     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2063 
2064     // At this point, it is known to be a typeArray (array_tag 0x3).
2065 #ifdef ASSERT
2066     {
2067       BLOCK_COMMENT("assert primitive array {");
2068       Label L;
2069       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2070       __ cmpw(lh, rscratch2);
2071       __ br(Assembler::GE, L);
2072       __ stop("must be a primitive array");
2073       __ bind(L);
2074       BLOCK_COMMENT("} assert primitive array done");
2075     }
2076 #endif
2077 
2078     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2079                            rscratch2, L_failed);
2080 
2081     // TypeArrayKlass
2082     //
2083     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2084     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2085     //
2086 
2087     const Register rscratch1_offset = rscratch1;    // array offset
2088     const Register r18_elsize = lh; // element size
2089 
2090     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2091            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2092     __ add(src, src, rscratch1_offset);           // src array offset
2093     __ add(dst, dst, rscratch1_offset);           // dst array offset
2094     BLOCK_COMMENT("choose copy loop based on element size");
2095 
2096     // next registers should be set before the jump to corresponding stub
2097     const Register from     = c_rarg0;  // source array address
2098     const Register to       = c_rarg1;  // destination array address
2099     const Register count    = c_rarg2;  // elements count
2100 
2101     // 'from', 'to', 'count' registers should be set in such order
2102     // since they are the same as 'src', 'src_pos', 'dst'.
2103 
2104     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2105 
2106     // The possible values of elsize are 0-3, i.e. exact_log2(element
2107     // size in bytes).  We do a simple bitwise binary search.
2108   __ BIND(L_copy_bytes);
2109     __ tbnz(r18_elsize, 1, L_copy_ints);
2110     __ tbnz(r18_elsize, 0, L_copy_shorts);
2111     __ lea(from, Address(src, src_pos));// src_addr
2112     __ lea(to,   Address(dst, dst_pos));// dst_addr
2113     __ movw(count, scratch_length); // length
2114     __ b(RuntimeAddress(byte_copy_entry));
2115 
2116   __ BIND(L_copy_shorts);
2117     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2118     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2119     __ movw(count, scratch_length); // length
2120     __ b(RuntimeAddress(short_copy_entry));
2121 
2122   __ BIND(L_copy_ints);
2123     __ tbnz(r18_elsize, 0, L_copy_longs);
2124     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2125     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2126     __ movw(count, scratch_length); // length
2127     __ b(RuntimeAddress(int_copy_entry));
2128 
2129   __ BIND(L_copy_longs);
2130 #ifdef ASSERT
2131     {
2132       BLOCK_COMMENT("assert long copy {");
2133       Label L;
2134       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2135       __ cmpw(r18_elsize, LogBytesPerLong);
2136       __ br(Assembler::EQ, L);
2137       __ stop("must be long copy, but elsize is wrong");
2138       __ bind(L);
2139       BLOCK_COMMENT("} assert long copy done");
2140     }
2141 #endif
2142     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2143     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2144     __ movw(count, scratch_length); // length
2145     __ b(RuntimeAddress(long_copy_entry));
2146 
2147     // ObjArrayKlass
2148   __ BIND(L_objArray);
2149     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2150 
2151     Label L_plain_copy, L_checkcast_copy;
2152     //  test array classes for subtyping
2153     __ load_klass(r18, dst);
2154     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2155     __ br(Assembler::NE, L_checkcast_copy);
2156 
2157     // Identically typed arrays can be copied without element-wise checks.
2158     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2159                            rscratch2, L_failed);
2160 
2161     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2162     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2163     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2164     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2165     __ movw(count, scratch_length); // length
2166   __ BIND(L_plain_copy);
2167     __ b(RuntimeAddress(oop_copy_entry));
2168 
2169   __ BIND(L_checkcast_copy);
2170     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2171     {
2172       // Before looking at dst.length, make sure dst is also an objArray.
2173       __ ldrw(rscratch1, Address(r18, lh_offset));
2174       __ movw(rscratch2, objArray_lh);
2175       __ eorw(rscratch1, rscratch1, rscratch2);
2176       __ cbnzw(rscratch1, L_failed);
2177 
2178       // It is safe to examine both src.length and dst.length.
2179       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2180                              r18, L_failed);
2181 
2182       const Register rscratch2_dst_klass = rscratch2;
2183       __ load_klass(rscratch2_dst_klass, dst); // reload
2184 
2185       // Marshal the base address arguments now, freeing registers.
2186       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2187       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2188       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2189       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2190       __ movw(count, length);           // length (reloaded)
2191       Register sco_temp = c_rarg3;      // this register is free now
2192       assert_different_registers(from, to, count, sco_temp,
2193                                  rscratch2_dst_klass, scratch_src_klass);
2194       // assert_clean_int(count, sco_temp);
2195 
2196       // Generate the type check.
2197       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2198       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2199       // assert_clean_int(sco_temp, r18);
2200       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2201 
2202       // Fetch destination element klass from the ObjArrayKlass header.
2203       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2204       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2205       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2206 
2207       // the checkcast_copy loop needs two extra arguments:
2208       assert(c_rarg3 == sco_temp, "#3 already in place");
2209       // Set up arguments for checkcast_copy_entry.
2210       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2211       __ b(RuntimeAddress(checkcast_copy_entry));
2212     }
2213 
2214   __ BIND(L_failed);
2215     __ mov(r0, -1);
2216     __ leave();   // required for proper stackwalking of RuntimeStub frame
2217     __ ret(lr);
2218 
2219     return start;
2220   }
2221 
2222   //
2223   // Generate stub for array fill. If "aligned" is true, the
2224   // "to" address is assumed to be heapword aligned.
2225   //
2226   // Arguments for generated stub:
2227   //   to:    c_rarg0
2228   //   value: c_rarg1
2229   //   count: c_rarg2 treated as signed
2230   //
2231   address generate_fill(BasicType t, bool aligned, const char *name) {
2232     __ align(CodeEntryAlignment);
2233     StubCodeMark mark(this, "StubRoutines", name);
2234     address start = __ pc();
2235 
2236     BLOCK_COMMENT("Entry:");
2237 
2238     const Register to        = c_rarg0;  // source array address
2239     const Register value     = c_rarg1;  // value
2240     const Register count     = c_rarg2;  // elements count
2241 
2242     const Register bz_base = r10;        // base for block_zero routine
2243     const Register cnt_words = r11;      // temp register
2244 
2245     __ enter();
2246 
2247     Label L_fill_elements, L_exit1;
2248 
2249     int shift = -1;
2250     switch (t) {
2251       case T_BYTE:
2252         shift = 0;
2253         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2254         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2255         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2256         __ br(Assembler::LO, L_fill_elements);
2257         break;
2258       case T_SHORT:
2259         shift = 1;
2260         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2261         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2262         __ br(Assembler::LO, L_fill_elements);
2263         break;
2264       case T_INT:
2265         shift = 2;
2266         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2267         __ br(Assembler::LO, L_fill_elements);
2268         break;
2269       default: ShouldNotReachHere();
2270     }
2271 
2272     // Align source address at 8 bytes address boundary.
2273     Label L_skip_align1, L_skip_align2, L_skip_align4;
2274     if (!aligned) {
2275       switch (t) {
2276         case T_BYTE:
2277           // One byte misalignment happens only for byte arrays.
2278           __ tbz(to, 0, L_skip_align1);
2279           __ strb(value, Address(__ post(to, 1)));
2280           __ subw(count, count, 1);
2281           __ bind(L_skip_align1);
2282           // Fallthrough
2283         case T_SHORT:
2284           // Two bytes misalignment happens only for byte and short (char) arrays.
2285           __ tbz(to, 1, L_skip_align2);
2286           __ strh(value, Address(__ post(to, 2)));
2287           __ subw(count, count, 2 >> shift);
2288           __ bind(L_skip_align2);
2289           // Fallthrough
2290         case T_INT:
2291           // Align to 8 bytes, we know we are 4 byte aligned to start.
2292           __ tbz(to, 2, L_skip_align4);
2293           __ strw(value, Address(__ post(to, 4)));
2294           __ subw(count, count, 4 >> shift);
2295           __ bind(L_skip_align4);
2296           break;
2297         default: ShouldNotReachHere();
2298       }
2299     }
2300 
2301     //
2302     //  Fill large chunks
2303     //
2304     __ lsrw(cnt_words, count, 3 - shift); // number of words
2305     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2306     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2307     if (UseBlockZeroing) {
2308       Label non_block_zeroing, rest;
2309       // If the fill value is zero we can use the fast zero_words().
2310       __ cbnz(value, non_block_zeroing);
2311       __ mov(bz_base, to);
2312       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2313       __ zero_words(bz_base, cnt_words);
2314       __ b(rest);
2315       __ bind(non_block_zeroing);
2316       __ fill_words(to, cnt_words, value);
2317       __ bind(rest);
2318     } else {
2319       __ fill_words(to, cnt_words, value);
2320     }
2321 
2322     // Remaining count is less than 8 bytes. Fill it by a single store.
2323     // Note that the total length is no less than 8 bytes.
2324     if (t == T_BYTE || t == T_SHORT) {
2325       Label L_exit1;
2326       __ cbzw(count, L_exit1);
2327       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2328       __ str(value, Address(to, -8));    // overwrite some elements
2329       __ bind(L_exit1);
2330       __ leave();
2331       __ ret(lr);
2332     }
2333 
2334     // Handle copies less than 8 bytes.
2335     Label L_fill_2, L_fill_4, L_exit2;
2336     __ bind(L_fill_elements);
2337     switch (t) {
2338       case T_BYTE:
2339         __ tbz(count, 0, L_fill_2);
2340         __ strb(value, Address(__ post(to, 1)));
2341         __ bind(L_fill_2);
2342         __ tbz(count, 1, L_fill_4);
2343         __ strh(value, Address(__ post(to, 2)));
2344         __ bind(L_fill_4);
2345         __ tbz(count, 2, L_exit2);
2346         __ strw(value, Address(to));
2347         break;
2348       case T_SHORT:
2349         __ tbz(count, 0, L_fill_4);
2350         __ strh(value, Address(__ post(to, 2)));
2351         __ bind(L_fill_4);
2352         __ tbz(count, 1, L_exit2);
2353         __ strw(value, Address(to));
2354         break;
2355       case T_INT:
2356         __ cbzw(count, L_exit2);
2357         __ strw(value, Address(to));
2358         break;
2359       default: ShouldNotReachHere();
2360     }
2361     __ bind(L_exit2);
2362     __ leave();
2363     __ ret(lr);
2364     return start;
2365   }
2366 
2367   void generate_arraycopy_stubs() {
2368     address entry;
2369     address entry_jbyte_arraycopy;
2370     address entry_jshort_arraycopy;
2371     address entry_jint_arraycopy;
2372     address entry_oop_arraycopy;
2373     address entry_jlong_arraycopy;
2374     address entry_checkcast_arraycopy;
2375 
2376     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2377     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2378 
2379     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2380 
2381     //*** jbyte
2382     // Always need aligned and unaligned versions
2383     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2384                                                                                   "jbyte_disjoint_arraycopy");
2385     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2386                                                                                   &entry_jbyte_arraycopy,
2387                                                                                   "jbyte_arraycopy");
2388     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2389                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2390     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2391                                                                                   "arrayof_jbyte_arraycopy");
2392 
2393     //*** jshort
2394     // Always need aligned and unaligned versions
2395     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2396                                                                                     "jshort_disjoint_arraycopy");
2397     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2398                                                                                     &entry_jshort_arraycopy,
2399                                                                                     "jshort_arraycopy");
2400     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2401                                                                                     "arrayof_jshort_disjoint_arraycopy");
2402     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2403                                                                                     "arrayof_jshort_arraycopy");
2404 
2405     //*** jint
2406     // Aligned versions
2407     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2408                                                                                 "arrayof_jint_disjoint_arraycopy");
2409     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2410                                                                                 "arrayof_jint_arraycopy");
2411     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2412     // entry_jint_arraycopy always points to the unaligned version
2413     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2414                                                                                 "jint_disjoint_arraycopy");
2415     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2416                                                                                 &entry_jint_arraycopy,
2417                                                                                 "jint_arraycopy");
2418 
2419     //*** jlong
2420     // It is always aligned
2421     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2422                                                                                   "arrayof_jlong_disjoint_arraycopy");
2423     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2424                                                                                   "arrayof_jlong_arraycopy");
2425     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2426     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2427 
2428     //*** oops
2429     {
2430       // With compressed oops we need unaligned versions; notice that
2431       // we overwrite entry_oop_arraycopy.
2432       bool aligned = !UseCompressedOops;
2433 
2434       StubRoutines::_arrayof_oop_disjoint_arraycopy
2435         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2436                                      /*dest_uninitialized*/false);
2437       StubRoutines::_arrayof_oop_arraycopy
2438         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2439                                      /*dest_uninitialized*/false);
2440       // Aligned versions without pre-barriers
2441       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2442         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2443                                      /*dest_uninitialized*/true);
2444       StubRoutines::_arrayof_oop_arraycopy_uninit
2445         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2446                                      /*dest_uninitialized*/true);
2447     }
2448 
2449     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2450     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2451     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2452     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2453 
2454     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2455     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2456                                                                         /*dest_uninitialized*/true);
2457 
2458     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2459                                                               entry_jbyte_arraycopy,
2460                                                               entry_jshort_arraycopy,
2461                                                               entry_jint_arraycopy,
2462                                                               entry_jlong_arraycopy);
2463 
2464     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2465                                                                entry_jbyte_arraycopy,
2466                                                                entry_jshort_arraycopy,
2467                                                                entry_jint_arraycopy,
2468                                                                entry_oop_arraycopy,
2469                                                                entry_jlong_arraycopy,
2470                                                                entry_checkcast_arraycopy);
2471 
2472     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2473     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2474     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2475     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2476     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2477     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2478   }
2479 
2480   void generate_math_stubs() { Unimplemented(); }
2481 
2482   // Arguments:
2483   //
2484   // Inputs:
2485   //   c_rarg0   - source byte array address
2486   //   c_rarg1   - destination byte array address
2487   //   c_rarg2   - K (key) in little endian int array
2488   //
2489   address generate_aescrypt_encryptBlock() {
2490     __ align(CodeEntryAlignment);
2491     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2492 
2493     Label L_doLast;
2494 
2495     const Register from        = c_rarg0;  // source array address
2496     const Register to          = c_rarg1;  // destination array address
2497     const Register key         = c_rarg2;  // key array address
2498     const Register keylen      = rscratch1;
2499 
2500     address start = __ pc();
2501     __ enter();
2502 
2503     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2504 
2505     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2506 
2507     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2508     __ rev32(v1, __ T16B, v1);
2509     __ rev32(v2, __ T16B, v2);
2510     __ rev32(v3, __ T16B, v3);
2511     __ rev32(v4, __ T16B, v4);
2512     __ aese(v0, v1);
2513     __ aesmc(v0, v0);
2514     __ aese(v0, v2);
2515     __ aesmc(v0, v0);
2516     __ aese(v0, v3);
2517     __ aesmc(v0, v0);
2518     __ aese(v0, v4);
2519     __ aesmc(v0, v0);
2520 
2521     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2522     __ rev32(v1, __ T16B, v1);
2523     __ rev32(v2, __ T16B, v2);
2524     __ rev32(v3, __ T16B, v3);
2525     __ rev32(v4, __ T16B, v4);
2526     __ aese(v0, v1);
2527     __ aesmc(v0, v0);
2528     __ aese(v0, v2);
2529     __ aesmc(v0, v0);
2530     __ aese(v0, v3);
2531     __ aesmc(v0, v0);
2532     __ aese(v0, v4);
2533     __ aesmc(v0, v0);
2534 
2535     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2536     __ rev32(v1, __ T16B, v1);
2537     __ rev32(v2, __ T16B, v2);
2538 
2539     __ cmpw(keylen, 44);
2540     __ br(Assembler::EQ, L_doLast);
2541 
2542     __ aese(v0, v1);
2543     __ aesmc(v0, v0);
2544     __ aese(v0, v2);
2545     __ aesmc(v0, v0);
2546 
2547     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2548     __ rev32(v1, __ T16B, v1);
2549     __ rev32(v2, __ T16B, v2);
2550 
2551     __ cmpw(keylen, 52);
2552     __ br(Assembler::EQ, L_doLast);
2553 
2554     __ aese(v0, v1);
2555     __ aesmc(v0, v0);
2556     __ aese(v0, v2);
2557     __ aesmc(v0, v0);
2558 
2559     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2560     __ rev32(v1, __ T16B, v1);
2561     __ rev32(v2, __ T16B, v2);
2562 
2563     __ BIND(L_doLast);
2564 
2565     __ aese(v0, v1);
2566     __ aesmc(v0, v0);
2567     __ aese(v0, v2);
2568 
2569     __ ld1(v1, __ T16B, key);
2570     __ rev32(v1, __ T16B, v1);
2571     __ eor(v0, __ T16B, v0, v1);
2572 
2573     __ st1(v0, __ T16B, to);
2574 
2575     __ mov(r0, 0);
2576 
2577     __ leave();
2578     __ ret(lr);
2579 
2580     return start;
2581   }
2582 
2583   // Arguments:
2584   //
2585   // Inputs:
2586   //   c_rarg0   - source byte array address
2587   //   c_rarg1   - destination byte array address
2588   //   c_rarg2   - K (key) in little endian int array
2589   //
2590   address generate_aescrypt_decryptBlock() {
2591     assert(UseAES, "need AES instructions and misaligned SSE support");
2592     __ align(CodeEntryAlignment);
2593     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2594     Label L_doLast;
2595 
2596     const Register from        = c_rarg0;  // source array address
2597     const Register to          = c_rarg1;  // destination array address
2598     const Register key         = c_rarg2;  // key array address
2599     const Register keylen      = rscratch1;
2600 
2601     address start = __ pc();
2602     __ enter(); // required for proper stackwalking of RuntimeStub frame
2603 
2604     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2605 
2606     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2607 
2608     __ ld1(v5, __ T16B, __ post(key, 16));
2609     __ rev32(v5, __ T16B, v5);
2610 
2611     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2612     __ rev32(v1, __ T16B, v1);
2613     __ rev32(v2, __ T16B, v2);
2614     __ rev32(v3, __ T16B, v3);
2615     __ rev32(v4, __ T16B, v4);
2616     __ aesd(v0, v1);
2617     __ aesimc(v0, v0);
2618     __ aesd(v0, v2);
2619     __ aesimc(v0, v0);
2620     __ aesd(v0, v3);
2621     __ aesimc(v0, v0);
2622     __ aesd(v0, v4);
2623     __ aesimc(v0, v0);
2624 
2625     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2626     __ rev32(v1, __ T16B, v1);
2627     __ rev32(v2, __ T16B, v2);
2628     __ rev32(v3, __ T16B, v3);
2629     __ rev32(v4, __ T16B, v4);
2630     __ aesd(v0, v1);
2631     __ aesimc(v0, v0);
2632     __ aesd(v0, v2);
2633     __ aesimc(v0, v0);
2634     __ aesd(v0, v3);
2635     __ aesimc(v0, v0);
2636     __ aesd(v0, v4);
2637     __ aesimc(v0, v0);
2638 
2639     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2640     __ rev32(v1, __ T16B, v1);
2641     __ rev32(v2, __ T16B, v2);
2642 
2643     __ cmpw(keylen, 44);
2644     __ br(Assembler::EQ, L_doLast);
2645 
2646     __ aesd(v0, v1);
2647     __ aesimc(v0, v0);
2648     __ aesd(v0, v2);
2649     __ aesimc(v0, v0);
2650 
2651     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2652     __ rev32(v1, __ T16B, v1);
2653     __ rev32(v2, __ T16B, v2);
2654 
2655     __ cmpw(keylen, 52);
2656     __ br(Assembler::EQ, L_doLast);
2657 
2658     __ aesd(v0, v1);
2659     __ aesimc(v0, v0);
2660     __ aesd(v0, v2);
2661     __ aesimc(v0, v0);
2662 
2663     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2664     __ rev32(v1, __ T16B, v1);
2665     __ rev32(v2, __ T16B, v2);
2666 
2667     __ BIND(L_doLast);
2668 
2669     __ aesd(v0, v1);
2670     __ aesimc(v0, v0);
2671     __ aesd(v0, v2);
2672 
2673     __ eor(v0, __ T16B, v0, v5);
2674 
2675     __ st1(v0, __ T16B, to);
2676 
2677     __ mov(r0, 0);
2678 
2679     __ leave();
2680     __ ret(lr);
2681 
2682     return start;
2683   }
2684 
2685   // Arguments:
2686   //
2687   // Inputs:
2688   //   c_rarg0   - source byte array address
2689   //   c_rarg1   - destination byte array address
2690   //   c_rarg2   - K (key) in little endian int array
2691   //   c_rarg3   - r vector byte array address
2692   //   c_rarg4   - input length
2693   //
2694   // Output:
2695   //   x0        - input length
2696   //
2697   address generate_cipherBlockChaining_encryptAESCrypt() {
2698     assert(UseAES, "need AES instructions and misaligned SSE support");
2699     __ align(CodeEntryAlignment);
2700     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2701 
2702     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2703 
2704     const Register from        = c_rarg0;  // source array address
2705     const Register to          = c_rarg1;  // destination array address
2706     const Register key         = c_rarg2;  // key array address
2707     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2708                                            // and left with the results of the last encryption block
2709     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2710     const Register keylen      = rscratch1;
2711 
2712     address start = __ pc();
2713 
2714       __ enter();
2715 
2716       __ movw(rscratch2, len_reg);
2717 
2718       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2719 
2720       __ ld1(v0, __ T16B, rvec);
2721 
2722       __ cmpw(keylen, 52);
2723       __ br(Assembler::CC, L_loadkeys_44);
2724       __ br(Assembler::EQ, L_loadkeys_52);
2725 
2726       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2727       __ rev32(v17, __ T16B, v17);
2728       __ rev32(v18, __ T16B, v18);
2729     __ BIND(L_loadkeys_52);
2730       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2731       __ rev32(v19, __ T16B, v19);
2732       __ rev32(v20, __ T16B, v20);
2733     __ BIND(L_loadkeys_44);
2734       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2735       __ rev32(v21, __ T16B, v21);
2736       __ rev32(v22, __ T16B, v22);
2737       __ rev32(v23, __ T16B, v23);
2738       __ rev32(v24, __ T16B, v24);
2739       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2740       __ rev32(v25, __ T16B, v25);
2741       __ rev32(v26, __ T16B, v26);
2742       __ rev32(v27, __ T16B, v27);
2743       __ rev32(v28, __ T16B, v28);
2744       __ ld1(v29, v30, v31, __ T16B, key);
2745       __ rev32(v29, __ T16B, v29);
2746       __ rev32(v30, __ T16B, v30);
2747       __ rev32(v31, __ T16B, v31);
2748 
2749     __ BIND(L_aes_loop);
2750       __ ld1(v1, __ T16B, __ post(from, 16));
2751       __ eor(v0, __ T16B, v0, v1);
2752 
2753       __ br(Assembler::CC, L_rounds_44);
2754       __ br(Assembler::EQ, L_rounds_52);
2755 
2756       __ aese(v0, v17); __ aesmc(v0, v0);
2757       __ aese(v0, v18); __ aesmc(v0, v0);
2758     __ BIND(L_rounds_52);
2759       __ aese(v0, v19); __ aesmc(v0, v0);
2760       __ aese(v0, v20); __ aesmc(v0, v0);
2761     __ BIND(L_rounds_44);
2762       __ aese(v0, v21); __ aesmc(v0, v0);
2763       __ aese(v0, v22); __ aesmc(v0, v0);
2764       __ aese(v0, v23); __ aesmc(v0, v0);
2765       __ aese(v0, v24); __ aesmc(v0, v0);
2766       __ aese(v0, v25); __ aesmc(v0, v0);
2767       __ aese(v0, v26); __ aesmc(v0, v0);
2768       __ aese(v0, v27); __ aesmc(v0, v0);
2769       __ aese(v0, v28); __ aesmc(v0, v0);
2770       __ aese(v0, v29); __ aesmc(v0, v0);
2771       __ aese(v0, v30);
2772       __ eor(v0, __ T16B, v0, v31);
2773 
2774       __ st1(v0, __ T16B, __ post(to, 16));
2775 
2776       __ subw(len_reg, len_reg, 16);
2777       __ cbnzw(len_reg, L_aes_loop);
2778 
2779       __ st1(v0, __ T16B, rvec);
2780 
2781       __ mov(r0, rscratch2);
2782 
2783       __ leave();
2784       __ ret(lr);
2785 
2786       return start;
2787   }
2788 
2789   // Arguments:
2790   //
2791   // Inputs:
2792   //   c_rarg0   - source byte array address
2793   //   c_rarg1   - destination byte array address
2794   //   c_rarg2   - K (key) in little endian int array
2795   //   c_rarg3   - r vector byte array address
2796   //   c_rarg4   - input length
2797   //
2798   // Output:
2799   //   r0        - input length
2800   //
2801   address generate_cipherBlockChaining_decryptAESCrypt() {
2802     assert(UseAES, "need AES instructions and misaligned SSE support");
2803     __ align(CodeEntryAlignment);
2804     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2805 
2806     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2807 
2808     const Register from        = c_rarg0;  // source array address
2809     const Register to          = c_rarg1;  // destination array address
2810     const Register key         = c_rarg2;  // key array address
2811     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2812                                            // and left with the results of the last encryption block
2813     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2814     const Register keylen      = rscratch1;
2815 
2816     address start = __ pc();
2817 
2818       __ enter();
2819 
2820       __ movw(rscratch2, len_reg);
2821 
2822       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2823 
2824       __ ld1(v2, __ T16B, rvec);
2825 
2826       __ ld1(v31, __ T16B, __ post(key, 16));
2827       __ rev32(v31, __ T16B, v31);
2828 
2829       __ cmpw(keylen, 52);
2830       __ br(Assembler::CC, L_loadkeys_44);
2831       __ br(Assembler::EQ, L_loadkeys_52);
2832 
2833       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2834       __ rev32(v17, __ T16B, v17);
2835       __ rev32(v18, __ T16B, v18);
2836     __ BIND(L_loadkeys_52);
2837       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2838       __ rev32(v19, __ T16B, v19);
2839       __ rev32(v20, __ T16B, v20);
2840     __ BIND(L_loadkeys_44);
2841       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2842       __ rev32(v21, __ T16B, v21);
2843       __ rev32(v22, __ T16B, v22);
2844       __ rev32(v23, __ T16B, v23);
2845       __ rev32(v24, __ T16B, v24);
2846       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2847       __ rev32(v25, __ T16B, v25);
2848       __ rev32(v26, __ T16B, v26);
2849       __ rev32(v27, __ T16B, v27);
2850       __ rev32(v28, __ T16B, v28);
2851       __ ld1(v29, v30, __ T16B, key);
2852       __ rev32(v29, __ T16B, v29);
2853       __ rev32(v30, __ T16B, v30);
2854 
2855     __ BIND(L_aes_loop);
2856       __ ld1(v0, __ T16B, __ post(from, 16));
2857       __ orr(v1, __ T16B, v0, v0);
2858 
2859       __ br(Assembler::CC, L_rounds_44);
2860       __ br(Assembler::EQ, L_rounds_52);
2861 
2862       __ aesd(v0, v17); __ aesimc(v0, v0);
2863       __ aesd(v0, v18); __ aesimc(v0, v0);
2864     __ BIND(L_rounds_52);
2865       __ aesd(v0, v19); __ aesimc(v0, v0);
2866       __ aesd(v0, v20); __ aesimc(v0, v0);
2867     __ BIND(L_rounds_44);
2868       __ aesd(v0, v21); __ aesimc(v0, v0);
2869       __ aesd(v0, v22); __ aesimc(v0, v0);
2870       __ aesd(v0, v23); __ aesimc(v0, v0);
2871       __ aesd(v0, v24); __ aesimc(v0, v0);
2872       __ aesd(v0, v25); __ aesimc(v0, v0);
2873       __ aesd(v0, v26); __ aesimc(v0, v0);
2874       __ aesd(v0, v27); __ aesimc(v0, v0);
2875       __ aesd(v0, v28); __ aesimc(v0, v0);
2876       __ aesd(v0, v29); __ aesimc(v0, v0);
2877       __ aesd(v0, v30);
2878       __ eor(v0, __ T16B, v0, v31);
2879       __ eor(v0, __ T16B, v0, v2);
2880 
2881       __ st1(v0, __ T16B, __ post(to, 16));
2882       __ orr(v2, __ T16B, v1, v1);
2883 
2884       __ subw(len_reg, len_reg, 16);
2885       __ cbnzw(len_reg, L_aes_loop);
2886 
2887       __ st1(v2, __ T16B, rvec);
2888 
2889       __ mov(r0, rscratch2);
2890 
2891       __ leave();
2892       __ ret(lr);
2893 
2894     return start;
2895   }
2896 
2897   // Arguments:
2898   //
2899   // Inputs:
2900   //   c_rarg0   - byte[]  source+offset
2901   //   c_rarg1   - int[]   SHA.state
2902   //   c_rarg2   - int     offset
2903   //   c_rarg3   - int     limit
2904   //
2905   address generate_sha1_implCompress(bool multi_block, const char *name) {
2906     __ align(CodeEntryAlignment);
2907     StubCodeMark mark(this, "StubRoutines", name);
2908     address start = __ pc();
2909 
2910     Register buf   = c_rarg0;
2911     Register state = c_rarg1;
2912     Register ofs   = c_rarg2;
2913     Register limit = c_rarg3;
2914 
2915     Label keys;
2916     Label sha1_loop;
2917 
2918     // load the keys into v0..v3
2919     __ adr(rscratch1, keys);
2920     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2921     // load 5 words state into v6, v7
2922     __ ldrq(v6, Address(state, 0));
2923     __ ldrs(v7, Address(state, 16));
2924 
2925 
2926     __ BIND(sha1_loop);
2927     // load 64 bytes of data into v16..v19
2928     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2929     __ rev32(v16, __ T16B, v16);
2930     __ rev32(v17, __ T16B, v17);
2931     __ rev32(v18, __ T16B, v18);
2932     __ rev32(v19, __ T16B, v19);
2933 
2934     // do the sha1
2935     __ addv(v4, __ T4S, v16, v0);
2936     __ orr(v20, __ T16B, v6, v6);
2937 
2938     FloatRegister d0 = v16;
2939     FloatRegister d1 = v17;
2940     FloatRegister d2 = v18;
2941     FloatRegister d3 = v19;
2942 
2943     for (int round = 0; round < 20; round++) {
2944       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2945       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2946       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2947       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2948       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2949 
2950       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2951       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2952       __ sha1h(tmp2, __ T4S, v20);
2953       if (round < 5)
2954         __ sha1c(v20, __ T4S, tmp3, tmp4);
2955       else if (round < 10 || round >= 15)
2956         __ sha1p(v20, __ T4S, tmp3, tmp4);
2957       else
2958         __ sha1m(v20, __ T4S, tmp3, tmp4);
2959       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2960 
2961       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2962     }
2963 
2964     __ addv(v7, __ T2S, v7, v21);
2965     __ addv(v6, __ T4S, v6, v20);
2966 
2967     if (multi_block) {
2968       __ add(ofs, ofs, 64);
2969       __ cmp(ofs, limit);
2970       __ br(Assembler::LE, sha1_loop);
2971       __ mov(c_rarg0, ofs); // return ofs
2972     }
2973 
2974     __ strq(v6, Address(state, 0));
2975     __ strs(v7, Address(state, 16));
2976 
2977     __ ret(lr);
2978 
2979     __ bind(keys);
2980     __ emit_int32(0x5a827999);
2981     __ emit_int32(0x6ed9eba1);
2982     __ emit_int32(0x8f1bbcdc);
2983     __ emit_int32(0xca62c1d6);
2984 
2985     return start;
2986   }
2987 
2988 
2989   // Arguments:
2990   //
2991   // Inputs:
2992   //   c_rarg0   - byte[]  source+offset
2993   //   c_rarg1   - int[]   SHA.state
2994   //   c_rarg2   - int     offset
2995   //   c_rarg3   - int     limit
2996   //
2997   address generate_sha256_implCompress(bool multi_block, const char *name) {
2998     static const uint32_t round_consts[64] = {
2999       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3000       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3001       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3002       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3003       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3004       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3005       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3006       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3007       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3008       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3009       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3010       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3011       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3012       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3013       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3014       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3015     };
3016     __ align(CodeEntryAlignment);
3017     StubCodeMark mark(this, "StubRoutines", name);
3018     address start = __ pc();
3019 
3020     Register buf   = c_rarg0;
3021     Register state = c_rarg1;
3022     Register ofs   = c_rarg2;
3023     Register limit = c_rarg3;
3024 
3025     Label sha1_loop;
3026 
3027     __ stpd(v8, v9, __ pre(sp, -32));
3028     __ stpd(v10, v11, Address(sp, 16));
3029 
3030 // dga == v0
3031 // dgb == v1
3032 // dg0 == v2
3033 // dg1 == v3
3034 // dg2 == v4
3035 // t0 == v6
3036 // t1 == v7
3037 
3038     // load 16 keys to v16..v31
3039     __ lea(rscratch1, ExternalAddress((address)round_consts));
3040     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3041     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3042     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3043     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3044 
3045     // load 8 words (256 bits) state
3046     __ ldpq(v0, v1, state);
3047 
3048     __ BIND(sha1_loop);
3049     // load 64 bytes of data into v8..v11
3050     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3051     __ rev32(v8, __ T16B, v8);
3052     __ rev32(v9, __ T16B, v9);
3053     __ rev32(v10, __ T16B, v10);
3054     __ rev32(v11, __ T16B, v11);
3055 
3056     __ addv(v6, __ T4S, v8, v16);
3057     __ orr(v2, __ T16B, v0, v0);
3058     __ orr(v3, __ T16B, v1, v1);
3059 
3060     FloatRegister d0 = v8;
3061     FloatRegister d1 = v9;
3062     FloatRegister d2 = v10;
3063     FloatRegister d3 = v11;
3064 
3065 
3066     for (int round = 0; round < 16; round++) {
3067       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3068       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3069       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3070       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3071 
3072       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3073        __ orr(v4, __ T16B, v2, v2);
3074       if (round < 15)
3075         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3076       __ sha256h(v2, __ T4S, v3, tmp2);
3077       __ sha256h2(v3, __ T4S, v4, tmp2);
3078       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3079 
3080       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3081     }
3082 
3083     __ addv(v0, __ T4S, v0, v2);
3084     __ addv(v1, __ T4S, v1, v3);
3085 
3086     if (multi_block) {
3087       __ add(ofs, ofs, 64);
3088       __ cmp(ofs, limit);
3089       __ br(Assembler::LE, sha1_loop);
3090       __ mov(c_rarg0, ofs); // return ofs
3091     }
3092 
3093     __ ldpd(v10, v11, Address(sp, 16));
3094     __ ldpd(v8, v9, __ post(sp, 32));
3095 
3096     __ stpq(v0, v1, state);
3097 
3098     __ ret(lr);
3099 
3100     return start;
3101   }
3102 
3103 #ifndef BUILTIN_SIM
3104   // Safefetch stubs.
3105   void generate_safefetch(const char* name, int size, address* entry,
3106                           address* fault_pc, address* continuation_pc) {
3107     // safefetch signatures:
3108     //   int      SafeFetch32(int*      adr, int      errValue);
3109     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3110     //
3111     // arguments:
3112     //   c_rarg0 = adr
3113     //   c_rarg1 = errValue
3114     //
3115     // result:
3116     //   PPC_RET  = *adr or errValue
3117 
3118     StubCodeMark mark(this, "StubRoutines", name);
3119 
3120     // Entry point, pc or function descriptor.
3121     *entry = __ pc();
3122 
3123     // Load *adr into c_rarg1, may fault.
3124     *fault_pc = __ pc();
3125     switch (size) {
3126       case 4:
3127         // int32_t
3128         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3129         break;
3130       case 8:
3131         // int64_t
3132         __ ldr(c_rarg1, Address(c_rarg0, 0));
3133         break;
3134       default:
3135         ShouldNotReachHere();
3136     }
3137 
3138     // return errValue or *adr
3139     *continuation_pc = __ pc();
3140     __ mov(r0, c_rarg1);
3141     __ ret(lr);
3142   }
3143 #endif
3144 
3145   /**
3146    *  Arguments:
3147    *
3148    * Inputs:
3149    *   c_rarg0   - int crc
3150    *   c_rarg1   - byte* buf
3151    *   c_rarg2   - int length
3152    *
3153    * Ouput:
3154    *       rax   - int crc result
3155    */
3156   address generate_updateBytesCRC32() {
3157     assert(UseCRC32Intrinsics, "what are we doing here?");
3158 
3159     __ align(CodeEntryAlignment);
3160     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3161 
3162     address start = __ pc();
3163 
3164     const Register crc   = c_rarg0;  // crc
3165     const Register buf   = c_rarg1;  // source java byte array address
3166     const Register len   = c_rarg2;  // length
3167     const Register table0 = c_rarg3; // crc_table address
3168     const Register table1 = c_rarg4;
3169     const Register table2 = c_rarg5;
3170     const Register table3 = c_rarg6;
3171     const Register tmp3 = c_rarg7;
3172 
3173     BLOCK_COMMENT("Entry:");
3174     __ enter(); // required for proper stackwalking of RuntimeStub frame
3175 
3176     __ kernel_crc32(crc, buf, len,
3177               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3178 
3179     __ leave(); // required for proper stackwalking of RuntimeStub frame
3180     __ ret(lr);
3181 
3182     return start;
3183   }
3184 
3185   /**
3186    *  Arguments:
3187    *
3188    * Inputs:
3189    *   c_rarg0   - int crc
3190    *   c_rarg1   - byte* buf
3191    *   c_rarg2   - int length
3192    *   c_rarg3   - int* table
3193    *
3194    * Ouput:
3195    *       r0   - int crc result
3196    */
3197   address generate_updateBytesCRC32C() {
3198     assert(UseCRC32CIntrinsics, "what are we doing here?");
3199 
3200     __ align(CodeEntryAlignment);
3201     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3202 
3203     address start = __ pc();
3204 
3205     const Register crc   = c_rarg0;  // crc
3206     const Register buf   = c_rarg1;  // source java byte array address
3207     const Register len   = c_rarg2;  // length
3208     const Register table0 = c_rarg3; // crc_table address
3209     const Register table1 = c_rarg4;
3210     const Register table2 = c_rarg5;
3211     const Register table3 = c_rarg6;
3212     const Register tmp3 = c_rarg7;
3213 
3214     BLOCK_COMMENT("Entry:");
3215     __ enter(); // required for proper stackwalking of RuntimeStub frame
3216 
3217     __ kernel_crc32c(crc, buf, len,
3218               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3219 
3220     __ leave(); // required for proper stackwalking of RuntimeStub frame
3221     __ ret(lr);
3222 
3223     return start;
3224   }
3225 
3226   /***
3227    *  Arguments:
3228    *
3229    *  Inputs:
3230    *   c_rarg0   - int   adler
3231    *   c_rarg1   - byte* buff
3232    *   c_rarg2   - int   len
3233    *
3234    * Output:
3235    *   c_rarg0   - int adler result
3236    */
3237   address generate_updateBytesAdler32() {
3238     __ align(CodeEntryAlignment);
3239     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3240     address start = __ pc();
3241 
3242     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3243 
3244     // Aliases
3245     Register adler  = c_rarg0;
3246     Register s1     = c_rarg0;
3247     Register s2     = c_rarg3;
3248     Register buff   = c_rarg1;
3249     Register len    = c_rarg2;
3250     Register nmax  = r4;
3251     Register base = r5;
3252     Register count = r6;
3253     Register temp0 = rscratch1;
3254     Register temp1 = rscratch2;
3255     Register temp2 = r7;
3256 
3257     // Max number of bytes we can process before having to take the mod
3258     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3259     unsigned long BASE = 0xfff1;
3260     unsigned long NMAX = 0x15B0;
3261 
3262     __ mov(base, BASE);
3263     __ mov(nmax, NMAX);
3264 
3265     // s1 is initialized to the lower 16 bits of adler
3266     // s2 is initialized to the upper 16 bits of adler
3267     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3268     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3269 
3270     // The pipelined loop needs at least 16 elements for 1 iteration
3271     // It does check this, but it is more effective to skip to the cleanup loop
3272     __ cmp(len, 16);
3273     __ br(Assembler::HS, L_nmax);
3274     __ cbz(len, L_combine);
3275 
3276     __ bind(L_simple_by1_loop);
3277     __ ldrb(temp0, Address(__ post(buff, 1)));
3278     __ add(s1, s1, temp0);
3279     __ add(s2, s2, s1);
3280     __ subs(len, len, 1);
3281     __ br(Assembler::HI, L_simple_by1_loop);
3282 
3283     // s1 = s1 % BASE
3284     __ subs(temp0, s1, base);
3285     __ csel(s1, temp0, s1, Assembler::HS);
3286 
3287     // s2 = s2 % BASE
3288     __ lsr(temp0, s2, 16);
3289     __ lsl(temp1, temp0, 4);
3290     __ sub(temp1, temp1, temp0);
3291     __ add(s2, temp1, s2, ext::uxth);
3292 
3293     __ subs(temp0, s2, base);
3294     __ csel(s2, temp0, s2, Assembler::HS);
3295 
3296     __ b(L_combine);
3297 
3298     __ bind(L_nmax);
3299     __ subs(len, len, nmax);
3300     __ sub(count, nmax, 16);
3301     __ br(Assembler::LO, L_by16);
3302 
3303     __ bind(L_nmax_loop);
3304 
3305     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3306 
3307     __ add(s1, s1, temp0, ext::uxtb);
3308     __ ubfx(temp2, temp0, 8, 8);
3309     __ add(s2, s2, s1);
3310     __ add(s1, s1, temp2);
3311     __ ubfx(temp2, temp0, 16, 8);
3312     __ add(s2, s2, s1);
3313     __ add(s1, s1, temp2);
3314     __ ubfx(temp2, temp0, 24, 8);
3315     __ add(s2, s2, s1);
3316     __ add(s1, s1, temp2);
3317     __ ubfx(temp2, temp0, 32, 8);
3318     __ add(s2, s2, s1);
3319     __ add(s1, s1, temp2);
3320     __ ubfx(temp2, temp0, 40, 8);
3321     __ add(s2, s2, s1);
3322     __ add(s1, s1, temp2);
3323     __ ubfx(temp2, temp0, 48, 8);
3324     __ add(s2, s2, s1);
3325     __ add(s1, s1, temp2);
3326     __ add(s2, s2, s1);
3327     __ add(s1, s1, temp0, Assembler::LSR, 56);
3328     __ add(s2, s2, s1);
3329 
3330     __ add(s1, s1, temp1, ext::uxtb);
3331     __ ubfx(temp2, temp1, 8, 8);
3332     __ add(s2, s2, s1);
3333     __ add(s1, s1, temp2);
3334     __ ubfx(temp2, temp1, 16, 8);
3335     __ add(s2, s2, s1);
3336     __ add(s1, s1, temp2);
3337     __ ubfx(temp2, temp1, 24, 8);
3338     __ add(s2, s2, s1);
3339     __ add(s1, s1, temp2);
3340     __ ubfx(temp2, temp1, 32, 8);
3341     __ add(s2, s2, s1);
3342     __ add(s1, s1, temp2);
3343     __ ubfx(temp2, temp1, 40, 8);
3344     __ add(s2, s2, s1);
3345     __ add(s1, s1, temp2);
3346     __ ubfx(temp2, temp1, 48, 8);
3347     __ add(s2, s2, s1);
3348     __ add(s1, s1, temp2);
3349     __ add(s2, s2, s1);
3350     __ add(s1, s1, temp1, Assembler::LSR, 56);
3351     __ add(s2, s2, s1);
3352 
3353     __ subs(count, count, 16);
3354     __ br(Assembler::HS, L_nmax_loop);
3355 
3356     // s1 = s1 % BASE
3357     __ lsr(temp0, s1, 16);
3358     __ lsl(temp1, temp0, 4);
3359     __ sub(temp1, temp1, temp0);
3360     __ add(temp1, temp1, s1, ext::uxth);
3361 
3362     __ lsr(temp0, temp1, 16);
3363     __ lsl(s1, temp0, 4);
3364     __ sub(s1, s1, temp0);
3365     __ add(s1, s1, temp1, ext:: uxth);
3366 
3367     __ subs(temp0, s1, base);
3368     __ csel(s1, temp0, s1, Assembler::HS);
3369 
3370     // s2 = s2 % BASE
3371     __ lsr(temp0, s2, 16);
3372     __ lsl(temp1, temp0, 4);
3373     __ sub(temp1, temp1, temp0);
3374     __ add(temp1, temp1, s2, ext::uxth);
3375 
3376     __ lsr(temp0, temp1, 16);
3377     __ lsl(s2, temp0, 4);
3378     __ sub(s2, s2, temp0);
3379     __ add(s2, s2, temp1, ext:: uxth);
3380 
3381     __ subs(temp0, s2, base);
3382     __ csel(s2, temp0, s2, Assembler::HS);
3383 
3384     __ subs(len, len, nmax);
3385     __ sub(count, nmax, 16);
3386     __ br(Assembler::HS, L_nmax_loop);
3387 
3388     __ bind(L_by16);
3389     __ adds(len, len, count);
3390     __ br(Assembler::LO, L_by1);
3391 
3392     __ bind(L_by16_loop);
3393 
3394     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3395 
3396     __ add(s1, s1, temp0, ext::uxtb);
3397     __ ubfx(temp2, temp0, 8, 8);
3398     __ add(s2, s2, s1);
3399     __ add(s1, s1, temp2);
3400     __ ubfx(temp2, temp0, 16, 8);
3401     __ add(s2, s2, s1);
3402     __ add(s1, s1, temp2);
3403     __ ubfx(temp2, temp0, 24, 8);
3404     __ add(s2, s2, s1);
3405     __ add(s1, s1, temp2);
3406     __ ubfx(temp2, temp0, 32, 8);
3407     __ add(s2, s2, s1);
3408     __ add(s1, s1, temp2);
3409     __ ubfx(temp2, temp0, 40, 8);
3410     __ add(s2, s2, s1);
3411     __ add(s1, s1, temp2);
3412     __ ubfx(temp2, temp0, 48, 8);
3413     __ add(s2, s2, s1);
3414     __ add(s1, s1, temp2);
3415     __ add(s2, s2, s1);
3416     __ add(s1, s1, temp0, Assembler::LSR, 56);
3417     __ add(s2, s2, s1);
3418 
3419     __ add(s1, s1, temp1, ext::uxtb);
3420     __ ubfx(temp2, temp1, 8, 8);
3421     __ add(s2, s2, s1);
3422     __ add(s1, s1, temp2);
3423     __ ubfx(temp2, temp1, 16, 8);
3424     __ add(s2, s2, s1);
3425     __ add(s1, s1, temp2);
3426     __ ubfx(temp2, temp1, 24, 8);
3427     __ add(s2, s2, s1);
3428     __ add(s1, s1, temp2);
3429     __ ubfx(temp2, temp1, 32, 8);
3430     __ add(s2, s2, s1);
3431     __ add(s1, s1, temp2);
3432     __ ubfx(temp2, temp1, 40, 8);
3433     __ add(s2, s2, s1);
3434     __ add(s1, s1, temp2);
3435     __ ubfx(temp2, temp1, 48, 8);
3436     __ add(s2, s2, s1);
3437     __ add(s1, s1, temp2);
3438     __ add(s2, s2, s1);
3439     __ add(s1, s1, temp1, Assembler::LSR, 56);
3440     __ add(s2, s2, s1);
3441 
3442     __ subs(len, len, 16);
3443     __ br(Assembler::HS, L_by16_loop);
3444 
3445     __ bind(L_by1);
3446     __ adds(len, len, 15);
3447     __ br(Assembler::LO, L_do_mod);
3448 
3449     __ bind(L_by1_loop);
3450     __ ldrb(temp0, Address(__ post(buff, 1)));
3451     __ add(s1, temp0, s1);
3452     __ add(s2, s2, s1);
3453     __ subs(len, len, 1);
3454     __ br(Assembler::HS, L_by1_loop);
3455 
3456     __ bind(L_do_mod);
3457     // s1 = s1 % BASE
3458     __ lsr(temp0, s1, 16);
3459     __ lsl(temp1, temp0, 4);
3460     __ sub(temp1, temp1, temp0);
3461     __ add(temp1, temp1, s1, ext::uxth);
3462 
3463     __ lsr(temp0, temp1, 16);
3464     __ lsl(s1, temp0, 4);
3465     __ sub(s1, s1, temp0);
3466     __ add(s1, s1, temp1, ext:: uxth);
3467 
3468     __ subs(temp0, s1, base);
3469     __ csel(s1, temp0, s1, Assembler::HS);
3470 
3471     // s2 = s2 % BASE
3472     __ lsr(temp0, s2, 16);
3473     __ lsl(temp1, temp0, 4);
3474     __ sub(temp1, temp1, temp0);
3475     __ add(temp1, temp1, s2, ext::uxth);
3476 
3477     __ lsr(temp0, temp1, 16);
3478     __ lsl(s2, temp0, 4);
3479     __ sub(s2, s2, temp0);
3480     __ add(s2, s2, temp1, ext:: uxth);
3481 
3482     __ subs(temp0, s2, base);
3483     __ csel(s2, temp0, s2, Assembler::HS);
3484 
3485     // Combine lower bits and higher bits
3486     __ bind(L_combine);
3487     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3488 
3489     __ ret(lr);
3490 
3491     return start;
3492   }
3493 
3494   /**
3495    *  Arguments:
3496    *
3497    *  Input:
3498    *    c_rarg0   - x address
3499    *    c_rarg1   - x length
3500    *    c_rarg2   - y address
3501    *    c_rarg3   - y lenth
3502    *    c_rarg4   - z address
3503    *    c_rarg5   - z length
3504    */
3505   address generate_multiplyToLen() {
3506     __ align(CodeEntryAlignment);
3507     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3508 
3509     address start = __ pc();
3510     const Register x     = r0;
3511     const Register xlen  = r1;
3512     const Register y     = r2;
3513     const Register ylen  = r3;
3514     const Register z     = r4;
3515     const Register zlen  = r5;
3516 
3517     const Register tmp1  = r10;
3518     const Register tmp2  = r11;
3519     const Register tmp3  = r12;
3520     const Register tmp4  = r13;
3521     const Register tmp5  = r14;
3522     const Register tmp6  = r15;
3523     const Register tmp7  = r16;
3524 
3525     BLOCK_COMMENT("Entry:");
3526     __ enter(); // required for proper stackwalking of RuntimeStub frame
3527     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3528     __ leave(); // required for proper stackwalking of RuntimeStub frame
3529     __ ret(lr);
3530 
3531     return start;
3532   }
3533 
3534   address generate_squareToLen() {
3535     // squareToLen algorithm for sizes 1..127 described in java code works
3536     // faster than multiply_to_len on some CPUs and slower on others, but
3537     // multiply_to_len shows a bit better overall results
3538     __ align(CodeEntryAlignment);
3539     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3540     address start = __ pc();
3541 
3542     const Register x     = r0;
3543     const Register xlen  = r1;
3544     const Register z     = r2;
3545     const Register zlen  = r3;
3546     const Register y     = r4; // == x
3547     const Register ylen  = r5; // == xlen
3548 
3549     const Register tmp1  = r10;
3550     const Register tmp2  = r11;
3551     const Register tmp3  = r12;
3552     const Register tmp4  = r13;
3553     const Register tmp5  = r14;
3554     const Register tmp6  = r15;
3555     const Register tmp7  = r16;
3556 
3557     RegSet spilled_regs = RegSet::of(y, ylen);
3558     BLOCK_COMMENT("Entry:");
3559     __ enter();
3560     __ push(spilled_regs, sp);
3561     __ mov(y, x);
3562     __ mov(ylen, xlen);
3563     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3564     __ pop(spilled_regs, sp);
3565     __ leave();
3566     __ ret(lr);
3567     return start;
3568   }
3569 
3570   address generate_mulAdd() {
3571     __ align(CodeEntryAlignment);
3572     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3573 
3574     address start = __ pc();
3575 
3576     const Register out     = r0;
3577     const Register in      = r1;
3578     const Register offset  = r2;
3579     const Register len     = r3;
3580     const Register k       = r4;
3581 
3582     BLOCK_COMMENT("Entry:");
3583     __ enter();
3584     __ mul_add(out, in, offset, len, k);
3585     __ leave();
3586     __ ret(lr);
3587 
3588     return start;
3589   }
3590 
3591   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3592                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3593                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3594     // Karatsuba multiplication performs a 128*128 -> 256-bit
3595     // multiplication in three 128-bit multiplications and a few
3596     // additions.
3597     //
3598     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3599     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3600     //
3601     // Inputs:
3602     //
3603     // A0 in a.d[0]     (subkey)
3604     // A1 in a.d[1]
3605     // (A1+A0) in a1_xor_a0.d[0]
3606     //
3607     // B0 in b.d[0]     (state)
3608     // B1 in b.d[1]
3609 
3610     __ ext(tmp1, __ T16B, b, b, 0x08);
3611     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3612     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3613     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3614     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3615 
3616     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3617     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3618     __ eor(tmp2, __ T16B, tmp2, tmp4);
3619     __ eor(tmp2, __ T16B, tmp2, tmp3);
3620 
3621     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3622     __ ins(result_hi, __ D, tmp2, 0, 1);
3623     __ ins(result_lo, __ D, tmp2, 1, 0);
3624   }
3625 
3626   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3627                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3628     const FloatRegister t0 = result;
3629 
3630     // The GCM field polynomial f is z^128 + p(z), where p =
3631     // z^7+z^2+z+1.
3632     //
3633     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3634     //
3635     // so, given that the product we're reducing is
3636     //    a == lo + hi * z^128
3637     // substituting,
3638     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3639     //
3640     // we reduce by multiplying hi by p(z) and subtracting the result
3641     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3642     // bits we can do this with two 64-bit multiplications, lo*p and
3643     // hi*p.
3644 
3645     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3646     __ ext(t1, __ T16B, t0, z, 8);
3647     __ eor(hi, __ T16B, hi, t1);
3648     __ ext(t1, __ T16B, z, t0, 8);
3649     __ eor(lo, __ T16B, lo, t1);
3650     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3651     __ eor(result, __ T16B, lo, t0);
3652   }
3653 
3654   address generate_has_negatives(address &has_negatives_long) {
3655     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3656     const int large_loop_size = 64;
3657     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3658     int dcache_line = VM_Version::dcache_line_size();
3659 
3660     Register ary1 = r1, len = r2, result = r0;
3661 
3662     __ align(CodeEntryAlignment);
3663     address entry = __ pc();
3664 
3665     __ enter();
3666 
3667   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3668         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3669 
3670   __ cmp(len, 15);
3671   __ br(Assembler::GT, LEN_OVER_15);
3672   // The only case when execution falls into this code is when pointer is near
3673   // the end of memory page and we have to avoid reading next page
3674   __ add(ary1, ary1, len);
3675   __ subs(len, len, 8);
3676   __ br(Assembler::GT, LEN_OVER_8);
3677   __ ldr(rscratch2, Address(ary1, -8));
3678   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3679   __ lsrv(rscratch2, rscratch2, rscratch1);
3680   __ tst(rscratch2, UPPER_BIT_MASK);
3681   __ cset(result, Assembler::NE);
3682   __ leave();
3683   __ ret(lr);
3684   __ bind(LEN_OVER_8);
3685   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3686   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3687   __ tst(rscratch2, UPPER_BIT_MASK);
3688   __ br(Assembler::NE, RET_TRUE_NO_POP);
3689   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3690   __ lsrv(rscratch1, rscratch1, rscratch2);
3691   __ tst(rscratch1, UPPER_BIT_MASK);
3692   __ cset(result, Assembler::NE);
3693   __ leave();
3694   __ ret(lr);
3695 
3696   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3697   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3698 
3699   has_negatives_long = __ pc(); // 2nd entry point
3700 
3701   __ enter();
3702 
3703   __ bind(LEN_OVER_15);
3704     __ push(spilled_regs, sp);
3705     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3706     __ cbz(rscratch2, ALIGNED);
3707     __ ldp(tmp6, tmp1, Address(ary1));
3708     __ mov(tmp5, 16);
3709     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3710     __ add(ary1, ary1, rscratch1);
3711     __ sub(len, len, rscratch1);
3712     __ orr(tmp6, tmp6, tmp1);
3713     __ tst(tmp6, UPPER_BIT_MASK);
3714     __ br(Assembler::NE, RET_TRUE);
3715 
3716   __ bind(ALIGNED);
3717     __ cmp(len, large_loop_size);
3718     __ br(Assembler::LT, CHECK_16);
3719     // Perform 16-byte load as early return in pre-loop to handle situation
3720     // when initially aligned large array has negative values at starting bytes,
3721     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3722     // slower. Cases with negative bytes further ahead won't be affected that
3723     // much. In fact, it'll be faster due to early loads, less instructions and
3724     // less branches in LARGE_LOOP.
3725     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3726     __ sub(len, len, 16);
3727     __ orr(tmp6, tmp6, tmp1);
3728     __ tst(tmp6, UPPER_BIT_MASK);
3729     __ br(Assembler::NE, RET_TRUE);
3730     __ cmp(len, large_loop_size);
3731     __ br(Assembler::LT, CHECK_16);
3732 
3733     if (SoftwarePrefetchHintDistance >= 0
3734         && SoftwarePrefetchHintDistance >= dcache_line) {
3735       // initial prefetch
3736       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3737     }
3738   __ bind(LARGE_LOOP);
3739     if (SoftwarePrefetchHintDistance >= 0) {
3740       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3741     }
3742     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3743     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3744     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3745     // instructions per cycle and have less branches, but this approach disables
3746     // early return, thus, all 64 bytes are loaded and checked every time.
3747     __ ldp(tmp2, tmp3, Address(ary1));
3748     __ ldp(tmp4, tmp5, Address(ary1, 16));
3749     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3750     __ ldp(tmp6, tmp1, Address(ary1, 48));
3751     __ add(ary1, ary1, large_loop_size);
3752     __ sub(len, len, large_loop_size);
3753     __ orr(tmp2, tmp2, tmp3);
3754     __ orr(tmp4, tmp4, tmp5);
3755     __ orr(rscratch1, rscratch1, rscratch2);
3756     __ orr(tmp6, tmp6, tmp1);
3757     __ orr(tmp2, tmp2, tmp4);
3758     __ orr(rscratch1, rscratch1, tmp6);
3759     __ orr(tmp2, tmp2, rscratch1);
3760     __ tst(tmp2, UPPER_BIT_MASK);
3761     __ br(Assembler::NE, RET_TRUE);
3762     __ cmp(len, large_loop_size);
3763     __ br(Assembler::GE, LARGE_LOOP);
3764 
3765   __ bind(CHECK_16); // small 16-byte load pre-loop
3766     __ cmp(len, 16);
3767     __ br(Assembler::LT, POST_LOOP16);
3768 
3769   __ bind(LOOP16); // small 16-byte load loop
3770     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3771     __ sub(len, len, 16);
3772     __ orr(tmp2, tmp2, tmp3);
3773     __ tst(tmp2, UPPER_BIT_MASK);
3774     __ br(Assembler::NE, RET_TRUE);
3775     __ cmp(len, 16);
3776     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3777 
3778   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3779     __ cmp(len, 8);
3780     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3781     __ ldr(tmp3, Address(__ post(ary1, 8)));
3782     __ sub(len, len, 8);
3783     __ tst(tmp3, UPPER_BIT_MASK);
3784     __ br(Assembler::NE, RET_TRUE);
3785 
3786   __ bind(POST_LOOP16_LOAD_TAIL);
3787     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3788     __ ldr(tmp1, Address(ary1));
3789     __ mov(tmp2, 64);
3790     __ sub(tmp4, tmp2, len, __ LSL, 3);
3791     __ lslv(tmp1, tmp1, tmp4);
3792     __ tst(tmp1, UPPER_BIT_MASK);
3793     __ br(Assembler::NE, RET_TRUE);
3794     // Fallthrough
3795 
3796   __ bind(RET_FALSE);
3797     __ pop(spilled_regs, sp);
3798     __ leave();
3799     __ mov(result, zr);
3800     __ ret(lr);
3801 
3802   __ bind(RET_TRUE);
3803     __ pop(spilled_regs, sp);
3804   __ bind(RET_TRUE_NO_POP);
3805     __ leave();
3806     __ mov(result, 1);
3807     __ ret(lr);
3808 
3809   __ bind(DONE);
3810     __ pop(spilled_regs, sp);
3811     __ leave();
3812     __ ret(lr);
3813     return entry;
3814   }
3815   /**
3816    *  Arguments:
3817    *
3818    *  Input:
3819    *  c_rarg0   - current state address
3820    *  c_rarg1   - H key address
3821    *  c_rarg2   - data address
3822    *  c_rarg3   - number of blocks
3823    *
3824    *  Output:
3825    *  Updated state at c_rarg0
3826    */
3827   address generate_ghash_processBlocks() {
3828     // Bafflingly, GCM uses little-endian for the byte order, but
3829     // big-endian for the bit order.  For example, the polynomial 1 is
3830     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3831     //
3832     // So, we must either reverse the bytes in each word and do
3833     // everything big-endian or reverse the bits in each byte and do
3834     // it little-endian.  On AArch64 it's more idiomatic to reverse
3835     // the bits in each byte (we have an instruction, RBIT, to do
3836     // that) and keep the data in little-endian bit order throught the
3837     // calculation, bit-reversing the inputs and outputs.
3838 
3839     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3840     __ align(wordSize * 2);
3841     address p = __ pc();
3842     __ emit_int64(0x87);  // The low-order bits of the field
3843                           // polynomial (i.e. p = z^7+z^2+z+1)
3844                           // repeated in the low and high parts of a
3845                           // 128-bit vector
3846     __ emit_int64(0x87);
3847 
3848     __ align(CodeEntryAlignment);
3849     address start = __ pc();
3850 
3851     Register state   = c_rarg0;
3852     Register subkeyH = c_rarg1;
3853     Register data    = c_rarg2;
3854     Register blocks  = c_rarg3;
3855 
3856     FloatRegister vzr = v30;
3857     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3858 
3859     __ ldrq(v0, Address(state));
3860     __ ldrq(v1, Address(subkeyH));
3861 
3862     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3863     __ rbit(v0, __ T16B, v0);
3864     __ rev64(v1, __ T16B, v1);
3865     __ rbit(v1, __ T16B, v1);
3866 
3867     __ ldrq(v26, p);
3868 
3869     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3870     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3871 
3872     {
3873       Label L_ghash_loop;
3874       __ bind(L_ghash_loop);
3875 
3876       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3877                                                  // reversing each byte
3878       __ rbit(v2, __ T16B, v2);
3879       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3880 
3881       // Multiply state in v2 by subkey in v1
3882       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3883                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3884                      /*temps*/v6, v20, v18, v21);
3885       // Reduce v7:v5 by the field polynomial
3886       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3887 
3888       __ sub(blocks, blocks, 1);
3889       __ cbnz(blocks, L_ghash_loop);
3890     }
3891 
3892     // The bit-reversed result is at this point in v0
3893     __ rev64(v1, __ T16B, v0);
3894     __ rbit(v1, __ T16B, v1);
3895 
3896     __ st1(v1, __ T16B, state);
3897     __ ret(lr);
3898 
3899     return start;
3900   }
3901 
3902   // Continuation point for throwing of implicit exceptions that are
3903   // not handled in the current activation. Fabricates an exception
3904   // oop and initiates normal exception dispatching in this
3905   // frame. Since we need to preserve callee-saved values (currently
3906   // only for C2, but done for C1 as well) we need a callee-saved oop
3907   // map and therefore have to make these stubs into RuntimeStubs
3908   // rather than BufferBlobs.  If the compiler needs all registers to
3909   // be preserved between the fault point and the exception handler
3910   // then it must assume responsibility for that in
3911   // AbstractCompiler::continuation_for_implicit_null_exception or
3912   // continuation_for_implicit_division_by_zero_exception. All other
3913   // implicit exceptions (e.g., NullPointerException or
3914   // AbstractMethodError on entry) are either at call sites or
3915   // otherwise assume that stack unwinding will be initiated, so
3916   // caller saved registers were assumed volatile in the compiler.
3917 
3918 #undef __
3919 #define __ masm->
3920 
3921   address generate_throw_exception(const char* name,
3922                                    address runtime_entry,
3923                                    Register arg1 = noreg,
3924                                    Register arg2 = noreg) {
3925     // Information about frame layout at time of blocking runtime call.
3926     // Note that we only have to preserve callee-saved registers since
3927     // the compilers are responsible for supplying a continuation point
3928     // if they expect all registers to be preserved.
3929     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3930     enum layout {
3931       rfp_off = 0,
3932       rfp_off2,
3933       return_off,
3934       return_off2,
3935       framesize // inclusive of return address
3936     };
3937 
3938     int insts_size = 512;
3939     int locs_size  = 64;
3940 
3941     CodeBuffer code(name, insts_size, locs_size);
3942     OopMapSet* oop_maps  = new OopMapSet();
3943     MacroAssembler* masm = new MacroAssembler(&code);
3944 
3945     address start = __ pc();
3946 
3947     // This is an inlined and slightly modified version of call_VM
3948     // which has the ability to fetch the return PC out of
3949     // thread-local storage and also sets up last_Java_sp slightly
3950     // differently than the real call_VM
3951 
3952     __ enter(); // Save FP and LR before call
3953 
3954     assert(is_even(framesize/2), "sp not 16-byte aligned");
3955 
3956     // lr and fp are already in place
3957     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3958 
3959     int frame_complete = __ pc() - start;
3960 
3961     // Set up last_Java_sp and last_Java_fp
3962     address the_pc = __ pc();
3963     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3964 
3965     // Call runtime
3966     if (arg1 != noreg) {
3967       assert(arg2 != c_rarg1, "clobbered");
3968       __ mov(c_rarg1, arg1);
3969     }
3970     if (arg2 != noreg) {
3971       __ mov(c_rarg2, arg2);
3972     }
3973     __ mov(c_rarg0, rthread);
3974     BLOCK_COMMENT("call runtime_entry");
3975     __ mov(rscratch1, runtime_entry);
3976     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3977 
3978     // Generate oop map
3979     OopMap* map = new OopMap(framesize, 0);
3980 
3981     oop_maps->add_gc_map(the_pc - start, map);
3982 
3983     __ reset_last_Java_frame(true);
3984     __ maybe_isb();
3985 
3986     __ leave();
3987 
3988     // check for pending exceptions
3989 #ifdef ASSERT
3990     Label L;
3991     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3992     __ cbnz(rscratch1, L);
3993     __ should_not_reach_here();
3994     __ bind(L);
3995 #endif // ASSERT
3996     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3997 
3998 
3999     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4000     RuntimeStub* stub =
4001       RuntimeStub::new_runtime_stub(name,
4002                                     &code,
4003                                     frame_complete,
4004                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4005                                     oop_maps, false);
4006     return stub->entry_point();
4007   }
4008 
4009   class MontgomeryMultiplyGenerator : public MacroAssembler {
4010 
4011     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4012       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4013 
4014     RegSet _toSave;
4015     bool _squaring;
4016 
4017   public:
4018     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4019       : MacroAssembler(as->code()), _squaring(squaring) {
4020 
4021       // Register allocation
4022 
4023       Register reg = c_rarg0;
4024       Pa_base = reg;       // Argument registers
4025       if (squaring)
4026         Pb_base = Pa_base;
4027       else
4028         Pb_base = ++reg;
4029       Pn_base = ++reg;
4030       Rlen= ++reg;
4031       inv = ++reg;
4032       Pm_base = ++reg;
4033 
4034                           // Working registers:
4035       Ra =  ++reg;        // The current digit of a, b, n, and m.
4036       Rb =  ++reg;
4037       Rm =  ++reg;
4038       Rn =  ++reg;
4039 
4040       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4041       Pb =  ++reg;
4042       Pm =  ++reg;
4043       Pn =  ++reg;
4044 
4045       t0 =  ++reg;        // Three registers which form a
4046       t1 =  ++reg;        // triple-precision accumuator.
4047       t2 =  ++reg;
4048 
4049       Ri =  ++reg;        // Inner and outer loop indexes.
4050       Rj =  ++reg;
4051 
4052       Rhi_ab = ++reg;     // Product registers: low and high parts
4053       Rlo_ab = ++reg;     // of a*b and m*n.
4054       Rhi_mn = ++reg;
4055       Rlo_mn = ++reg;
4056 
4057       // r19 and up are callee-saved.
4058       _toSave = RegSet::range(r19, reg) + Pm_base;
4059     }
4060 
4061   private:
4062     void save_regs() {
4063       push(_toSave, sp);
4064     }
4065 
4066     void restore_regs() {
4067       pop(_toSave, sp);
4068     }
4069 
4070     template <typename T>
4071     void unroll_2(Register count, T block) {
4072       Label loop, end, odd;
4073       tbnz(count, 0, odd);
4074       cbz(count, end);
4075       align(16);
4076       bind(loop);
4077       (this->*block)();
4078       bind(odd);
4079       (this->*block)();
4080       subs(count, count, 2);
4081       br(Assembler::GT, loop);
4082       bind(end);
4083     }
4084 
4085     template <typename T>
4086     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4087       Label loop, end, odd;
4088       tbnz(count, 0, odd);
4089       cbz(count, end);
4090       align(16);
4091       bind(loop);
4092       (this->*block)(d, s, tmp);
4093       bind(odd);
4094       (this->*block)(d, s, tmp);
4095       subs(count, count, 2);
4096       br(Assembler::GT, loop);
4097       bind(end);
4098     }
4099 
4100     void pre1(RegisterOrConstant i) {
4101       block_comment("pre1");
4102       // Pa = Pa_base;
4103       // Pb = Pb_base + i;
4104       // Pm = Pm_base;
4105       // Pn = Pn_base + i;
4106       // Ra = *Pa;
4107       // Rb = *Pb;
4108       // Rm = *Pm;
4109       // Rn = *Pn;
4110       ldr(Ra, Address(Pa_base));
4111       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4112       ldr(Rm, Address(Pm_base));
4113       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4114       lea(Pa, Address(Pa_base));
4115       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4116       lea(Pm, Address(Pm_base));
4117       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4118 
4119       // Zero the m*n result.
4120       mov(Rhi_mn, zr);
4121       mov(Rlo_mn, zr);
4122     }
4123 
4124     // The core multiply-accumulate step of a Montgomery
4125     // multiplication.  The idea is to schedule operations as a
4126     // pipeline so that instructions with long latencies (loads and
4127     // multiplies) have time to complete before their results are
4128     // used.  This most benefits in-order implementations of the
4129     // architecture but out-of-order ones also benefit.
4130     void step() {
4131       block_comment("step");
4132       // MACC(Ra, Rb, t0, t1, t2);
4133       // Ra = *++Pa;
4134       // Rb = *--Pb;
4135       umulh(Rhi_ab, Ra, Rb);
4136       mul(Rlo_ab, Ra, Rb);
4137       ldr(Ra, pre(Pa, wordSize));
4138       ldr(Rb, pre(Pb, -wordSize));
4139       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4140                                        // previous iteration.
4141       // MACC(Rm, Rn, t0, t1, t2);
4142       // Rm = *++Pm;
4143       // Rn = *--Pn;
4144       umulh(Rhi_mn, Rm, Rn);
4145       mul(Rlo_mn, Rm, Rn);
4146       ldr(Rm, pre(Pm, wordSize));
4147       ldr(Rn, pre(Pn, -wordSize));
4148       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4149     }
4150 
4151     void post1() {
4152       block_comment("post1");
4153 
4154       // MACC(Ra, Rb, t0, t1, t2);
4155       // Ra = *++Pa;
4156       // Rb = *--Pb;
4157       umulh(Rhi_ab, Ra, Rb);
4158       mul(Rlo_ab, Ra, Rb);
4159       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4160       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4161 
4162       // *Pm = Rm = t0 * inv;
4163       mul(Rm, t0, inv);
4164       str(Rm, Address(Pm));
4165 
4166       // MACC(Rm, Rn, t0, t1, t2);
4167       // t0 = t1; t1 = t2; t2 = 0;
4168       umulh(Rhi_mn, Rm, Rn);
4169 
4170 #ifndef PRODUCT
4171       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4172       {
4173         mul(Rlo_mn, Rm, Rn);
4174         add(Rlo_mn, t0, Rlo_mn);
4175         Label ok;
4176         cbz(Rlo_mn, ok); {
4177           stop("broken Montgomery multiply");
4178         } bind(ok);
4179       }
4180 #endif
4181       // We have very carefully set things up so that
4182       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4183       // the lower half of Rm * Rn because we know the result already:
4184       // it must be -t0.  t0 + (-t0) must generate a carry iff
4185       // t0 != 0.  So, rather than do a mul and an adds we just set
4186       // the carry flag iff t0 is nonzero.
4187       //
4188       // mul(Rlo_mn, Rm, Rn);
4189       // adds(zr, t0, Rlo_mn);
4190       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4191       adcs(t0, t1, Rhi_mn);
4192       adc(t1, t2, zr);
4193       mov(t2, zr);
4194     }
4195 
4196     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4197       block_comment("pre2");
4198       // Pa = Pa_base + i-len;
4199       // Pb = Pb_base + len;
4200       // Pm = Pm_base + i-len;
4201       // Pn = Pn_base + len;
4202 
4203       if (i.is_register()) {
4204         sub(Rj, i.as_register(), len);
4205       } else {
4206         mov(Rj, i.as_constant());
4207         sub(Rj, Rj, len);
4208       }
4209       // Rj == i-len
4210 
4211       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4212       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4213       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4214       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4215 
4216       // Ra = *++Pa;
4217       // Rb = *--Pb;
4218       // Rm = *++Pm;
4219       // Rn = *--Pn;
4220       ldr(Ra, pre(Pa, wordSize));
4221       ldr(Rb, pre(Pb, -wordSize));
4222       ldr(Rm, pre(Pm, wordSize));
4223       ldr(Rn, pre(Pn, -wordSize));
4224 
4225       mov(Rhi_mn, zr);
4226       mov(Rlo_mn, zr);
4227     }
4228 
4229     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4230       block_comment("post2");
4231       if (i.is_constant()) {
4232         mov(Rj, i.as_constant()-len.as_constant());
4233       } else {
4234         sub(Rj, i.as_register(), len);
4235       }
4236 
4237       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4238 
4239       // As soon as we know the least significant digit of our result,
4240       // store it.
4241       // Pm_base[i-len] = t0;
4242       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4243 
4244       // t0 = t1; t1 = t2; t2 = 0;
4245       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4246       adc(t1, t2, zr);
4247       mov(t2, zr);
4248     }
4249 
4250     // A carry in t0 after Montgomery multiplication means that we
4251     // should subtract multiples of n from our result in m.  We'll
4252     // keep doing that until there is no carry.
4253     void normalize(RegisterOrConstant len) {
4254       block_comment("normalize");
4255       // while (t0)
4256       //   t0 = sub(Pm_base, Pn_base, t0, len);
4257       Label loop, post, again;
4258       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4259       cbz(t0, post); {
4260         bind(again); {
4261           mov(i, zr);
4262           mov(cnt, len);
4263           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4264           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4265           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4266           align(16);
4267           bind(loop); {
4268             sbcs(Rm, Rm, Rn);
4269             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4270             add(i, i, 1);
4271             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4272             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4273             sub(cnt, cnt, 1);
4274           } cbnz(cnt, loop);
4275           sbc(t0, t0, zr);
4276         } cbnz(t0, again);
4277       } bind(post);
4278     }
4279 
4280     // Move memory at s to d, reversing words.
4281     //    Increments d to end of copied memory
4282     //    Destroys tmp1, tmp2
4283     //    Preserves len
4284     //    Leaves s pointing to the address which was in d at start
4285     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4286       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4287 
4288       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4289       mov(tmp1, len);
4290       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4291       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4292     }
4293     // where
4294     void reverse1(Register d, Register s, Register tmp) {
4295       ldr(tmp, pre(s, -wordSize));
4296       ror(tmp, tmp, 32);
4297       str(tmp, post(d, wordSize));
4298     }
4299 
4300     void step_squaring() {
4301       // An extra ACC
4302       step();
4303       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4304     }
4305 
4306     void last_squaring(RegisterOrConstant i) {
4307       Label dont;
4308       // if ((i & 1) == 0) {
4309       tbnz(i.as_register(), 0, dont); {
4310         // MACC(Ra, Rb, t0, t1, t2);
4311         // Ra = *++Pa;
4312         // Rb = *--Pb;
4313         umulh(Rhi_ab, Ra, Rb);
4314         mul(Rlo_ab, Ra, Rb);
4315         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4316       } bind(dont);
4317     }
4318 
4319     void extra_step_squaring() {
4320       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4321 
4322       // MACC(Rm, Rn, t0, t1, t2);
4323       // Rm = *++Pm;
4324       // Rn = *--Pn;
4325       umulh(Rhi_mn, Rm, Rn);
4326       mul(Rlo_mn, Rm, Rn);
4327       ldr(Rm, pre(Pm, wordSize));
4328       ldr(Rn, pre(Pn, -wordSize));
4329     }
4330 
4331     void post1_squaring() {
4332       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4333 
4334       // *Pm = Rm = t0 * inv;
4335       mul(Rm, t0, inv);
4336       str(Rm, Address(Pm));
4337 
4338       // MACC(Rm, Rn, t0, t1, t2);
4339       // t0 = t1; t1 = t2; t2 = 0;
4340       umulh(Rhi_mn, Rm, Rn);
4341 
4342 #ifndef PRODUCT
4343       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4344       {
4345         mul(Rlo_mn, Rm, Rn);
4346         add(Rlo_mn, t0, Rlo_mn);
4347         Label ok;
4348         cbz(Rlo_mn, ok); {
4349           stop("broken Montgomery multiply");
4350         } bind(ok);
4351       }
4352 #endif
4353       // We have very carefully set things up so that
4354       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4355       // the lower half of Rm * Rn because we know the result already:
4356       // it must be -t0.  t0 + (-t0) must generate a carry iff
4357       // t0 != 0.  So, rather than do a mul and an adds we just set
4358       // the carry flag iff t0 is nonzero.
4359       //
4360       // mul(Rlo_mn, Rm, Rn);
4361       // adds(zr, t0, Rlo_mn);
4362       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4363       adcs(t0, t1, Rhi_mn);
4364       adc(t1, t2, zr);
4365       mov(t2, zr);
4366     }
4367 
4368     void acc(Register Rhi, Register Rlo,
4369              Register t0, Register t1, Register t2) {
4370       adds(t0, t0, Rlo);
4371       adcs(t1, t1, Rhi);
4372       adc(t2, t2, zr);
4373     }
4374 
4375   public:
4376     /**
4377      * Fast Montgomery multiplication.  The derivation of the
4378      * algorithm is in A Cryptographic Library for the Motorola
4379      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4380      *
4381      * Arguments:
4382      *
4383      * Inputs for multiplication:
4384      *   c_rarg0   - int array elements a
4385      *   c_rarg1   - int array elements b
4386      *   c_rarg2   - int array elements n (the modulus)
4387      *   c_rarg3   - int length
4388      *   c_rarg4   - int inv
4389      *   c_rarg5   - int array elements m (the result)
4390      *
4391      * Inputs for squaring:
4392      *   c_rarg0   - int array elements a
4393      *   c_rarg1   - int array elements n (the modulus)
4394      *   c_rarg2   - int length
4395      *   c_rarg3   - int inv
4396      *   c_rarg4   - int array elements m (the result)
4397      *
4398      */
4399     address generate_multiply() {
4400       Label argh, nothing;
4401       bind(argh);
4402       stop("MontgomeryMultiply total_allocation must be <= 8192");
4403 
4404       align(CodeEntryAlignment);
4405       address entry = pc();
4406 
4407       cbzw(Rlen, nothing);
4408 
4409       enter();
4410 
4411       // Make room.
4412       cmpw(Rlen, 512);
4413       br(Assembler::HI, argh);
4414       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4415       andr(sp, Ra, -2 * wordSize);
4416 
4417       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4418 
4419       {
4420         // Copy input args, reversing as we go.  We use Ra as a
4421         // temporary variable.
4422         reverse(Ra, Pa_base, Rlen, t0, t1);
4423         if (!_squaring)
4424           reverse(Ra, Pb_base, Rlen, t0, t1);
4425         reverse(Ra, Pn_base, Rlen, t0, t1);
4426       }
4427 
4428       // Push all call-saved registers and also Pm_base which we'll need
4429       // at the end.
4430       save_regs();
4431 
4432 #ifndef PRODUCT
4433       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4434       {
4435         ldr(Rn, Address(Pn_base, 0));
4436         mul(Rlo_mn, Rn, inv);
4437         cmp(Rlo_mn, -1);
4438         Label ok;
4439         br(EQ, ok); {
4440           stop("broken inverse in Montgomery multiply");
4441         } bind(ok);
4442       }
4443 #endif
4444 
4445       mov(Pm_base, Ra);
4446 
4447       mov(t0, zr);
4448       mov(t1, zr);
4449       mov(t2, zr);
4450 
4451       block_comment("for (int i = 0; i < len; i++) {");
4452       mov(Ri, zr); {
4453         Label loop, end;
4454         cmpw(Ri, Rlen);
4455         br(Assembler::GE, end);
4456 
4457         bind(loop);
4458         pre1(Ri);
4459 
4460         block_comment("  for (j = i; j; j--) {"); {
4461           movw(Rj, Ri);
4462           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4463         } block_comment("  } // j");
4464 
4465         post1();
4466         addw(Ri, Ri, 1);
4467         cmpw(Ri, Rlen);
4468         br(Assembler::LT, loop);
4469         bind(end);
4470         block_comment("} // i");
4471       }
4472 
4473       block_comment("for (int i = len; i < 2*len; i++) {");
4474       mov(Ri, Rlen); {
4475         Label loop, end;
4476         cmpw(Ri, Rlen, Assembler::LSL, 1);
4477         br(Assembler::GE, end);
4478 
4479         bind(loop);
4480         pre2(Ri, Rlen);
4481 
4482         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4483           lslw(Rj, Rlen, 1);
4484           subw(Rj, Rj, Ri);
4485           subw(Rj, Rj, 1);
4486           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4487         } block_comment("  } // j");
4488 
4489         post2(Ri, Rlen);
4490         addw(Ri, Ri, 1);
4491         cmpw(Ri, Rlen, Assembler::LSL, 1);
4492         br(Assembler::LT, loop);
4493         bind(end);
4494       }
4495       block_comment("} // i");
4496 
4497       normalize(Rlen);
4498 
4499       mov(Ra, Pm_base);  // Save Pm_base in Ra
4500       restore_regs();  // Restore caller's Pm_base
4501 
4502       // Copy our result into caller's Pm_base
4503       reverse(Pm_base, Ra, Rlen, t0, t1);
4504 
4505       leave();
4506       bind(nothing);
4507       ret(lr);
4508 
4509       return entry;
4510     }
4511     // In C, approximately:
4512 
4513     // void
4514     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4515     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4516     //                     unsigned long inv, int len) {
4517     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4518     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4519     //   unsigned long Ra, Rb, Rn, Rm;
4520 
4521     //   int i;
4522 
4523     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4524 
4525     //   for (i = 0; i < len; i++) {
4526     //     int j;
4527 
4528     //     Pa = Pa_base;
4529     //     Pb = Pb_base + i;
4530     //     Pm = Pm_base;
4531     //     Pn = Pn_base + i;
4532 
4533     //     Ra = *Pa;
4534     //     Rb = *Pb;
4535     //     Rm = *Pm;
4536     //     Rn = *Pn;
4537 
4538     //     int iters = i;
4539     //     for (j = 0; iters--; j++) {
4540     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4541     //       MACC(Ra, Rb, t0, t1, t2);
4542     //       Ra = *++Pa;
4543     //       Rb = *--Pb;
4544     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4545     //       MACC(Rm, Rn, t0, t1, t2);
4546     //       Rm = *++Pm;
4547     //       Rn = *--Pn;
4548     //     }
4549 
4550     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4551     //     MACC(Ra, Rb, t0, t1, t2);
4552     //     *Pm = Rm = t0 * inv;
4553     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4554     //     MACC(Rm, Rn, t0, t1, t2);
4555 
4556     //     assert(t0 == 0, "broken Montgomery multiply");
4557 
4558     //     t0 = t1; t1 = t2; t2 = 0;
4559     //   }
4560 
4561     //   for (i = len; i < 2*len; i++) {
4562     //     int j;
4563 
4564     //     Pa = Pa_base + i-len;
4565     //     Pb = Pb_base + len;
4566     //     Pm = Pm_base + i-len;
4567     //     Pn = Pn_base + len;
4568 
4569     //     Ra = *++Pa;
4570     //     Rb = *--Pb;
4571     //     Rm = *++Pm;
4572     //     Rn = *--Pn;
4573 
4574     //     int iters = len*2-i-1;
4575     //     for (j = i-len+1; iters--; j++) {
4576     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4577     //       MACC(Ra, Rb, t0, t1, t2);
4578     //       Ra = *++Pa;
4579     //       Rb = *--Pb;
4580     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4581     //       MACC(Rm, Rn, t0, t1, t2);
4582     //       Rm = *++Pm;
4583     //       Rn = *--Pn;
4584     //     }
4585 
4586     //     Pm_base[i-len] = t0;
4587     //     t0 = t1; t1 = t2; t2 = 0;
4588     //   }
4589 
4590     //   while (t0)
4591     //     t0 = sub(Pm_base, Pn_base, t0, len);
4592     // }
4593 
4594     /**
4595      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4596      * multiplies than Montgomery multiplication so it should be up to
4597      * 25% faster.  However, its loop control is more complex and it
4598      * may actually run slower on some machines.
4599      *
4600      * Arguments:
4601      *
4602      * Inputs:
4603      *   c_rarg0   - int array elements a
4604      *   c_rarg1   - int array elements n (the modulus)
4605      *   c_rarg2   - int length
4606      *   c_rarg3   - int inv
4607      *   c_rarg4   - int array elements m (the result)
4608      *
4609      */
4610     address generate_square() {
4611       Label argh;
4612       bind(argh);
4613       stop("MontgomeryMultiply total_allocation must be <= 8192");
4614 
4615       align(CodeEntryAlignment);
4616       address entry = pc();
4617 
4618       enter();
4619 
4620       // Make room.
4621       cmpw(Rlen, 512);
4622       br(Assembler::HI, argh);
4623       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4624       andr(sp, Ra, -2 * wordSize);
4625 
4626       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4627 
4628       {
4629         // Copy input args, reversing as we go.  We use Ra as a
4630         // temporary variable.
4631         reverse(Ra, Pa_base, Rlen, t0, t1);
4632         reverse(Ra, Pn_base, Rlen, t0, t1);
4633       }
4634 
4635       // Push all call-saved registers and also Pm_base which we'll need
4636       // at the end.
4637       save_regs();
4638 
4639       mov(Pm_base, Ra);
4640 
4641       mov(t0, zr);
4642       mov(t1, zr);
4643       mov(t2, zr);
4644 
4645       block_comment("for (int i = 0; i < len; i++) {");
4646       mov(Ri, zr); {
4647         Label loop, end;
4648         bind(loop);
4649         cmp(Ri, Rlen);
4650         br(Assembler::GE, end);
4651 
4652         pre1(Ri);
4653 
4654         block_comment("for (j = (i+1)/2; j; j--) {"); {
4655           add(Rj, Ri, 1);
4656           lsr(Rj, Rj, 1);
4657           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4658         } block_comment("  } // j");
4659 
4660         last_squaring(Ri);
4661 
4662         block_comment("  for (j = i/2; j; j--) {"); {
4663           lsr(Rj, Ri, 1);
4664           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4665         } block_comment("  } // j");
4666 
4667         post1_squaring();
4668         add(Ri, Ri, 1);
4669         cmp(Ri, Rlen);
4670         br(Assembler::LT, loop);
4671 
4672         bind(end);
4673         block_comment("} // i");
4674       }
4675 
4676       block_comment("for (int i = len; i < 2*len; i++) {");
4677       mov(Ri, Rlen); {
4678         Label loop, end;
4679         bind(loop);
4680         cmp(Ri, Rlen, Assembler::LSL, 1);
4681         br(Assembler::GE, end);
4682 
4683         pre2(Ri, Rlen);
4684 
4685         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4686           lsl(Rj, Rlen, 1);
4687           sub(Rj, Rj, Ri);
4688           sub(Rj, Rj, 1);
4689           lsr(Rj, Rj, 1);
4690           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4691         } block_comment("  } // j");
4692 
4693         last_squaring(Ri);
4694 
4695         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4696           lsl(Rj, Rlen, 1);
4697           sub(Rj, Rj, Ri);
4698           lsr(Rj, Rj, 1);
4699           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4700         } block_comment("  } // j");
4701 
4702         post2(Ri, Rlen);
4703         add(Ri, Ri, 1);
4704         cmp(Ri, Rlen, Assembler::LSL, 1);
4705 
4706         br(Assembler::LT, loop);
4707         bind(end);
4708         block_comment("} // i");
4709       }
4710 
4711       normalize(Rlen);
4712 
4713       mov(Ra, Pm_base);  // Save Pm_base in Ra
4714       restore_regs();  // Restore caller's Pm_base
4715 
4716       // Copy our result into caller's Pm_base
4717       reverse(Pm_base, Ra, Rlen, t0, t1);
4718 
4719       leave();
4720       ret(lr);
4721 
4722       return entry;
4723     }
4724     // In C, approximately:
4725 
4726     // void
4727     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4728     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4729     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4730     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4731     //   unsigned long Ra, Rb, Rn, Rm;
4732 
4733     //   int i;
4734 
4735     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4736 
4737     //   for (i = 0; i < len; i++) {
4738     //     int j;
4739 
4740     //     Pa = Pa_base;
4741     //     Pb = Pa_base + i;
4742     //     Pm = Pm_base;
4743     //     Pn = Pn_base + i;
4744 
4745     //     Ra = *Pa;
4746     //     Rb = *Pb;
4747     //     Rm = *Pm;
4748     //     Rn = *Pn;
4749 
4750     //     int iters = (i+1)/2;
4751     //     for (j = 0; iters--; j++) {
4752     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4753     //       MACC2(Ra, Rb, t0, t1, t2);
4754     //       Ra = *++Pa;
4755     //       Rb = *--Pb;
4756     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4757     //       MACC(Rm, Rn, t0, t1, t2);
4758     //       Rm = *++Pm;
4759     //       Rn = *--Pn;
4760     //     }
4761     //     if ((i & 1) == 0) {
4762     //       assert(Ra == Pa_base[j], "must be");
4763     //       MACC(Ra, Ra, t0, t1, t2);
4764     //     }
4765     //     iters = i/2;
4766     //     assert(iters == i-j, "must be");
4767     //     for (; iters--; j++) {
4768     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4769     //       MACC(Rm, Rn, t0, t1, t2);
4770     //       Rm = *++Pm;
4771     //       Rn = *--Pn;
4772     //     }
4773 
4774     //     *Pm = Rm = t0 * inv;
4775     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4776     //     MACC(Rm, Rn, t0, t1, t2);
4777 
4778     //     assert(t0 == 0, "broken Montgomery multiply");
4779 
4780     //     t0 = t1; t1 = t2; t2 = 0;
4781     //   }
4782 
4783     //   for (i = len; i < 2*len; i++) {
4784     //     int start = i-len+1;
4785     //     int end = start + (len - start)/2;
4786     //     int j;
4787 
4788     //     Pa = Pa_base + i-len;
4789     //     Pb = Pa_base + len;
4790     //     Pm = Pm_base + i-len;
4791     //     Pn = Pn_base + len;
4792 
4793     //     Ra = *++Pa;
4794     //     Rb = *--Pb;
4795     //     Rm = *++Pm;
4796     //     Rn = *--Pn;
4797 
4798     //     int iters = (2*len-i-1)/2;
4799     //     assert(iters == end-start, "must be");
4800     //     for (j = start; iters--; j++) {
4801     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4802     //       MACC2(Ra, Rb, t0, t1, t2);
4803     //       Ra = *++Pa;
4804     //       Rb = *--Pb;
4805     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4806     //       MACC(Rm, Rn, t0, t1, t2);
4807     //       Rm = *++Pm;
4808     //       Rn = *--Pn;
4809     //     }
4810     //     if ((i & 1) == 0) {
4811     //       assert(Ra == Pa_base[j], "must be");
4812     //       MACC(Ra, Ra, t0, t1, t2);
4813     //     }
4814     //     iters =  (2*len-i)/2;
4815     //     assert(iters == len-j, "must be");
4816     //     for (; iters--; j++) {
4817     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4818     //       MACC(Rm, Rn, t0, t1, t2);
4819     //       Rm = *++Pm;
4820     //       Rn = *--Pn;
4821     //     }
4822     //     Pm_base[i-len] = t0;
4823     //     t0 = t1; t1 = t2; t2 = 0;
4824     //   }
4825 
4826     //   while (t0)
4827     //     t0 = sub(Pm_base, Pn_base, t0, len);
4828     // }
4829   };
4830 
4831 
4832   // Initialization
4833   void generate_initial() {
4834     // Generate initial stubs and initializes the entry points
4835 
4836     // entry points that exist in all platforms Note: This is code
4837     // that could be shared among different platforms - however the
4838     // benefit seems to be smaller than the disadvantage of having a
4839     // much more complicated generator structure. See also comment in
4840     // stubRoutines.hpp.
4841 
4842     StubRoutines::_forward_exception_entry = generate_forward_exception();
4843 
4844     StubRoutines::_call_stub_entry =
4845       generate_call_stub(StubRoutines::_call_stub_return_address);
4846 
4847     // is referenced by megamorphic call
4848     StubRoutines::_catch_exception_entry = generate_catch_exception();
4849 
4850     // Build this early so it's available for the interpreter.
4851     StubRoutines::_throw_StackOverflowError_entry =
4852       generate_throw_exception("StackOverflowError throw_exception",
4853                                CAST_FROM_FN_PTR(address,
4854                                                 SharedRuntime::throw_StackOverflowError));
4855     StubRoutines::_throw_delayed_StackOverflowError_entry =
4856       generate_throw_exception("delayed StackOverflowError throw_exception",
4857                                CAST_FROM_FN_PTR(address,
4858                                                 SharedRuntime::throw_delayed_StackOverflowError));
4859     if (UseCRC32Intrinsics) {
4860       // set table address before stub generation which use it
4861       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4862       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4863     }
4864 
4865     if (UseCRC32CIntrinsics) {
4866       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4867     }
4868   }
4869 
4870   void generate_all() {
4871     // support for verify_oop (must happen after universe_init)
4872     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4873     StubRoutines::_throw_AbstractMethodError_entry =
4874       generate_throw_exception("AbstractMethodError throw_exception",
4875                                CAST_FROM_FN_PTR(address,
4876                                                 SharedRuntime::
4877                                                 throw_AbstractMethodError));
4878 
4879     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4880       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4881                                CAST_FROM_FN_PTR(address,
4882                                                 SharedRuntime::
4883                                                 throw_IncompatibleClassChangeError));
4884 
4885     StubRoutines::_throw_NullPointerException_at_call_entry =
4886       generate_throw_exception("NullPointerException at call throw_exception",
4887                                CAST_FROM_FN_PTR(address,
4888                                                 SharedRuntime::
4889                                                 throw_NullPointerException_at_call));
4890 
4891     // arraycopy stubs used by compilers
4892     generate_arraycopy_stubs();
4893 
4894     // has negatives stub for large arrays.
4895     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
4896 
4897     if (UseMultiplyToLenIntrinsic) {
4898       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4899     }
4900 
4901     if (UseSquareToLenIntrinsic) {
4902       StubRoutines::_squareToLen = generate_squareToLen();
4903     }
4904 
4905     if (UseMulAddIntrinsic) {
4906       StubRoutines::_mulAdd = generate_mulAdd();
4907     }
4908 
4909     if (UseMontgomeryMultiplyIntrinsic) {
4910       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4911       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4912       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4913     }
4914 
4915     if (UseMontgomerySquareIntrinsic) {
4916       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4917       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4918       // We use generate_multiply() rather than generate_square()
4919       // because it's faster for the sizes of modulus we care about.
4920       StubRoutines::_montgomerySquare = g.generate_multiply();
4921     }
4922 
4923 #ifndef BUILTIN_SIM
4924     // generate GHASH intrinsics code
4925     if (UseGHASHIntrinsics) {
4926       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4927     }
4928 
4929     if (UseAESIntrinsics) {
4930       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4931       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4932       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4933       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4934     }
4935 
4936     if (UseSHA1Intrinsics) {
4937       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4938       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4939     }
4940     if (UseSHA256Intrinsics) {
4941       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4942       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4943     }
4944 
4945     // generate Adler32 intrinsics code
4946     if (UseAdler32Intrinsics) {
4947       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4948     }
4949 
4950     // Safefetch stubs.
4951     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4952                                                        &StubRoutines::_safefetch32_fault_pc,
4953                                                        &StubRoutines::_safefetch32_continuation_pc);
4954     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4955                                                        &StubRoutines::_safefetchN_fault_pc,
4956                                                        &StubRoutines::_safefetchN_continuation_pc);
4957 #endif
4958     StubRoutines::aarch64::set_completed();
4959   }
4960 
4961  public:
4962   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4963     if (all) {
4964       generate_all();
4965     } else {
4966       generate_initial();
4967     }
4968   }
4969 }; // end class declaration
4970 
4971 void StubGenerator_generate(CodeBuffer* code, bool all) {
4972   StubGenerator g(code, all);
4973 }