1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSetCodeGen.hpp"
  30 #include "gc/shared/cardTable.hpp"
  31 #include "gc/shared/cardTableModRefBS.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 
  50 #ifdef BUILTIN_SIM
  51 #include "../../../../../../simulator/simulator.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     // we need a C prolog to bootstrap the x86 caller into the sim
 222     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 223 
 224     address aarch64_entry = __ pc();
 225 
 226 #ifdef BUILTIN_SIM
 227     // Save sender's SP for stack traces.
 228     __ mov(rscratch1, sp);
 229     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 230 #endif
 231     // set up frame and move sp to end of save area
 232     __ enter();
 233     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 234 
 235     // save register parameters and Java scratch/global registers
 236     // n.b. we save thread even though it gets installed in
 237     // rthread because we want to sanity check rthread later
 238     __ str(c_rarg7,  thread);
 239     __ strw(c_rarg6, parameter_size);
 240     __ stp(c_rarg4, c_rarg5,  entry_point);
 241     __ stp(c_rarg2, c_rarg3,  result_type);
 242     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 243 
 244     __ stp(r20, r19,   r20_save);
 245     __ stp(r22, r21,   r22_save);
 246     __ stp(r24, r23,   r24_save);
 247     __ stp(r26, r25,   r26_save);
 248     __ stp(r28, r27,   r28_save);
 249 
 250     __ stpd(v9,  v8,   d9_save);
 251     __ stpd(v11, v10,  d11_save);
 252     __ stpd(v13, v12,  d13_save);
 253     __ stpd(v15, v14,  d15_save);
 254 
 255     // install Java thread in global register now we have saved
 256     // whatever value it held
 257     __ mov(rthread, c_rarg7);
 258     // And method
 259     __ mov(rmethod, c_rarg3);
 260 
 261     // set up the heapbase register
 262     __ reinit_heapbase();
 263 
 264 #ifdef ASSERT
 265     // make sure we have no pending exceptions
 266     {
 267       Label L;
 268       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 269       __ cmp(rscratch1, (unsigned)NULL_WORD);
 270       __ br(Assembler::EQ, L);
 271       __ stop("StubRoutines::call_stub: entered with pending exception");
 272       __ BIND(L);
 273     }
 274 #endif
 275     // pass parameters if any
 276     __ mov(esp, sp);
 277     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 278     __ andr(sp, rscratch1, -2 * wordSize);
 279 
 280     BLOCK_COMMENT("pass parameters if any");
 281     Label parameters_done;
 282     // parameter count is still in c_rarg6
 283     // and parameter pointer identifying param 1 is in c_rarg5
 284     __ cbzw(c_rarg6, parameters_done);
 285 
 286     address loop = __ pc();
 287     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 288     __ subsw(c_rarg6, c_rarg6, 1);
 289     __ push(rscratch1);
 290     __ br(Assembler::GT, loop);
 291 
 292     __ BIND(parameters_done);
 293 
 294     // call Java entry -- passing methdoOop, and current sp
 295     //      rmethod: Method*
 296     //      r13: sender sp
 297     BLOCK_COMMENT("call Java function");
 298     __ mov(r13, sp);
 299     __ blr(c_rarg4);
 300 
 301     // tell the simulator we have returned to the stub
 302 
 303     // we do this here because the notify will already have been done
 304     // if we get to the next instruction via an exception
 305     //
 306     // n.b. adding this instruction here affects the calculation of
 307     // whether or not a routine returns to the call stub (used when
 308     // doing stack walks) since the normal test is to check the return
 309     // pc against the address saved below. so we may need to allow for
 310     // this extra instruction in the check.
 311 
 312     if (NotifySimulator) {
 313       __ notify(Assembler::method_reentry);
 314     }
 315     // save current address for use by exception handling code
 316 
 317     return_address = __ pc();
 318 
 319     // store result depending on type (everything that is not
 320     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 321     // n.b. this assumes Java returns an integral result in r0
 322     // and a floating result in j_farg0
 323     __ ldr(j_rarg2, result);
 324     Label is_long, is_float, is_double, exit;
 325     __ ldr(j_rarg1, result_type);
 326     __ cmp(j_rarg1, T_OBJECT);
 327     __ br(Assembler::EQ, is_long);
 328     __ cmp(j_rarg1, T_LONG);
 329     __ br(Assembler::EQ, is_long);
 330     __ cmp(j_rarg1, T_FLOAT);
 331     __ br(Assembler::EQ, is_float);
 332     __ cmp(j_rarg1, T_DOUBLE);
 333     __ br(Assembler::EQ, is_double);
 334 
 335     // handle T_INT case
 336     __ strw(r0, Address(j_rarg2));
 337 
 338     __ BIND(exit);
 339 
 340     // pop parameters
 341     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 342 
 343 #ifdef ASSERT
 344     // verify that threads correspond
 345     {
 346       Label L, S;
 347       __ ldr(rscratch1, thread);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::NE, S);
 350       __ get_thread(rscratch1);
 351       __ cmp(rthread, rscratch1);
 352       __ br(Assembler::EQ, L);
 353       __ BIND(S);
 354       __ stop("StubRoutines::call_stub: threads must correspond");
 355       __ BIND(L);
 356     }
 357 #endif
 358 
 359     // restore callee-save registers
 360     __ ldpd(v15, v14,  d15_save);
 361     __ ldpd(v13, v12,  d13_save);
 362     __ ldpd(v11, v10,  d11_save);
 363     __ ldpd(v9,  v8,   d9_save);
 364 
 365     __ ldp(r28, r27,   r28_save);
 366     __ ldp(r26, r25,   r26_save);
 367     __ ldp(r24, r23,   r24_save);
 368     __ ldp(r22, r21,   r22_save);
 369     __ ldp(r20, r19,   r20_save);
 370 
 371     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 372     __ ldrw(c_rarg2, result_type);
 373     __ ldr(c_rarg3,  method);
 374     __ ldp(c_rarg4, c_rarg5,  entry_point);
 375     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 376 
 377 #ifndef PRODUCT
 378     // tell the simulator we are about to end Java execution
 379     if (NotifySimulator) {
 380       __ notify(Assembler::method_exit);
 381     }
 382 #endif
 383     // leave frame and return to caller
 384     __ leave();
 385     __ ret(lr);
 386 
 387     // handle return types different from T_INT
 388 
 389     __ BIND(is_long);
 390     __ str(r0, Address(j_rarg2, 0));
 391     __ br(Assembler::AL, exit);
 392 
 393     __ BIND(is_float);
 394     __ strs(j_farg0, Address(j_rarg2, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     __ BIND(is_double);
 398     __ strd(j_farg0, Address(j_rarg2, 0));
 399     __ br(Assembler::AL, exit);
 400 
 401     return start;
 402   }
 403 
 404   // Return point for a Java call if there's an exception thrown in
 405   // Java code.  The exception is caught and transformed into a
 406   // pending exception stored in JavaThread that can be tested from
 407   // within the VM.
 408   //
 409   // Note: Usually the parameters are removed by the callee. In case
 410   // of an exception crossing an activation frame boundary, that is
 411   // not the case if the callee is compiled code => need to setup the
 412   // rsp.
 413   //
 414   // r0: exception oop
 415 
 416   // NOTE: this is used as a target from the signal handler so it
 417   // needs an x86 prolog which returns into the current simulator
 418   // executing the generated catch_exception code. so the prolog
 419   // needs to install rax in a sim register and adjust the sim's
 420   // restart pc to enter the generated code at the start position
 421   // then return from native to simulated execution.
 422 
 423   address generate_catch_exception() {
 424     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 425     address start = __ pc();
 426 
 427     // same as in generate_call_stub():
 428     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 429     const Address thread        (rfp, thread_off         * wordSize);
 430 
 431 #ifdef ASSERT
 432     // verify that threads correspond
 433     {
 434       Label L, S;
 435       __ ldr(rscratch1, thread);
 436       __ cmp(rthread, rscratch1);
 437       __ br(Assembler::NE, S);
 438       __ get_thread(rscratch1);
 439       __ cmp(rthread, rscratch1);
 440       __ br(Assembler::EQ, L);
 441       __ bind(S);
 442       __ stop("StubRoutines::catch_exception: threads must correspond");
 443       __ bind(L);
 444     }
 445 #endif
 446 
 447     // set pending exception
 448     __ verify_oop(r0);
 449 
 450     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 451     __ mov(rscratch1, (address)__FILE__);
 452     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 453     __ movw(rscratch1, (int)__LINE__);
 454     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 455 
 456     // complete return to VM
 457     assert(StubRoutines::_call_stub_return_address != NULL,
 458            "_call_stub_return_address must have been generated before");
 459     __ b(StubRoutines::_call_stub_return_address);
 460 
 461     return start;
 462   }
 463 
 464   // Continuation point for runtime calls returning with a pending
 465   // exception.  The pending exception check happened in the runtime
 466   // or native call stub.  The pending exception in Thread is
 467   // converted into a Java-level exception.
 468   //
 469   // Contract with Java-level exception handlers:
 470   // r0: exception
 471   // r3: throwing pc
 472   //
 473   // NOTE: At entry of this stub, exception-pc must be in LR !!
 474 
 475   // NOTE: this is always used as a jump target within generated code
 476   // so it just needs to be generated code wiht no x86 prolog
 477 
 478   address generate_forward_exception() {
 479     StubCodeMark mark(this, "StubRoutines", "forward exception");
 480     address start = __ pc();
 481 
 482     // Upon entry, LR points to the return address returning into
 483     // Java (interpreted or compiled) code; i.e., the return address
 484     // becomes the throwing pc.
 485     //
 486     // Arguments pushed before the runtime call are still on the stack
 487     // but the exception handler will reset the stack pointer ->
 488     // ignore them.  A potential result in registers can be ignored as
 489     // well.
 490 
 491 #ifdef ASSERT
 492     // make sure this code is only executed if there is a pending exception
 493     {
 494       Label L;
 495       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 496       __ cbnz(rscratch1, L);
 497       __ stop("StubRoutines::forward exception: no pending exception (1)");
 498       __ bind(L);
 499     }
 500 #endif
 501 
 502     // compute exception handler into r19
 503 
 504     // call the VM to find the handler address associated with the
 505     // caller address. pass thread in r0 and caller pc (ret address)
 506     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 507     // the stack.
 508     __ mov(c_rarg1, lr);
 509     // lr will be trashed by the VM call so we move it to R19
 510     // (callee-saved) because we also need to pass it to the handler
 511     // returned by this call.
 512     __ mov(r19, lr);
 513     BLOCK_COMMENT("call exception_handler_for_return_address");
 514     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 515                          SharedRuntime::exception_handler_for_return_address),
 516                     rthread, c_rarg1);
 517     // we should not really care that lr is no longer the callee
 518     // address. we saved the value the handler needs in r19 so we can
 519     // just copy it to r3. however, the C2 handler will push its own
 520     // frame and then calls into the VM and the VM code asserts that
 521     // the PC for the frame above the handler belongs to a compiled
 522     // Java method. So, we restore lr here to satisfy that assert.
 523     __ mov(lr, r19);
 524     // setup r0 & r3 & clear pending exception
 525     __ mov(r3, r19);
 526     __ mov(r19, r0);
 527     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 528     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 529 
 530 #ifdef ASSERT
 531     // make sure exception is set
 532     {
 533       Label L;
 534       __ cbnz(r0, L);
 535       __ stop("StubRoutines::forward exception: no pending exception (2)");
 536       __ bind(L);
 537     }
 538 #endif
 539 
 540     // continue at exception handler
 541     // r0: exception
 542     // r3: throwing pc
 543     // r19: exception handler
 544     __ verify_oop(r0);
 545     __ br(r19);
 546 
 547     return start;
 548   }
 549 
 550   // Non-destructive plausibility checks for oops
 551   //
 552   // Arguments:
 553   //    r0: oop to verify
 554   //    rscratch1: error message
 555   //
 556   // Stack after saving c_rarg3:
 557   //    [tos + 0]: saved c_rarg3
 558   //    [tos + 1]: saved c_rarg2
 559   //    [tos + 2]: saved lr
 560   //    [tos + 3]: saved rscratch2
 561   //    [tos + 4]: saved r0
 562   //    [tos + 5]: saved rscratch1
 563   address generate_verify_oop() {
 564 
 565     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 566     address start = __ pc();
 567 
 568     Label exit, error;
 569 
 570     // save c_rarg2 and c_rarg3
 571     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 572 
 573     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 575     __ ldr(c_rarg3, Address(c_rarg2));
 576     __ add(c_rarg3, c_rarg3, 1);
 577     __ str(c_rarg3, Address(c_rarg2));
 578 
 579     // object is in r0
 580     // make sure object is 'reasonable'
 581     __ cbz(r0, exit); // if obj is NULL it is OK
 582 
 583     // Check if the oop is in the right area of memory
 584     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 585     __ andr(c_rarg2, r0, c_rarg3);
 586     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 587 
 588     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 589     // instruction here because the flags register is live.
 590     __ eor(c_rarg2, c_rarg2, c_rarg3);
 591     __ cbnz(c_rarg2, error);
 592 
 593     // make sure klass is 'reasonable', which is not zero.
 594     __ load_klass(r0, r0);  // get klass
 595     __ cbz(r0, error);      // if klass is NULL it is broken
 596 
 597     // return if everything seems ok
 598     __ bind(exit);
 599 
 600     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 601     __ ret(lr);
 602 
 603     // handle errors
 604     __ bind(error);
 605     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 606 
 607     __ push(RegSet::range(r0, r29), sp);
 608     // debug(char* msg, int64_t pc, int64_t regs[])
 609     __ mov(c_rarg0, rscratch1);      // pass address of error message
 610     __ mov(c_rarg1, lr);             // pass return address
 611     __ mov(c_rarg2, sp);             // pass address of regs on stack
 612 #ifndef PRODUCT
 613     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 614 #endif
 615     BLOCK_COMMENT("call MacroAssembler::debug");
 616     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 617     __ blrt(rscratch1, 3, 0, 1);
 618 
 619     return start;
 620   }
 621 
 622   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 623 
 624   // The inner part of zero_words().  This is the bulk operation,
 625   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 626   // caller is responsible for zeroing the last few words.
 627   //
 628   // Inputs:
 629   // r10: the HeapWord-aligned base address of an array to zero.
 630   // r11: the count in HeapWords, r11 > 0.
 631   //
 632   // Returns r10 and r11, adjusted for the caller to clear.
 633   // r10: the base address of the tail of words left to clear.
 634   // r11: the number of words in the tail.
 635   //      r11 < MacroAssembler::zero_words_block_size.
 636 
 637   address generate_zero_blocks() {
 638     Label store_pair, loop_store_pair, done;
 639     Label base_aligned;
 640 
 641     Register base = r10, cnt = r11;
 642 
 643     __ align(CodeEntryAlignment);
 644     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 645     address start = __ pc();
 646 
 647     if (UseBlockZeroing) {
 648       int zva_length = VM_Version::zva_length();
 649 
 650       // Ensure ZVA length can be divided by 16. This is required by
 651       // the subsequent operations.
 652       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 653 
 654       __ tbz(base, 3, base_aligned);
 655       __ str(zr, Address(__ post(base, 8)));
 656       __ sub(cnt, cnt, 1);
 657       __ bind(base_aligned);
 658 
 659       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 660       // alignment.
 661       Label small;
 662       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 663       __ subs(rscratch1, cnt, low_limit >> 3);
 664       __ br(Assembler::LT, small);
 665       __ zero_dcache_blocks(base, cnt);
 666       __ bind(small);
 667     }
 668 
 669     {
 670       // Number of stp instructions we'll unroll
 671       const int unroll =
 672         MacroAssembler::zero_words_block_size / 2;
 673       // Clear the remaining blocks.
 674       Label loop;
 675       __ subs(cnt, cnt, unroll * 2);
 676       __ br(Assembler::LT, done);
 677       __ bind(loop);
 678       for (int i = 0; i < unroll; i++)
 679         __ stp(zr, zr, __ post(base, 16));
 680       __ subs(cnt, cnt, unroll * 2);
 681       __ br(Assembler::GE, loop);
 682       __ bind(done);
 683       __ add(cnt, cnt, unroll * 2);
 684     }
 685 
 686     __ ret(lr);
 687 
 688     return start;
 689   }
 690 
 691 
 692   typedef enum {
 693     copy_forwards = 1,
 694     copy_backwards = -1
 695   } copy_direction;
 696 
 697   // Bulk copy of blocks of 8 words.
 698   //
 699   // count is a count of words.
 700   //
 701   // Precondition: count >= 8
 702   //
 703   // Postconditions:
 704   //
 705   // The least significant bit of count contains the remaining count
 706   // of words to copy.  The rest of count is trash.
 707   //
 708   // s and d are adjusted to point to the remaining words to copy
 709   //
 710   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 711                            copy_direction direction) {
 712     int unit = wordSize * direction;
 713     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 714 
 715     int offset;
 716     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 717       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 718     const Register stride = r13;
 719 
 720     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 721     assert_different_registers(s, d, count, rscratch1);
 722 
 723     Label again, drain;
 724     const char *stub_name;
 725     if (direction == copy_forwards)
 726       stub_name = "forward_copy_longs";
 727     else
 728       stub_name = "backward_copy_longs";
 729     StubCodeMark mark(this, "StubRoutines", stub_name);
 730     __ align(CodeEntryAlignment);
 731     __ bind(start);
 732 
 733     Label unaligned_copy_long;
 734     if (AvoidUnalignedAccesses) {
 735       __ tbnz(d, 3, unaligned_copy_long);
 736     }
 737 
 738     if (direction == copy_forwards) {
 739       __ sub(s, s, bias);
 740       __ sub(d, d, bias);
 741     }
 742 
 743 #ifdef ASSERT
 744     // Make sure we are never given < 8 words
 745     {
 746       Label L;
 747       __ cmp(count, 8);
 748       __ br(Assembler::GE, L);
 749       __ stop("genrate_copy_longs called with < 8 words");
 750       __ bind(L);
 751     }
 752 #endif
 753 
 754     // Fill 8 registers
 755     if (UseSIMDForMemoryOps) {
 756       __ ldpq(v0, v1, Address(s, 4 * unit));
 757       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 758     } else {
 759       __ ldp(t0, t1, Address(s, 2 * unit));
 760       __ ldp(t2, t3, Address(s, 4 * unit));
 761       __ ldp(t4, t5, Address(s, 6 * unit));
 762       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 763     }
 764 
 765     __ subs(count, count, 16);
 766     __ br(Assembler::LO, drain);
 767 
 768     int prefetch = PrefetchCopyIntervalInBytes;
 769     bool use_stride = false;
 770     if (direction == copy_backwards) {
 771        use_stride = prefetch > 256;
 772        prefetch = -prefetch;
 773        if (use_stride) __ mov(stride, prefetch);
 774     }
 775 
 776     __ bind(again);
 777 
 778     if (PrefetchCopyIntervalInBytes > 0)
 779       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 780 
 781     if (UseSIMDForMemoryOps) {
 782       __ stpq(v0, v1, Address(d, 4 * unit));
 783       __ ldpq(v0, v1, Address(s, 4 * unit));
 784       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 785       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 786     } else {
 787       __ stp(t0, t1, Address(d, 2 * unit));
 788       __ ldp(t0, t1, Address(s, 2 * unit));
 789       __ stp(t2, t3, Address(d, 4 * unit));
 790       __ ldp(t2, t3, Address(s, 4 * unit));
 791       __ stp(t4, t5, Address(d, 6 * unit));
 792       __ ldp(t4, t5, Address(s, 6 * unit));
 793       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 794       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 795     }
 796 
 797     __ subs(count, count, 8);
 798     __ br(Assembler::HS, again);
 799 
 800     // Drain
 801     __ bind(drain);
 802     if (UseSIMDForMemoryOps) {
 803       __ stpq(v0, v1, Address(d, 4 * unit));
 804       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 805     } else {
 806       __ stp(t0, t1, Address(d, 2 * unit));
 807       __ stp(t2, t3, Address(d, 4 * unit));
 808       __ stp(t4, t5, Address(d, 6 * unit));
 809       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 810     }
 811 
 812     {
 813       Label L1, L2;
 814       __ tbz(count, exact_log2(4), L1);
 815       if (UseSIMDForMemoryOps) {
 816         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 817         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 818       } else {
 819         __ ldp(t0, t1, Address(s, 2 * unit));
 820         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 821         __ stp(t0, t1, Address(d, 2 * unit));
 822         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 823       }
 824       __ bind(L1);
 825 
 826       if (direction == copy_forwards) {
 827         __ add(s, s, bias);
 828         __ add(d, d, bias);
 829       }
 830 
 831       __ tbz(count, 1, L2);
 832       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 833       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 834       __ bind(L2);
 835     }
 836 
 837     __ ret(lr);
 838 
 839     if (AvoidUnalignedAccesses) {
 840       Label drain, again;
 841       // Register order for storing. Order is different for backward copy.
 842 
 843       __ bind(unaligned_copy_long);
 844 
 845       // source address is even aligned, target odd aligned
 846       //
 847       // when forward copying word pairs we read long pairs at offsets
 848       // {0, 2, 4, 6} (in long words). when backwards copying we read
 849       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 850       // address by -2 in the forwards case so we can compute the
 851       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 852       // or -1.
 853       //
 854       // when forward copying we need to store 1 word, 3 pairs and
 855       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 856       // zero offset We adjust the destination by -1 which means we
 857       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 858       //
 859       // When backwards copyng we need to store 1 word, 3 pairs and
 860       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 861       // offsets {1, 3, 5, 7, 8} * unit.
 862 
 863       if (direction == copy_forwards) {
 864         __ sub(s, s, 16);
 865         __ sub(d, d, 8);
 866       }
 867 
 868       // Fill 8 registers
 869       //
 870       // for forwards copy s was offset by -16 from the original input
 871       // value of s so the register contents are at these offsets
 872       // relative to the 64 bit block addressed by that original input
 873       // and so on for each successive 64 byte block when s is updated
 874       //
 875       // t0 at offset 0,  t1 at offset 8
 876       // t2 at offset 16, t3 at offset 24
 877       // t4 at offset 32, t5 at offset 40
 878       // t6 at offset 48, t7 at offset 56
 879 
 880       // for backwards copy s was not offset so the register contents
 881       // are at these offsets into the preceding 64 byte block
 882       // relative to that original input and so on for each successive
 883       // preceding 64 byte block when s is updated. this explains the
 884       // slightly counter-intuitive looking pattern of register usage
 885       // in the stp instructions for backwards copy.
 886       //
 887       // t0 at offset -16, t1 at offset -8
 888       // t2 at offset -32, t3 at offset -24
 889       // t4 at offset -48, t5 at offset -40
 890       // t6 at offset -64, t7 at offset -56
 891 
 892       __ ldp(t0, t1, Address(s, 2 * unit));
 893       __ ldp(t2, t3, Address(s, 4 * unit));
 894       __ ldp(t4, t5, Address(s, 6 * unit));
 895       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 896 
 897       __ subs(count, count, 16);
 898       __ br(Assembler::LO, drain);
 899 
 900       int prefetch = PrefetchCopyIntervalInBytes;
 901       bool use_stride = false;
 902       if (direction == copy_backwards) {
 903          use_stride = prefetch > 256;
 904          prefetch = -prefetch;
 905          if (use_stride) __ mov(stride, prefetch);
 906       }
 907 
 908       __ bind(again);
 909 
 910       if (PrefetchCopyIntervalInBytes > 0)
 911         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 912 
 913       if (direction == copy_forwards) {
 914        // allowing for the offset of -8 the store instructions place
 915        // registers into the target 64 bit block at the following
 916        // offsets
 917        //
 918        // t0 at offset 0
 919        // t1 at offset 8,  t2 at offset 16
 920        // t3 at offset 24, t4 at offset 32
 921        // t5 at offset 40, t6 at offset 48
 922        // t7 at offset 56
 923 
 924         __ str(t0, Address(d, 1 * unit));
 925         __ stp(t1, t2, Address(d, 2 * unit));
 926         __ ldp(t0, t1, Address(s, 2 * unit));
 927         __ stp(t3, t4, Address(d, 4 * unit));
 928         __ ldp(t2, t3, Address(s, 4 * unit));
 929         __ stp(t5, t6, Address(d, 6 * unit));
 930         __ ldp(t4, t5, Address(s, 6 * unit));
 931         __ str(t7, Address(__ pre(d, 8 * unit)));
 932         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 933       } else {
 934        // d was not offset when we started so the registers are
 935        // written into the 64 bit block preceding d with the following
 936        // offsets
 937        //
 938        // t1 at offset -8
 939        // t3 at offset -24, t0 at offset -16
 940        // t5 at offset -48, t2 at offset -32
 941        // t7 at offset -56, t4 at offset -48
 942        //                   t6 at offset -64
 943        //
 944        // note that this matches the offsets previously noted for the
 945        // loads
 946 
 947         __ str(t1, Address(d, 1 * unit));
 948         __ stp(t3, t0, Address(d, 3 * unit));
 949         __ ldp(t0, t1, Address(s, 2 * unit));
 950         __ stp(t5, t2, Address(d, 5 * unit));
 951         __ ldp(t2, t3, Address(s, 4 * unit));
 952         __ stp(t7, t4, Address(d, 7 * unit));
 953         __ ldp(t4, t5, Address(s, 6 * unit));
 954         __ str(t6, Address(__ pre(d, 8 * unit)));
 955         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 956       }
 957 
 958       __ subs(count, count, 8);
 959       __ br(Assembler::HS, again);
 960 
 961       // Drain
 962       //
 963       // this uses the same pattern of offsets and register arguments
 964       // as above
 965       __ bind(drain);
 966       if (direction == copy_forwards) {
 967         __ str(t0, Address(d, 1 * unit));
 968         __ stp(t1, t2, Address(d, 2 * unit));
 969         __ stp(t3, t4, Address(d, 4 * unit));
 970         __ stp(t5, t6, Address(d, 6 * unit));
 971         __ str(t7, Address(__ pre(d, 8 * unit)));
 972       } else {
 973         __ str(t1, Address(d, 1 * unit));
 974         __ stp(t3, t0, Address(d, 3 * unit));
 975         __ stp(t5, t2, Address(d, 5 * unit));
 976         __ stp(t7, t4, Address(d, 7 * unit));
 977         __ str(t6, Address(__ pre(d, 8 * unit)));
 978       }
 979       // now we need to copy any remaining part block which may
 980       // include a 4 word block subblock and/or a 2 word subblock.
 981       // bits 2 and 1 in the count are the tell-tale for whetehr we
 982       // have each such subblock
 983       {
 984         Label L1, L2;
 985         __ tbz(count, exact_log2(4), L1);
 986        // this is the same as above but copying only 4 longs hence
 987        // with ony one intervening stp between the str instructions
 988        // but note that the offsets and registers still follow the
 989        // same pattern
 990         __ ldp(t0, t1, Address(s, 2 * unit));
 991         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 992         if (direction == copy_forwards) {
 993           __ str(t0, Address(d, 1 * unit));
 994           __ stp(t1, t2, Address(d, 2 * unit));
 995           __ str(t3, Address(__ pre(d, 4 * unit)));
 996         } else {
 997           __ str(t1, Address(d, 1 * unit));
 998           __ stp(t3, t0, Address(d, 3 * unit));
 999           __ str(t2, Address(__ pre(d, 4 * unit)));
1000         }
1001         __ bind(L1);
1002 
1003         __ tbz(count, 1, L2);
1004        // this is the same as above but copying only 2 longs hence
1005        // there is no intervening stp between the str instructions
1006        // but note that the offset and register patterns are still
1007        // the same
1008         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1009         if (direction == copy_forwards) {
1010           __ str(t0, Address(d, 1 * unit));
1011           __ str(t1, Address(__ pre(d, 2 * unit)));
1012         } else {
1013           __ str(t1, Address(d, 1 * unit));
1014           __ str(t0, Address(__ pre(d, 2 * unit)));
1015         }
1016         __ bind(L2);
1017 
1018        // for forwards copy we need to re-adjust the offsets we
1019        // applied so that s and d are follow the last words written
1020 
1021        if (direction == copy_forwards) {
1022          __ add(s, s, 16);
1023          __ add(d, d, 8);
1024        }
1025 
1026       }
1027 
1028       __ ret(lr);
1029       }
1030   }
1031 
1032   // Small copy: less than 16 bytes.
1033   //
1034   // NB: Ignores all of the bits of count which represent more than 15
1035   // bytes, so a caller doesn't have to mask them.
1036 
1037   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1038     bool is_backwards = step < 0;
1039     size_t granularity = uabs(step);
1040     int direction = is_backwards ? -1 : 1;
1041     int unit = wordSize * direction;
1042 
1043     Label Lpair, Lword, Lint, Lshort, Lbyte;
1044 
1045     assert(granularity
1046            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1047 
1048     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1049 
1050     // ??? I don't know if this bit-test-and-branch is the right thing
1051     // to do.  It does a lot of jumping, resulting in several
1052     // mispredicted branches.  It might make more sense to do this
1053     // with something like Duff's device with a single computed branch.
1054 
1055     __ tbz(count, 3 - exact_log2(granularity), Lword);
1056     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1057     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1058     __ bind(Lword);
1059 
1060     if (granularity <= sizeof (jint)) {
1061       __ tbz(count, 2 - exact_log2(granularity), Lint);
1062       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1063       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1064       __ bind(Lint);
1065     }
1066 
1067     if (granularity <= sizeof (jshort)) {
1068       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1069       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1070       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1071       __ bind(Lshort);
1072     }
1073 
1074     if (granularity <= sizeof (jbyte)) {
1075       __ tbz(count, 0, Lbyte);
1076       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1077       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1078       __ bind(Lbyte);
1079     }
1080   }
1081 
1082   Label copy_f, copy_b;
1083 
1084   // All-singing all-dancing memory copy.
1085   //
1086   // Copy count units of memory from s to d.  The size of a unit is
1087   // step, which can be positive or negative depending on the direction
1088   // of copy.  If is_aligned is false, we align the source address.
1089   //
1090 
1091   void copy_memory(bool is_aligned, Register s, Register d,
1092                    Register count, Register tmp, int step) {
1093     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1094     bool is_backwards = step < 0;
1095     int granularity = uabs(step);
1096     const Register t0 = r3, t1 = r4;
1097 
1098     // <= 96 bytes do inline. Direction doesn't matter because we always
1099     // load all the data before writing anything
1100     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1101     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1102     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1103     const Register send = r17, dend = r18;
1104 
1105     if (PrefetchCopyIntervalInBytes > 0)
1106       __ prfm(Address(s, 0), PLDL1KEEP);
1107     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1108     __ br(Assembler::HI, copy_big);
1109 
1110     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1111     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1112 
1113     __ cmp(count, 16/granularity);
1114     __ br(Assembler::LS, copy16);
1115 
1116     __ cmp(count, 64/granularity);
1117     __ br(Assembler::HI, copy80);
1118 
1119     __ cmp(count, 32/granularity);
1120     __ br(Assembler::LS, copy32);
1121 
1122     // 33..64 bytes
1123     if (UseSIMDForMemoryOps) {
1124       __ ldpq(v0, v1, Address(s, 0));
1125       __ ldpq(v2, v3, Address(send, -32));
1126       __ stpq(v0, v1, Address(d, 0));
1127       __ stpq(v2, v3, Address(dend, -32));
1128     } else {
1129       __ ldp(t0, t1, Address(s, 0));
1130       __ ldp(t2, t3, Address(s, 16));
1131       __ ldp(t4, t5, Address(send, -32));
1132       __ ldp(t6, t7, Address(send, -16));
1133 
1134       __ stp(t0, t1, Address(d, 0));
1135       __ stp(t2, t3, Address(d, 16));
1136       __ stp(t4, t5, Address(dend, -32));
1137       __ stp(t6, t7, Address(dend, -16));
1138     }
1139     __ b(finish);
1140 
1141     // 17..32 bytes
1142     __ bind(copy32);
1143     __ ldp(t0, t1, Address(s, 0));
1144     __ ldp(t2, t3, Address(send, -16));
1145     __ stp(t0, t1, Address(d, 0));
1146     __ stp(t2, t3, Address(dend, -16));
1147     __ b(finish);
1148 
1149     // 65..80/96 bytes
1150     // (96 bytes if SIMD because we do 32 byes per instruction)
1151     __ bind(copy80);
1152     if (UseSIMDForMemoryOps) {
1153       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1154       __ ldpq(v4, v5, Address(send, -32));
1155       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1156       __ stpq(v4, v5, Address(dend, -32));
1157     } else {
1158       __ ldp(t0, t1, Address(s, 0));
1159       __ ldp(t2, t3, Address(s, 16));
1160       __ ldp(t4, t5, Address(s, 32));
1161       __ ldp(t6, t7, Address(s, 48));
1162       __ ldp(t8, t9, Address(send, -16));
1163 
1164       __ stp(t0, t1, Address(d, 0));
1165       __ stp(t2, t3, Address(d, 16));
1166       __ stp(t4, t5, Address(d, 32));
1167       __ stp(t6, t7, Address(d, 48));
1168       __ stp(t8, t9, Address(dend, -16));
1169     }
1170     __ b(finish);
1171 
1172     // 0..16 bytes
1173     __ bind(copy16);
1174     __ cmp(count, 8/granularity);
1175     __ br(Assembler::LO, copy8);
1176 
1177     // 8..16 bytes
1178     __ ldr(t0, Address(s, 0));
1179     __ ldr(t1, Address(send, -8));
1180     __ str(t0, Address(d, 0));
1181     __ str(t1, Address(dend, -8));
1182     __ b(finish);
1183 
1184     if (granularity < 8) {
1185       // 4..7 bytes
1186       __ bind(copy8);
1187       __ tbz(count, 2 - exact_log2(granularity), copy4);
1188       __ ldrw(t0, Address(s, 0));
1189       __ ldrw(t1, Address(send, -4));
1190       __ strw(t0, Address(d, 0));
1191       __ strw(t1, Address(dend, -4));
1192       __ b(finish);
1193       if (granularity < 4) {
1194         // 0..3 bytes
1195         __ bind(copy4);
1196         __ cbz(count, finish); // get rid of 0 case
1197         if (granularity == 2) {
1198           __ ldrh(t0, Address(s, 0));
1199           __ strh(t0, Address(d, 0));
1200         } else { // granularity == 1
1201           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1202           // the first and last byte.
1203           // Handle the 3 byte case by loading and storing base + count/2
1204           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1205           // This does means in the 1 byte case we load/store the same
1206           // byte 3 times.
1207           __ lsr(count, count, 1);
1208           __ ldrb(t0, Address(s, 0));
1209           __ ldrb(t1, Address(send, -1));
1210           __ ldrb(t2, Address(s, count));
1211           __ strb(t0, Address(d, 0));
1212           __ strb(t1, Address(dend, -1));
1213           __ strb(t2, Address(d, count));
1214         }
1215         __ b(finish);
1216       }
1217     }
1218 
1219     __ bind(copy_big);
1220     if (is_backwards) {
1221       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1222       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1223     }
1224 
1225     // Now we've got the small case out of the way we can align the
1226     // source address on a 2-word boundary.
1227 
1228     Label aligned;
1229 
1230     if (is_aligned) {
1231       // We may have to adjust by 1 word to get s 2-word-aligned.
1232       __ tbz(s, exact_log2(wordSize), aligned);
1233       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1234       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1235       __ sub(count, count, wordSize/granularity);
1236     } else {
1237       if (is_backwards) {
1238         __ andr(rscratch2, s, 2 * wordSize - 1);
1239       } else {
1240         __ neg(rscratch2, s);
1241         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1242       }
1243       // rscratch2 is the byte adjustment needed to align s.
1244       __ cbz(rscratch2, aligned);
1245       int shift = exact_log2(granularity);
1246       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1247       __ sub(count, count, rscratch2);
1248 
1249 #if 0
1250       // ?? This code is only correct for a disjoint copy.  It may or
1251       // may not make sense to use it in that case.
1252 
1253       // Copy the first pair; s and d may not be aligned.
1254       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1255       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1256 
1257       // Align s and d, adjust count
1258       if (is_backwards) {
1259         __ sub(s, s, rscratch2);
1260         __ sub(d, d, rscratch2);
1261       } else {
1262         __ add(s, s, rscratch2);
1263         __ add(d, d, rscratch2);
1264       }
1265 #else
1266       copy_memory_small(s, d, rscratch2, rscratch1, step);
1267 #endif
1268     }
1269 
1270     __ bind(aligned);
1271 
1272     // s is now 2-word-aligned.
1273 
1274     // We have a count of units and some trailing bytes.  Adjust the
1275     // count and do a bulk copy of words.
1276     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1277     if (direction == copy_forwards)
1278       __ bl(copy_f);
1279     else
1280       __ bl(copy_b);
1281 
1282     // And the tail.
1283     copy_memory_small(s, d, count, tmp, step);
1284 
1285     if (granularity >= 8) __ bind(copy8);
1286     if (granularity >= 4) __ bind(copy4);
1287     __ bind(finish);
1288   }
1289 
1290 
1291   void clobber_registers() {
1292 #ifdef ASSERT
1293     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1294     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1295     for (Register r = r3; r <= r18; r++)
1296       if (r != rscratch1) __ mov(r, rscratch1);
1297 #endif
1298   }
1299 
1300   // Scan over array at a for count oops, verifying each one.
1301   // Preserves a and count, clobbers rscratch1 and rscratch2.
1302   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1303     Label loop, end;
1304     __ mov(rscratch1, a);
1305     __ mov(rscratch2, zr);
1306     __ bind(loop);
1307     __ cmp(rscratch2, count);
1308     __ br(Assembler::HS, end);
1309     if (size == (size_t)wordSize) {
1310       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1311       __ verify_oop(temp);
1312     } else {
1313       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1314       __ decode_heap_oop(temp); // calls verify_oop
1315     }
1316     __ add(rscratch2, rscratch2, size);
1317     __ b(loop);
1318     __ bind(end);
1319   }
1320 
1321   // Arguments:
1322   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1323   //             ignored
1324   //   is_oop  - true => oop array, so generate store check code
1325   //   name    - stub name string
1326   //
1327   // Inputs:
1328   //   c_rarg0   - source array address
1329   //   c_rarg1   - destination array address
1330   //   c_rarg2   - element count, treated as ssize_t, can be zero
1331   //
1332   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1333   // the hardware handle it.  The two dwords within qwords that span
1334   // cache line boundaries will still be loaded and stored atomicly.
1335   //
1336   // Side Effects:
1337   //   disjoint_int_copy_entry is set to the no-overlap entry point
1338   //   used by generate_conjoint_int_oop_copy().
1339   //
1340   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1341                                   const char *name, bool dest_uninitialized = false) {
1342     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1343     RegSet saved_reg = RegSet::of(s, d, count);
1344     __ align(CodeEntryAlignment);
1345     StubCodeMark mark(this, "StubRoutines", name);
1346     address start = __ pc();
1347     __ enter();
1348 
1349     if (entry != NULL) {
1350       *entry = __ pc();
1351       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1352       BLOCK_COMMENT("Entry:");
1353     }
1354 
1355     BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen();
1356     DecoratorSet decorators = ARRAYCOPY_DISJOINT;
1357     if (dest_uninitialized) {
1358       decorators |= DEST_NOT_INITIALIZED;
1359     }
1360     if (aligned) {
1361       decorators |= ARRAYCOPY_ALIGNED;
1362     }
1363 
1364     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1365 
1366     if (is_oop) {
1367       // save regs before copy_memory
1368       __ push(RegSet::of(d, count), sp);
1369     }
1370     copy_memory(aligned, s, d, count, rscratch1, size);
1371 
1372     if (is_oop) {
1373       __ pop(RegSet::of(d, count), sp);
1374       if (VerifyOops)
1375         verify_oop_array(size, d, count, r16);
1376       __ sub(count, count, 1); // make an inclusive end pointer
1377       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1378     }
1379 
1380     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1381 
1382     __ leave();
1383     __ mov(r0, zr); // return 0
1384     __ ret(lr);
1385 #ifdef BUILTIN_SIM
1386     {
1387       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1388       sim->notifyCompile(const_cast<char*>(name), start);
1389     }
1390 #endif
1391     return start;
1392   }
1393 
1394   // Arguments:
1395   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1396   //             ignored
1397   //   is_oop  - true => oop array, so generate store check code
1398   //   name    - stub name string
1399   //
1400   // Inputs:
1401   //   c_rarg0   - source array address
1402   //   c_rarg1   - destination array address
1403   //   c_rarg2   - element count, treated as ssize_t, can be zero
1404   //
1405   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1406   // the hardware handle it.  The two dwords within qwords that span
1407   // cache line boundaries will still be loaded and stored atomicly.
1408   //
1409   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1410                                  address *entry, const char *name,
1411                                  bool dest_uninitialized = false) {
1412     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1413     RegSet saved_regs = RegSet::of(s, d, count);
1414     StubCodeMark mark(this, "StubRoutines", name);
1415     address start = __ pc();
1416     __ enter();
1417 
1418     if (entry != NULL) {
1419       *entry = __ pc();
1420       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1421       BLOCK_COMMENT("Entry:");
1422     }
1423 
1424     // use fwd copy when (d-s) above_equal (count*size)
1425     __ sub(rscratch1, d, s);
1426     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1427     __ br(Assembler::HS, nooverlap_target);
1428 
1429     BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen();
1430     DecoratorSet decorators = DECORATOR_DEFAULT;
1431     if (dest_uninitialized) {
1432       decorators |= DEST_NOT_INITIALIZED;
1433     }
1434     if (aligned) {
1435       decorators |= ARRAYCOPY_ALIGNED;
1436     }
1437     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1438 
1439     if (is_oop) {
1440       // save regs before copy_memory
1441       __ push(RegSet::of(d, count), sp);
1442     }
1443     copy_memory(aligned, s, d, count, rscratch1, -size);
1444     if (is_oop) {
1445       __ pop(RegSet::of(d, count), sp);
1446       if (VerifyOops)
1447         verify_oop_array(size, d, count, r16);
1448       __ sub(count, count, 1); // make an inclusive end pointer
1449       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1450     }
1451     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1452     __ leave();
1453     __ mov(r0, zr); // return 0
1454     __ ret(lr);
1455 #ifdef BUILTIN_SIM
1456     {
1457       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1458       sim->notifyCompile(const_cast<char*>(name), start);
1459     }
1460 #endif
1461     return start;
1462 }
1463 
1464   // Arguments:
1465   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1466   //             ignored
1467   //   name    - stub name string
1468   //
1469   // Inputs:
1470   //   c_rarg0   - source array address
1471   //   c_rarg1   - destination array address
1472   //   c_rarg2   - element count, treated as ssize_t, can be zero
1473   //
1474   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1475   // we let the hardware handle it.  The one to eight bytes within words,
1476   // dwords or qwords that span cache line boundaries will still be loaded
1477   // and stored atomically.
1478   //
1479   // Side Effects:
1480   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1481   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1482   // we let the hardware handle it.  The one to eight bytes within words,
1483   // dwords or qwords that span cache line boundaries will still be loaded
1484   // and stored atomically.
1485   //
1486   // Side Effects:
1487   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1488   //   used by generate_conjoint_byte_copy().
1489   //
1490   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1491     const bool not_oop = false;
1492     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1493   }
1494 
1495   // Arguments:
1496   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1497   //             ignored
1498   //   name    - stub name string
1499   //
1500   // Inputs:
1501   //   c_rarg0   - source array address
1502   //   c_rarg1   - destination array address
1503   //   c_rarg2   - element count, treated as ssize_t, can be zero
1504   //
1505   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1506   // we let the hardware handle it.  The one to eight bytes within words,
1507   // dwords or qwords that span cache line boundaries will still be loaded
1508   // and stored atomically.
1509   //
1510   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1511                                       address* entry, const char *name) {
1512     const bool not_oop = false;
1513     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1514   }
1515 
1516   // Arguments:
1517   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1518   //             ignored
1519   //   name    - stub name string
1520   //
1521   // Inputs:
1522   //   c_rarg0   - source array address
1523   //   c_rarg1   - destination array address
1524   //   c_rarg2   - element count, treated as ssize_t, can be zero
1525   //
1526   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1527   // let the hardware handle it.  The two or four words within dwords
1528   // or qwords that span cache line boundaries will still be loaded
1529   // and stored atomically.
1530   //
1531   // Side Effects:
1532   //   disjoint_short_copy_entry is set to the no-overlap entry point
1533   //   used by generate_conjoint_short_copy().
1534   //
1535   address generate_disjoint_short_copy(bool aligned,
1536                                        address* entry, const char *name) {
1537     const bool not_oop = false;
1538     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1539   }
1540 
1541   // Arguments:
1542   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1543   //             ignored
1544   //   name    - stub name string
1545   //
1546   // Inputs:
1547   //   c_rarg0   - source array address
1548   //   c_rarg1   - destination array address
1549   //   c_rarg2   - element count, treated as ssize_t, can be zero
1550   //
1551   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1552   // let the hardware handle it.  The two or four words within dwords
1553   // or qwords that span cache line boundaries will still be loaded
1554   // and stored atomically.
1555   //
1556   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1557                                        address *entry, const char *name) {
1558     const bool not_oop = false;
1559     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1560 
1561   }
1562   // Arguments:
1563   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1564   //             ignored
1565   //   name    - stub name string
1566   //
1567   // Inputs:
1568   //   c_rarg0   - source array address
1569   //   c_rarg1   - destination array address
1570   //   c_rarg2   - element count, treated as ssize_t, can be zero
1571   //
1572   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1573   // the hardware handle it.  The two dwords within qwords that span
1574   // cache line boundaries will still be loaded and stored atomicly.
1575   //
1576   // Side Effects:
1577   //   disjoint_int_copy_entry is set to the no-overlap entry point
1578   //   used by generate_conjoint_int_oop_copy().
1579   //
1580   address generate_disjoint_int_copy(bool aligned, address *entry,
1581                                          const char *name, bool dest_uninitialized = false) {
1582     const bool not_oop = false;
1583     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1584   }
1585 
1586   // Arguments:
1587   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1588   //             ignored
1589   //   name    - stub name string
1590   //
1591   // Inputs:
1592   //   c_rarg0   - source array address
1593   //   c_rarg1   - destination array address
1594   //   c_rarg2   - element count, treated as ssize_t, can be zero
1595   //
1596   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1597   // the hardware handle it.  The two dwords within qwords that span
1598   // cache line boundaries will still be loaded and stored atomicly.
1599   //
1600   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1601                                      address *entry, const char *name,
1602                                      bool dest_uninitialized = false) {
1603     const bool not_oop = false;
1604     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1605   }
1606 
1607 
1608   // Arguments:
1609   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1610   //             ignored
1611   //   name    - stub name string
1612   //
1613   // Inputs:
1614   //   c_rarg0   - source array address
1615   //   c_rarg1   - destination array address
1616   //   c_rarg2   - element count, treated as size_t, can be zero
1617   //
1618   // Side Effects:
1619   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1620   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1621   //
1622   address generate_disjoint_long_copy(bool aligned, address *entry,
1623                                           const char *name, bool dest_uninitialized = false) {
1624     const bool not_oop = false;
1625     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1626   }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as size_t, can be zero
1637   //
1638   address generate_conjoint_long_copy(bool aligned,
1639                                       address nooverlap_target, address *entry,
1640                                       const char *name, bool dest_uninitialized = false) {
1641     const bool not_oop = false;
1642     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1643   }
1644 
1645   // Arguments:
1646   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1647   //             ignored
1648   //   name    - stub name string
1649   //
1650   // Inputs:
1651   //   c_rarg0   - source array address
1652   //   c_rarg1   - destination array address
1653   //   c_rarg2   - element count, treated as size_t, can be zero
1654   //
1655   // Side Effects:
1656   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1657   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1658   //
1659   address generate_disjoint_oop_copy(bool aligned, address *entry,
1660                                      const char *name, bool dest_uninitialized) {
1661     const bool is_oop = true;
1662     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1663     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1664   }
1665 
1666   // Arguments:
1667   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1668   //             ignored
1669   //   name    - stub name string
1670   //
1671   // Inputs:
1672   //   c_rarg0   - source array address
1673   //   c_rarg1   - destination array address
1674   //   c_rarg2   - element count, treated as size_t, can be zero
1675   //
1676   address generate_conjoint_oop_copy(bool aligned,
1677                                      address nooverlap_target, address *entry,
1678                                      const char *name, bool dest_uninitialized) {
1679     const bool is_oop = true;
1680     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1681     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1682                                   name, dest_uninitialized);
1683   }
1684 
1685 
1686   // Helper for generating a dynamic type check.
1687   // Smashes rscratch1.
1688   void generate_type_check(Register sub_klass,
1689                            Register super_check_offset,
1690                            Register super_klass,
1691                            Label& L_success) {
1692     assert_different_registers(sub_klass, super_check_offset, super_klass);
1693 
1694     BLOCK_COMMENT("type_check:");
1695 
1696     Label L_miss;
1697 
1698     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1699                                      super_check_offset);
1700     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1701 
1702     // Fall through on failure!
1703     __ BIND(L_miss);
1704   }
1705 
1706   //
1707   //  Generate checkcasting array copy stub
1708   //
1709   //  Input:
1710   //    c_rarg0   - source array address
1711   //    c_rarg1   - destination array address
1712   //    c_rarg2   - element count, treated as ssize_t, can be zero
1713   //    c_rarg3   - size_t ckoff (super_check_offset)
1714   //    c_rarg4   - oop ckval (super_klass)
1715   //
1716   //  Output:
1717   //    r0 ==  0  -  success
1718   //    r0 == -1^K - failure, where K is partial transfer count
1719   //
1720   address generate_checkcast_copy(const char *name, address *entry,
1721                                   bool dest_uninitialized = false) {
1722 
1723     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1724 
1725     // Input registers (after setup_arg_regs)
1726     const Register from        = c_rarg0;   // source array address
1727     const Register to          = c_rarg1;   // destination array address
1728     const Register count       = c_rarg2;   // elementscount
1729     const Register ckoff       = c_rarg3;   // super_check_offset
1730     const Register ckval       = c_rarg4;   // super_klass
1731 
1732     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1733     RegSet wb_post_saved_regs = RegSet::of(count);
1734 
1735     // Registers used as temps (r18, r19, r20 are save-on-entry)
1736     const Register count_save  = r21;       // orig elementscount
1737     const Register start_to    = r20;       // destination array start address
1738     const Register copied_oop  = r18;       // actual oop copied
1739     const Register r19_klass   = r19;       // oop._klass
1740 
1741     //---------------------------------------------------------------
1742     // Assembler stub will be used for this call to arraycopy
1743     // if the two arrays are subtypes of Object[] but the
1744     // destination array type is not equal to or a supertype
1745     // of the source type.  Each element must be separately
1746     // checked.
1747 
1748     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1749                                copied_oop, r19_klass, count_save);
1750 
1751     __ align(CodeEntryAlignment);
1752     StubCodeMark mark(this, "StubRoutines", name);
1753     address start = __ pc();
1754 
1755     __ enter(); // required for proper stackwalking of RuntimeStub frame
1756 
1757 #ifdef ASSERT
1758     // caller guarantees that the arrays really are different
1759     // otherwise, we would have to make conjoint checks
1760     { Label L;
1761       array_overlap_test(L, TIMES_OOP);
1762       __ stop("checkcast_copy within a single array");
1763       __ bind(L);
1764     }
1765 #endif //ASSERT
1766 
1767     // Caller of this entry point must set up the argument registers.
1768     if (entry != NULL) {
1769       *entry = __ pc();
1770       BLOCK_COMMENT("Entry:");
1771     }
1772 
1773      // Empty array:  Nothing to do.
1774     __ cbz(count, L_done);
1775 
1776     __ push(RegSet::of(r18, r19, r20, r21), sp);
1777 
1778 #ifdef ASSERT
1779     BLOCK_COMMENT("assert consistent ckoff/ckval");
1780     // The ckoff and ckval must be mutually consistent,
1781     // even though caller generates both.
1782     { Label L;
1783       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1784       __ ldrw(start_to, Address(ckval, sco_offset));
1785       __ cmpw(ckoff, start_to);
1786       __ br(Assembler::EQ, L);
1787       __ stop("super_check_offset inconsistent");
1788       __ bind(L);
1789     }
1790 #endif //ASSERT
1791 
1792     BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen();
1793     DecoratorSet decorators = ARRAYCOPY_CONTRAVARIANT | ARRAYCOPY_DISJOINT;
1794     bool is_oop = true;
1795     if (dest_uninitialized) {
1796       decorators |= DEST_NOT_INITIALIZED;
1797     }
1798 
1799     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1800 
1801     // save the original count
1802     __ mov(count_save, count);
1803 
1804     // Copy from low to high addresses
1805     __ mov(start_to, to);              // Save destination array start address
1806     __ b(L_load_element);
1807 
1808     // ======== begin loop ========
1809     // (Loop is rotated; its entry is L_load_element.)
1810     // Loop control:
1811     //   for (; count != 0; count--) {
1812     //     copied_oop = load_heap_oop(from++);
1813     //     ... generate_type_check ...;
1814     //     store_heap_oop(to++, copied_oop);
1815     //   }
1816     __ align(OptoLoopAlignment);
1817 
1818     __ BIND(L_store_element);
1819     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1820     __ sub(count, count, 1);
1821     __ cbz(count, L_do_card_marks);
1822 
1823     // ======== loop entry is here ========
1824     __ BIND(L_load_element);
1825     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1826     __ cbz(copied_oop, L_store_element);
1827 
1828     __ load_klass(r19_klass, copied_oop);// query the object klass
1829     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1830     // ======== end loop ========
1831 
1832     // It was a real error; we must depend on the caller to finish the job.
1833     // Register count = remaining oops, count_orig = total oops.
1834     // Emit GC store barriers for the oops we have copied and report
1835     // their number to the caller.
1836 
1837     __ subs(count, count_save, count);     // K = partially copied oop count
1838     __ eon(count, count, zr);                   // report (-1^K) to caller
1839     __ br(Assembler::EQ, L_done_pop);
1840 
1841     __ BIND(L_do_card_marks);
1842     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1843     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1844 
1845     __ bind(L_done_pop);
1846     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1847     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1848 
1849     __ bind(L_done);
1850     __ mov(r0, count);
1851     __ leave();
1852     __ ret(lr);
1853 
1854     return start;
1855   }
1856 
1857   // Perform range checks on the proposed arraycopy.
1858   // Kills temp, but nothing else.
1859   // Also, clean the sign bits of src_pos and dst_pos.
1860   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1861                               Register src_pos, // source position (c_rarg1)
1862                               Register dst,     // destination array oo (c_rarg2)
1863                               Register dst_pos, // destination position (c_rarg3)
1864                               Register length,
1865                               Register temp,
1866                               Label& L_failed) {
1867     BLOCK_COMMENT("arraycopy_range_checks:");
1868 
1869     assert_different_registers(rscratch1, temp);
1870 
1871     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1872     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1873     __ addw(temp, length, src_pos);
1874     __ cmpw(temp, rscratch1);
1875     __ br(Assembler::HI, L_failed);
1876 
1877     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1878     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1879     __ addw(temp, length, dst_pos);
1880     __ cmpw(temp, rscratch1);
1881     __ br(Assembler::HI, L_failed);
1882 
1883     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1884     __ movw(src_pos, src_pos);
1885     __ movw(dst_pos, dst_pos);
1886 
1887     BLOCK_COMMENT("arraycopy_range_checks done");
1888   }
1889 
1890   // These stubs get called from some dumb test routine.
1891   // I'll write them properly when they're called from
1892   // something that's actually doing something.
1893   static void fake_arraycopy_stub(address src, address dst, int count) {
1894     assert(count == 0, "huh?");
1895   }
1896 
1897 
1898   //
1899   //  Generate 'unsafe' array copy stub
1900   //  Though just as safe as the other stubs, it takes an unscaled
1901   //  size_t argument instead of an element count.
1902   //
1903   //  Input:
1904   //    c_rarg0   - source array address
1905   //    c_rarg1   - destination array address
1906   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1907   //
1908   // Examines the alignment of the operands and dispatches
1909   // to a long, int, short, or byte copy loop.
1910   //
1911   address generate_unsafe_copy(const char *name,
1912                                address byte_copy_entry,
1913                                address short_copy_entry,
1914                                address int_copy_entry,
1915                                address long_copy_entry) {
1916     Label L_long_aligned, L_int_aligned, L_short_aligned;
1917     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1918 
1919     __ align(CodeEntryAlignment);
1920     StubCodeMark mark(this, "StubRoutines", name);
1921     address start = __ pc();
1922     __ enter(); // required for proper stackwalking of RuntimeStub frame
1923 
1924     // bump this on entry, not on exit:
1925     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1926 
1927     __ orr(rscratch1, s, d);
1928     __ orr(rscratch1, rscratch1, count);
1929 
1930     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1931     __ cbz(rscratch1, L_long_aligned);
1932     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1933     __ cbz(rscratch1, L_int_aligned);
1934     __ tbz(rscratch1, 0, L_short_aligned);
1935     __ b(RuntimeAddress(byte_copy_entry));
1936 
1937     __ BIND(L_short_aligned);
1938     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1939     __ b(RuntimeAddress(short_copy_entry));
1940     __ BIND(L_int_aligned);
1941     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1942     __ b(RuntimeAddress(int_copy_entry));
1943     __ BIND(L_long_aligned);
1944     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1945     __ b(RuntimeAddress(long_copy_entry));
1946 
1947     return start;
1948   }
1949 
1950   //
1951   //  Generate generic array copy stubs
1952   //
1953   //  Input:
1954   //    c_rarg0    -  src oop
1955   //    c_rarg1    -  src_pos (32-bits)
1956   //    c_rarg2    -  dst oop
1957   //    c_rarg3    -  dst_pos (32-bits)
1958   //    c_rarg4    -  element count (32-bits)
1959   //
1960   //  Output:
1961   //    r0 ==  0  -  success
1962   //    r0 == -1^K - failure, where K is partial transfer count
1963   //
1964   address generate_generic_copy(const char *name,
1965                                 address byte_copy_entry, address short_copy_entry,
1966                                 address int_copy_entry, address oop_copy_entry,
1967                                 address long_copy_entry, address checkcast_copy_entry) {
1968 
1969     Label L_failed, L_failed_0, L_objArray;
1970     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1971 
1972     // Input registers
1973     const Register src        = c_rarg0;  // source array oop
1974     const Register src_pos    = c_rarg1;  // source position
1975     const Register dst        = c_rarg2;  // destination array oop
1976     const Register dst_pos    = c_rarg3;  // destination position
1977     const Register length     = c_rarg4;
1978 
1979     StubCodeMark mark(this, "StubRoutines", name);
1980 
1981     __ align(CodeEntryAlignment);
1982     address start = __ pc();
1983 
1984     __ enter(); // required for proper stackwalking of RuntimeStub frame
1985 
1986     // bump this on entry, not on exit:
1987     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1988 
1989     //-----------------------------------------------------------------------
1990     // Assembler stub will be used for this call to arraycopy
1991     // if the following conditions are met:
1992     //
1993     // (1) src and dst must not be null.
1994     // (2) src_pos must not be negative.
1995     // (3) dst_pos must not be negative.
1996     // (4) length  must not be negative.
1997     // (5) src klass and dst klass should be the same and not NULL.
1998     // (6) src and dst should be arrays.
1999     // (7) src_pos + length must not exceed length of src.
2000     // (8) dst_pos + length must not exceed length of dst.
2001     //
2002 
2003     //  if (src == NULL) return -1;
2004     __ cbz(src, L_failed);
2005 
2006     //  if (src_pos < 0) return -1;
2007     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2008 
2009     //  if (dst == NULL) return -1;
2010     __ cbz(dst, L_failed);
2011 
2012     //  if (dst_pos < 0) return -1;
2013     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2014 
2015     // registers used as temp
2016     const Register scratch_length    = r16; // elements count to copy
2017     const Register scratch_src_klass = r17; // array klass
2018     const Register lh                = r18; // layout helper
2019 
2020     //  if (length < 0) return -1;
2021     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2022     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2023 
2024     __ load_klass(scratch_src_klass, src);
2025 #ifdef ASSERT
2026     //  assert(src->klass() != NULL);
2027     {
2028       BLOCK_COMMENT("assert klasses not null {");
2029       Label L1, L2;
2030       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2031       __ bind(L1);
2032       __ stop("broken null klass");
2033       __ bind(L2);
2034       __ load_klass(rscratch1, dst);
2035       __ cbz(rscratch1, L1);     // this would be broken also
2036       BLOCK_COMMENT("} assert klasses not null done");
2037     }
2038 #endif
2039 
2040     // Load layout helper (32-bits)
2041     //
2042     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2043     // 32        30    24            16              8     2                 0
2044     //
2045     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2046     //
2047 
2048     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2049 
2050     // Handle objArrays completely differently...
2051     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2052     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2053     __ movw(rscratch1, objArray_lh);
2054     __ eorw(rscratch2, lh, rscratch1);
2055     __ cbzw(rscratch2, L_objArray);
2056 
2057     //  if (src->klass() != dst->klass()) return -1;
2058     __ load_klass(rscratch2, dst);
2059     __ eor(rscratch2, rscratch2, scratch_src_klass);
2060     __ cbnz(rscratch2, L_failed);
2061 
2062     //  if (!src->is_Array()) return -1;
2063     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2064 
2065     // At this point, it is known to be a typeArray (array_tag 0x3).
2066 #ifdef ASSERT
2067     {
2068       BLOCK_COMMENT("assert primitive array {");
2069       Label L;
2070       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2071       __ cmpw(lh, rscratch2);
2072       __ br(Assembler::GE, L);
2073       __ stop("must be a primitive array");
2074       __ bind(L);
2075       BLOCK_COMMENT("} assert primitive array done");
2076     }
2077 #endif
2078 
2079     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2080                            rscratch2, L_failed);
2081 
2082     // TypeArrayKlass
2083     //
2084     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2085     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2086     //
2087 
2088     const Register rscratch1_offset = rscratch1;    // array offset
2089     const Register r18_elsize = lh; // element size
2090 
2091     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2092            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2093     __ add(src, src, rscratch1_offset);           // src array offset
2094     __ add(dst, dst, rscratch1_offset);           // dst array offset
2095     BLOCK_COMMENT("choose copy loop based on element size");
2096 
2097     // next registers should be set before the jump to corresponding stub
2098     const Register from     = c_rarg0;  // source array address
2099     const Register to       = c_rarg1;  // destination array address
2100     const Register count    = c_rarg2;  // elements count
2101 
2102     // 'from', 'to', 'count' registers should be set in such order
2103     // since they are the same as 'src', 'src_pos', 'dst'.
2104 
2105     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2106 
2107     // The possible values of elsize are 0-3, i.e. exact_log2(element
2108     // size in bytes).  We do a simple bitwise binary search.
2109   __ BIND(L_copy_bytes);
2110     __ tbnz(r18_elsize, 1, L_copy_ints);
2111     __ tbnz(r18_elsize, 0, L_copy_shorts);
2112     __ lea(from, Address(src, src_pos));// src_addr
2113     __ lea(to,   Address(dst, dst_pos));// dst_addr
2114     __ movw(count, scratch_length); // length
2115     __ b(RuntimeAddress(byte_copy_entry));
2116 
2117   __ BIND(L_copy_shorts);
2118     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2119     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2120     __ movw(count, scratch_length); // length
2121     __ b(RuntimeAddress(short_copy_entry));
2122 
2123   __ BIND(L_copy_ints);
2124     __ tbnz(r18_elsize, 0, L_copy_longs);
2125     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2126     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2127     __ movw(count, scratch_length); // length
2128     __ b(RuntimeAddress(int_copy_entry));
2129 
2130   __ BIND(L_copy_longs);
2131 #ifdef ASSERT
2132     {
2133       BLOCK_COMMENT("assert long copy {");
2134       Label L;
2135       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2136       __ cmpw(r18_elsize, LogBytesPerLong);
2137       __ br(Assembler::EQ, L);
2138       __ stop("must be long copy, but elsize is wrong");
2139       __ bind(L);
2140       BLOCK_COMMENT("} assert long copy done");
2141     }
2142 #endif
2143     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2144     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2145     __ movw(count, scratch_length); // length
2146     __ b(RuntimeAddress(long_copy_entry));
2147 
2148     // ObjArrayKlass
2149   __ BIND(L_objArray);
2150     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2151 
2152     Label L_plain_copy, L_checkcast_copy;
2153     //  test array classes for subtyping
2154     __ load_klass(r18, dst);
2155     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2156     __ br(Assembler::NE, L_checkcast_copy);
2157 
2158     // Identically typed arrays can be copied without element-wise checks.
2159     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2160                            rscratch2, L_failed);
2161 
2162     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2163     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2164     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2165     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2166     __ movw(count, scratch_length); // length
2167   __ BIND(L_plain_copy);
2168     __ b(RuntimeAddress(oop_copy_entry));
2169 
2170   __ BIND(L_checkcast_copy);
2171     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2172     {
2173       // Before looking at dst.length, make sure dst is also an objArray.
2174       __ ldrw(rscratch1, Address(r18, lh_offset));
2175       __ movw(rscratch2, objArray_lh);
2176       __ eorw(rscratch1, rscratch1, rscratch2);
2177       __ cbnzw(rscratch1, L_failed);
2178 
2179       // It is safe to examine both src.length and dst.length.
2180       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2181                              r18, L_failed);
2182 
2183       const Register rscratch2_dst_klass = rscratch2;
2184       __ load_klass(rscratch2_dst_klass, dst); // reload
2185 
2186       // Marshal the base address arguments now, freeing registers.
2187       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2188       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2189       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2190       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2191       __ movw(count, length);           // length (reloaded)
2192       Register sco_temp = c_rarg3;      // this register is free now
2193       assert_different_registers(from, to, count, sco_temp,
2194                                  rscratch2_dst_klass, scratch_src_klass);
2195       // assert_clean_int(count, sco_temp);
2196 
2197       // Generate the type check.
2198       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2199       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2200       // assert_clean_int(sco_temp, r18);
2201       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2202 
2203       // Fetch destination element klass from the ObjArrayKlass header.
2204       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2205       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2206       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2207 
2208       // the checkcast_copy loop needs two extra arguments:
2209       assert(c_rarg3 == sco_temp, "#3 already in place");
2210       // Set up arguments for checkcast_copy_entry.
2211       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2212       __ b(RuntimeAddress(checkcast_copy_entry));
2213     }
2214 
2215   __ BIND(L_failed);
2216     __ mov(r0, -1);
2217     __ leave();   // required for proper stackwalking of RuntimeStub frame
2218     __ ret(lr);
2219 
2220     return start;
2221   }
2222 
2223   //
2224   // Generate stub for array fill. If "aligned" is true, the
2225   // "to" address is assumed to be heapword aligned.
2226   //
2227   // Arguments for generated stub:
2228   //   to:    c_rarg0
2229   //   value: c_rarg1
2230   //   count: c_rarg2 treated as signed
2231   //
2232   address generate_fill(BasicType t, bool aligned, const char *name) {
2233     __ align(CodeEntryAlignment);
2234     StubCodeMark mark(this, "StubRoutines", name);
2235     address start = __ pc();
2236 
2237     BLOCK_COMMENT("Entry:");
2238 
2239     const Register to        = c_rarg0;  // source array address
2240     const Register value     = c_rarg1;  // value
2241     const Register count     = c_rarg2;  // elements count
2242 
2243     const Register bz_base = r10;        // base for block_zero routine
2244     const Register cnt_words = r11;      // temp register
2245 
2246     __ enter();
2247 
2248     Label L_fill_elements, L_exit1;
2249 
2250     int shift = -1;
2251     switch (t) {
2252       case T_BYTE:
2253         shift = 0;
2254         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2255         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2256         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2257         __ br(Assembler::LO, L_fill_elements);
2258         break;
2259       case T_SHORT:
2260         shift = 1;
2261         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2262         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2263         __ br(Assembler::LO, L_fill_elements);
2264         break;
2265       case T_INT:
2266         shift = 2;
2267         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2268         __ br(Assembler::LO, L_fill_elements);
2269         break;
2270       default: ShouldNotReachHere();
2271     }
2272 
2273     // Align source address at 8 bytes address boundary.
2274     Label L_skip_align1, L_skip_align2, L_skip_align4;
2275     if (!aligned) {
2276       switch (t) {
2277         case T_BYTE:
2278           // One byte misalignment happens only for byte arrays.
2279           __ tbz(to, 0, L_skip_align1);
2280           __ strb(value, Address(__ post(to, 1)));
2281           __ subw(count, count, 1);
2282           __ bind(L_skip_align1);
2283           // Fallthrough
2284         case T_SHORT:
2285           // Two bytes misalignment happens only for byte and short (char) arrays.
2286           __ tbz(to, 1, L_skip_align2);
2287           __ strh(value, Address(__ post(to, 2)));
2288           __ subw(count, count, 2 >> shift);
2289           __ bind(L_skip_align2);
2290           // Fallthrough
2291         case T_INT:
2292           // Align to 8 bytes, we know we are 4 byte aligned to start.
2293           __ tbz(to, 2, L_skip_align4);
2294           __ strw(value, Address(__ post(to, 4)));
2295           __ subw(count, count, 4 >> shift);
2296           __ bind(L_skip_align4);
2297           break;
2298         default: ShouldNotReachHere();
2299       }
2300     }
2301 
2302     //
2303     //  Fill large chunks
2304     //
2305     __ lsrw(cnt_words, count, 3 - shift); // number of words
2306     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2307     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2308     if (UseBlockZeroing) {
2309       Label non_block_zeroing, rest;
2310       // If the fill value is zero we can use the fast zero_words().
2311       __ cbnz(value, non_block_zeroing);
2312       __ mov(bz_base, to);
2313       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2314       __ zero_words(bz_base, cnt_words);
2315       __ b(rest);
2316       __ bind(non_block_zeroing);
2317       __ fill_words(to, cnt_words, value);
2318       __ bind(rest);
2319     } else {
2320       __ fill_words(to, cnt_words, value);
2321     }
2322 
2323     // Remaining count is less than 8 bytes. Fill it by a single store.
2324     // Note that the total length is no less than 8 bytes.
2325     if (t == T_BYTE || t == T_SHORT) {
2326       Label L_exit1;
2327       __ cbzw(count, L_exit1);
2328       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2329       __ str(value, Address(to, -8));    // overwrite some elements
2330       __ bind(L_exit1);
2331       __ leave();
2332       __ ret(lr);
2333     }
2334 
2335     // Handle copies less than 8 bytes.
2336     Label L_fill_2, L_fill_4, L_exit2;
2337     __ bind(L_fill_elements);
2338     switch (t) {
2339       case T_BYTE:
2340         __ tbz(count, 0, L_fill_2);
2341         __ strb(value, Address(__ post(to, 1)));
2342         __ bind(L_fill_2);
2343         __ tbz(count, 1, L_fill_4);
2344         __ strh(value, Address(__ post(to, 2)));
2345         __ bind(L_fill_4);
2346         __ tbz(count, 2, L_exit2);
2347         __ strw(value, Address(to));
2348         break;
2349       case T_SHORT:
2350         __ tbz(count, 0, L_fill_4);
2351         __ strh(value, Address(__ post(to, 2)));
2352         __ bind(L_fill_4);
2353         __ tbz(count, 1, L_exit2);
2354         __ strw(value, Address(to));
2355         break;
2356       case T_INT:
2357         __ cbzw(count, L_exit2);
2358         __ strw(value, Address(to));
2359         break;
2360       default: ShouldNotReachHere();
2361     }
2362     __ bind(L_exit2);
2363     __ leave();
2364     __ ret(lr);
2365     return start;
2366   }
2367 
2368   void generate_arraycopy_stubs() {
2369     address entry;
2370     address entry_jbyte_arraycopy;
2371     address entry_jshort_arraycopy;
2372     address entry_jint_arraycopy;
2373     address entry_oop_arraycopy;
2374     address entry_jlong_arraycopy;
2375     address entry_checkcast_arraycopy;
2376 
2377     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2378     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2379 
2380     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2381 
2382     //*** jbyte
2383     // Always need aligned and unaligned versions
2384     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2385                                                                                   "jbyte_disjoint_arraycopy");
2386     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2387                                                                                   &entry_jbyte_arraycopy,
2388                                                                                   "jbyte_arraycopy");
2389     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2390                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2391     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2392                                                                                   "arrayof_jbyte_arraycopy");
2393 
2394     //*** jshort
2395     // Always need aligned and unaligned versions
2396     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2397                                                                                     "jshort_disjoint_arraycopy");
2398     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2399                                                                                     &entry_jshort_arraycopy,
2400                                                                                     "jshort_arraycopy");
2401     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2402                                                                                     "arrayof_jshort_disjoint_arraycopy");
2403     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2404                                                                                     "arrayof_jshort_arraycopy");
2405 
2406     //*** jint
2407     // Aligned versions
2408     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2409                                                                                 "arrayof_jint_disjoint_arraycopy");
2410     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2411                                                                                 "arrayof_jint_arraycopy");
2412     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2413     // entry_jint_arraycopy always points to the unaligned version
2414     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2415                                                                                 "jint_disjoint_arraycopy");
2416     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2417                                                                                 &entry_jint_arraycopy,
2418                                                                                 "jint_arraycopy");
2419 
2420     //*** jlong
2421     // It is always aligned
2422     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2423                                                                                   "arrayof_jlong_disjoint_arraycopy");
2424     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2425                                                                                   "arrayof_jlong_arraycopy");
2426     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2427     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2428 
2429     //*** oops
2430     {
2431       // With compressed oops we need unaligned versions; notice that
2432       // we overwrite entry_oop_arraycopy.
2433       bool aligned = !UseCompressedOops;
2434 
2435       StubRoutines::_arrayof_oop_disjoint_arraycopy
2436         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2437                                      /*dest_uninitialized*/false);
2438       StubRoutines::_arrayof_oop_arraycopy
2439         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2440                                      /*dest_uninitialized*/false);
2441       // Aligned versions without pre-barriers
2442       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2443         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2444                                      /*dest_uninitialized*/true);
2445       StubRoutines::_arrayof_oop_arraycopy_uninit
2446         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2447                                      /*dest_uninitialized*/true);
2448     }
2449 
2450     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2451     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2452     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2453     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2454 
2455     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2456     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2457                                                                         /*dest_uninitialized*/true);
2458 
2459     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2460                                                               entry_jbyte_arraycopy,
2461                                                               entry_jshort_arraycopy,
2462                                                               entry_jint_arraycopy,
2463                                                               entry_jlong_arraycopy);
2464 
2465     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2466                                                                entry_jbyte_arraycopy,
2467                                                                entry_jshort_arraycopy,
2468                                                                entry_jint_arraycopy,
2469                                                                entry_oop_arraycopy,
2470                                                                entry_jlong_arraycopy,
2471                                                                entry_checkcast_arraycopy);
2472 
2473     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2474     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2475     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2476     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2477     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2478     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2479   }
2480 
2481   void generate_math_stubs() { Unimplemented(); }
2482 
2483   // Arguments:
2484   //
2485   // Inputs:
2486   //   c_rarg0   - source byte array address
2487   //   c_rarg1   - destination byte array address
2488   //   c_rarg2   - K (key) in little endian int array
2489   //
2490   address generate_aescrypt_encryptBlock() {
2491     __ align(CodeEntryAlignment);
2492     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2493 
2494     Label L_doLast;
2495 
2496     const Register from        = c_rarg0;  // source array address
2497     const Register to          = c_rarg1;  // destination array address
2498     const Register key         = c_rarg2;  // key array address
2499     const Register keylen      = rscratch1;
2500 
2501     address start = __ pc();
2502     __ enter();
2503 
2504     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2505 
2506     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2507 
2508     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2509     __ rev32(v1, __ T16B, v1);
2510     __ rev32(v2, __ T16B, v2);
2511     __ rev32(v3, __ T16B, v3);
2512     __ rev32(v4, __ T16B, v4);
2513     __ aese(v0, v1);
2514     __ aesmc(v0, v0);
2515     __ aese(v0, v2);
2516     __ aesmc(v0, v0);
2517     __ aese(v0, v3);
2518     __ aesmc(v0, v0);
2519     __ aese(v0, v4);
2520     __ aesmc(v0, v0);
2521 
2522     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2523     __ rev32(v1, __ T16B, v1);
2524     __ rev32(v2, __ T16B, v2);
2525     __ rev32(v3, __ T16B, v3);
2526     __ rev32(v4, __ T16B, v4);
2527     __ aese(v0, v1);
2528     __ aesmc(v0, v0);
2529     __ aese(v0, v2);
2530     __ aesmc(v0, v0);
2531     __ aese(v0, v3);
2532     __ aesmc(v0, v0);
2533     __ aese(v0, v4);
2534     __ aesmc(v0, v0);
2535 
2536     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2537     __ rev32(v1, __ T16B, v1);
2538     __ rev32(v2, __ T16B, v2);
2539 
2540     __ cmpw(keylen, 44);
2541     __ br(Assembler::EQ, L_doLast);
2542 
2543     __ aese(v0, v1);
2544     __ aesmc(v0, v0);
2545     __ aese(v0, v2);
2546     __ aesmc(v0, v0);
2547 
2548     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2549     __ rev32(v1, __ T16B, v1);
2550     __ rev32(v2, __ T16B, v2);
2551 
2552     __ cmpw(keylen, 52);
2553     __ br(Assembler::EQ, L_doLast);
2554 
2555     __ aese(v0, v1);
2556     __ aesmc(v0, v0);
2557     __ aese(v0, v2);
2558     __ aesmc(v0, v0);
2559 
2560     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2561     __ rev32(v1, __ T16B, v1);
2562     __ rev32(v2, __ T16B, v2);
2563 
2564     __ BIND(L_doLast);
2565 
2566     __ aese(v0, v1);
2567     __ aesmc(v0, v0);
2568     __ aese(v0, v2);
2569 
2570     __ ld1(v1, __ T16B, key);
2571     __ rev32(v1, __ T16B, v1);
2572     __ eor(v0, __ T16B, v0, v1);
2573 
2574     __ st1(v0, __ T16B, to);
2575 
2576     __ mov(r0, 0);
2577 
2578     __ leave();
2579     __ ret(lr);
2580 
2581     return start;
2582   }
2583 
2584   // Arguments:
2585   //
2586   // Inputs:
2587   //   c_rarg0   - source byte array address
2588   //   c_rarg1   - destination byte array address
2589   //   c_rarg2   - K (key) in little endian int array
2590   //
2591   address generate_aescrypt_decryptBlock() {
2592     assert(UseAES, "need AES instructions and misaligned SSE support");
2593     __ align(CodeEntryAlignment);
2594     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2595     Label L_doLast;
2596 
2597     const Register from        = c_rarg0;  // source array address
2598     const Register to          = c_rarg1;  // destination array address
2599     const Register key         = c_rarg2;  // key array address
2600     const Register keylen      = rscratch1;
2601 
2602     address start = __ pc();
2603     __ enter(); // required for proper stackwalking of RuntimeStub frame
2604 
2605     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2606 
2607     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2608 
2609     __ ld1(v5, __ T16B, __ post(key, 16));
2610     __ rev32(v5, __ T16B, v5);
2611 
2612     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2613     __ rev32(v1, __ T16B, v1);
2614     __ rev32(v2, __ T16B, v2);
2615     __ rev32(v3, __ T16B, v3);
2616     __ rev32(v4, __ T16B, v4);
2617     __ aesd(v0, v1);
2618     __ aesimc(v0, v0);
2619     __ aesd(v0, v2);
2620     __ aesimc(v0, v0);
2621     __ aesd(v0, v3);
2622     __ aesimc(v0, v0);
2623     __ aesd(v0, v4);
2624     __ aesimc(v0, v0);
2625 
2626     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2627     __ rev32(v1, __ T16B, v1);
2628     __ rev32(v2, __ T16B, v2);
2629     __ rev32(v3, __ T16B, v3);
2630     __ rev32(v4, __ T16B, v4);
2631     __ aesd(v0, v1);
2632     __ aesimc(v0, v0);
2633     __ aesd(v0, v2);
2634     __ aesimc(v0, v0);
2635     __ aesd(v0, v3);
2636     __ aesimc(v0, v0);
2637     __ aesd(v0, v4);
2638     __ aesimc(v0, v0);
2639 
2640     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2641     __ rev32(v1, __ T16B, v1);
2642     __ rev32(v2, __ T16B, v2);
2643 
2644     __ cmpw(keylen, 44);
2645     __ br(Assembler::EQ, L_doLast);
2646 
2647     __ aesd(v0, v1);
2648     __ aesimc(v0, v0);
2649     __ aesd(v0, v2);
2650     __ aesimc(v0, v0);
2651 
2652     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2653     __ rev32(v1, __ T16B, v1);
2654     __ rev32(v2, __ T16B, v2);
2655 
2656     __ cmpw(keylen, 52);
2657     __ br(Assembler::EQ, L_doLast);
2658 
2659     __ aesd(v0, v1);
2660     __ aesimc(v0, v0);
2661     __ aesd(v0, v2);
2662     __ aesimc(v0, v0);
2663 
2664     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2665     __ rev32(v1, __ T16B, v1);
2666     __ rev32(v2, __ T16B, v2);
2667 
2668     __ BIND(L_doLast);
2669 
2670     __ aesd(v0, v1);
2671     __ aesimc(v0, v0);
2672     __ aesd(v0, v2);
2673 
2674     __ eor(v0, __ T16B, v0, v5);
2675 
2676     __ st1(v0, __ T16B, to);
2677 
2678     __ mov(r0, 0);
2679 
2680     __ leave();
2681     __ ret(lr);
2682 
2683     return start;
2684   }
2685 
2686   // Arguments:
2687   //
2688   // Inputs:
2689   //   c_rarg0   - source byte array address
2690   //   c_rarg1   - destination byte array address
2691   //   c_rarg2   - K (key) in little endian int array
2692   //   c_rarg3   - r vector byte array address
2693   //   c_rarg4   - input length
2694   //
2695   // Output:
2696   //   x0        - input length
2697   //
2698   address generate_cipherBlockChaining_encryptAESCrypt() {
2699     assert(UseAES, "need AES instructions and misaligned SSE support");
2700     __ align(CodeEntryAlignment);
2701     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2702 
2703     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2704 
2705     const Register from        = c_rarg0;  // source array address
2706     const Register to          = c_rarg1;  // destination array address
2707     const Register key         = c_rarg2;  // key array address
2708     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2709                                            // and left with the results of the last encryption block
2710     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2711     const Register keylen      = rscratch1;
2712 
2713     address start = __ pc();
2714 
2715       __ enter();
2716 
2717       __ movw(rscratch2, len_reg);
2718 
2719       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2720 
2721       __ ld1(v0, __ T16B, rvec);
2722 
2723       __ cmpw(keylen, 52);
2724       __ br(Assembler::CC, L_loadkeys_44);
2725       __ br(Assembler::EQ, L_loadkeys_52);
2726 
2727       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2728       __ rev32(v17, __ T16B, v17);
2729       __ rev32(v18, __ T16B, v18);
2730     __ BIND(L_loadkeys_52);
2731       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2732       __ rev32(v19, __ T16B, v19);
2733       __ rev32(v20, __ T16B, v20);
2734     __ BIND(L_loadkeys_44);
2735       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2736       __ rev32(v21, __ T16B, v21);
2737       __ rev32(v22, __ T16B, v22);
2738       __ rev32(v23, __ T16B, v23);
2739       __ rev32(v24, __ T16B, v24);
2740       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2741       __ rev32(v25, __ T16B, v25);
2742       __ rev32(v26, __ T16B, v26);
2743       __ rev32(v27, __ T16B, v27);
2744       __ rev32(v28, __ T16B, v28);
2745       __ ld1(v29, v30, v31, __ T16B, key);
2746       __ rev32(v29, __ T16B, v29);
2747       __ rev32(v30, __ T16B, v30);
2748       __ rev32(v31, __ T16B, v31);
2749 
2750     __ BIND(L_aes_loop);
2751       __ ld1(v1, __ T16B, __ post(from, 16));
2752       __ eor(v0, __ T16B, v0, v1);
2753 
2754       __ br(Assembler::CC, L_rounds_44);
2755       __ br(Assembler::EQ, L_rounds_52);
2756 
2757       __ aese(v0, v17); __ aesmc(v0, v0);
2758       __ aese(v0, v18); __ aesmc(v0, v0);
2759     __ BIND(L_rounds_52);
2760       __ aese(v0, v19); __ aesmc(v0, v0);
2761       __ aese(v0, v20); __ aesmc(v0, v0);
2762     __ BIND(L_rounds_44);
2763       __ aese(v0, v21); __ aesmc(v0, v0);
2764       __ aese(v0, v22); __ aesmc(v0, v0);
2765       __ aese(v0, v23); __ aesmc(v0, v0);
2766       __ aese(v0, v24); __ aesmc(v0, v0);
2767       __ aese(v0, v25); __ aesmc(v0, v0);
2768       __ aese(v0, v26); __ aesmc(v0, v0);
2769       __ aese(v0, v27); __ aesmc(v0, v0);
2770       __ aese(v0, v28); __ aesmc(v0, v0);
2771       __ aese(v0, v29); __ aesmc(v0, v0);
2772       __ aese(v0, v30);
2773       __ eor(v0, __ T16B, v0, v31);
2774 
2775       __ st1(v0, __ T16B, __ post(to, 16));
2776 
2777       __ subw(len_reg, len_reg, 16);
2778       __ cbnzw(len_reg, L_aes_loop);
2779 
2780       __ st1(v0, __ T16B, rvec);
2781 
2782       __ mov(r0, rscratch2);
2783 
2784       __ leave();
2785       __ ret(lr);
2786 
2787       return start;
2788   }
2789 
2790   // Arguments:
2791   //
2792   // Inputs:
2793   //   c_rarg0   - source byte array address
2794   //   c_rarg1   - destination byte array address
2795   //   c_rarg2   - K (key) in little endian int array
2796   //   c_rarg3   - r vector byte array address
2797   //   c_rarg4   - input length
2798   //
2799   // Output:
2800   //   r0        - input length
2801   //
2802   address generate_cipherBlockChaining_decryptAESCrypt() {
2803     assert(UseAES, "need AES instructions and misaligned SSE support");
2804     __ align(CodeEntryAlignment);
2805     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2806 
2807     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2808 
2809     const Register from        = c_rarg0;  // source array address
2810     const Register to          = c_rarg1;  // destination array address
2811     const Register key         = c_rarg2;  // key array address
2812     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2813                                            // and left with the results of the last encryption block
2814     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2815     const Register keylen      = rscratch1;
2816 
2817     address start = __ pc();
2818 
2819       __ enter();
2820 
2821       __ movw(rscratch2, len_reg);
2822 
2823       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2824 
2825       __ ld1(v2, __ T16B, rvec);
2826 
2827       __ ld1(v31, __ T16B, __ post(key, 16));
2828       __ rev32(v31, __ T16B, v31);
2829 
2830       __ cmpw(keylen, 52);
2831       __ br(Assembler::CC, L_loadkeys_44);
2832       __ br(Assembler::EQ, L_loadkeys_52);
2833 
2834       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2835       __ rev32(v17, __ T16B, v17);
2836       __ rev32(v18, __ T16B, v18);
2837     __ BIND(L_loadkeys_52);
2838       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2839       __ rev32(v19, __ T16B, v19);
2840       __ rev32(v20, __ T16B, v20);
2841     __ BIND(L_loadkeys_44);
2842       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2843       __ rev32(v21, __ T16B, v21);
2844       __ rev32(v22, __ T16B, v22);
2845       __ rev32(v23, __ T16B, v23);
2846       __ rev32(v24, __ T16B, v24);
2847       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2848       __ rev32(v25, __ T16B, v25);
2849       __ rev32(v26, __ T16B, v26);
2850       __ rev32(v27, __ T16B, v27);
2851       __ rev32(v28, __ T16B, v28);
2852       __ ld1(v29, v30, __ T16B, key);
2853       __ rev32(v29, __ T16B, v29);
2854       __ rev32(v30, __ T16B, v30);
2855 
2856     __ BIND(L_aes_loop);
2857       __ ld1(v0, __ T16B, __ post(from, 16));
2858       __ orr(v1, __ T16B, v0, v0);
2859 
2860       __ br(Assembler::CC, L_rounds_44);
2861       __ br(Assembler::EQ, L_rounds_52);
2862 
2863       __ aesd(v0, v17); __ aesimc(v0, v0);
2864       __ aesd(v0, v18); __ aesimc(v0, v0);
2865     __ BIND(L_rounds_52);
2866       __ aesd(v0, v19); __ aesimc(v0, v0);
2867       __ aesd(v0, v20); __ aesimc(v0, v0);
2868     __ BIND(L_rounds_44);
2869       __ aesd(v0, v21); __ aesimc(v0, v0);
2870       __ aesd(v0, v22); __ aesimc(v0, v0);
2871       __ aesd(v0, v23); __ aesimc(v0, v0);
2872       __ aesd(v0, v24); __ aesimc(v0, v0);
2873       __ aesd(v0, v25); __ aesimc(v0, v0);
2874       __ aesd(v0, v26); __ aesimc(v0, v0);
2875       __ aesd(v0, v27); __ aesimc(v0, v0);
2876       __ aesd(v0, v28); __ aesimc(v0, v0);
2877       __ aesd(v0, v29); __ aesimc(v0, v0);
2878       __ aesd(v0, v30);
2879       __ eor(v0, __ T16B, v0, v31);
2880       __ eor(v0, __ T16B, v0, v2);
2881 
2882       __ st1(v0, __ T16B, __ post(to, 16));
2883       __ orr(v2, __ T16B, v1, v1);
2884 
2885       __ subw(len_reg, len_reg, 16);
2886       __ cbnzw(len_reg, L_aes_loop);
2887 
2888       __ st1(v2, __ T16B, rvec);
2889 
2890       __ mov(r0, rscratch2);
2891 
2892       __ leave();
2893       __ ret(lr);
2894 
2895     return start;
2896   }
2897 
2898   // Arguments:
2899   //
2900   // Inputs:
2901   //   c_rarg0   - byte[]  source+offset
2902   //   c_rarg1   - int[]   SHA.state
2903   //   c_rarg2   - int     offset
2904   //   c_rarg3   - int     limit
2905   //
2906   address generate_sha1_implCompress(bool multi_block, const char *name) {
2907     __ align(CodeEntryAlignment);
2908     StubCodeMark mark(this, "StubRoutines", name);
2909     address start = __ pc();
2910 
2911     Register buf   = c_rarg0;
2912     Register state = c_rarg1;
2913     Register ofs   = c_rarg2;
2914     Register limit = c_rarg3;
2915 
2916     Label keys;
2917     Label sha1_loop;
2918 
2919     // load the keys into v0..v3
2920     __ adr(rscratch1, keys);
2921     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2922     // load 5 words state into v6, v7
2923     __ ldrq(v6, Address(state, 0));
2924     __ ldrs(v7, Address(state, 16));
2925 
2926 
2927     __ BIND(sha1_loop);
2928     // load 64 bytes of data into v16..v19
2929     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2930     __ rev32(v16, __ T16B, v16);
2931     __ rev32(v17, __ T16B, v17);
2932     __ rev32(v18, __ T16B, v18);
2933     __ rev32(v19, __ T16B, v19);
2934 
2935     // do the sha1
2936     __ addv(v4, __ T4S, v16, v0);
2937     __ orr(v20, __ T16B, v6, v6);
2938 
2939     FloatRegister d0 = v16;
2940     FloatRegister d1 = v17;
2941     FloatRegister d2 = v18;
2942     FloatRegister d3 = v19;
2943 
2944     for (int round = 0; round < 20; round++) {
2945       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2946       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2947       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2948       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2949       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2950 
2951       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2952       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2953       __ sha1h(tmp2, __ T4S, v20);
2954       if (round < 5)
2955         __ sha1c(v20, __ T4S, tmp3, tmp4);
2956       else if (round < 10 || round >= 15)
2957         __ sha1p(v20, __ T4S, tmp3, tmp4);
2958       else
2959         __ sha1m(v20, __ T4S, tmp3, tmp4);
2960       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2961 
2962       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2963     }
2964 
2965     __ addv(v7, __ T2S, v7, v21);
2966     __ addv(v6, __ T4S, v6, v20);
2967 
2968     if (multi_block) {
2969       __ add(ofs, ofs, 64);
2970       __ cmp(ofs, limit);
2971       __ br(Assembler::LE, sha1_loop);
2972       __ mov(c_rarg0, ofs); // return ofs
2973     }
2974 
2975     __ strq(v6, Address(state, 0));
2976     __ strs(v7, Address(state, 16));
2977 
2978     __ ret(lr);
2979 
2980     __ bind(keys);
2981     __ emit_int32(0x5a827999);
2982     __ emit_int32(0x6ed9eba1);
2983     __ emit_int32(0x8f1bbcdc);
2984     __ emit_int32(0xca62c1d6);
2985 
2986     return start;
2987   }
2988 
2989 
2990   // Arguments:
2991   //
2992   // Inputs:
2993   //   c_rarg0   - byte[]  source+offset
2994   //   c_rarg1   - int[]   SHA.state
2995   //   c_rarg2   - int     offset
2996   //   c_rarg3   - int     limit
2997   //
2998   address generate_sha256_implCompress(bool multi_block, const char *name) {
2999     static const uint32_t round_consts[64] = {
3000       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3001       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3002       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3003       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3004       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3005       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3006       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3007       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3008       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3009       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3010       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3011       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3012       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3013       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3014       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3015       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3016     };
3017     __ align(CodeEntryAlignment);
3018     StubCodeMark mark(this, "StubRoutines", name);
3019     address start = __ pc();
3020 
3021     Register buf   = c_rarg0;
3022     Register state = c_rarg1;
3023     Register ofs   = c_rarg2;
3024     Register limit = c_rarg3;
3025 
3026     Label sha1_loop;
3027 
3028     __ stpd(v8, v9, __ pre(sp, -32));
3029     __ stpd(v10, v11, Address(sp, 16));
3030 
3031 // dga == v0
3032 // dgb == v1
3033 // dg0 == v2
3034 // dg1 == v3
3035 // dg2 == v4
3036 // t0 == v6
3037 // t1 == v7
3038 
3039     // load 16 keys to v16..v31
3040     __ lea(rscratch1, ExternalAddress((address)round_consts));
3041     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3042     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3043     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3044     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3045 
3046     // load 8 words (256 bits) state
3047     __ ldpq(v0, v1, state);
3048 
3049     __ BIND(sha1_loop);
3050     // load 64 bytes of data into v8..v11
3051     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3052     __ rev32(v8, __ T16B, v8);
3053     __ rev32(v9, __ T16B, v9);
3054     __ rev32(v10, __ T16B, v10);
3055     __ rev32(v11, __ T16B, v11);
3056 
3057     __ addv(v6, __ T4S, v8, v16);
3058     __ orr(v2, __ T16B, v0, v0);
3059     __ orr(v3, __ T16B, v1, v1);
3060 
3061     FloatRegister d0 = v8;
3062     FloatRegister d1 = v9;
3063     FloatRegister d2 = v10;
3064     FloatRegister d3 = v11;
3065 
3066 
3067     for (int round = 0; round < 16; round++) {
3068       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3069       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3070       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3071       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3072 
3073       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3074        __ orr(v4, __ T16B, v2, v2);
3075       if (round < 15)
3076         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3077       __ sha256h(v2, __ T4S, v3, tmp2);
3078       __ sha256h2(v3, __ T4S, v4, tmp2);
3079       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3080 
3081       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3082     }
3083 
3084     __ addv(v0, __ T4S, v0, v2);
3085     __ addv(v1, __ T4S, v1, v3);
3086 
3087     if (multi_block) {
3088       __ add(ofs, ofs, 64);
3089       __ cmp(ofs, limit);
3090       __ br(Assembler::LE, sha1_loop);
3091       __ mov(c_rarg0, ofs); // return ofs
3092     }
3093 
3094     __ ldpd(v10, v11, Address(sp, 16));
3095     __ ldpd(v8, v9, __ post(sp, 32));
3096 
3097     __ stpq(v0, v1, state);
3098 
3099     __ ret(lr);
3100 
3101     return start;
3102   }
3103 
3104 #ifndef BUILTIN_SIM
3105   // Safefetch stubs.
3106   void generate_safefetch(const char* name, int size, address* entry,
3107                           address* fault_pc, address* continuation_pc) {
3108     // safefetch signatures:
3109     //   int      SafeFetch32(int*      adr, int      errValue);
3110     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3111     //
3112     // arguments:
3113     //   c_rarg0 = adr
3114     //   c_rarg1 = errValue
3115     //
3116     // result:
3117     //   PPC_RET  = *adr or errValue
3118 
3119     StubCodeMark mark(this, "StubRoutines", name);
3120 
3121     // Entry point, pc or function descriptor.
3122     *entry = __ pc();
3123 
3124     // Load *adr into c_rarg1, may fault.
3125     *fault_pc = __ pc();
3126     switch (size) {
3127       case 4:
3128         // int32_t
3129         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3130         break;
3131       case 8:
3132         // int64_t
3133         __ ldr(c_rarg1, Address(c_rarg0, 0));
3134         break;
3135       default:
3136         ShouldNotReachHere();
3137     }
3138 
3139     // return errValue or *adr
3140     *continuation_pc = __ pc();
3141     __ mov(r0, c_rarg1);
3142     __ ret(lr);
3143   }
3144 #endif
3145 
3146   /**
3147    *  Arguments:
3148    *
3149    * Inputs:
3150    *   c_rarg0   - int crc
3151    *   c_rarg1   - byte* buf
3152    *   c_rarg2   - int length
3153    *
3154    * Ouput:
3155    *       rax   - int crc result
3156    */
3157   address generate_updateBytesCRC32() {
3158     assert(UseCRC32Intrinsics, "what are we doing here?");
3159 
3160     __ align(CodeEntryAlignment);
3161     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3162 
3163     address start = __ pc();
3164 
3165     const Register crc   = c_rarg0;  // crc
3166     const Register buf   = c_rarg1;  // source java byte array address
3167     const Register len   = c_rarg2;  // length
3168     const Register table0 = c_rarg3; // crc_table address
3169     const Register table1 = c_rarg4;
3170     const Register table2 = c_rarg5;
3171     const Register table3 = c_rarg6;
3172     const Register tmp3 = c_rarg7;
3173 
3174     BLOCK_COMMENT("Entry:");
3175     __ enter(); // required for proper stackwalking of RuntimeStub frame
3176 
3177     __ kernel_crc32(crc, buf, len,
3178               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3179 
3180     __ leave(); // required for proper stackwalking of RuntimeStub frame
3181     __ ret(lr);
3182 
3183     return start;
3184   }
3185 
3186   /**
3187    *  Arguments:
3188    *
3189    * Inputs:
3190    *   c_rarg0   - int crc
3191    *   c_rarg1   - byte* buf
3192    *   c_rarg2   - int length
3193    *   c_rarg3   - int* table
3194    *
3195    * Ouput:
3196    *       r0   - int crc result
3197    */
3198   address generate_updateBytesCRC32C() {
3199     assert(UseCRC32CIntrinsics, "what are we doing here?");
3200 
3201     __ align(CodeEntryAlignment);
3202     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3203 
3204     address start = __ pc();
3205 
3206     const Register crc   = c_rarg0;  // crc
3207     const Register buf   = c_rarg1;  // source java byte array address
3208     const Register len   = c_rarg2;  // length
3209     const Register table0 = c_rarg3; // crc_table address
3210     const Register table1 = c_rarg4;
3211     const Register table2 = c_rarg5;
3212     const Register table3 = c_rarg6;
3213     const Register tmp3 = c_rarg7;
3214 
3215     BLOCK_COMMENT("Entry:");
3216     __ enter(); // required for proper stackwalking of RuntimeStub frame
3217 
3218     __ kernel_crc32c(crc, buf, len,
3219               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3220 
3221     __ leave(); // required for proper stackwalking of RuntimeStub frame
3222     __ ret(lr);
3223 
3224     return start;
3225   }
3226 
3227   /***
3228    *  Arguments:
3229    *
3230    *  Inputs:
3231    *   c_rarg0   - int   adler
3232    *   c_rarg1   - byte* buff
3233    *   c_rarg2   - int   len
3234    *
3235    * Output:
3236    *   c_rarg0   - int adler result
3237    */
3238   address generate_updateBytesAdler32() {
3239     __ align(CodeEntryAlignment);
3240     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3241     address start = __ pc();
3242 
3243     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3244 
3245     // Aliases
3246     Register adler  = c_rarg0;
3247     Register s1     = c_rarg0;
3248     Register s2     = c_rarg3;
3249     Register buff   = c_rarg1;
3250     Register len    = c_rarg2;
3251     Register nmax  = r4;
3252     Register base = r5;
3253     Register count = r6;
3254     Register temp0 = rscratch1;
3255     Register temp1 = rscratch2;
3256     Register temp2 = r7;
3257 
3258     // Max number of bytes we can process before having to take the mod
3259     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3260     unsigned long BASE = 0xfff1;
3261     unsigned long NMAX = 0x15B0;
3262 
3263     __ mov(base, BASE);
3264     __ mov(nmax, NMAX);
3265 
3266     // s1 is initialized to the lower 16 bits of adler
3267     // s2 is initialized to the upper 16 bits of adler
3268     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3269     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3270 
3271     // The pipelined loop needs at least 16 elements for 1 iteration
3272     // It does check this, but it is more effective to skip to the cleanup loop
3273     __ cmp(len, 16);
3274     __ br(Assembler::HS, L_nmax);
3275     __ cbz(len, L_combine);
3276 
3277     __ bind(L_simple_by1_loop);
3278     __ ldrb(temp0, Address(__ post(buff, 1)));
3279     __ add(s1, s1, temp0);
3280     __ add(s2, s2, s1);
3281     __ subs(len, len, 1);
3282     __ br(Assembler::HI, L_simple_by1_loop);
3283 
3284     // s1 = s1 % BASE
3285     __ subs(temp0, s1, base);
3286     __ csel(s1, temp0, s1, Assembler::HS);
3287 
3288     // s2 = s2 % BASE
3289     __ lsr(temp0, s2, 16);
3290     __ lsl(temp1, temp0, 4);
3291     __ sub(temp1, temp1, temp0);
3292     __ add(s2, temp1, s2, ext::uxth);
3293 
3294     __ subs(temp0, s2, base);
3295     __ csel(s2, temp0, s2, Assembler::HS);
3296 
3297     __ b(L_combine);
3298 
3299     __ bind(L_nmax);
3300     __ subs(len, len, nmax);
3301     __ sub(count, nmax, 16);
3302     __ br(Assembler::LO, L_by16);
3303 
3304     __ bind(L_nmax_loop);
3305 
3306     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3307 
3308     __ add(s1, s1, temp0, ext::uxtb);
3309     __ ubfx(temp2, temp0, 8, 8);
3310     __ add(s2, s2, s1);
3311     __ add(s1, s1, temp2);
3312     __ ubfx(temp2, temp0, 16, 8);
3313     __ add(s2, s2, s1);
3314     __ add(s1, s1, temp2);
3315     __ ubfx(temp2, temp0, 24, 8);
3316     __ add(s2, s2, s1);
3317     __ add(s1, s1, temp2);
3318     __ ubfx(temp2, temp0, 32, 8);
3319     __ add(s2, s2, s1);
3320     __ add(s1, s1, temp2);
3321     __ ubfx(temp2, temp0, 40, 8);
3322     __ add(s2, s2, s1);
3323     __ add(s1, s1, temp2);
3324     __ ubfx(temp2, temp0, 48, 8);
3325     __ add(s2, s2, s1);
3326     __ add(s1, s1, temp2);
3327     __ add(s2, s2, s1);
3328     __ add(s1, s1, temp0, Assembler::LSR, 56);
3329     __ add(s2, s2, s1);
3330 
3331     __ add(s1, s1, temp1, ext::uxtb);
3332     __ ubfx(temp2, temp1, 8, 8);
3333     __ add(s2, s2, s1);
3334     __ add(s1, s1, temp2);
3335     __ ubfx(temp2, temp1, 16, 8);
3336     __ add(s2, s2, s1);
3337     __ add(s1, s1, temp2);
3338     __ ubfx(temp2, temp1, 24, 8);
3339     __ add(s2, s2, s1);
3340     __ add(s1, s1, temp2);
3341     __ ubfx(temp2, temp1, 32, 8);
3342     __ add(s2, s2, s1);
3343     __ add(s1, s1, temp2);
3344     __ ubfx(temp2, temp1, 40, 8);
3345     __ add(s2, s2, s1);
3346     __ add(s1, s1, temp2);
3347     __ ubfx(temp2, temp1, 48, 8);
3348     __ add(s2, s2, s1);
3349     __ add(s1, s1, temp2);
3350     __ add(s2, s2, s1);
3351     __ add(s1, s1, temp1, Assembler::LSR, 56);
3352     __ add(s2, s2, s1);
3353 
3354     __ subs(count, count, 16);
3355     __ br(Assembler::HS, L_nmax_loop);
3356 
3357     // s1 = s1 % BASE
3358     __ lsr(temp0, s1, 16);
3359     __ lsl(temp1, temp0, 4);
3360     __ sub(temp1, temp1, temp0);
3361     __ add(temp1, temp1, s1, ext::uxth);
3362 
3363     __ lsr(temp0, temp1, 16);
3364     __ lsl(s1, temp0, 4);
3365     __ sub(s1, s1, temp0);
3366     __ add(s1, s1, temp1, ext:: uxth);
3367 
3368     __ subs(temp0, s1, base);
3369     __ csel(s1, temp0, s1, Assembler::HS);
3370 
3371     // s2 = s2 % BASE
3372     __ lsr(temp0, s2, 16);
3373     __ lsl(temp1, temp0, 4);
3374     __ sub(temp1, temp1, temp0);
3375     __ add(temp1, temp1, s2, ext::uxth);
3376 
3377     __ lsr(temp0, temp1, 16);
3378     __ lsl(s2, temp0, 4);
3379     __ sub(s2, s2, temp0);
3380     __ add(s2, s2, temp1, ext:: uxth);
3381 
3382     __ subs(temp0, s2, base);
3383     __ csel(s2, temp0, s2, Assembler::HS);
3384 
3385     __ subs(len, len, nmax);
3386     __ sub(count, nmax, 16);
3387     __ br(Assembler::HS, L_nmax_loop);
3388 
3389     __ bind(L_by16);
3390     __ adds(len, len, count);
3391     __ br(Assembler::LO, L_by1);
3392 
3393     __ bind(L_by16_loop);
3394 
3395     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3396 
3397     __ add(s1, s1, temp0, ext::uxtb);
3398     __ ubfx(temp2, temp0, 8, 8);
3399     __ add(s2, s2, s1);
3400     __ add(s1, s1, temp2);
3401     __ ubfx(temp2, temp0, 16, 8);
3402     __ add(s2, s2, s1);
3403     __ add(s1, s1, temp2);
3404     __ ubfx(temp2, temp0, 24, 8);
3405     __ add(s2, s2, s1);
3406     __ add(s1, s1, temp2);
3407     __ ubfx(temp2, temp0, 32, 8);
3408     __ add(s2, s2, s1);
3409     __ add(s1, s1, temp2);
3410     __ ubfx(temp2, temp0, 40, 8);
3411     __ add(s2, s2, s1);
3412     __ add(s1, s1, temp2);
3413     __ ubfx(temp2, temp0, 48, 8);
3414     __ add(s2, s2, s1);
3415     __ add(s1, s1, temp2);
3416     __ add(s2, s2, s1);
3417     __ add(s1, s1, temp0, Assembler::LSR, 56);
3418     __ add(s2, s2, s1);
3419 
3420     __ add(s1, s1, temp1, ext::uxtb);
3421     __ ubfx(temp2, temp1, 8, 8);
3422     __ add(s2, s2, s1);
3423     __ add(s1, s1, temp2);
3424     __ ubfx(temp2, temp1, 16, 8);
3425     __ add(s2, s2, s1);
3426     __ add(s1, s1, temp2);
3427     __ ubfx(temp2, temp1, 24, 8);
3428     __ add(s2, s2, s1);
3429     __ add(s1, s1, temp2);
3430     __ ubfx(temp2, temp1, 32, 8);
3431     __ add(s2, s2, s1);
3432     __ add(s1, s1, temp2);
3433     __ ubfx(temp2, temp1, 40, 8);
3434     __ add(s2, s2, s1);
3435     __ add(s1, s1, temp2);
3436     __ ubfx(temp2, temp1, 48, 8);
3437     __ add(s2, s2, s1);
3438     __ add(s1, s1, temp2);
3439     __ add(s2, s2, s1);
3440     __ add(s1, s1, temp1, Assembler::LSR, 56);
3441     __ add(s2, s2, s1);
3442 
3443     __ subs(len, len, 16);
3444     __ br(Assembler::HS, L_by16_loop);
3445 
3446     __ bind(L_by1);
3447     __ adds(len, len, 15);
3448     __ br(Assembler::LO, L_do_mod);
3449 
3450     __ bind(L_by1_loop);
3451     __ ldrb(temp0, Address(__ post(buff, 1)));
3452     __ add(s1, temp0, s1);
3453     __ add(s2, s2, s1);
3454     __ subs(len, len, 1);
3455     __ br(Assembler::HS, L_by1_loop);
3456 
3457     __ bind(L_do_mod);
3458     // s1 = s1 % BASE
3459     __ lsr(temp0, s1, 16);
3460     __ lsl(temp1, temp0, 4);
3461     __ sub(temp1, temp1, temp0);
3462     __ add(temp1, temp1, s1, ext::uxth);
3463 
3464     __ lsr(temp0, temp1, 16);
3465     __ lsl(s1, temp0, 4);
3466     __ sub(s1, s1, temp0);
3467     __ add(s1, s1, temp1, ext:: uxth);
3468 
3469     __ subs(temp0, s1, base);
3470     __ csel(s1, temp0, s1, Assembler::HS);
3471 
3472     // s2 = s2 % BASE
3473     __ lsr(temp0, s2, 16);
3474     __ lsl(temp1, temp0, 4);
3475     __ sub(temp1, temp1, temp0);
3476     __ add(temp1, temp1, s2, ext::uxth);
3477 
3478     __ lsr(temp0, temp1, 16);
3479     __ lsl(s2, temp0, 4);
3480     __ sub(s2, s2, temp0);
3481     __ add(s2, s2, temp1, ext:: uxth);
3482 
3483     __ subs(temp0, s2, base);
3484     __ csel(s2, temp0, s2, Assembler::HS);
3485 
3486     // Combine lower bits and higher bits
3487     __ bind(L_combine);
3488     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3489 
3490     __ ret(lr);
3491 
3492     return start;
3493   }
3494 
3495   /**
3496    *  Arguments:
3497    *
3498    *  Input:
3499    *    c_rarg0   - x address
3500    *    c_rarg1   - x length
3501    *    c_rarg2   - y address
3502    *    c_rarg3   - y lenth
3503    *    c_rarg4   - z address
3504    *    c_rarg5   - z length
3505    */
3506   address generate_multiplyToLen() {
3507     __ align(CodeEntryAlignment);
3508     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3509 
3510     address start = __ pc();
3511     const Register x     = r0;
3512     const Register xlen  = r1;
3513     const Register y     = r2;
3514     const Register ylen  = r3;
3515     const Register z     = r4;
3516     const Register zlen  = r5;
3517 
3518     const Register tmp1  = r10;
3519     const Register tmp2  = r11;
3520     const Register tmp3  = r12;
3521     const Register tmp4  = r13;
3522     const Register tmp5  = r14;
3523     const Register tmp6  = r15;
3524     const Register tmp7  = r16;
3525 
3526     BLOCK_COMMENT("Entry:");
3527     __ enter(); // required for proper stackwalking of RuntimeStub frame
3528     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3529     __ leave(); // required for proper stackwalking of RuntimeStub frame
3530     __ ret(lr);
3531 
3532     return start;
3533   }
3534 
3535   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3536                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3537                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3538     // Karatsuba multiplication performs a 128*128 -> 256-bit
3539     // multiplication in three 128-bit multiplications and a few
3540     // additions.
3541     //
3542     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3543     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3544     //
3545     // Inputs:
3546     //
3547     // A0 in a.d[0]     (subkey)
3548     // A1 in a.d[1]
3549     // (A1+A0) in a1_xor_a0.d[0]
3550     //
3551     // B0 in b.d[0]     (state)
3552     // B1 in b.d[1]
3553 
3554     __ ext(tmp1, __ T16B, b, b, 0x08);
3555     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3556     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3557     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3558     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3559 
3560     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3561     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3562     __ eor(tmp2, __ T16B, tmp2, tmp4);
3563     __ eor(tmp2, __ T16B, tmp2, tmp3);
3564 
3565     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3566     __ ins(result_hi, __ D, tmp2, 0, 1);
3567     __ ins(result_lo, __ D, tmp2, 1, 0);
3568   }
3569 
3570   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3571                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3572     const FloatRegister t0 = result;
3573 
3574     // The GCM field polynomial f is z^128 + p(z), where p =
3575     // z^7+z^2+z+1.
3576     //
3577     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3578     //
3579     // so, given that the product we're reducing is
3580     //    a == lo + hi * z^128
3581     // substituting,
3582     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3583     //
3584     // we reduce by multiplying hi by p(z) and subtracting the result
3585     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3586     // bits we can do this with two 64-bit multiplications, lo*p and
3587     // hi*p.
3588 
3589     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3590     __ ext(t1, __ T16B, t0, z, 8);
3591     __ eor(hi, __ T16B, hi, t1);
3592     __ ext(t1, __ T16B, z, t0, 8);
3593     __ eor(lo, __ T16B, lo, t1);
3594     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3595     __ eor(result, __ T16B, lo, t0);
3596   }
3597 
3598   address generate_has_negatives(address &has_negatives_long) {
3599     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3600     const int large_loop_size = 64;
3601     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3602     int dcache_line = VM_Version::dcache_line_size();
3603 
3604     Register ary1 = r1, len = r2, result = r0;
3605 
3606     __ align(CodeEntryAlignment);
3607     address entry = __ pc();
3608 
3609     __ enter();
3610 
3611   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3612         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3613 
3614   __ cmp(len, 15);
3615   __ br(Assembler::GT, LEN_OVER_15);
3616   // The only case when execution falls into this code is when pointer is near
3617   // the end of memory page and we have to avoid reading next page
3618   __ add(ary1, ary1, len);
3619   __ subs(len, len, 8);
3620   __ br(Assembler::GT, LEN_OVER_8);
3621   __ ldr(rscratch2, Address(ary1, -8));
3622   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3623   __ lsrv(rscratch2, rscratch2, rscratch1);
3624   __ tst(rscratch2, UPPER_BIT_MASK);
3625   __ cset(result, Assembler::NE);
3626   __ leave();
3627   __ ret(lr);
3628   __ bind(LEN_OVER_8);
3629   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3630   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3631   __ tst(rscratch2, UPPER_BIT_MASK);
3632   __ br(Assembler::NE, RET_TRUE_NO_POP);
3633   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3634   __ lsrv(rscratch1, rscratch1, rscratch2);
3635   __ tst(rscratch1, UPPER_BIT_MASK);
3636   __ cset(result, Assembler::NE);
3637   __ leave();
3638   __ ret(lr);
3639 
3640   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3641   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3642 
3643   has_negatives_long = __ pc(); // 2nd entry point
3644 
3645   __ enter();
3646 
3647   __ bind(LEN_OVER_15);
3648     __ push(spilled_regs, sp);
3649     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3650     __ cbz(rscratch2, ALIGNED);
3651     __ ldp(tmp6, tmp1, Address(ary1));
3652     __ mov(tmp5, 16);
3653     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3654     __ add(ary1, ary1, rscratch1);
3655     __ sub(len, len, rscratch1);
3656     __ orr(tmp6, tmp6, tmp1);
3657     __ tst(tmp6, UPPER_BIT_MASK);
3658     __ br(Assembler::NE, RET_TRUE);
3659 
3660   __ bind(ALIGNED);
3661     __ cmp(len, large_loop_size);
3662     __ br(Assembler::LT, CHECK_16);
3663     // Perform 16-byte load as early return in pre-loop to handle situation
3664     // when initially aligned large array has negative values at starting bytes,
3665     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3666     // slower. Cases with negative bytes further ahead won't be affected that
3667     // much. In fact, it'll be faster due to early loads, less instructions and
3668     // less branches in LARGE_LOOP.
3669     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3670     __ sub(len, len, 16);
3671     __ orr(tmp6, tmp6, tmp1);
3672     __ tst(tmp6, UPPER_BIT_MASK);
3673     __ br(Assembler::NE, RET_TRUE);
3674     __ cmp(len, large_loop_size);
3675     __ br(Assembler::LT, CHECK_16);
3676 
3677     if (SoftwarePrefetchHintDistance >= 0
3678         && SoftwarePrefetchHintDistance >= dcache_line) {
3679       // initial prefetch
3680       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3681     }
3682   __ bind(LARGE_LOOP);
3683     if (SoftwarePrefetchHintDistance >= 0) {
3684       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3685     }
3686     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3687     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3688     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3689     // instructions per cycle and have less branches, but this approach disables
3690     // early return, thus, all 64 bytes are loaded and checked every time.
3691     __ ldp(tmp2, tmp3, Address(ary1));
3692     __ ldp(tmp4, tmp5, Address(ary1, 16));
3693     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3694     __ ldp(tmp6, tmp1, Address(ary1, 48));
3695     __ add(ary1, ary1, large_loop_size);
3696     __ sub(len, len, large_loop_size);
3697     __ orr(tmp2, tmp2, tmp3);
3698     __ orr(tmp4, tmp4, tmp5);
3699     __ orr(rscratch1, rscratch1, rscratch2);
3700     __ orr(tmp6, tmp6, tmp1);
3701     __ orr(tmp2, tmp2, tmp4);
3702     __ orr(rscratch1, rscratch1, tmp6);
3703     __ orr(tmp2, tmp2, rscratch1);
3704     __ tst(tmp2, UPPER_BIT_MASK);
3705     __ br(Assembler::NE, RET_TRUE);
3706     __ cmp(len, large_loop_size);
3707     __ br(Assembler::GE, LARGE_LOOP);
3708 
3709   __ bind(CHECK_16); // small 16-byte load pre-loop
3710     __ cmp(len, 16);
3711     __ br(Assembler::LT, POST_LOOP16);
3712 
3713   __ bind(LOOP16); // small 16-byte load loop
3714     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3715     __ sub(len, len, 16);
3716     __ orr(tmp2, tmp2, tmp3);
3717     __ tst(tmp2, UPPER_BIT_MASK);
3718     __ br(Assembler::NE, RET_TRUE);
3719     __ cmp(len, 16);
3720     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3721 
3722   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3723     __ cmp(len, 8);
3724     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3725     __ ldr(tmp3, Address(__ post(ary1, 8)));
3726     __ sub(len, len, 8);
3727     __ tst(tmp3, UPPER_BIT_MASK);
3728     __ br(Assembler::NE, RET_TRUE);
3729 
3730   __ bind(POST_LOOP16_LOAD_TAIL);
3731     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3732     __ ldr(tmp1, Address(ary1));
3733     __ mov(tmp2, 64);
3734     __ sub(tmp4, tmp2, len, __ LSL, 3);
3735     __ lslv(tmp1, tmp1, tmp4);
3736     __ tst(tmp1, UPPER_BIT_MASK);
3737     __ br(Assembler::NE, RET_TRUE);
3738     // Fallthrough
3739 
3740   __ bind(RET_FALSE);
3741     __ pop(spilled_regs, sp);
3742     __ leave();
3743     __ mov(result, zr);
3744     __ ret(lr);
3745 
3746   __ bind(RET_TRUE);
3747     __ pop(spilled_regs, sp);
3748   __ bind(RET_TRUE_NO_POP);
3749     __ leave();
3750     __ mov(result, 1);
3751     __ ret(lr);
3752 
3753   __ bind(DONE);
3754     __ pop(spilled_regs, sp);
3755     __ leave();
3756     __ ret(lr);
3757     return entry;
3758   }
3759   /**
3760    *  Arguments:
3761    *
3762    *  Input:
3763    *  c_rarg0   - current state address
3764    *  c_rarg1   - H key address
3765    *  c_rarg2   - data address
3766    *  c_rarg3   - number of blocks
3767    *
3768    *  Output:
3769    *  Updated state at c_rarg0
3770    */
3771   address generate_ghash_processBlocks() {
3772     // Bafflingly, GCM uses little-endian for the byte order, but
3773     // big-endian for the bit order.  For example, the polynomial 1 is
3774     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3775     //
3776     // So, we must either reverse the bytes in each word and do
3777     // everything big-endian or reverse the bits in each byte and do
3778     // it little-endian.  On AArch64 it's more idiomatic to reverse
3779     // the bits in each byte (we have an instruction, RBIT, to do
3780     // that) and keep the data in little-endian bit order throught the
3781     // calculation, bit-reversing the inputs and outputs.
3782 
3783     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3784     __ align(wordSize * 2);
3785     address p = __ pc();
3786     __ emit_int64(0x87);  // The low-order bits of the field
3787                           // polynomial (i.e. p = z^7+z^2+z+1)
3788                           // repeated in the low and high parts of a
3789                           // 128-bit vector
3790     __ emit_int64(0x87);
3791 
3792     __ align(CodeEntryAlignment);
3793     address start = __ pc();
3794 
3795     Register state   = c_rarg0;
3796     Register subkeyH = c_rarg1;
3797     Register data    = c_rarg2;
3798     Register blocks  = c_rarg3;
3799 
3800     FloatRegister vzr = v30;
3801     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3802 
3803     __ ldrq(v0, Address(state));
3804     __ ldrq(v1, Address(subkeyH));
3805 
3806     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3807     __ rbit(v0, __ T16B, v0);
3808     __ rev64(v1, __ T16B, v1);
3809     __ rbit(v1, __ T16B, v1);
3810 
3811     __ ldrq(v26, p);
3812 
3813     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3814     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3815 
3816     {
3817       Label L_ghash_loop;
3818       __ bind(L_ghash_loop);
3819 
3820       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3821                                                  // reversing each byte
3822       __ rbit(v2, __ T16B, v2);
3823       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3824 
3825       // Multiply state in v2 by subkey in v1
3826       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3827                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3828                      /*temps*/v6, v20, v18, v21);
3829       // Reduce v7:v5 by the field polynomial
3830       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3831 
3832       __ sub(blocks, blocks, 1);
3833       __ cbnz(blocks, L_ghash_loop);
3834     }
3835 
3836     // The bit-reversed result is at this point in v0
3837     __ rev64(v1, __ T16B, v0);
3838     __ rbit(v1, __ T16B, v1);
3839 
3840     __ st1(v1, __ T16B, state);
3841     __ ret(lr);
3842 
3843     return start;
3844   }
3845 
3846   // Continuation point for throwing of implicit exceptions that are
3847   // not handled in the current activation. Fabricates an exception
3848   // oop and initiates normal exception dispatching in this
3849   // frame. Since we need to preserve callee-saved values (currently
3850   // only for C2, but done for C1 as well) we need a callee-saved oop
3851   // map and therefore have to make these stubs into RuntimeStubs
3852   // rather than BufferBlobs.  If the compiler needs all registers to
3853   // be preserved between the fault point and the exception handler
3854   // then it must assume responsibility for that in
3855   // AbstractCompiler::continuation_for_implicit_null_exception or
3856   // continuation_for_implicit_division_by_zero_exception. All other
3857   // implicit exceptions (e.g., NullPointerException or
3858   // AbstractMethodError on entry) are either at call sites or
3859   // otherwise assume that stack unwinding will be initiated, so
3860   // caller saved registers were assumed volatile in the compiler.
3861 
3862 #undef __
3863 #define __ masm->
3864 
3865   address generate_throw_exception(const char* name,
3866                                    address runtime_entry,
3867                                    Register arg1 = noreg,
3868                                    Register arg2 = noreg) {
3869     // Information about frame layout at time of blocking runtime call.
3870     // Note that we only have to preserve callee-saved registers since
3871     // the compilers are responsible for supplying a continuation point
3872     // if they expect all registers to be preserved.
3873     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3874     enum layout {
3875       rfp_off = 0,
3876       rfp_off2,
3877       return_off,
3878       return_off2,
3879       framesize // inclusive of return address
3880     };
3881 
3882     int insts_size = 512;
3883     int locs_size  = 64;
3884 
3885     CodeBuffer code(name, insts_size, locs_size);
3886     OopMapSet* oop_maps  = new OopMapSet();
3887     MacroAssembler* masm = new MacroAssembler(&code);
3888 
3889     address start = __ pc();
3890 
3891     // This is an inlined and slightly modified version of call_VM
3892     // which has the ability to fetch the return PC out of
3893     // thread-local storage and also sets up last_Java_sp slightly
3894     // differently than the real call_VM
3895 
3896     __ enter(); // Save FP and LR before call
3897 
3898     assert(is_even(framesize/2), "sp not 16-byte aligned");
3899 
3900     // lr and fp are already in place
3901     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3902 
3903     int frame_complete = __ pc() - start;
3904 
3905     // Set up last_Java_sp and last_Java_fp
3906     address the_pc = __ pc();
3907     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3908 
3909     // Call runtime
3910     if (arg1 != noreg) {
3911       assert(arg2 != c_rarg1, "clobbered");
3912       __ mov(c_rarg1, arg1);
3913     }
3914     if (arg2 != noreg) {
3915       __ mov(c_rarg2, arg2);
3916     }
3917     __ mov(c_rarg0, rthread);
3918     BLOCK_COMMENT("call runtime_entry");
3919     __ mov(rscratch1, runtime_entry);
3920     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3921 
3922     // Generate oop map
3923     OopMap* map = new OopMap(framesize, 0);
3924 
3925     oop_maps->add_gc_map(the_pc - start, map);
3926 
3927     __ reset_last_Java_frame(true);
3928     __ maybe_isb();
3929 
3930     __ leave();
3931 
3932     // check for pending exceptions
3933 #ifdef ASSERT
3934     Label L;
3935     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3936     __ cbnz(rscratch1, L);
3937     __ should_not_reach_here();
3938     __ bind(L);
3939 #endif // ASSERT
3940     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3941 
3942 
3943     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3944     RuntimeStub* stub =
3945       RuntimeStub::new_runtime_stub(name,
3946                                     &code,
3947                                     frame_complete,
3948                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3949                                     oop_maps, false);
3950     return stub->entry_point();
3951   }
3952 
3953   class MontgomeryMultiplyGenerator : public MacroAssembler {
3954 
3955     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3956       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3957 
3958     RegSet _toSave;
3959     bool _squaring;
3960 
3961   public:
3962     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3963       : MacroAssembler(as->code()), _squaring(squaring) {
3964 
3965       // Register allocation
3966 
3967       Register reg = c_rarg0;
3968       Pa_base = reg;       // Argument registers
3969       if (squaring)
3970         Pb_base = Pa_base;
3971       else
3972         Pb_base = ++reg;
3973       Pn_base = ++reg;
3974       Rlen= ++reg;
3975       inv = ++reg;
3976       Pm_base = ++reg;
3977 
3978                           // Working registers:
3979       Ra =  ++reg;        // The current digit of a, b, n, and m.
3980       Rb =  ++reg;
3981       Rm =  ++reg;
3982       Rn =  ++reg;
3983 
3984       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3985       Pb =  ++reg;
3986       Pm =  ++reg;
3987       Pn =  ++reg;
3988 
3989       t0 =  ++reg;        // Three registers which form a
3990       t1 =  ++reg;        // triple-precision accumuator.
3991       t2 =  ++reg;
3992 
3993       Ri =  ++reg;        // Inner and outer loop indexes.
3994       Rj =  ++reg;
3995 
3996       Rhi_ab = ++reg;     // Product registers: low and high parts
3997       Rlo_ab = ++reg;     // of a*b and m*n.
3998       Rhi_mn = ++reg;
3999       Rlo_mn = ++reg;
4000 
4001       // r19 and up are callee-saved.
4002       _toSave = RegSet::range(r19, reg) + Pm_base;
4003     }
4004 
4005   private:
4006     void save_regs() {
4007       push(_toSave, sp);
4008     }
4009 
4010     void restore_regs() {
4011       pop(_toSave, sp);
4012     }
4013 
4014     template <typename T>
4015     void unroll_2(Register count, T block) {
4016       Label loop, end, odd;
4017       tbnz(count, 0, odd);
4018       cbz(count, end);
4019       align(16);
4020       bind(loop);
4021       (this->*block)();
4022       bind(odd);
4023       (this->*block)();
4024       subs(count, count, 2);
4025       br(Assembler::GT, loop);
4026       bind(end);
4027     }
4028 
4029     template <typename T>
4030     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4031       Label loop, end, odd;
4032       tbnz(count, 0, odd);
4033       cbz(count, end);
4034       align(16);
4035       bind(loop);
4036       (this->*block)(d, s, tmp);
4037       bind(odd);
4038       (this->*block)(d, s, tmp);
4039       subs(count, count, 2);
4040       br(Assembler::GT, loop);
4041       bind(end);
4042     }
4043 
4044     void pre1(RegisterOrConstant i) {
4045       block_comment("pre1");
4046       // Pa = Pa_base;
4047       // Pb = Pb_base + i;
4048       // Pm = Pm_base;
4049       // Pn = Pn_base + i;
4050       // Ra = *Pa;
4051       // Rb = *Pb;
4052       // Rm = *Pm;
4053       // Rn = *Pn;
4054       ldr(Ra, Address(Pa_base));
4055       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4056       ldr(Rm, Address(Pm_base));
4057       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4058       lea(Pa, Address(Pa_base));
4059       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4060       lea(Pm, Address(Pm_base));
4061       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4062 
4063       // Zero the m*n result.
4064       mov(Rhi_mn, zr);
4065       mov(Rlo_mn, zr);
4066     }
4067 
4068     // The core multiply-accumulate step of a Montgomery
4069     // multiplication.  The idea is to schedule operations as a
4070     // pipeline so that instructions with long latencies (loads and
4071     // multiplies) have time to complete before their results are
4072     // used.  This most benefits in-order implementations of the
4073     // architecture but out-of-order ones also benefit.
4074     void step() {
4075       block_comment("step");
4076       // MACC(Ra, Rb, t0, t1, t2);
4077       // Ra = *++Pa;
4078       // Rb = *--Pb;
4079       umulh(Rhi_ab, Ra, Rb);
4080       mul(Rlo_ab, Ra, Rb);
4081       ldr(Ra, pre(Pa, wordSize));
4082       ldr(Rb, pre(Pb, -wordSize));
4083       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4084                                        // previous iteration.
4085       // MACC(Rm, Rn, t0, t1, t2);
4086       // Rm = *++Pm;
4087       // Rn = *--Pn;
4088       umulh(Rhi_mn, Rm, Rn);
4089       mul(Rlo_mn, Rm, Rn);
4090       ldr(Rm, pre(Pm, wordSize));
4091       ldr(Rn, pre(Pn, -wordSize));
4092       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4093     }
4094 
4095     void post1() {
4096       block_comment("post1");
4097 
4098       // MACC(Ra, Rb, t0, t1, t2);
4099       // Ra = *++Pa;
4100       // Rb = *--Pb;
4101       umulh(Rhi_ab, Ra, Rb);
4102       mul(Rlo_ab, Ra, Rb);
4103       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4104       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4105 
4106       // *Pm = Rm = t0 * inv;
4107       mul(Rm, t0, inv);
4108       str(Rm, Address(Pm));
4109 
4110       // MACC(Rm, Rn, t0, t1, t2);
4111       // t0 = t1; t1 = t2; t2 = 0;
4112       umulh(Rhi_mn, Rm, Rn);
4113 
4114 #ifndef PRODUCT
4115       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4116       {
4117         mul(Rlo_mn, Rm, Rn);
4118         add(Rlo_mn, t0, Rlo_mn);
4119         Label ok;
4120         cbz(Rlo_mn, ok); {
4121           stop("broken Montgomery multiply");
4122         } bind(ok);
4123       }
4124 #endif
4125       // We have very carefully set things up so that
4126       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4127       // the lower half of Rm * Rn because we know the result already:
4128       // it must be -t0.  t0 + (-t0) must generate a carry iff
4129       // t0 != 0.  So, rather than do a mul and an adds we just set
4130       // the carry flag iff t0 is nonzero.
4131       //
4132       // mul(Rlo_mn, Rm, Rn);
4133       // adds(zr, t0, Rlo_mn);
4134       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4135       adcs(t0, t1, Rhi_mn);
4136       adc(t1, t2, zr);
4137       mov(t2, zr);
4138     }
4139 
4140     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4141       block_comment("pre2");
4142       // Pa = Pa_base + i-len;
4143       // Pb = Pb_base + len;
4144       // Pm = Pm_base + i-len;
4145       // Pn = Pn_base + len;
4146 
4147       if (i.is_register()) {
4148         sub(Rj, i.as_register(), len);
4149       } else {
4150         mov(Rj, i.as_constant());
4151         sub(Rj, Rj, len);
4152       }
4153       // Rj == i-len
4154 
4155       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4156       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4157       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4158       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4159 
4160       // Ra = *++Pa;
4161       // Rb = *--Pb;
4162       // Rm = *++Pm;
4163       // Rn = *--Pn;
4164       ldr(Ra, pre(Pa, wordSize));
4165       ldr(Rb, pre(Pb, -wordSize));
4166       ldr(Rm, pre(Pm, wordSize));
4167       ldr(Rn, pre(Pn, -wordSize));
4168 
4169       mov(Rhi_mn, zr);
4170       mov(Rlo_mn, zr);
4171     }
4172 
4173     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4174       block_comment("post2");
4175       if (i.is_constant()) {
4176         mov(Rj, i.as_constant()-len.as_constant());
4177       } else {
4178         sub(Rj, i.as_register(), len);
4179       }
4180 
4181       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4182 
4183       // As soon as we know the least significant digit of our result,
4184       // store it.
4185       // Pm_base[i-len] = t0;
4186       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4187 
4188       // t0 = t1; t1 = t2; t2 = 0;
4189       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4190       adc(t1, t2, zr);
4191       mov(t2, zr);
4192     }
4193 
4194     // A carry in t0 after Montgomery multiplication means that we
4195     // should subtract multiples of n from our result in m.  We'll
4196     // keep doing that until there is no carry.
4197     void normalize(RegisterOrConstant len) {
4198       block_comment("normalize");
4199       // while (t0)
4200       //   t0 = sub(Pm_base, Pn_base, t0, len);
4201       Label loop, post, again;
4202       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4203       cbz(t0, post); {
4204         bind(again); {
4205           mov(i, zr);
4206           mov(cnt, len);
4207           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4208           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4209           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4210           align(16);
4211           bind(loop); {
4212             sbcs(Rm, Rm, Rn);
4213             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4214             add(i, i, 1);
4215             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4216             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4217             sub(cnt, cnt, 1);
4218           } cbnz(cnt, loop);
4219           sbc(t0, t0, zr);
4220         } cbnz(t0, again);
4221       } bind(post);
4222     }
4223 
4224     // Move memory at s to d, reversing words.
4225     //    Increments d to end of copied memory
4226     //    Destroys tmp1, tmp2
4227     //    Preserves len
4228     //    Leaves s pointing to the address which was in d at start
4229     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4230       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4231 
4232       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4233       mov(tmp1, len);
4234       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4235       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4236     }
4237     // where
4238     void reverse1(Register d, Register s, Register tmp) {
4239       ldr(tmp, pre(s, -wordSize));
4240       ror(tmp, tmp, 32);
4241       str(tmp, post(d, wordSize));
4242     }
4243 
4244     void step_squaring() {
4245       // An extra ACC
4246       step();
4247       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4248     }
4249 
4250     void last_squaring(RegisterOrConstant i) {
4251       Label dont;
4252       // if ((i & 1) == 0) {
4253       tbnz(i.as_register(), 0, dont); {
4254         // MACC(Ra, Rb, t0, t1, t2);
4255         // Ra = *++Pa;
4256         // Rb = *--Pb;
4257         umulh(Rhi_ab, Ra, Rb);
4258         mul(Rlo_ab, Ra, Rb);
4259         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4260       } bind(dont);
4261     }
4262 
4263     void extra_step_squaring() {
4264       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4265 
4266       // MACC(Rm, Rn, t0, t1, t2);
4267       // Rm = *++Pm;
4268       // Rn = *--Pn;
4269       umulh(Rhi_mn, Rm, Rn);
4270       mul(Rlo_mn, Rm, Rn);
4271       ldr(Rm, pre(Pm, wordSize));
4272       ldr(Rn, pre(Pn, -wordSize));
4273     }
4274 
4275     void post1_squaring() {
4276       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4277 
4278       // *Pm = Rm = t0 * inv;
4279       mul(Rm, t0, inv);
4280       str(Rm, Address(Pm));
4281 
4282       // MACC(Rm, Rn, t0, t1, t2);
4283       // t0 = t1; t1 = t2; t2 = 0;
4284       umulh(Rhi_mn, Rm, Rn);
4285 
4286 #ifndef PRODUCT
4287       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4288       {
4289         mul(Rlo_mn, Rm, Rn);
4290         add(Rlo_mn, t0, Rlo_mn);
4291         Label ok;
4292         cbz(Rlo_mn, ok); {
4293           stop("broken Montgomery multiply");
4294         } bind(ok);
4295       }
4296 #endif
4297       // We have very carefully set things up so that
4298       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4299       // the lower half of Rm * Rn because we know the result already:
4300       // it must be -t0.  t0 + (-t0) must generate a carry iff
4301       // t0 != 0.  So, rather than do a mul and an adds we just set
4302       // the carry flag iff t0 is nonzero.
4303       //
4304       // mul(Rlo_mn, Rm, Rn);
4305       // adds(zr, t0, Rlo_mn);
4306       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4307       adcs(t0, t1, Rhi_mn);
4308       adc(t1, t2, zr);
4309       mov(t2, zr);
4310     }
4311 
4312     void acc(Register Rhi, Register Rlo,
4313              Register t0, Register t1, Register t2) {
4314       adds(t0, t0, Rlo);
4315       adcs(t1, t1, Rhi);
4316       adc(t2, t2, zr);
4317     }
4318 
4319   public:
4320     /**
4321      * Fast Montgomery multiplication.  The derivation of the
4322      * algorithm is in A Cryptographic Library for the Motorola
4323      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4324      *
4325      * Arguments:
4326      *
4327      * Inputs for multiplication:
4328      *   c_rarg0   - int array elements a
4329      *   c_rarg1   - int array elements b
4330      *   c_rarg2   - int array elements n (the modulus)
4331      *   c_rarg3   - int length
4332      *   c_rarg4   - int inv
4333      *   c_rarg5   - int array elements m (the result)
4334      *
4335      * Inputs for squaring:
4336      *   c_rarg0   - int array elements a
4337      *   c_rarg1   - int array elements n (the modulus)
4338      *   c_rarg2   - int length
4339      *   c_rarg3   - int inv
4340      *   c_rarg4   - int array elements m (the result)
4341      *
4342      */
4343     address generate_multiply() {
4344       Label argh, nothing;
4345       bind(argh);
4346       stop("MontgomeryMultiply total_allocation must be <= 8192");
4347 
4348       align(CodeEntryAlignment);
4349       address entry = pc();
4350 
4351       cbzw(Rlen, nothing);
4352 
4353       enter();
4354 
4355       // Make room.
4356       cmpw(Rlen, 512);
4357       br(Assembler::HI, argh);
4358       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4359       andr(sp, Ra, -2 * wordSize);
4360 
4361       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4362 
4363       {
4364         // Copy input args, reversing as we go.  We use Ra as a
4365         // temporary variable.
4366         reverse(Ra, Pa_base, Rlen, t0, t1);
4367         if (!_squaring)
4368           reverse(Ra, Pb_base, Rlen, t0, t1);
4369         reverse(Ra, Pn_base, Rlen, t0, t1);
4370       }
4371 
4372       // Push all call-saved registers and also Pm_base which we'll need
4373       // at the end.
4374       save_regs();
4375 
4376 #ifndef PRODUCT
4377       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4378       {
4379         ldr(Rn, Address(Pn_base, 0));
4380         mul(Rlo_mn, Rn, inv);
4381         cmp(Rlo_mn, -1);
4382         Label ok;
4383         br(EQ, ok); {
4384           stop("broken inverse in Montgomery multiply");
4385         } bind(ok);
4386       }
4387 #endif
4388 
4389       mov(Pm_base, Ra);
4390 
4391       mov(t0, zr);
4392       mov(t1, zr);
4393       mov(t2, zr);
4394 
4395       block_comment("for (int i = 0; i < len; i++) {");
4396       mov(Ri, zr); {
4397         Label loop, end;
4398         cmpw(Ri, Rlen);
4399         br(Assembler::GE, end);
4400 
4401         bind(loop);
4402         pre1(Ri);
4403 
4404         block_comment("  for (j = i; j; j--) {"); {
4405           movw(Rj, Ri);
4406           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4407         } block_comment("  } // j");
4408 
4409         post1();
4410         addw(Ri, Ri, 1);
4411         cmpw(Ri, Rlen);
4412         br(Assembler::LT, loop);
4413         bind(end);
4414         block_comment("} // i");
4415       }
4416 
4417       block_comment("for (int i = len; i < 2*len; i++) {");
4418       mov(Ri, Rlen); {
4419         Label loop, end;
4420         cmpw(Ri, Rlen, Assembler::LSL, 1);
4421         br(Assembler::GE, end);
4422 
4423         bind(loop);
4424         pre2(Ri, Rlen);
4425 
4426         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4427           lslw(Rj, Rlen, 1);
4428           subw(Rj, Rj, Ri);
4429           subw(Rj, Rj, 1);
4430           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4431         } block_comment("  } // j");
4432 
4433         post2(Ri, Rlen);
4434         addw(Ri, Ri, 1);
4435         cmpw(Ri, Rlen, Assembler::LSL, 1);
4436         br(Assembler::LT, loop);
4437         bind(end);
4438       }
4439       block_comment("} // i");
4440 
4441       normalize(Rlen);
4442 
4443       mov(Ra, Pm_base);  // Save Pm_base in Ra
4444       restore_regs();  // Restore caller's Pm_base
4445 
4446       // Copy our result into caller's Pm_base
4447       reverse(Pm_base, Ra, Rlen, t0, t1);
4448 
4449       leave();
4450       bind(nothing);
4451       ret(lr);
4452 
4453       return entry;
4454     }
4455     // In C, approximately:
4456 
4457     // void
4458     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4459     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4460     //                     unsigned long inv, int len) {
4461     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4462     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4463     //   unsigned long Ra, Rb, Rn, Rm;
4464 
4465     //   int i;
4466 
4467     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4468 
4469     //   for (i = 0; i < len; i++) {
4470     //     int j;
4471 
4472     //     Pa = Pa_base;
4473     //     Pb = Pb_base + i;
4474     //     Pm = Pm_base;
4475     //     Pn = Pn_base + i;
4476 
4477     //     Ra = *Pa;
4478     //     Rb = *Pb;
4479     //     Rm = *Pm;
4480     //     Rn = *Pn;
4481 
4482     //     int iters = i;
4483     //     for (j = 0; iters--; j++) {
4484     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4485     //       MACC(Ra, Rb, t0, t1, t2);
4486     //       Ra = *++Pa;
4487     //       Rb = *--Pb;
4488     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4489     //       MACC(Rm, Rn, t0, t1, t2);
4490     //       Rm = *++Pm;
4491     //       Rn = *--Pn;
4492     //     }
4493 
4494     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4495     //     MACC(Ra, Rb, t0, t1, t2);
4496     //     *Pm = Rm = t0 * inv;
4497     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4498     //     MACC(Rm, Rn, t0, t1, t2);
4499 
4500     //     assert(t0 == 0, "broken Montgomery multiply");
4501 
4502     //     t0 = t1; t1 = t2; t2 = 0;
4503     //   }
4504 
4505     //   for (i = len; i < 2*len; i++) {
4506     //     int j;
4507 
4508     //     Pa = Pa_base + i-len;
4509     //     Pb = Pb_base + len;
4510     //     Pm = Pm_base + i-len;
4511     //     Pn = Pn_base + len;
4512 
4513     //     Ra = *++Pa;
4514     //     Rb = *--Pb;
4515     //     Rm = *++Pm;
4516     //     Rn = *--Pn;
4517 
4518     //     int iters = len*2-i-1;
4519     //     for (j = i-len+1; iters--; j++) {
4520     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4521     //       MACC(Ra, Rb, t0, t1, t2);
4522     //       Ra = *++Pa;
4523     //       Rb = *--Pb;
4524     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4525     //       MACC(Rm, Rn, t0, t1, t2);
4526     //       Rm = *++Pm;
4527     //       Rn = *--Pn;
4528     //     }
4529 
4530     //     Pm_base[i-len] = t0;
4531     //     t0 = t1; t1 = t2; t2 = 0;
4532     //   }
4533 
4534     //   while (t0)
4535     //     t0 = sub(Pm_base, Pn_base, t0, len);
4536     // }
4537 
4538     /**
4539      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4540      * multiplies than Montgomery multiplication so it should be up to
4541      * 25% faster.  However, its loop control is more complex and it
4542      * may actually run slower on some machines.
4543      *
4544      * Arguments:
4545      *
4546      * Inputs:
4547      *   c_rarg0   - int array elements a
4548      *   c_rarg1   - int array elements n (the modulus)
4549      *   c_rarg2   - int length
4550      *   c_rarg3   - int inv
4551      *   c_rarg4   - int array elements m (the result)
4552      *
4553      */
4554     address generate_square() {
4555       Label argh;
4556       bind(argh);
4557       stop("MontgomeryMultiply total_allocation must be <= 8192");
4558 
4559       align(CodeEntryAlignment);
4560       address entry = pc();
4561 
4562       enter();
4563 
4564       // Make room.
4565       cmpw(Rlen, 512);
4566       br(Assembler::HI, argh);
4567       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4568       andr(sp, Ra, -2 * wordSize);
4569 
4570       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4571 
4572       {
4573         // Copy input args, reversing as we go.  We use Ra as a
4574         // temporary variable.
4575         reverse(Ra, Pa_base, Rlen, t0, t1);
4576         reverse(Ra, Pn_base, Rlen, t0, t1);
4577       }
4578 
4579       // Push all call-saved registers and also Pm_base which we'll need
4580       // at the end.
4581       save_regs();
4582 
4583       mov(Pm_base, Ra);
4584 
4585       mov(t0, zr);
4586       mov(t1, zr);
4587       mov(t2, zr);
4588 
4589       block_comment("for (int i = 0; i < len; i++) {");
4590       mov(Ri, zr); {
4591         Label loop, end;
4592         bind(loop);
4593         cmp(Ri, Rlen);
4594         br(Assembler::GE, end);
4595 
4596         pre1(Ri);
4597 
4598         block_comment("for (j = (i+1)/2; j; j--) {"); {
4599           add(Rj, Ri, 1);
4600           lsr(Rj, Rj, 1);
4601           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4602         } block_comment("  } // j");
4603 
4604         last_squaring(Ri);
4605 
4606         block_comment("  for (j = i/2; j; j--) {"); {
4607           lsr(Rj, Ri, 1);
4608           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4609         } block_comment("  } // j");
4610 
4611         post1_squaring();
4612         add(Ri, Ri, 1);
4613         cmp(Ri, Rlen);
4614         br(Assembler::LT, loop);
4615 
4616         bind(end);
4617         block_comment("} // i");
4618       }
4619 
4620       block_comment("for (int i = len; i < 2*len; i++) {");
4621       mov(Ri, Rlen); {
4622         Label loop, end;
4623         bind(loop);
4624         cmp(Ri, Rlen, Assembler::LSL, 1);
4625         br(Assembler::GE, end);
4626 
4627         pre2(Ri, Rlen);
4628 
4629         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4630           lsl(Rj, Rlen, 1);
4631           sub(Rj, Rj, Ri);
4632           sub(Rj, Rj, 1);
4633           lsr(Rj, Rj, 1);
4634           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4635         } block_comment("  } // j");
4636 
4637         last_squaring(Ri);
4638 
4639         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4640           lsl(Rj, Rlen, 1);
4641           sub(Rj, Rj, Ri);
4642           lsr(Rj, Rj, 1);
4643           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4644         } block_comment("  } // j");
4645 
4646         post2(Ri, Rlen);
4647         add(Ri, Ri, 1);
4648         cmp(Ri, Rlen, Assembler::LSL, 1);
4649 
4650         br(Assembler::LT, loop);
4651         bind(end);
4652         block_comment("} // i");
4653       }
4654 
4655       normalize(Rlen);
4656 
4657       mov(Ra, Pm_base);  // Save Pm_base in Ra
4658       restore_regs();  // Restore caller's Pm_base
4659 
4660       // Copy our result into caller's Pm_base
4661       reverse(Pm_base, Ra, Rlen, t0, t1);
4662 
4663       leave();
4664       ret(lr);
4665 
4666       return entry;
4667     }
4668     // In C, approximately:
4669 
4670     // void
4671     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4672     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4673     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4674     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4675     //   unsigned long Ra, Rb, Rn, Rm;
4676 
4677     //   int i;
4678 
4679     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4680 
4681     //   for (i = 0; i < len; i++) {
4682     //     int j;
4683 
4684     //     Pa = Pa_base;
4685     //     Pb = Pa_base + i;
4686     //     Pm = Pm_base;
4687     //     Pn = Pn_base + i;
4688 
4689     //     Ra = *Pa;
4690     //     Rb = *Pb;
4691     //     Rm = *Pm;
4692     //     Rn = *Pn;
4693 
4694     //     int iters = (i+1)/2;
4695     //     for (j = 0; iters--; j++) {
4696     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4697     //       MACC2(Ra, Rb, t0, t1, t2);
4698     //       Ra = *++Pa;
4699     //       Rb = *--Pb;
4700     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4701     //       MACC(Rm, Rn, t0, t1, t2);
4702     //       Rm = *++Pm;
4703     //       Rn = *--Pn;
4704     //     }
4705     //     if ((i & 1) == 0) {
4706     //       assert(Ra == Pa_base[j], "must be");
4707     //       MACC(Ra, Ra, t0, t1, t2);
4708     //     }
4709     //     iters = i/2;
4710     //     assert(iters == i-j, "must be");
4711     //     for (; iters--; j++) {
4712     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4713     //       MACC(Rm, Rn, t0, t1, t2);
4714     //       Rm = *++Pm;
4715     //       Rn = *--Pn;
4716     //     }
4717 
4718     //     *Pm = Rm = t0 * inv;
4719     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4720     //     MACC(Rm, Rn, t0, t1, t2);
4721 
4722     //     assert(t0 == 0, "broken Montgomery multiply");
4723 
4724     //     t0 = t1; t1 = t2; t2 = 0;
4725     //   }
4726 
4727     //   for (i = len; i < 2*len; i++) {
4728     //     int start = i-len+1;
4729     //     int end = start + (len - start)/2;
4730     //     int j;
4731 
4732     //     Pa = Pa_base + i-len;
4733     //     Pb = Pa_base + len;
4734     //     Pm = Pm_base + i-len;
4735     //     Pn = Pn_base + len;
4736 
4737     //     Ra = *++Pa;
4738     //     Rb = *--Pb;
4739     //     Rm = *++Pm;
4740     //     Rn = *--Pn;
4741 
4742     //     int iters = (2*len-i-1)/2;
4743     //     assert(iters == end-start, "must be");
4744     //     for (j = start; iters--; j++) {
4745     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4746     //       MACC2(Ra, Rb, t0, t1, t2);
4747     //       Ra = *++Pa;
4748     //       Rb = *--Pb;
4749     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4750     //       MACC(Rm, Rn, t0, t1, t2);
4751     //       Rm = *++Pm;
4752     //       Rn = *--Pn;
4753     //     }
4754     //     if ((i & 1) == 0) {
4755     //       assert(Ra == Pa_base[j], "must be");
4756     //       MACC(Ra, Ra, t0, t1, t2);
4757     //     }
4758     //     iters =  (2*len-i)/2;
4759     //     assert(iters == len-j, "must be");
4760     //     for (; iters--; j++) {
4761     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4762     //       MACC(Rm, Rn, t0, t1, t2);
4763     //       Rm = *++Pm;
4764     //       Rn = *--Pn;
4765     //     }
4766     //     Pm_base[i-len] = t0;
4767     //     t0 = t1; t1 = t2; t2 = 0;
4768     //   }
4769 
4770     //   while (t0)
4771     //     t0 = sub(Pm_base, Pn_base, t0, len);
4772     // }
4773   };
4774 
4775 
4776   // Initialization
4777   void generate_initial() {
4778     // Generate initial stubs and initializes the entry points
4779 
4780     // entry points that exist in all platforms Note: This is code
4781     // that could be shared among different platforms - however the
4782     // benefit seems to be smaller than the disadvantage of having a
4783     // much more complicated generator structure. See also comment in
4784     // stubRoutines.hpp.
4785 
4786     StubRoutines::_forward_exception_entry = generate_forward_exception();
4787 
4788     StubRoutines::_call_stub_entry =
4789       generate_call_stub(StubRoutines::_call_stub_return_address);
4790 
4791     // is referenced by megamorphic call
4792     StubRoutines::_catch_exception_entry = generate_catch_exception();
4793 
4794     // Build this early so it's available for the interpreter.
4795     StubRoutines::_throw_StackOverflowError_entry =
4796       generate_throw_exception("StackOverflowError throw_exception",
4797                                CAST_FROM_FN_PTR(address,
4798                                                 SharedRuntime::throw_StackOverflowError));
4799     StubRoutines::_throw_delayed_StackOverflowError_entry =
4800       generate_throw_exception("delayed StackOverflowError throw_exception",
4801                                CAST_FROM_FN_PTR(address,
4802                                                 SharedRuntime::throw_delayed_StackOverflowError));
4803     if (UseCRC32Intrinsics) {
4804       // set table address before stub generation which use it
4805       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4806       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4807     }
4808   }
4809 
4810   void generate_all() {
4811     // support for verify_oop (must happen after universe_init)
4812     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4813     StubRoutines::_throw_AbstractMethodError_entry =
4814       generate_throw_exception("AbstractMethodError throw_exception",
4815                                CAST_FROM_FN_PTR(address,
4816                                                 SharedRuntime::
4817                                                 throw_AbstractMethodError));
4818 
4819     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4820       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4821                                CAST_FROM_FN_PTR(address,
4822                                                 SharedRuntime::
4823                                                 throw_IncompatibleClassChangeError));
4824 
4825     StubRoutines::_throw_NullPointerException_at_call_entry =
4826       generate_throw_exception("NullPointerException at call throw_exception",
4827                                CAST_FROM_FN_PTR(address,
4828                                                 SharedRuntime::
4829                                                 throw_NullPointerException_at_call));
4830 
4831     // arraycopy stubs used by compilers
4832     generate_arraycopy_stubs();
4833 
4834     // has negatives stub for large arrays.
4835     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
4836 
4837     if (UseMultiplyToLenIntrinsic) {
4838       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4839     }
4840 
4841     if (UseMontgomeryMultiplyIntrinsic) {
4842       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4843       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4844       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4845     }
4846 
4847     if (UseMontgomerySquareIntrinsic) {
4848       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4849       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4850       // We use generate_multiply() rather than generate_square()
4851       // because it's faster for the sizes of modulus we care about.
4852       StubRoutines::_montgomerySquare = g.generate_multiply();
4853     }
4854 
4855 #ifndef BUILTIN_SIM
4856     // generate GHASH intrinsics code
4857     if (UseGHASHIntrinsics) {
4858       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4859     }
4860 
4861     if (UseAESIntrinsics) {
4862       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4863       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4864       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4865       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4866     }
4867 
4868     if (UseSHA1Intrinsics) {
4869       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4870       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4871     }
4872     if (UseSHA256Intrinsics) {
4873       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4874       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4875     }
4876 
4877     if (UseCRC32CIntrinsics) {
4878       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4879     }
4880 
4881     // generate Adler32 intrinsics code
4882     if (UseAdler32Intrinsics) {
4883       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4884     }
4885 
4886     // Safefetch stubs.
4887     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4888                                                        &StubRoutines::_safefetch32_fault_pc,
4889                                                        &StubRoutines::_safefetch32_continuation_pc);
4890     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4891                                                        &StubRoutines::_safefetchN_fault_pc,
4892                                                        &StubRoutines::_safefetchN_continuation_pc);
4893 #endif
4894     StubRoutines::aarch64::set_completed();
4895   }
4896 
4897  public:
4898   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4899     if (all) {
4900       generate_all();
4901     } else {
4902       generate_initial();
4903     }
4904   }
4905 }; // end class declaration
4906 
4907 void StubGenerator_generate(CodeBuffer* code, bool all) {
4908   StubGenerator g(code, all);
4909 }