1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #ifdef COMPILER2
  43 #include "opto/runtime.hpp"
  44 #endif
  45 
  46 #ifdef BUILTIN_SIM
  47 #include "../../../../../../simulator/simulator.hpp"
  48 #endif
  49 
  50 // Declaration and definition of StubGenerator (no .hpp file).
  51 // For a more detailed description of the stub routine structure
  52 // see the comment in stubRoutines.hpp
  53 
  54 #undef __
  55 #define __ _masm->
  56 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #else
  61 #define BLOCK_COMMENT(str) __ block_comment(str)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 // Stub Code definitions
  67 
  68 class StubGenerator: public StubCodeGenerator {
  69  private:
  70 
  71 #ifdef PRODUCT
  72 #define inc_counter_np(counter) ((void)0)
  73 #else
  74   void inc_counter_np_(int& counter) {
  75     __ lea(rscratch2, ExternalAddress((address)&counter));
  76     __ ldrw(rscratch1, Address(rscratch2));
  77     __ addw(rscratch1, rscratch1, 1);
  78     __ strw(rscratch1, Address(rscratch2));
  79   }
  80 #define inc_counter_np(counter) \
  81   BLOCK_COMMENT("inc_counter " #counter); \
  82   inc_counter_np_(counter);
  83 #endif
  84 
  85   // Call stubs are used to call Java from C
  86   //
  87   // Arguments:
  88   //    c_rarg0:   call wrapper address                   address
  89   //    c_rarg1:   result                                 address
  90   //    c_rarg2:   result type                            BasicType
  91   //    c_rarg3:   method                                 Method*
  92   //    c_rarg4:   (interpreter) entry point              address
  93   //    c_rarg5:   parameters                             intptr_t*
  94   //    c_rarg6:   parameter size (in words)              int
  95   //    c_rarg7:   thread                                 Thread*
  96   //
  97   // There is no return from the stub itself as any Java result
  98   // is written to result
  99   //
 100   // we save r30 (lr) as the return PC at the base of the frame and
 101   // link r29 (fp) below it as the frame pointer installing sp (r31)
 102   // into fp.
 103   //
 104   // we save r0-r7, which accounts for all the c arguments.
 105   //
 106   // TODO: strictly do we need to save them all? they are treated as
 107   // volatile by C so could we omit saving the ones we are going to
 108   // place in global registers (thread? method?) or those we only use
 109   // during setup of the Java call?
 110   //
 111   // we don't need to save r8 which C uses as an indirect result location
 112   // return register.
 113   //
 114   // we don't need to save r9-r15 which both C and Java treat as
 115   // volatile
 116   //
 117   // we don't need to save r16-18 because Java does not use them
 118   //
 119   // we save r19-r28 which Java uses as scratch registers and C
 120   // expects to be callee-save
 121   //
 122   // we save the bottom 64 bits of each value stored in v8-v15; it is
 123   // the responsibility of the caller to preserve larger values.
 124   //
 125   // so the stub frame looks like this when we enter Java code
 126   //
 127   //     [ return_from_Java     ] <--- sp
 128   //     [ argument word n      ]
 129   //      ...
 130   // -27 [ argument word 1      ]
 131   // -26 [ saved v15            ] <--- sp_after_call
 132   // -25 [ saved v14            ]
 133   // -24 [ saved v13            ]
 134   // -23 [ saved v12            ]
 135   // -22 [ saved v11            ]
 136   // -21 [ saved v10            ]
 137   // -20 [ saved v9             ]
 138   // -19 [ saved v8             ]
 139   // -18 [ saved r28            ]
 140   // -17 [ saved r27            ]
 141   // -16 [ saved r26            ]
 142   // -15 [ saved r25            ]
 143   // -14 [ saved r24            ]
 144   // -13 [ saved r23            ]
 145   // -12 [ saved r22            ]
 146   // -11 [ saved r21            ]
 147   // -10 [ saved r20            ]
 148   //  -9 [ saved r19            ]
 149   //  -8 [ call wrapper    (r0) ]
 150   //  -7 [ result          (r1) ]
 151   //  -6 [ result type     (r2) ]
 152   //  -5 [ method          (r3) ]
 153   //  -4 [ entry point     (r4) ]
 154   //  -3 [ parameters      (r5) ]
 155   //  -2 [ parameter size  (r6) ]
 156   //  -1 [ thread (r7)          ]
 157   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 158   //   1 [ saved lr       (r30) ]
 159 
 160   // Call stub stack layout word offsets from fp
 161   enum call_stub_layout {
 162     sp_after_call_off = -26,
 163 
 164     d15_off            = -26,
 165     d13_off            = -24,
 166     d11_off            = -22,
 167     d9_off             = -20,
 168 
 169     r28_off            = -18,
 170     r26_off            = -16,
 171     r24_off            = -14,
 172     r22_off            = -12,
 173     r20_off            = -10,
 174     call_wrapper_off   =  -8,
 175     result_off         =  -7,
 176     result_type_off    =  -6,
 177     method_off         =  -5,
 178     entry_point_off    =  -4,
 179     parameter_size_off =  -2,
 180     thread_off         =  -1,
 181     fp_f               =   0,
 182     retaddr_off        =   1,
 183   };
 184 
 185   address generate_call_stub(address& return_address) {
 186     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 187            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 188            "adjust this code");
 189 
 190     StubCodeMark mark(this, "StubRoutines", "call_stub");
 191     address start = __ pc();
 192 
 193     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 194 
 195     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 196     const Address result        (rfp, result_off         * wordSize);
 197     const Address result_type   (rfp, result_type_off    * wordSize);
 198     const Address method        (rfp, method_off         * wordSize);
 199     const Address entry_point   (rfp, entry_point_off    * wordSize);
 200     const Address parameter_size(rfp, parameter_size_off * wordSize);
 201 
 202     const Address thread        (rfp, thread_off         * wordSize);
 203 
 204     const Address d15_save      (rfp, d15_off * wordSize);
 205     const Address d13_save      (rfp, d13_off * wordSize);
 206     const Address d11_save      (rfp, d11_off * wordSize);
 207     const Address d9_save       (rfp, d9_off * wordSize);
 208 
 209     const Address r28_save      (rfp, r28_off * wordSize);
 210     const Address r26_save      (rfp, r26_off * wordSize);
 211     const Address r24_save      (rfp, r24_off * wordSize);
 212     const Address r22_save      (rfp, r22_off * wordSize);
 213     const Address r20_save      (rfp, r20_off * wordSize);
 214 
 215     // stub code
 216 
 217     // we need a C prolog to bootstrap the x86 caller into the sim
 218     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 219 
 220     address aarch64_entry = __ pc();
 221 
 222 #ifdef BUILTIN_SIM
 223     // Save sender's SP for stack traces.
 224     __ mov(rscratch1, sp);
 225     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 226 #endif
 227     // set up frame and move sp to end of save area
 228     __ enter();
 229     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 230 
 231     // save register parameters and Java scratch/global registers
 232     // n.b. we save thread even though it gets installed in
 233     // rthread because we want to sanity check rthread later
 234     __ str(c_rarg7,  thread);
 235     __ strw(c_rarg6, parameter_size);
 236     __ stp(c_rarg4, c_rarg5,  entry_point);
 237     __ stp(c_rarg2, c_rarg3,  result_type);
 238     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 239 
 240     __ stp(r20, r19,   r20_save);
 241     __ stp(r22, r21,   r22_save);
 242     __ stp(r24, r23,   r24_save);
 243     __ stp(r26, r25,   r26_save);
 244     __ stp(r28, r27,   r28_save);
 245 
 246     __ stpd(v9,  v8,   d9_save);
 247     __ stpd(v11, v10,  d11_save);
 248     __ stpd(v13, v12,  d13_save);
 249     __ stpd(v15, v14,  d15_save);
 250 
 251     // install Java thread in global register now we have saved
 252     // whatever value it held
 253     __ mov(rthread, c_rarg7);
 254     // And method
 255     __ mov(rmethod, c_rarg3);
 256 
 257     // set up the heapbase register
 258     __ reinit_heapbase();
 259 
 260 #ifdef ASSERT
 261     // make sure we have no pending exceptions
 262     {
 263       Label L;
 264       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 265       __ cmp(rscratch1, (unsigned)NULL_WORD);
 266       __ br(Assembler::EQ, L);
 267       __ stop("StubRoutines::call_stub: entered with pending exception");
 268       __ BIND(L);
 269     }
 270 #endif
 271     // pass parameters if any
 272     __ mov(esp, sp);
 273     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 274     __ andr(sp, rscratch1, -2 * wordSize);
 275 
 276     BLOCK_COMMENT("pass parameters if any");
 277     Label parameters_done;
 278     // parameter count is still in c_rarg6
 279     // and parameter pointer identifying param 1 is in c_rarg5
 280     __ cbzw(c_rarg6, parameters_done);
 281 
 282     address loop = __ pc();
 283     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 284     __ subsw(c_rarg6, c_rarg6, 1);
 285     __ push(rscratch1);
 286     __ br(Assembler::GT, loop);
 287 
 288     __ BIND(parameters_done);
 289 
 290     // call Java entry -- passing methdoOop, and current sp
 291     //      rmethod: Method*
 292     //      r13: sender sp
 293     BLOCK_COMMENT("call Java function");
 294     __ mov(r13, sp);
 295     __ blr(c_rarg4);
 296 
 297     // tell the simulator we have returned to the stub
 298 
 299     // we do this here because the notify will already have been done
 300     // if we get to the next instruction via an exception
 301     //
 302     // n.b. adding this instruction here affects the calculation of
 303     // whether or not a routine returns to the call stub (used when
 304     // doing stack walks) since the normal test is to check the return
 305     // pc against the address saved below. so we may need to allow for
 306     // this extra instruction in the check.
 307 
 308     if (NotifySimulator) {
 309       __ notify(Assembler::method_reentry);
 310     }
 311     // save current address for use by exception handling code
 312 
 313     return_address = __ pc();
 314 
 315     // store result depending on type (everything that is not
 316     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 317     // n.b. this assumes Java returns an integral result in r0
 318     // and a floating result in j_farg0
 319     __ ldr(j_rarg2, result);
 320     Label is_long, is_float, is_double, exit;
 321     __ ldr(j_rarg1, result_type);
 322     __ cmp(j_rarg1, T_OBJECT);
 323     __ br(Assembler::EQ, is_long);
 324     __ cmp(j_rarg1, T_LONG);
 325     __ br(Assembler::EQ, is_long);
 326     __ cmp(j_rarg1, T_FLOAT);
 327     __ br(Assembler::EQ, is_float);
 328     __ cmp(j_rarg1, T_DOUBLE);
 329     __ br(Assembler::EQ, is_double);
 330 
 331     // handle T_INT case
 332     __ strw(r0, Address(j_rarg2));
 333 
 334     __ BIND(exit);
 335 
 336     // pop parameters
 337     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 338 
 339 #ifdef ASSERT
 340     // verify that threads correspond
 341     {
 342       Label L, S;
 343       __ ldr(rscratch1, thread);
 344       __ cmp(rthread, rscratch1);
 345       __ br(Assembler::NE, S);
 346       __ get_thread(rscratch1);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::EQ, L);
 349       __ BIND(S);
 350       __ stop("StubRoutines::call_stub: threads must correspond");
 351       __ BIND(L);
 352     }
 353 #endif
 354 
 355     // restore callee-save registers
 356     __ ldpd(v15, v14,  d15_save);
 357     __ ldpd(v13, v12,  d13_save);
 358     __ ldpd(v11, v10,  d11_save);
 359     __ ldpd(v9,  v8,   d9_save);
 360 
 361     __ ldp(r28, r27,   r28_save);
 362     __ ldp(r26, r25,   r26_save);
 363     __ ldp(r24, r23,   r24_save);
 364     __ ldp(r22, r21,   r22_save);
 365     __ ldp(r20, r19,   r20_save);
 366 
 367     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 368     __ ldrw(c_rarg2, result_type);
 369     __ ldr(c_rarg3,  method);
 370     __ ldp(c_rarg4, c_rarg5,  entry_point);
 371     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 372 
 373 #ifndef PRODUCT
 374     // tell the simulator we are about to end Java execution
 375     if (NotifySimulator) {
 376       __ notify(Assembler::method_exit);
 377     }
 378 #endif
 379     // leave frame and return to caller
 380     __ leave();
 381     __ ret(lr);
 382 
 383     // handle return types different from T_INT
 384 
 385     __ BIND(is_long);
 386     __ str(r0, Address(j_rarg2, 0));
 387     __ br(Assembler::AL, exit);
 388 
 389     __ BIND(is_float);
 390     __ strs(j_farg0, Address(j_rarg2, 0));
 391     __ br(Assembler::AL, exit);
 392 
 393     __ BIND(is_double);
 394     __ strd(j_farg0, Address(j_rarg2, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     return start;
 398   }
 399 
 400   // Return point for a Java call if there's an exception thrown in
 401   // Java code.  The exception is caught and transformed into a
 402   // pending exception stored in JavaThread that can be tested from
 403   // within the VM.
 404   //
 405   // Note: Usually the parameters are removed by the callee. In case
 406   // of an exception crossing an activation frame boundary, that is
 407   // not the case if the callee is compiled code => need to setup the
 408   // rsp.
 409   //
 410   // r0: exception oop
 411 
 412   // NOTE: this is used as a target from the signal handler so it
 413   // needs an x86 prolog which returns into the current simulator
 414   // executing the generated catch_exception code. so the prolog
 415   // needs to install rax in a sim register and adjust the sim's
 416   // restart pc to enter the generated code at the start position
 417   // then return from native to simulated execution.
 418 
 419   address generate_catch_exception() {
 420     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 421     address start = __ pc();
 422 
 423     // same as in generate_call_stub():
 424     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 425     const Address thread        (rfp, thread_off         * wordSize);
 426 
 427 #ifdef ASSERT
 428     // verify that threads correspond
 429     {
 430       Label L, S;
 431       __ ldr(rscratch1, thread);
 432       __ cmp(rthread, rscratch1);
 433       __ br(Assembler::NE, S);
 434       __ get_thread(rscratch1);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::EQ, L);
 437       __ bind(S);
 438       __ stop("StubRoutines::catch_exception: threads must correspond");
 439       __ bind(L);
 440     }
 441 #endif
 442 
 443     // set pending exception
 444     __ verify_oop(r0);
 445 
 446     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 447     __ mov(rscratch1, (address)__FILE__);
 448     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 449     __ movw(rscratch1, (int)__LINE__);
 450     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 451 
 452     // complete return to VM
 453     assert(StubRoutines::_call_stub_return_address != NULL,
 454            "_call_stub_return_address must have been generated before");
 455     __ b(StubRoutines::_call_stub_return_address);
 456 
 457     return start;
 458   }
 459 
 460   // Continuation point for runtime calls returning with a pending
 461   // exception.  The pending exception check happened in the runtime
 462   // or native call stub.  The pending exception in Thread is
 463   // converted into a Java-level exception.
 464   //
 465   // Contract with Java-level exception handlers:
 466   // r0: exception
 467   // r3: throwing pc
 468   //
 469   // NOTE: At entry of this stub, exception-pc must be in LR !!
 470 
 471   // NOTE: this is always used as a jump target within generated code
 472   // so it just needs to be generated code wiht no x86 prolog
 473 
 474   address generate_forward_exception() {
 475     StubCodeMark mark(this, "StubRoutines", "forward exception");
 476     address start = __ pc();
 477 
 478     // Upon entry, LR points to the return address returning into
 479     // Java (interpreted or compiled) code; i.e., the return address
 480     // becomes the throwing pc.
 481     //
 482     // Arguments pushed before the runtime call are still on the stack
 483     // but the exception handler will reset the stack pointer ->
 484     // ignore them.  A potential result in registers can be ignored as
 485     // well.
 486 
 487 #ifdef ASSERT
 488     // make sure this code is only executed if there is a pending exception
 489     {
 490       Label L;
 491       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 492       __ cbnz(rscratch1, L);
 493       __ stop("StubRoutines::forward exception: no pending exception (1)");
 494       __ bind(L);
 495     }
 496 #endif
 497 
 498     // compute exception handler into r19
 499 
 500     // call the VM to find the handler address associated with the
 501     // caller address. pass thread in r0 and caller pc (ret address)
 502     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 503     // the stack.
 504     __ mov(c_rarg1, lr);
 505     // lr will be trashed by the VM call so we move it to R19
 506     // (callee-saved) because we also need to pass it to the handler
 507     // returned by this call.
 508     __ mov(r19, lr);
 509     BLOCK_COMMENT("call exception_handler_for_return_address");
 510     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 511                          SharedRuntime::exception_handler_for_return_address),
 512                     rthread, c_rarg1);
 513     // we should not really care that lr is no longer the callee
 514     // address. we saved the value the handler needs in r19 so we can
 515     // just copy it to r3. however, the C2 handler will push its own
 516     // frame and then calls into the VM and the VM code asserts that
 517     // the PC for the frame above the handler belongs to a compiled
 518     // Java method. So, we restore lr here to satisfy that assert.
 519     __ mov(lr, r19);
 520     // setup r0 & r3 & clear pending exception
 521     __ mov(r3, r19);
 522     __ mov(r19, r0);
 523     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 524     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 525 
 526 #ifdef ASSERT
 527     // make sure exception is set
 528     {
 529       Label L;
 530       __ cbnz(r0, L);
 531       __ stop("StubRoutines::forward exception: no pending exception (2)");
 532       __ bind(L);
 533     }
 534 #endif
 535 
 536     // continue at exception handler
 537     // r0: exception
 538     // r3: throwing pc
 539     // r19: exception handler
 540     __ verify_oop(r0);
 541     __ br(r19);
 542 
 543     return start;
 544   }
 545 
 546   // Non-destructive plausibility checks for oops
 547   //
 548   // Arguments:
 549   //    r0: oop to verify
 550   //    rscratch1: error message
 551   //
 552   // Stack after saving c_rarg3:
 553   //    [tos + 0]: saved c_rarg3
 554   //    [tos + 1]: saved c_rarg2
 555   //    [tos + 2]: saved lr
 556   //    [tos + 3]: saved rscratch2
 557   //    [tos + 4]: saved r0
 558   //    [tos + 5]: saved rscratch1
 559   address generate_verify_oop() {
 560 
 561     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 562     address start = __ pc();
 563 
 564     Label exit, error;
 565 
 566     // save c_rarg2 and c_rarg3
 567     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 568 
 569     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 570     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 571     __ ldr(c_rarg3, Address(c_rarg2));
 572     __ add(c_rarg3, c_rarg3, 1);
 573     __ str(c_rarg3, Address(c_rarg2));
 574 
 575     // object is in r0
 576     // make sure object is 'reasonable'
 577     __ cbz(r0, exit); // if obj is NULL it is OK
 578 
 579     // Check if the oop is in the right area of memory
 580     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 581     __ andr(c_rarg2, r0, c_rarg3);
 582     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 583 
 584     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 585     // instruction here because the flags register is live.
 586     __ eor(c_rarg2, c_rarg2, c_rarg3);
 587     __ cbnz(c_rarg2, error);
 588 
 589     // make sure klass is 'reasonable', which is not zero.
 590     __ load_klass(r0, r0);  // get klass
 591     __ cbz(r0, error);      // if klass is NULL it is broken
 592 
 593     // return if everything seems ok
 594     __ bind(exit);
 595 
 596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 597     __ ret(lr);
 598 
 599     // handle errors
 600     __ bind(error);
 601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 602 
 603     __ push(RegSet::range(r0, r29), sp);
 604     // debug(char* msg, int64_t pc, int64_t regs[])
 605     __ mov(c_rarg0, rscratch1);      // pass address of error message
 606     __ mov(c_rarg1, lr);             // pass return address
 607     __ mov(c_rarg2, sp);             // pass address of regs on stack
 608 #ifndef PRODUCT
 609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 610 #endif
 611     BLOCK_COMMENT("call MacroAssembler::debug");
 612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 613     __ blrt(rscratch1, 3, 0, 1);
 614 
 615     return start;
 616   }
 617 
 618   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 619 
 620   // Generate code for an array write pre barrier
 621   //
 622   //     addr    -  starting address
 623   //     count   -  element count
 624   //     tmp     - scratch register
 625   //
 626   //     Destroy no registers except rscratch1 and rscratch2
 627   //
 628   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 629     BarrierSet* bs = Universe::heap()->barrier_set();
 630     switch (bs->kind()) {
 631     case BarrierSet::G1SATBCTLogging:
 632       // With G1, don't generate the call if we statically know that the target in uninitialized
 633       if (!dest_uninitialized) {
 634         __ push_call_clobbered_registers();
 635         if (count == c_rarg0) {
 636           if (addr == c_rarg1) {
 637             // exactly backwards!!
 638             __ mov(rscratch1, c_rarg0);
 639             __ mov(c_rarg0, c_rarg1);
 640             __ mov(c_rarg1, rscratch1);
 641           } else {
 642             __ mov(c_rarg1, count);
 643             __ mov(c_rarg0, addr);
 644           }
 645         } else {
 646           __ mov(c_rarg0, addr);
 647           __ mov(c_rarg1, count);
 648         }
 649         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 650         __ pop_call_clobbered_registers();
 651         break;
 652       case BarrierSet::CardTableForRS:
 653       case BarrierSet::CardTableExtension:
 654       case BarrierSet::ModRef:
 655         break;
 656       default:
 657         ShouldNotReachHere();
 658 
 659       }
 660     }
 661   }
 662 
 663   //
 664   // Generate code for an array write post barrier
 665   //
 666   //  Input:
 667   //     start    - register containing starting address of destination array
 668   //     end      - register containing ending address of destination array
 669   //     scratch  - scratch register
 670   //
 671   //  The input registers are overwritten.
 672   //  The ending address is inclusive.
 673   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 674     assert_different_registers(start, end, scratch);
 675     BarrierSet* bs = Universe::heap()->barrier_set();
 676     switch (bs->kind()) {
 677       case BarrierSet::G1SATBCTLogging:
 678 
 679         {
 680           __ push_call_clobbered_registers();
 681           // must compute element count unless barrier set interface is changed (other platforms supply count)
 682           assert_different_registers(start, end, scratch);
 683           __ lea(scratch, Address(end, BytesPerHeapOop));
 684           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 685           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 686           __ mov(c_rarg0, start);
 687           __ mov(c_rarg1, scratch);
 688           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 689           __ pop_call_clobbered_registers();
 690         }
 691         break;
 692       case BarrierSet::CardTableForRS:
 693       case BarrierSet::CardTableExtension:
 694         {
 695           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 696           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 697 
 698           Label L_loop;
 699 
 700            __ lsr(start, start, CardTableModRefBS::card_shift);
 701            __ lsr(end, end, CardTableModRefBS::card_shift);
 702            __ sub(end, end, start); // number of bytes to copy
 703 
 704           const Register count = end; // 'end' register contains bytes count now
 705           __ load_byte_map_base(scratch);
 706           __ add(start, start, scratch);
 707           if (UseConcMarkSweepGC) {
 708             __ membar(__ StoreStore);
 709           }
 710           __ BIND(L_loop);
 711           __ strb(zr, Address(start, count));
 712           __ subs(count, count, 1);
 713           __ br(Assembler::GE, L_loop);
 714         }
 715         break;
 716       default:
 717         ShouldNotReachHere();
 718 
 719     }
 720   }
 721 
 722   address generate_zero_longs(Register base, Register cnt) {
 723     Register tmp = rscratch1;
 724     Register tmp2 = rscratch2;
 725     int zva_length = VM_Version::zva_length();
 726     Label initial_table_end, loop_zva;
 727     Label fini;
 728 
 729     __ align(CodeEntryAlignment);
 730     StubCodeMark mark(this, "StubRoutines", "zero_longs");
 731     address start = __ pc();
 732 
 733     // Base must be 16 byte aligned. If not just return and let caller handle it
 734     __ tst(base, 0x0f);
 735     __ br(Assembler::NE, fini);
 736     // Align base with ZVA length.
 737     __ neg(tmp, base);
 738     __ andr(tmp, tmp, zva_length - 1);
 739 
 740     // tmp: the number of bytes to be filled to align the base with ZVA length.
 741     __ add(base, base, tmp);
 742     __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
 743     __ adr(tmp2, initial_table_end);
 744     __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
 745     __ br(tmp2);
 746 
 747     for (int i = -zva_length + 16; i < 0; i += 16)
 748       __ stp(zr, zr, Address(base, i));
 749     __ bind(initial_table_end);
 750 
 751     __ sub(cnt, cnt, zva_length >> 3);
 752     __ bind(loop_zva);
 753     __ dc(Assembler::ZVA, base);
 754     __ subs(cnt, cnt, zva_length >> 3);
 755     __ add(base, base, zva_length);
 756     __ br(Assembler::GE, loop_zva);
 757     __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
 758     __ bind(fini);
 759     __ ret(lr);
 760 
 761     return start;
 762   }
 763 
 764   typedef enum {
 765     copy_forwards = 1,
 766     copy_backwards = -1
 767   } copy_direction;
 768 
 769   // Bulk copy of blocks of 8 words.
 770   //
 771   // count is a count of words.
 772   //
 773   // Precondition: count >= 8
 774   //
 775   // Postconditions:
 776   //
 777   // The least significant bit of count contains the remaining count
 778   // of words to copy.  The rest of count is trash.
 779   //
 780   // s and d are adjusted to point to the remaining words to copy
 781   //
 782   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 783                            copy_direction direction) {
 784     int unit = wordSize * direction;
 785     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 786 
 787     int offset;
 788     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 789       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 790     const Register stride = r13;
 791 
 792     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 793     assert_different_registers(s, d, count, rscratch1);
 794 
 795     Label again, drain;
 796     const char *stub_name;
 797     if (direction == copy_forwards)
 798       stub_name = "foward_copy_longs";
 799     else
 800       stub_name = "backward_copy_longs";
 801     StubCodeMark mark(this, "StubRoutines", stub_name);
 802     __ align(CodeEntryAlignment);
 803     __ bind(start);
 804 
 805     Label unaligned_copy_long;
 806     if (AvoidUnalignedAccesses) {
 807       __ tbnz(d, 3, unaligned_copy_long);
 808     }
 809 
 810     if (direction == copy_forwards) {
 811       __ sub(s, s, bias);
 812       __ sub(d, d, bias);
 813     }
 814 
 815 #ifdef ASSERT
 816     // Make sure we are never given < 8 words
 817     {
 818       Label L;
 819       __ cmp(count, 8);
 820       __ br(Assembler::GE, L);
 821       __ stop("genrate_copy_longs called with < 8 words");
 822       __ bind(L);
 823     }
 824 #endif
 825 
 826     // Fill 8 registers
 827     if (UseSIMDForMemoryOps) {
 828       __ ldpq(v0, v1, Address(s, 4 * unit));
 829       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 830     } else {
 831       __ ldp(t0, t1, Address(s, 2 * unit));
 832       __ ldp(t2, t3, Address(s, 4 * unit));
 833       __ ldp(t4, t5, Address(s, 6 * unit));
 834       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 835     }
 836 
 837     __ subs(count, count, 16);
 838     __ br(Assembler::LO, drain);
 839 
 840     int prefetch = PrefetchCopyIntervalInBytes;
 841     bool use_stride = false;
 842     if (direction == copy_backwards) {
 843        use_stride = prefetch > 256;
 844        prefetch = -prefetch;
 845        if (use_stride) __ mov(stride, prefetch);
 846     }
 847 
 848     __ bind(again);
 849 
 850     if (PrefetchCopyIntervalInBytes > 0)
 851       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 852 
 853     if (UseSIMDForMemoryOps) {
 854       __ stpq(v0, v1, Address(d, 4 * unit));
 855       __ ldpq(v0, v1, Address(s, 4 * unit));
 856       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 857       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 858     } else {
 859       __ stp(t0, t1, Address(d, 2 * unit));
 860       __ ldp(t0, t1, Address(s, 2 * unit));
 861       __ stp(t2, t3, Address(d, 4 * unit));
 862       __ ldp(t2, t3, Address(s, 4 * unit));
 863       __ stp(t4, t5, Address(d, 6 * unit));
 864       __ ldp(t4, t5, Address(s, 6 * unit));
 865       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 866       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 867     }
 868 
 869     __ subs(count, count, 8);
 870     __ br(Assembler::HS, again);
 871 
 872     // Drain
 873     __ bind(drain);
 874     if (UseSIMDForMemoryOps) {
 875       __ stpq(v0, v1, Address(d, 4 * unit));
 876       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 877     } else {
 878       __ stp(t0, t1, Address(d, 2 * unit));
 879       __ stp(t2, t3, Address(d, 4 * unit));
 880       __ stp(t4, t5, Address(d, 6 * unit));
 881       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 882     }
 883 
 884     {
 885       Label L1, L2;
 886       __ tbz(count, exact_log2(4), L1);
 887       if (UseSIMDForMemoryOps) {
 888         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 889         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 890       } else {
 891         __ ldp(t0, t1, Address(s, 2 * unit));
 892         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 893         __ stp(t0, t1, Address(d, 2 * unit));
 894         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 895       }
 896       __ bind(L1);
 897 
 898       if (direction == copy_forwards) {
 899         __ add(s, s, bias);
 900         __ add(d, d, bias);
 901       }
 902 
 903       __ tbz(count, 1, L2);
 904       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 905       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 906       __ bind(L2);
 907     }
 908 
 909     __ ret(lr);
 910 
 911     if (AvoidUnalignedAccesses) {
 912       Label drain, again;
 913       // Register order for storing. Order is different for backward copy.
 914 
 915       __ bind(unaligned_copy_long);
 916 
 917       // source address is even aligned, target odd aligned
 918       //
 919       // when forward copying word pairs we read long pairs at offsets
 920       // {0, 2, 4, 6} (in long words). when backwards copying we read
 921       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 922       // address by -2 in the forwards case so we can compute the
 923       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 924       // or -1.
 925       //
 926       // when forward copying we need to store 1 word, 3 pairs and
 927       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 928       // zero offset We adjust the destination by -1 which means we
 929       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 930       //
 931       // When backwards copyng we need to store 1 word, 3 pairs and
 932       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 933       // offsets {1, 3, 5, 7, 8} * unit.
 934 
 935       if (direction == copy_forwards) {
 936         __ sub(s, s, 16);
 937         __ sub(d, d, 8);
 938       }
 939 
 940       // Fill 8 registers
 941       //
 942       // for forwards copy s was offset by -16 from the original input
 943       // value of s so the register contents are at these offsets
 944       // relative to the 64 bit block addressed by that original input
 945       // and so on for each successive 64 byte block when s is updated
 946       //
 947       // t0 at offset 0,  t1 at offset 8
 948       // t2 at offset 16, t3 at offset 24
 949       // t4 at offset 32, t5 at offset 40
 950       // t6 at offset 48, t7 at offset 56
 951 
 952       // for backwards copy s was not offset so the register contents
 953       // are at these offsets into the preceding 64 byte block
 954       // relative to that original input and so on for each successive
 955       // preceding 64 byte block when s is updated. this explains the
 956       // slightly counter-intuitive looking pattern of register usage
 957       // in the stp instructions for backwards copy.
 958       //
 959       // t0 at offset -16, t1 at offset -8
 960       // t2 at offset -32, t3 at offset -24
 961       // t4 at offset -48, t5 at offset -40
 962       // t6 at offset -64, t7 at offset -56
 963 
 964       __ ldp(t0, t1, Address(s, 2 * unit));
 965       __ ldp(t2, t3, Address(s, 4 * unit));
 966       __ ldp(t4, t5, Address(s, 6 * unit));
 967       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 968 
 969       __ subs(count, count, 16);
 970       __ br(Assembler::LO, drain);
 971 
 972       int prefetch = PrefetchCopyIntervalInBytes;
 973       bool use_stride = false;
 974       if (direction == copy_backwards) {
 975          use_stride = prefetch > 256;
 976          prefetch = -prefetch;
 977          if (use_stride) __ mov(stride, prefetch);
 978       }
 979 
 980       __ bind(again);
 981 
 982       if (PrefetchCopyIntervalInBytes > 0)
 983         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 984 
 985       if (direction == copy_forwards) {
 986        // allowing for the offset of -8 the store instructions place
 987        // registers into the target 64 bit block at the following
 988        // offsets
 989        //
 990        // t0 at offset 0
 991        // t1 at offset 8,  t2 at offset 16
 992        // t3 at offset 24, t4 at offset 32
 993        // t5 at offset 40, t6 at offset 48
 994        // t7 at offset 56
 995 
 996         __ str(t0, Address(d, 1 * unit));
 997         __ stp(t1, t2, Address(d, 2 * unit));
 998         __ ldp(t0, t1, Address(s, 2 * unit));
 999         __ stp(t3, t4, Address(d, 4 * unit));
1000         __ ldp(t2, t3, Address(s, 4 * unit));
1001         __ stp(t5, t6, Address(d, 6 * unit));
1002         __ ldp(t4, t5, Address(s, 6 * unit));
1003         __ str(t7, Address(__ pre(d, 8 * unit)));
1004         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1005       } else {
1006        // d was not offset when we started so the registers are
1007        // written into the 64 bit block preceding d with the following
1008        // offsets
1009        //
1010        // t1 at offset -8
1011        // t3 at offset -24, t0 at offset -16
1012        // t5 at offset -48, t2 at offset -32
1013        // t7 at offset -56, t4 at offset -48
1014        //                   t6 at offset -64
1015        //
1016        // note that this matches the offsets previously noted for the
1017        // loads
1018 
1019         __ str(t1, Address(d, 1 * unit));
1020         __ stp(t3, t0, Address(d, 3 * unit));
1021         __ ldp(t0, t1, Address(s, 2 * unit));
1022         __ stp(t5, t2, Address(d, 5 * unit));
1023         __ ldp(t2, t3, Address(s, 4 * unit));
1024         __ stp(t7, t4, Address(d, 7 * unit));
1025         __ ldp(t4, t5, Address(s, 6 * unit));
1026         __ str(t6, Address(__ pre(d, 8 * unit)));
1027         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1028       }
1029 
1030       __ subs(count, count, 8);
1031       __ br(Assembler::HS, again);
1032 
1033       // Drain
1034       //
1035       // this uses the same pattern of offsets and register arguments
1036       // as above
1037       __ bind(drain);
1038       if (direction == copy_forwards) {
1039         __ str(t0, Address(d, 1 * unit));
1040         __ stp(t1, t2, Address(d, 2 * unit));
1041         __ stp(t3, t4, Address(d, 4 * unit));
1042         __ stp(t5, t6, Address(d, 6 * unit));
1043         __ str(t7, Address(__ pre(d, 8 * unit)));
1044       } else {
1045         __ str(t1, Address(d, 1 * unit));
1046         __ stp(t3, t0, Address(d, 3 * unit));
1047         __ stp(t5, t2, Address(d, 5 * unit));
1048         __ stp(t7, t4, Address(d, 7 * unit));
1049         __ str(t6, Address(__ pre(d, 8 * unit)));
1050       }
1051       // now we need to copy any remaining part block which may
1052       // include a 4 word block subblock and/or a 2 word subblock.
1053       // bits 2 and 1 in the count are the tell-tale for whetehr we
1054       // have each such subblock
1055       {
1056         Label L1, L2;
1057         __ tbz(count, exact_log2(4), L1);
1058        // this is the same as above but copying only 4 longs hence
1059        // with ony one intervening stp between the str instructions
1060        // but note that the offsets and registers still follow the
1061        // same pattern
1062         __ ldp(t0, t1, Address(s, 2 * unit));
1063         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1064         if (direction == copy_forwards) {
1065           __ str(t0, Address(d, 1 * unit));
1066           __ stp(t1, t2, Address(d, 2 * unit));
1067           __ str(t3, Address(__ pre(d, 4 * unit)));
1068         } else {
1069           __ str(t1, Address(d, 1 * unit));
1070           __ stp(t3, t0, Address(d, 3 * unit));
1071           __ str(t2, Address(__ pre(d, 4 * unit)));
1072         }
1073         __ bind(L1);
1074 
1075         __ tbz(count, 1, L2);
1076        // this is the same as above but copying only 2 longs hence
1077        // there is no intervening stp between the str instructions
1078        // but note that the offset and register patterns are still
1079        // the same
1080         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1081         if (direction == copy_forwards) {
1082           __ str(t0, Address(d, 1 * unit));
1083           __ str(t1, Address(__ pre(d, 2 * unit)));
1084         } else {
1085           __ str(t1, Address(d, 1 * unit));
1086           __ str(t0, Address(__ pre(d, 2 * unit)));
1087         }
1088         __ bind(L2);
1089 
1090        // for forwards copy we need to re-adjust the offsets we
1091        // applied so that s and d are follow the last words written
1092 
1093        if (direction == copy_forwards) {
1094          __ add(s, s, 16);
1095          __ add(d, d, 8);
1096        }
1097 
1098       }
1099 
1100       __ ret(lr);
1101       }
1102   }
1103 
1104   // Small copy: less than 16 bytes.
1105   //
1106   // NB: Ignores all of the bits of count which represent more than 15
1107   // bytes, so a caller doesn't have to mask them.
1108 
1109   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1110     bool is_backwards = step < 0;
1111     size_t granularity = uabs(step);
1112     int direction = is_backwards ? -1 : 1;
1113     int unit = wordSize * direction;
1114 
1115     Label Lpair, Lword, Lint, Lshort, Lbyte;
1116 
1117     assert(granularity
1118            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1119 
1120     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1121 
1122     // ??? I don't know if this bit-test-and-branch is the right thing
1123     // to do.  It does a lot of jumping, resulting in several
1124     // mispredicted branches.  It might make more sense to do this
1125     // with something like Duff's device with a single computed branch.
1126 
1127     __ tbz(count, 3 - exact_log2(granularity), Lword);
1128     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1129     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1130     __ bind(Lword);
1131 
1132     if (granularity <= sizeof (jint)) {
1133       __ tbz(count, 2 - exact_log2(granularity), Lint);
1134       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1135       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1136       __ bind(Lint);
1137     }
1138 
1139     if (granularity <= sizeof (jshort)) {
1140       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1141       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1142       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1143       __ bind(Lshort);
1144     }
1145 
1146     if (granularity <= sizeof (jbyte)) {
1147       __ tbz(count, 0, Lbyte);
1148       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1149       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1150       __ bind(Lbyte);
1151     }
1152   }
1153 
1154   Label copy_f, copy_b;
1155 
1156   // All-singing all-dancing memory copy.
1157   //
1158   // Copy count units of memory from s to d.  The size of a unit is
1159   // step, which can be positive or negative depending on the direction
1160   // of copy.  If is_aligned is false, we align the source address.
1161   //
1162 
1163   void copy_memory(bool is_aligned, Register s, Register d,
1164                    Register count, Register tmp, int step) {
1165     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1166     bool is_backwards = step < 0;
1167     int granularity = uabs(step);
1168     const Register t0 = r3, t1 = r4;
1169 
1170     // <= 96 bytes do inline. Direction doesn't matter because we always
1171     // load all the data before writing anything
1172     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1173     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1174     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1175     const Register send = r17, dend = r18;
1176 
1177     if (PrefetchCopyIntervalInBytes > 0)
1178       __ prfm(Address(s, 0), PLDL1KEEP);
1179     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1180     __ br(Assembler::HI, copy_big);
1181 
1182     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1183     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1184 
1185     __ cmp(count, 16/granularity);
1186     __ br(Assembler::LS, copy16);
1187 
1188     __ cmp(count, 64/granularity);
1189     __ br(Assembler::HI, copy80);
1190 
1191     __ cmp(count, 32/granularity);
1192     __ br(Assembler::LS, copy32);
1193 
1194     // 33..64 bytes
1195     if (UseSIMDForMemoryOps) {
1196       __ ldpq(v0, v1, Address(s, 0));
1197       __ ldpq(v2, v3, Address(send, -32));
1198       __ stpq(v0, v1, Address(d, 0));
1199       __ stpq(v2, v3, Address(dend, -32));
1200     } else {
1201       __ ldp(t0, t1, Address(s, 0));
1202       __ ldp(t2, t3, Address(s, 16));
1203       __ ldp(t4, t5, Address(send, -32));
1204       __ ldp(t6, t7, Address(send, -16));
1205 
1206       __ stp(t0, t1, Address(d, 0));
1207       __ stp(t2, t3, Address(d, 16));
1208       __ stp(t4, t5, Address(dend, -32));
1209       __ stp(t6, t7, Address(dend, -16));
1210     }
1211     __ b(finish);
1212 
1213     // 17..32 bytes
1214     __ bind(copy32);
1215     __ ldp(t0, t1, Address(s, 0));
1216     __ ldp(t2, t3, Address(send, -16));
1217     __ stp(t0, t1, Address(d, 0));
1218     __ stp(t2, t3, Address(dend, -16));
1219     __ b(finish);
1220 
1221     // 65..80/96 bytes
1222     // (96 bytes if SIMD because we do 32 byes per instruction)
1223     __ bind(copy80);
1224     if (UseSIMDForMemoryOps) {
1225       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1226       __ ldpq(v4, v5, Address(send, -32));
1227       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1228       __ stpq(v4, v5, Address(dend, -32));
1229     } else {
1230       __ ldp(t0, t1, Address(s, 0));
1231       __ ldp(t2, t3, Address(s, 16));
1232       __ ldp(t4, t5, Address(s, 32));
1233       __ ldp(t6, t7, Address(s, 48));
1234       __ ldp(t8, t9, Address(send, -16));
1235 
1236       __ stp(t0, t1, Address(d, 0));
1237       __ stp(t2, t3, Address(d, 16));
1238       __ stp(t4, t5, Address(d, 32));
1239       __ stp(t6, t7, Address(d, 48));
1240       __ stp(t8, t9, Address(dend, -16));
1241     }
1242     __ b(finish);
1243 
1244     // 0..16 bytes
1245     __ bind(copy16);
1246     __ cmp(count, 8/granularity);
1247     __ br(Assembler::LO, copy8);
1248 
1249     // 8..16 bytes
1250     __ ldr(t0, Address(s, 0));
1251     __ ldr(t1, Address(send, -8));
1252     __ str(t0, Address(d, 0));
1253     __ str(t1, Address(dend, -8));
1254     __ b(finish);
1255 
1256     if (granularity < 8) {
1257       // 4..7 bytes
1258       __ bind(copy8);
1259       __ tbz(count, 2 - exact_log2(granularity), copy4);
1260       __ ldrw(t0, Address(s, 0));
1261       __ ldrw(t1, Address(send, -4));
1262       __ strw(t0, Address(d, 0));
1263       __ strw(t1, Address(dend, -4));
1264       __ b(finish);
1265       if (granularity < 4) {
1266         // 0..3 bytes
1267         __ bind(copy4);
1268         __ cbz(count, finish); // get rid of 0 case
1269         if (granularity == 2) {
1270           __ ldrh(t0, Address(s, 0));
1271           __ strh(t0, Address(d, 0));
1272         } else { // granularity == 1
1273           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1274           // the first and last byte.
1275           // Handle the 3 byte case by loading and storing base + count/2
1276           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1277           // This does means in the 1 byte case we load/store the same
1278           // byte 3 times.
1279           __ lsr(count, count, 1);
1280           __ ldrb(t0, Address(s, 0));
1281           __ ldrb(t1, Address(send, -1));
1282           __ ldrb(t2, Address(s, count));
1283           __ strb(t0, Address(d, 0));
1284           __ strb(t1, Address(dend, -1));
1285           __ strb(t2, Address(d, count));
1286         }
1287         __ b(finish);
1288       }
1289     }
1290 
1291     __ bind(copy_big);
1292     if (is_backwards) {
1293       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1294       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1295     }
1296 
1297     // Now we've got the small case out of the way we can align the
1298     // source address on a 2-word boundary.
1299 
1300     Label aligned;
1301 
1302     if (is_aligned) {
1303       // We may have to adjust by 1 word to get s 2-word-aligned.
1304       __ tbz(s, exact_log2(wordSize), aligned);
1305       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1306       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1307       __ sub(count, count, wordSize/granularity);
1308     } else {
1309       if (is_backwards) {
1310         __ andr(rscratch2, s, 2 * wordSize - 1);
1311       } else {
1312         __ neg(rscratch2, s);
1313         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1314       }
1315       // rscratch2 is the byte adjustment needed to align s.
1316       __ cbz(rscratch2, aligned);
1317       int shift = exact_log2(granularity);
1318       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1319       __ sub(count, count, rscratch2);
1320 
1321 #if 0
1322       // ?? This code is only correct for a disjoint copy.  It may or
1323       // may not make sense to use it in that case.
1324 
1325       // Copy the first pair; s and d may not be aligned.
1326       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1327       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1328 
1329       // Align s and d, adjust count
1330       if (is_backwards) {
1331         __ sub(s, s, rscratch2);
1332         __ sub(d, d, rscratch2);
1333       } else {
1334         __ add(s, s, rscratch2);
1335         __ add(d, d, rscratch2);
1336       }
1337 #else
1338       copy_memory_small(s, d, rscratch2, rscratch1, step);
1339 #endif
1340     }
1341 
1342     __ bind(aligned);
1343 
1344     // s is now 2-word-aligned.
1345 
1346     // We have a count of units and some trailing bytes.  Adjust the
1347     // count and do a bulk copy of words.
1348     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1349     if (direction == copy_forwards)
1350       __ bl(copy_f);
1351     else
1352       __ bl(copy_b);
1353 
1354     // And the tail.
1355     copy_memory_small(s, d, count, tmp, step);
1356 
1357     if (granularity >= 8) __ bind(copy8);
1358     if (granularity >= 4) __ bind(copy4);
1359     __ bind(finish);
1360   }
1361 
1362 
1363   void clobber_registers() {
1364 #ifdef ASSERT
1365     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1366     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1367     for (Register r = r3; r <= r18; r++)
1368       if (r != rscratch1) __ mov(r, rscratch1);
1369 #endif
1370   }
1371 
1372   // Scan over array at a for count oops, verifying each one.
1373   // Preserves a and count, clobbers rscratch1 and rscratch2.
1374   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1375     Label loop, end;
1376     __ mov(rscratch1, a);
1377     __ mov(rscratch2, zr);
1378     __ bind(loop);
1379     __ cmp(rscratch2, count);
1380     __ br(Assembler::HS, end);
1381     if (size == (size_t)wordSize) {
1382       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1383       __ verify_oop(temp);
1384     } else {
1385       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1386       __ decode_heap_oop(temp); // calls verify_oop
1387     }
1388     __ add(rscratch2, rscratch2, size);
1389     __ b(loop);
1390     __ bind(end);
1391   }
1392 
1393   // Arguments:
1394   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1395   //             ignored
1396   //   is_oop  - true => oop array, so generate store check code
1397   //   name    - stub name string
1398   //
1399   // Inputs:
1400   //   c_rarg0   - source array address
1401   //   c_rarg1   - destination array address
1402   //   c_rarg2   - element count, treated as ssize_t, can be zero
1403   //
1404   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1405   // the hardware handle it.  The two dwords within qwords that span
1406   // cache line boundaries will still be loaded and stored atomicly.
1407   //
1408   // Side Effects:
1409   //   disjoint_int_copy_entry is set to the no-overlap entry point
1410   //   used by generate_conjoint_int_oop_copy().
1411   //
1412   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1413                                   const char *name, bool dest_uninitialized = false) {
1414     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1415     __ align(CodeEntryAlignment);
1416     StubCodeMark mark(this, "StubRoutines", name);
1417     address start = __ pc();
1418     __ enter();
1419 
1420     if (entry != NULL) {
1421       *entry = __ pc();
1422       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1423       BLOCK_COMMENT("Entry:");
1424     }
1425 
1426     if (is_oop) {
1427       __ push(RegSet::of(d, count), sp);
1428       // no registers are destroyed by this call
1429       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1430     }
1431     copy_memory(aligned, s, d, count, rscratch1, size);
1432     if (is_oop) {
1433       __ pop(RegSet::of(d, count), sp);
1434       if (VerifyOops)
1435         verify_oop_array(size, d, count, r16);
1436       __ sub(count, count, 1); // make an inclusive end pointer
1437       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1438       gen_write_ref_array_post_barrier(d, count, rscratch1);
1439     }
1440     __ leave();
1441     __ mov(r0, zr); // return 0
1442     __ ret(lr);
1443 #ifdef BUILTIN_SIM
1444     {
1445       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1446       sim->notifyCompile(const_cast<char*>(name), start);
1447     }
1448 #endif
1449     return start;
1450   }
1451 
1452   // Arguments:
1453   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1454   //             ignored
1455   //   is_oop  - true => oop array, so generate store check code
1456   //   name    - stub name string
1457   //
1458   // Inputs:
1459   //   c_rarg0   - source array address
1460   //   c_rarg1   - destination array address
1461   //   c_rarg2   - element count, treated as ssize_t, can be zero
1462   //
1463   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1464   // the hardware handle it.  The two dwords within qwords that span
1465   // cache line boundaries will still be loaded and stored atomicly.
1466   //
1467   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1468                                  address *entry, const char *name,
1469                                  bool dest_uninitialized = false) {
1470     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1471 
1472     StubCodeMark mark(this, "StubRoutines", name);
1473     address start = __ pc();
1474     __ enter();
1475 
1476     if (entry != NULL) {
1477       *entry = __ pc();
1478       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1479       BLOCK_COMMENT("Entry:");
1480     }
1481 
1482     // use fwd copy when (d-s) above_equal (count*size)
1483     __ sub(rscratch1, d, s);
1484     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1485     __ br(Assembler::HS, nooverlap_target);
1486 
1487     if (is_oop) {
1488       __ push(RegSet::of(d, count), sp);
1489       // no registers are destroyed by this call
1490       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1491     }
1492     copy_memory(aligned, s, d, count, rscratch1, -size);
1493     if (is_oop) {
1494       __ pop(RegSet::of(d, count), sp);
1495       if (VerifyOops)
1496         verify_oop_array(size, d, count, r16);
1497       __ sub(count, count, 1); // make an inclusive end pointer
1498       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1499       gen_write_ref_array_post_barrier(d, count, rscratch1);
1500     }
1501     __ leave();
1502     __ mov(r0, zr); // return 0
1503     __ ret(lr);
1504 #ifdef BUILTIN_SIM
1505     {
1506       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1507       sim->notifyCompile(const_cast<char*>(name), start);
1508     }
1509 #endif
1510     return start;
1511 }
1512 
1513   // Arguments:
1514   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1515   //             ignored
1516   //   name    - stub name string
1517   //
1518   // Inputs:
1519   //   c_rarg0   - source array address
1520   //   c_rarg1   - destination array address
1521   //   c_rarg2   - element count, treated as ssize_t, can be zero
1522   //
1523   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1524   // we let the hardware handle it.  The one to eight bytes within words,
1525   // dwords or qwords that span cache line boundaries will still be loaded
1526   // and stored atomically.
1527   //
1528   // Side Effects:
1529   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1530   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1531   // we let the hardware handle it.  The one to eight bytes within words,
1532   // dwords or qwords that span cache line boundaries will still be loaded
1533   // and stored atomically.
1534   //
1535   // Side Effects:
1536   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1537   //   used by generate_conjoint_byte_copy().
1538   //
1539   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1542   }
1543 
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1555   // we let the hardware handle it.  The one to eight bytes within words,
1556   // dwords or qwords that span cache line boundaries will still be loaded
1557   // and stored atomically.
1558   //
1559   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1560                                       address* entry, const char *name) {
1561     const bool not_oop = false;
1562     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1563   }
1564 
1565   // Arguments:
1566   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1567   //             ignored
1568   //   name    - stub name string
1569   //
1570   // Inputs:
1571   //   c_rarg0   - source array address
1572   //   c_rarg1   - destination array address
1573   //   c_rarg2   - element count, treated as ssize_t, can be zero
1574   //
1575   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1576   // let the hardware handle it.  The two or four words within dwords
1577   // or qwords that span cache line boundaries will still be loaded
1578   // and stored atomically.
1579   //
1580   // Side Effects:
1581   //   disjoint_short_copy_entry is set to the no-overlap entry point
1582   //   used by generate_conjoint_short_copy().
1583   //
1584   address generate_disjoint_short_copy(bool aligned,
1585                                        address* entry, const char *name) {
1586     const bool not_oop = false;
1587     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1588   }
1589 
1590   // Arguments:
1591   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1592   //             ignored
1593   //   name    - stub name string
1594   //
1595   // Inputs:
1596   //   c_rarg0   - source array address
1597   //   c_rarg1   - destination array address
1598   //   c_rarg2   - element count, treated as ssize_t, can be zero
1599   //
1600   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1601   // let the hardware handle it.  The two or four words within dwords
1602   // or qwords that span cache line boundaries will still be loaded
1603   // and stored atomically.
1604   //
1605   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1606                                        address *entry, const char *name) {
1607     const bool not_oop = false;
1608     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1609 
1610   }
1611   // Arguments:
1612   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1613   //             ignored
1614   //   name    - stub name string
1615   //
1616   // Inputs:
1617   //   c_rarg0   - source array address
1618   //   c_rarg1   - destination array address
1619   //   c_rarg2   - element count, treated as ssize_t, can be zero
1620   //
1621   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1622   // the hardware handle it.  The two dwords within qwords that span
1623   // cache line boundaries will still be loaded and stored atomicly.
1624   //
1625   // Side Effects:
1626   //   disjoint_int_copy_entry is set to the no-overlap entry point
1627   //   used by generate_conjoint_int_oop_copy().
1628   //
1629   address generate_disjoint_int_copy(bool aligned, address *entry,
1630                                          const char *name, bool dest_uninitialized = false) {
1631     const bool not_oop = false;
1632     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1633   }
1634 
1635   // Arguments:
1636   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1637   //             ignored
1638   //   name    - stub name string
1639   //
1640   // Inputs:
1641   //   c_rarg0   - source array address
1642   //   c_rarg1   - destination array address
1643   //   c_rarg2   - element count, treated as ssize_t, can be zero
1644   //
1645   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1646   // the hardware handle it.  The two dwords within qwords that span
1647   // cache line boundaries will still be loaded and stored atomicly.
1648   //
1649   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1650                                      address *entry, const char *name,
1651                                      bool dest_uninitialized = false) {
1652     const bool not_oop = false;
1653     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1654   }
1655 
1656 
1657   // Arguments:
1658   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1659   //             ignored
1660   //   name    - stub name string
1661   //
1662   // Inputs:
1663   //   c_rarg0   - source array address
1664   //   c_rarg1   - destination array address
1665   //   c_rarg2   - element count, treated as size_t, can be zero
1666   //
1667   // Side Effects:
1668   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1669   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1670   //
1671   address generate_disjoint_long_copy(bool aligned, address *entry,
1672                                           const char *name, bool dest_uninitialized = false) {
1673     const bool not_oop = false;
1674     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1675   }
1676 
1677   // Arguments:
1678   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1679   //             ignored
1680   //   name    - stub name string
1681   //
1682   // Inputs:
1683   //   c_rarg0   - source array address
1684   //   c_rarg1   - destination array address
1685   //   c_rarg2   - element count, treated as size_t, can be zero
1686   //
1687   address generate_conjoint_long_copy(bool aligned,
1688                                       address nooverlap_target, address *entry,
1689                                       const char *name, bool dest_uninitialized = false) {
1690     const bool not_oop = false;
1691     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1692   }
1693 
1694   // Arguments:
1695   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1696   //             ignored
1697   //   name    - stub name string
1698   //
1699   // Inputs:
1700   //   c_rarg0   - source array address
1701   //   c_rarg1   - destination array address
1702   //   c_rarg2   - element count, treated as size_t, can be zero
1703   //
1704   // Side Effects:
1705   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1706   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1707   //
1708   address generate_disjoint_oop_copy(bool aligned, address *entry,
1709                                      const char *name, bool dest_uninitialized) {
1710     const bool is_oop = true;
1711     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1712     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1713   }
1714 
1715   // Arguments:
1716   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1717   //             ignored
1718   //   name    - stub name string
1719   //
1720   // Inputs:
1721   //   c_rarg0   - source array address
1722   //   c_rarg1   - destination array address
1723   //   c_rarg2   - element count, treated as size_t, can be zero
1724   //
1725   address generate_conjoint_oop_copy(bool aligned,
1726                                      address nooverlap_target, address *entry,
1727                                      const char *name, bool dest_uninitialized) {
1728     const bool is_oop = true;
1729     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1730     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1731                                   name, dest_uninitialized);
1732   }
1733 
1734 
1735   // Helper for generating a dynamic type check.
1736   // Smashes rscratch1.
1737   void generate_type_check(Register sub_klass,
1738                            Register super_check_offset,
1739                            Register super_klass,
1740                            Label& L_success) {
1741     assert_different_registers(sub_klass, super_check_offset, super_klass);
1742 
1743     BLOCK_COMMENT("type_check:");
1744 
1745     Label L_miss;
1746 
1747     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1748                                      super_check_offset);
1749     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1750 
1751     // Fall through on failure!
1752     __ BIND(L_miss);
1753   }
1754 
1755   //
1756   //  Generate checkcasting array copy stub
1757   //
1758   //  Input:
1759   //    c_rarg0   - source array address
1760   //    c_rarg1   - destination array address
1761   //    c_rarg2   - element count, treated as ssize_t, can be zero
1762   //    c_rarg3   - size_t ckoff (super_check_offset)
1763   //    c_rarg4   - oop ckval (super_klass)
1764   //
1765   //  Output:
1766   //    r0 ==  0  -  success
1767   //    r0 == -1^K - failure, where K is partial transfer count
1768   //
1769   address generate_checkcast_copy(const char *name, address *entry,
1770                                   bool dest_uninitialized = false) {
1771 
1772     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1773 
1774     // Input registers (after setup_arg_regs)
1775     const Register from        = c_rarg0;   // source array address
1776     const Register to          = c_rarg1;   // destination array address
1777     const Register count       = c_rarg2;   // elementscount
1778     const Register ckoff       = c_rarg3;   // super_check_offset
1779     const Register ckval       = c_rarg4;   // super_klass
1780 
1781     // Registers used as temps (r18, r19, r20 are save-on-entry)
1782     const Register count_save  = r21;       // orig elementscount
1783     const Register start_to    = r20;       // destination array start address
1784     const Register copied_oop  = r18;       // actual oop copied
1785     const Register r19_klass   = r19;       // oop._klass
1786 
1787     //---------------------------------------------------------------
1788     // Assembler stub will be used for this call to arraycopy
1789     // if the two arrays are subtypes of Object[] but the
1790     // destination array type is not equal to or a supertype
1791     // of the source type.  Each element must be separately
1792     // checked.
1793 
1794     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1795                                copied_oop, r19_klass, count_save);
1796 
1797     __ align(CodeEntryAlignment);
1798     StubCodeMark mark(this, "StubRoutines", name);
1799     address start = __ pc();
1800 
1801     __ enter(); // required for proper stackwalking of RuntimeStub frame
1802 
1803 #ifdef ASSERT
1804     // caller guarantees that the arrays really are different
1805     // otherwise, we would have to make conjoint checks
1806     { Label L;
1807       array_overlap_test(L, TIMES_OOP);
1808       __ stop("checkcast_copy within a single array");
1809       __ bind(L);
1810     }
1811 #endif //ASSERT
1812 
1813     // Caller of this entry point must set up the argument registers.
1814     if (entry != NULL) {
1815       *entry = __ pc();
1816       BLOCK_COMMENT("Entry:");
1817     }
1818 
1819      // Empty array:  Nothing to do.
1820     __ cbz(count, L_done);
1821 
1822     __ push(RegSet::of(r18, r19, r20, r21), sp);
1823 
1824 #ifdef ASSERT
1825     BLOCK_COMMENT("assert consistent ckoff/ckval");
1826     // The ckoff and ckval must be mutually consistent,
1827     // even though caller generates both.
1828     { Label L;
1829       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1830       __ ldrw(start_to, Address(ckval, sco_offset));
1831       __ cmpw(ckoff, start_to);
1832       __ br(Assembler::EQ, L);
1833       __ stop("super_check_offset inconsistent");
1834       __ bind(L);
1835     }
1836 #endif //ASSERT
1837 
1838     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1839 
1840     // save the original count
1841     __ mov(count_save, count);
1842 
1843     // Copy from low to high addresses
1844     __ mov(start_to, to);              // Save destination array start address
1845     __ b(L_load_element);
1846 
1847     // ======== begin loop ========
1848     // (Loop is rotated; its entry is L_load_element.)
1849     // Loop control:
1850     //   for (; count != 0; count--) {
1851     //     copied_oop = load_heap_oop(from++);
1852     //     ... generate_type_check ...;
1853     //     store_heap_oop(to++, copied_oop);
1854     //   }
1855     __ align(OptoLoopAlignment);
1856 
1857     __ BIND(L_store_element);
1858     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1859     __ sub(count, count, 1);
1860     __ cbz(count, L_do_card_marks);
1861 
1862     // ======== loop entry is here ========
1863     __ BIND(L_load_element);
1864     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1865     __ cbz(copied_oop, L_store_element);
1866 
1867     __ load_klass(r19_klass, copied_oop);// query the object klass
1868     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1869     // ======== end loop ========
1870 
1871     // It was a real error; we must depend on the caller to finish the job.
1872     // Register count = remaining oops, count_orig = total oops.
1873     // Emit GC store barriers for the oops we have copied and report
1874     // their number to the caller.
1875 
1876     __ subs(count, count_save, count);     // K = partially copied oop count
1877     __ eon(count, count, zr);                   // report (-1^K) to caller
1878     __ br(Assembler::EQ, L_done_pop);
1879 
1880     __ BIND(L_do_card_marks);
1881     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1882     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1883 
1884     __ bind(L_done_pop);
1885     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1886     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1887 
1888     __ bind(L_done);
1889     __ mov(r0, count);
1890     __ leave();
1891     __ ret(lr);
1892 
1893     return start;
1894   }
1895 
1896   // Perform range checks on the proposed arraycopy.
1897   // Kills temp, but nothing else.
1898   // Also, clean the sign bits of src_pos and dst_pos.
1899   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1900                               Register src_pos, // source position (c_rarg1)
1901                               Register dst,     // destination array oo (c_rarg2)
1902                               Register dst_pos, // destination position (c_rarg3)
1903                               Register length,
1904                               Register temp,
1905                               Label& L_failed) {
1906     BLOCK_COMMENT("arraycopy_range_checks:");
1907 
1908     assert_different_registers(rscratch1, temp);
1909 
1910     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1911     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1912     __ addw(temp, length, src_pos);
1913     __ cmpw(temp, rscratch1);
1914     __ br(Assembler::HI, L_failed);
1915 
1916     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1917     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1918     __ addw(temp, length, dst_pos);
1919     __ cmpw(temp, rscratch1);
1920     __ br(Assembler::HI, L_failed);
1921 
1922     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1923     __ movw(src_pos, src_pos);
1924     __ movw(dst_pos, dst_pos);
1925 
1926     BLOCK_COMMENT("arraycopy_range_checks done");
1927   }
1928 
1929   // These stubs get called from some dumb test routine.
1930   // I'll write them properly when they're called from
1931   // something that's actually doing something.
1932   static void fake_arraycopy_stub(address src, address dst, int count) {
1933     assert(count == 0, "huh?");
1934   }
1935 
1936 
1937   //
1938   //  Generate 'unsafe' array copy stub
1939   //  Though just as safe as the other stubs, it takes an unscaled
1940   //  size_t argument instead of an element count.
1941   //
1942   //  Input:
1943   //    c_rarg0   - source array address
1944   //    c_rarg1   - destination array address
1945   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1946   //
1947   // Examines the alignment of the operands and dispatches
1948   // to a long, int, short, or byte copy loop.
1949   //
1950   address generate_unsafe_copy(const char *name,
1951                                address byte_copy_entry,
1952                                address short_copy_entry,
1953                                address int_copy_entry,
1954                                address long_copy_entry) {
1955     Label L_long_aligned, L_int_aligned, L_short_aligned;
1956     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1957 
1958     __ align(CodeEntryAlignment);
1959     StubCodeMark mark(this, "StubRoutines", name);
1960     address start = __ pc();
1961     __ enter(); // required for proper stackwalking of RuntimeStub frame
1962 
1963     // bump this on entry, not on exit:
1964     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1965 
1966     __ orr(rscratch1, s, d);
1967     __ orr(rscratch1, rscratch1, count);
1968 
1969     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1970     __ cbz(rscratch1, L_long_aligned);
1971     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1972     __ cbz(rscratch1, L_int_aligned);
1973     __ tbz(rscratch1, 0, L_short_aligned);
1974     __ b(RuntimeAddress(byte_copy_entry));
1975 
1976     __ BIND(L_short_aligned);
1977     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1978     __ b(RuntimeAddress(short_copy_entry));
1979     __ BIND(L_int_aligned);
1980     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1981     __ b(RuntimeAddress(int_copy_entry));
1982     __ BIND(L_long_aligned);
1983     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1984     __ b(RuntimeAddress(long_copy_entry));
1985 
1986     return start;
1987   }
1988 
1989   //
1990   //  Generate generic array copy stubs
1991   //
1992   //  Input:
1993   //    c_rarg0    -  src oop
1994   //    c_rarg1    -  src_pos (32-bits)
1995   //    c_rarg2    -  dst oop
1996   //    c_rarg3    -  dst_pos (32-bits)
1997   //    c_rarg4    -  element count (32-bits)
1998   //
1999   //  Output:
2000   //    r0 ==  0  -  success
2001   //    r0 == -1^K - failure, where K is partial transfer count
2002   //
2003   address generate_generic_copy(const char *name,
2004                                 address byte_copy_entry, address short_copy_entry,
2005                                 address int_copy_entry, address oop_copy_entry,
2006                                 address long_copy_entry, address checkcast_copy_entry) {
2007 
2008     Label L_failed, L_failed_0, L_objArray;
2009     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2010 
2011     // Input registers
2012     const Register src        = c_rarg0;  // source array oop
2013     const Register src_pos    = c_rarg1;  // source position
2014     const Register dst        = c_rarg2;  // destination array oop
2015     const Register dst_pos    = c_rarg3;  // destination position
2016     const Register length     = c_rarg4;
2017 
2018     StubCodeMark mark(this, "StubRoutines", name);
2019 
2020     __ align(CodeEntryAlignment);
2021     address start = __ pc();
2022 
2023     __ enter(); // required for proper stackwalking of RuntimeStub frame
2024 
2025     // bump this on entry, not on exit:
2026     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2027 
2028     //-----------------------------------------------------------------------
2029     // Assembler stub will be used for this call to arraycopy
2030     // if the following conditions are met:
2031     //
2032     // (1) src and dst must not be null.
2033     // (2) src_pos must not be negative.
2034     // (3) dst_pos must not be negative.
2035     // (4) length  must not be negative.
2036     // (5) src klass and dst klass should be the same and not NULL.
2037     // (6) src and dst should be arrays.
2038     // (7) src_pos + length must not exceed length of src.
2039     // (8) dst_pos + length must not exceed length of dst.
2040     //
2041 
2042     //  if (src == NULL) return -1;
2043     __ cbz(src, L_failed);
2044 
2045     //  if (src_pos < 0) return -1;
2046     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2047 
2048     //  if (dst == NULL) return -1;
2049     __ cbz(dst, L_failed);
2050 
2051     //  if (dst_pos < 0) return -1;
2052     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2053 
2054     // registers used as temp
2055     const Register scratch_length    = r16; // elements count to copy
2056     const Register scratch_src_klass = r17; // array klass
2057     const Register lh                = r18; // layout helper
2058 
2059     //  if (length < 0) return -1;
2060     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2061     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2062 
2063     __ load_klass(scratch_src_klass, src);
2064 #ifdef ASSERT
2065     //  assert(src->klass() != NULL);
2066     {
2067       BLOCK_COMMENT("assert klasses not null {");
2068       Label L1, L2;
2069       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2070       __ bind(L1);
2071       __ stop("broken null klass");
2072       __ bind(L2);
2073       __ load_klass(rscratch1, dst);
2074       __ cbz(rscratch1, L1);     // this would be broken also
2075       BLOCK_COMMENT("} assert klasses not null done");
2076     }
2077 #endif
2078 
2079     // Load layout helper (32-bits)
2080     //
2081     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2082     // 32        30    24            16              8     2                 0
2083     //
2084     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2085     //
2086 
2087     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2088 
2089     // Handle objArrays completely differently...
2090     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2091     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2092     __ movw(rscratch1, objArray_lh);
2093     __ eorw(rscratch2, lh, rscratch1);
2094     __ cbzw(rscratch2, L_objArray);
2095 
2096     //  if (src->klass() != dst->klass()) return -1;
2097     __ load_klass(rscratch2, dst);
2098     __ eor(rscratch2, rscratch2, scratch_src_klass);
2099     __ cbnz(rscratch2, L_failed);
2100 
2101     //  if (!src->is_Array()) return -1;
2102     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2103 
2104     // At this point, it is known to be a typeArray (array_tag 0x3).
2105 #ifdef ASSERT
2106     {
2107       BLOCK_COMMENT("assert primitive array {");
2108       Label L;
2109       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2110       __ cmpw(lh, rscratch2);
2111       __ br(Assembler::GE, L);
2112       __ stop("must be a primitive array");
2113       __ bind(L);
2114       BLOCK_COMMENT("} assert primitive array done");
2115     }
2116 #endif
2117 
2118     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2119                            rscratch2, L_failed);
2120 
2121     // TypeArrayKlass
2122     //
2123     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2124     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2125     //
2126 
2127     const Register rscratch1_offset = rscratch1;    // array offset
2128     const Register r18_elsize = lh; // element size
2129 
2130     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2131            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2132     __ add(src, src, rscratch1_offset);           // src array offset
2133     __ add(dst, dst, rscratch1_offset);           // dst array offset
2134     BLOCK_COMMENT("choose copy loop based on element size");
2135 
2136     // next registers should be set before the jump to corresponding stub
2137     const Register from     = c_rarg0;  // source array address
2138     const Register to       = c_rarg1;  // destination array address
2139     const Register count    = c_rarg2;  // elements count
2140 
2141     // 'from', 'to', 'count' registers should be set in such order
2142     // since they are the same as 'src', 'src_pos', 'dst'.
2143 
2144     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2145 
2146     // The possible values of elsize are 0-3, i.e. exact_log2(element
2147     // size in bytes).  We do a simple bitwise binary search.
2148   __ BIND(L_copy_bytes);
2149     __ tbnz(r18_elsize, 1, L_copy_ints);
2150     __ tbnz(r18_elsize, 0, L_copy_shorts);
2151     __ lea(from, Address(src, src_pos));// src_addr
2152     __ lea(to,   Address(dst, dst_pos));// dst_addr
2153     __ movw(count, scratch_length); // length
2154     __ b(RuntimeAddress(byte_copy_entry));
2155 
2156   __ BIND(L_copy_shorts);
2157     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2158     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2159     __ movw(count, scratch_length); // length
2160     __ b(RuntimeAddress(short_copy_entry));
2161 
2162   __ BIND(L_copy_ints);
2163     __ tbnz(r18_elsize, 0, L_copy_longs);
2164     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2165     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2166     __ movw(count, scratch_length); // length
2167     __ b(RuntimeAddress(int_copy_entry));
2168 
2169   __ BIND(L_copy_longs);
2170 #ifdef ASSERT
2171     {
2172       BLOCK_COMMENT("assert long copy {");
2173       Label L;
2174       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2175       __ cmpw(r18_elsize, LogBytesPerLong);
2176       __ br(Assembler::EQ, L);
2177       __ stop("must be long copy, but elsize is wrong");
2178       __ bind(L);
2179       BLOCK_COMMENT("} assert long copy done");
2180     }
2181 #endif
2182     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2183     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2184     __ movw(count, scratch_length); // length
2185     __ b(RuntimeAddress(long_copy_entry));
2186 
2187     // ObjArrayKlass
2188   __ BIND(L_objArray);
2189     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2190 
2191     Label L_plain_copy, L_checkcast_copy;
2192     //  test array classes for subtyping
2193     __ load_klass(r18, dst);
2194     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2195     __ br(Assembler::NE, L_checkcast_copy);
2196 
2197     // Identically typed arrays can be copied without element-wise checks.
2198     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2199                            rscratch2, L_failed);
2200 
2201     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2202     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2203     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2204     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2205     __ movw(count, scratch_length); // length
2206   __ BIND(L_plain_copy);
2207     __ b(RuntimeAddress(oop_copy_entry));
2208 
2209   __ BIND(L_checkcast_copy);
2210     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2211     {
2212       // Before looking at dst.length, make sure dst is also an objArray.
2213       __ ldrw(rscratch1, Address(r18, lh_offset));
2214       __ movw(rscratch2, objArray_lh);
2215       __ eorw(rscratch1, rscratch1, rscratch2);
2216       __ cbnzw(rscratch1, L_failed);
2217 
2218       // It is safe to examine both src.length and dst.length.
2219       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2220                              r18, L_failed);
2221 
2222       const Register rscratch2_dst_klass = rscratch2;
2223       __ load_klass(rscratch2_dst_klass, dst); // reload
2224 
2225       // Marshal the base address arguments now, freeing registers.
2226       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2227       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2228       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2229       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2230       __ movw(count, length);           // length (reloaded)
2231       Register sco_temp = c_rarg3;      // this register is free now
2232       assert_different_registers(from, to, count, sco_temp,
2233                                  rscratch2_dst_klass, scratch_src_klass);
2234       // assert_clean_int(count, sco_temp);
2235 
2236       // Generate the type check.
2237       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2238       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2239       // assert_clean_int(sco_temp, r18);
2240       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2241 
2242       // Fetch destination element klass from the ObjArrayKlass header.
2243       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2244       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2245       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2246 
2247       // the checkcast_copy loop needs two extra arguments:
2248       assert(c_rarg3 == sco_temp, "#3 already in place");
2249       // Set up arguments for checkcast_copy_entry.
2250       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2251       __ b(RuntimeAddress(checkcast_copy_entry));
2252     }
2253 
2254   __ BIND(L_failed);
2255     __ mov(r0, -1);
2256     __ leave();   // required for proper stackwalking of RuntimeStub frame
2257     __ ret(lr);
2258 
2259     return start;
2260   }
2261 
2262   //
2263   // Generate stub for array fill. If "aligned" is true, the
2264   // "to" address is assumed to be heapword aligned.
2265   //
2266   // Arguments for generated stub:
2267   //   to:    c_rarg0
2268   //   value: c_rarg1
2269   //   count: c_rarg2 treated as signed
2270   //
2271   address generate_fill(BasicType t, bool aligned, const char *name) {
2272     __ align(CodeEntryAlignment);
2273     StubCodeMark mark(this, "StubRoutines", name);
2274     address start = __ pc();
2275 
2276     BLOCK_COMMENT("Entry:");
2277 
2278     const Register to        = c_rarg0;  // source array address
2279     const Register value     = c_rarg1;  // value
2280     const Register count     = c_rarg2;  // elements count
2281 
2282     const Register bz_base = r10;        // base for block_zero routine
2283     const Register cnt_words = r11;      // temp register
2284 
2285     __ enter();
2286 
2287     Label L_fill_elements, L_exit1;
2288 
2289     int shift = -1;
2290     switch (t) {
2291       case T_BYTE:
2292         shift = 0;
2293         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2294         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2295         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2296         __ br(Assembler::LO, L_fill_elements);
2297         break;
2298       case T_SHORT:
2299         shift = 1;
2300         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2301         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2302         __ br(Assembler::LO, L_fill_elements);
2303         break;
2304       case T_INT:
2305         shift = 2;
2306         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2307         __ br(Assembler::LO, L_fill_elements);
2308         break;
2309       default: ShouldNotReachHere();
2310     }
2311 
2312     // Align source address at 8 bytes address boundary.
2313     Label L_skip_align1, L_skip_align2, L_skip_align4;
2314     if (!aligned) {
2315       switch (t) {
2316         case T_BYTE:
2317           // One byte misalignment happens only for byte arrays.
2318           __ tbz(to, 0, L_skip_align1);
2319           __ strb(value, Address(__ post(to, 1)));
2320           __ subw(count, count, 1);
2321           __ bind(L_skip_align1);
2322           // Fallthrough
2323         case T_SHORT:
2324           // Two bytes misalignment happens only for byte and short (char) arrays.
2325           __ tbz(to, 1, L_skip_align2);
2326           __ strh(value, Address(__ post(to, 2)));
2327           __ subw(count, count, 2 >> shift);
2328           __ bind(L_skip_align2);
2329           // Fallthrough
2330         case T_INT:
2331           // Align to 8 bytes, we know we are 4 byte aligned to start.
2332           __ tbz(to, 2, L_skip_align4);
2333           __ strw(value, Address(__ post(to, 4)));
2334           __ subw(count, count, 4 >> shift);
2335           __ bind(L_skip_align4);
2336           break;
2337         default: ShouldNotReachHere();
2338       }
2339     }
2340 
2341     //
2342     //  Fill large chunks
2343     //
2344     __ lsrw(cnt_words, count, 3 - shift); // number of words
2345     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2346     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2347     if (UseBlockZeroing) {
2348       Label non_block_zeroing, rest;
2349       Register tmp = rscratch1;
2350       // count >= BlockZeroingLowLimit && value == 0
2351       __ subs(tmp, cnt_words, BlockZeroingLowLimit >> 3);
2352       __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
2353       __ br(Assembler::NE, non_block_zeroing);
2354       __ mov(bz_base, to);
2355       __ block_zero(bz_base, cnt_words, true);
2356       __ mov(to, bz_base);
2357       __ b(rest);
2358       __ bind(non_block_zeroing);
2359       __ fill_words(to, cnt_words, value);
2360       __ bind(rest);
2361     }
2362     else {
2363       __ fill_words(to, cnt_words, value);
2364     }
2365 
2366     // Remaining count is less than 8 bytes. Fill it by a single store.
2367     // Note that the total length is no less than 8 bytes.
2368     if (t == T_BYTE || t == T_SHORT) {
2369       Label L_exit1;
2370       __ cbzw(count, L_exit1);
2371       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2372       __ str(value, Address(to, -8));    // overwrite some elements
2373       __ bind(L_exit1);
2374       __ leave();
2375       __ ret(lr);
2376     }
2377 
2378     // Handle copies less than 8 bytes.
2379     Label L_fill_2, L_fill_4, L_exit2;
2380     __ bind(L_fill_elements);
2381     switch (t) {
2382       case T_BYTE:
2383         __ tbz(count, 0, L_fill_2);
2384         __ strb(value, Address(__ post(to, 1)));
2385         __ bind(L_fill_2);
2386         __ tbz(count, 1, L_fill_4);
2387         __ strh(value, Address(__ post(to, 2)));
2388         __ bind(L_fill_4);
2389         __ tbz(count, 2, L_exit2);
2390         __ strw(value, Address(to));
2391         break;
2392       case T_SHORT:
2393         __ tbz(count, 0, L_fill_4);
2394         __ strh(value, Address(__ post(to, 2)));
2395         __ bind(L_fill_4);
2396         __ tbz(count, 1, L_exit2);
2397         __ strw(value, Address(to));
2398         break;
2399       case T_INT:
2400         __ cbzw(count, L_exit2);
2401         __ strw(value, Address(to));
2402         break;
2403       default: ShouldNotReachHere();
2404     }
2405     __ bind(L_exit2);
2406     __ leave();
2407     __ ret(lr);
2408     return start;
2409   }
2410 
2411   void generate_arraycopy_stubs() {
2412     address entry;
2413     address entry_jbyte_arraycopy;
2414     address entry_jshort_arraycopy;
2415     address entry_jint_arraycopy;
2416     address entry_oop_arraycopy;
2417     address entry_jlong_arraycopy;
2418     address entry_checkcast_arraycopy;
2419 
2420     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2421     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2422 
2423     StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
2424 
2425     //*** jbyte
2426     // Always need aligned and unaligned versions
2427     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2428                                                                                   "jbyte_disjoint_arraycopy");
2429     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2430                                                                                   &entry_jbyte_arraycopy,
2431                                                                                   "jbyte_arraycopy");
2432     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2433                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2434     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2435                                                                                   "arrayof_jbyte_arraycopy");
2436 
2437     //*** jshort
2438     // Always need aligned and unaligned versions
2439     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2440                                                                                     "jshort_disjoint_arraycopy");
2441     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2442                                                                                     &entry_jshort_arraycopy,
2443                                                                                     "jshort_arraycopy");
2444     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2445                                                                                     "arrayof_jshort_disjoint_arraycopy");
2446     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2447                                                                                     "arrayof_jshort_arraycopy");
2448 
2449     //*** jint
2450     // Aligned versions
2451     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2452                                                                                 "arrayof_jint_disjoint_arraycopy");
2453     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2454                                                                                 "arrayof_jint_arraycopy");
2455     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2456     // entry_jint_arraycopy always points to the unaligned version
2457     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2458                                                                                 "jint_disjoint_arraycopy");
2459     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2460                                                                                 &entry_jint_arraycopy,
2461                                                                                 "jint_arraycopy");
2462 
2463     //*** jlong
2464     // It is always aligned
2465     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2466                                                                                   "arrayof_jlong_disjoint_arraycopy");
2467     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2468                                                                                   "arrayof_jlong_arraycopy");
2469     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2470     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2471 
2472     //*** oops
2473     {
2474       // With compressed oops we need unaligned versions; notice that
2475       // we overwrite entry_oop_arraycopy.
2476       bool aligned = !UseCompressedOops;
2477 
2478       StubRoutines::_arrayof_oop_disjoint_arraycopy
2479         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2480                                      /*dest_uninitialized*/false);
2481       StubRoutines::_arrayof_oop_arraycopy
2482         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2483                                      /*dest_uninitialized*/false);
2484       // Aligned versions without pre-barriers
2485       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2486         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2487                                      /*dest_uninitialized*/true);
2488       StubRoutines::_arrayof_oop_arraycopy_uninit
2489         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2490                                      /*dest_uninitialized*/true);
2491     }
2492 
2493     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2494     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2495     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2496     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2497 
2498     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2499     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2500                                                                         /*dest_uninitialized*/true);
2501 
2502     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2503                                                               entry_jbyte_arraycopy,
2504                                                               entry_jshort_arraycopy,
2505                                                               entry_jint_arraycopy,
2506                                                               entry_jlong_arraycopy);
2507 
2508     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2509                                                                entry_jbyte_arraycopy,
2510                                                                entry_jshort_arraycopy,
2511                                                                entry_jint_arraycopy,
2512                                                                entry_oop_arraycopy,
2513                                                                entry_jlong_arraycopy,
2514                                                                entry_checkcast_arraycopy);
2515 
2516     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2517     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2518     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2519     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2520     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2521     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2522   }
2523 
2524   void generate_math_stubs() { Unimplemented(); }
2525 
2526   // Arguments:
2527   //
2528   // Inputs:
2529   //   c_rarg0   - source byte array address
2530   //   c_rarg1   - destination byte array address
2531   //   c_rarg2   - K (key) in little endian int array
2532   //
2533   address generate_aescrypt_encryptBlock() {
2534     __ align(CodeEntryAlignment);
2535     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2536 
2537     Label L_doLast;
2538 
2539     const Register from        = c_rarg0;  // source array address
2540     const Register to          = c_rarg1;  // destination array address
2541     const Register key         = c_rarg2;  // key array address
2542     const Register keylen      = rscratch1;
2543 
2544     address start = __ pc();
2545     __ enter();
2546 
2547     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2548 
2549     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2550 
2551     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2552     __ rev32(v1, __ T16B, v1);
2553     __ rev32(v2, __ T16B, v2);
2554     __ rev32(v3, __ T16B, v3);
2555     __ rev32(v4, __ T16B, v4);
2556     __ aese(v0, v1);
2557     __ aesmc(v0, v0);
2558     __ aese(v0, v2);
2559     __ aesmc(v0, v0);
2560     __ aese(v0, v3);
2561     __ aesmc(v0, v0);
2562     __ aese(v0, v4);
2563     __ aesmc(v0, v0);
2564 
2565     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2566     __ rev32(v1, __ T16B, v1);
2567     __ rev32(v2, __ T16B, v2);
2568     __ rev32(v3, __ T16B, v3);
2569     __ rev32(v4, __ T16B, v4);
2570     __ aese(v0, v1);
2571     __ aesmc(v0, v0);
2572     __ aese(v0, v2);
2573     __ aesmc(v0, v0);
2574     __ aese(v0, v3);
2575     __ aesmc(v0, v0);
2576     __ aese(v0, v4);
2577     __ aesmc(v0, v0);
2578 
2579     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2580     __ rev32(v1, __ T16B, v1);
2581     __ rev32(v2, __ T16B, v2);
2582 
2583     __ cmpw(keylen, 44);
2584     __ br(Assembler::EQ, L_doLast);
2585 
2586     __ aese(v0, v1);
2587     __ aesmc(v0, v0);
2588     __ aese(v0, v2);
2589     __ aesmc(v0, v0);
2590 
2591     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2592     __ rev32(v1, __ T16B, v1);
2593     __ rev32(v2, __ T16B, v2);
2594 
2595     __ cmpw(keylen, 52);
2596     __ br(Assembler::EQ, L_doLast);
2597 
2598     __ aese(v0, v1);
2599     __ aesmc(v0, v0);
2600     __ aese(v0, v2);
2601     __ aesmc(v0, v0);
2602 
2603     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2604     __ rev32(v1, __ T16B, v1);
2605     __ rev32(v2, __ T16B, v2);
2606 
2607     __ BIND(L_doLast);
2608 
2609     __ aese(v0, v1);
2610     __ aesmc(v0, v0);
2611     __ aese(v0, v2);
2612 
2613     __ ld1(v1, __ T16B, key);
2614     __ rev32(v1, __ T16B, v1);
2615     __ eor(v0, __ T16B, v0, v1);
2616 
2617     __ st1(v0, __ T16B, to);
2618 
2619     __ mov(r0, 0);
2620 
2621     __ leave();
2622     __ ret(lr);
2623 
2624     return start;
2625   }
2626 
2627   // Arguments:
2628   //
2629   // Inputs:
2630   //   c_rarg0   - source byte array address
2631   //   c_rarg1   - destination byte array address
2632   //   c_rarg2   - K (key) in little endian int array
2633   //
2634   address generate_aescrypt_decryptBlock() {
2635     assert(UseAES, "need AES instructions and misaligned SSE support");
2636     __ align(CodeEntryAlignment);
2637     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2638     Label L_doLast;
2639 
2640     const Register from        = c_rarg0;  // source array address
2641     const Register to          = c_rarg1;  // destination array address
2642     const Register key         = c_rarg2;  // key array address
2643     const Register keylen      = rscratch1;
2644 
2645     address start = __ pc();
2646     __ enter(); // required for proper stackwalking of RuntimeStub frame
2647 
2648     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2649 
2650     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2651 
2652     __ ld1(v5, __ T16B, __ post(key, 16));
2653     __ rev32(v5, __ T16B, v5);
2654 
2655     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2656     __ rev32(v1, __ T16B, v1);
2657     __ rev32(v2, __ T16B, v2);
2658     __ rev32(v3, __ T16B, v3);
2659     __ rev32(v4, __ T16B, v4);
2660     __ aesd(v0, v1);
2661     __ aesimc(v0, v0);
2662     __ aesd(v0, v2);
2663     __ aesimc(v0, v0);
2664     __ aesd(v0, v3);
2665     __ aesimc(v0, v0);
2666     __ aesd(v0, v4);
2667     __ aesimc(v0, v0);
2668 
2669     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2670     __ rev32(v1, __ T16B, v1);
2671     __ rev32(v2, __ T16B, v2);
2672     __ rev32(v3, __ T16B, v3);
2673     __ rev32(v4, __ T16B, v4);
2674     __ aesd(v0, v1);
2675     __ aesimc(v0, v0);
2676     __ aesd(v0, v2);
2677     __ aesimc(v0, v0);
2678     __ aesd(v0, v3);
2679     __ aesimc(v0, v0);
2680     __ aesd(v0, v4);
2681     __ aesimc(v0, v0);
2682 
2683     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2684     __ rev32(v1, __ T16B, v1);
2685     __ rev32(v2, __ T16B, v2);
2686 
2687     __ cmpw(keylen, 44);
2688     __ br(Assembler::EQ, L_doLast);
2689 
2690     __ aesd(v0, v1);
2691     __ aesimc(v0, v0);
2692     __ aesd(v0, v2);
2693     __ aesimc(v0, v0);
2694 
2695     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2696     __ rev32(v1, __ T16B, v1);
2697     __ rev32(v2, __ T16B, v2);
2698 
2699     __ cmpw(keylen, 52);
2700     __ br(Assembler::EQ, L_doLast);
2701 
2702     __ aesd(v0, v1);
2703     __ aesimc(v0, v0);
2704     __ aesd(v0, v2);
2705     __ aesimc(v0, v0);
2706 
2707     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2708     __ rev32(v1, __ T16B, v1);
2709     __ rev32(v2, __ T16B, v2);
2710 
2711     __ BIND(L_doLast);
2712 
2713     __ aesd(v0, v1);
2714     __ aesimc(v0, v0);
2715     __ aesd(v0, v2);
2716 
2717     __ eor(v0, __ T16B, v0, v5);
2718 
2719     __ st1(v0, __ T16B, to);
2720 
2721     __ mov(r0, 0);
2722 
2723     __ leave();
2724     __ ret(lr);
2725 
2726     return start;
2727   }
2728 
2729   // Arguments:
2730   //
2731   // Inputs:
2732   //   c_rarg0   - source byte array address
2733   //   c_rarg1   - destination byte array address
2734   //   c_rarg2   - K (key) in little endian int array
2735   //   c_rarg3   - r vector byte array address
2736   //   c_rarg4   - input length
2737   //
2738   // Output:
2739   //   x0        - input length
2740   //
2741   address generate_cipherBlockChaining_encryptAESCrypt() {
2742     assert(UseAES, "need AES instructions and misaligned SSE support");
2743     __ align(CodeEntryAlignment);
2744     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2745 
2746     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2747 
2748     const Register from        = c_rarg0;  // source array address
2749     const Register to          = c_rarg1;  // destination array address
2750     const Register key         = c_rarg2;  // key array address
2751     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2752                                            // and left with the results of the last encryption block
2753     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2754     const Register keylen      = rscratch1;
2755 
2756     address start = __ pc();
2757 
2758       __ enter();
2759 
2760       __ movw(rscratch2, len_reg);
2761 
2762       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2763 
2764       __ ld1(v0, __ T16B, rvec);
2765 
2766       __ cmpw(keylen, 52);
2767       __ br(Assembler::CC, L_loadkeys_44);
2768       __ br(Assembler::EQ, L_loadkeys_52);
2769 
2770       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2771       __ rev32(v17, __ T16B, v17);
2772       __ rev32(v18, __ T16B, v18);
2773     __ BIND(L_loadkeys_52);
2774       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2775       __ rev32(v19, __ T16B, v19);
2776       __ rev32(v20, __ T16B, v20);
2777     __ BIND(L_loadkeys_44);
2778       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2779       __ rev32(v21, __ T16B, v21);
2780       __ rev32(v22, __ T16B, v22);
2781       __ rev32(v23, __ T16B, v23);
2782       __ rev32(v24, __ T16B, v24);
2783       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2784       __ rev32(v25, __ T16B, v25);
2785       __ rev32(v26, __ T16B, v26);
2786       __ rev32(v27, __ T16B, v27);
2787       __ rev32(v28, __ T16B, v28);
2788       __ ld1(v29, v30, v31, __ T16B, key);
2789       __ rev32(v29, __ T16B, v29);
2790       __ rev32(v30, __ T16B, v30);
2791       __ rev32(v31, __ T16B, v31);
2792 
2793     __ BIND(L_aes_loop);
2794       __ ld1(v1, __ T16B, __ post(from, 16));
2795       __ eor(v0, __ T16B, v0, v1);
2796 
2797       __ br(Assembler::CC, L_rounds_44);
2798       __ br(Assembler::EQ, L_rounds_52);
2799 
2800       __ aese(v0, v17); __ aesmc(v0, v0);
2801       __ aese(v0, v18); __ aesmc(v0, v0);
2802     __ BIND(L_rounds_52);
2803       __ aese(v0, v19); __ aesmc(v0, v0);
2804       __ aese(v0, v20); __ aesmc(v0, v0);
2805     __ BIND(L_rounds_44);
2806       __ aese(v0, v21); __ aesmc(v0, v0);
2807       __ aese(v0, v22); __ aesmc(v0, v0);
2808       __ aese(v0, v23); __ aesmc(v0, v0);
2809       __ aese(v0, v24); __ aesmc(v0, v0);
2810       __ aese(v0, v25); __ aesmc(v0, v0);
2811       __ aese(v0, v26); __ aesmc(v0, v0);
2812       __ aese(v0, v27); __ aesmc(v0, v0);
2813       __ aese(v0, v28); __ aesmc(v0, v0);
2814       __ aese(v0, v29); __ aesmc(v0, v0);
2815       __ aese(v0, v30);
2816       __ eor(v0, __ T16B, v0, v31);
2817 
2818       __ st1(v0, __ T16B, __ post(to, 16));
2819 
2820       __ subw(len_reg, len_reg, 16);
2821       __ cbnzw(len_reg, L_aes_loop);
2822 
2823       __ st1(v0, __ T16B, rvec);
2824 
2825       __ mov(r0, rscratch2);
2826 
2827       __ leave();
2828       __ ret(lr);
2829 
2830       return start;
2831   }
2832 
2833   // Arguments:
2834   //
2835   // Inputs:
2836   //   c_rarg0   - source byte array address
2837   //   c_rarg1   - destination byte array address
2838   //   c_rarg2   - K (key) in little endian int array
2839   //   c_rarg3   - r vector byte array address
2840   //   c_rarg4   - input length
2841   //
2842   // Output:
2843   //   r0        - input length
2844   //
2845   address generate_cipherBlockChaining_decryptAESCrypt() {
2846     assert(UseAES, "need AES instructions and misaligned SSE support");
2847     __ align(CodeEntryAlignment);
2848     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2849 
2850     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2851 
2852     const Register from        = c_rarg0;  // source array address
2853     const Register to          = c_rarg1;  // destination array address
2854     const Register key         = c_rarg2;  // key array address
2855     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2856                                            // and left with the results of the last encryption block
2857     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2858     const Register keylen      = rscratch1;
2859 
2860     address start = __ pc();
2861 
2862       __ enter();
2863 
2864       __ movw(rscratch2, len_reg);
2865 
2866       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2867 
2868       __ ld1(v2, __ T16B, rvec);
2869 
2870       __ ld1(v31, __ T16B, __ post(key, 16));
2871       __ rev32(v31, __ T16B, v31);
2872 
2873       __ cmpw(keylen, 52);
2874       __ br(Assembler::CC, L_loadkeys_44);
2875       __ br(Assembler::EQ, L_loadkeys_52);
2876 
2877       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2878       __ rev32(v17, __ T16B, v17);
2879       __ rev32(v18, __ T16B, v18);
2880     __ BIND(L_loadkeys_52);
2881       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2882       __ rev32(v19, __ T16B, v19);
2883       __ rev32(v20, __ T16B, v20);
2884     __ BIND(L_loadkeys_44);
2885       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2886       __ rev32(v21, __ T16B, v21);
2887       __ rev32(v22, __ T16B, v22);
2888       __ rev32(v23, __ T16B, v23);
2889       __ rev32(v24, __ T16B, v24);
2890       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2891       __ rev32(v25, __ T16B, v25);
2892       __ rev32(v26, __ T16B, v26);
2893       __ rev32(v27, __ T16B, v27);
2894       __ rev32(v28, __ T16B, v28);
2895       __ ld1(v29, v30, __ T16B, key);
2896       __ rev32(v29, __ T16B, v29);
2897       __ rev32(v30, __ T16B, v30);
2898 
2899     __ BIND(L_aes_loop);
2900       __ ld1(v0, __ T16B, __ post(from, 16));
2901       __ orr(v1, __ T16B, v0, v0);
2902 
2903       __ br(Assembler::CC, L_rounds_44);
2904       __ br(Assembler::EQ, L_rounds_52);
2905 
2906       __ aesd(v0, v17); __ aesimc(v0, v0);
2907       __ aesd(v0, v18); __ aesimc(v0, v0);
2908     __ BIND(L_rounds_52);
2909       __ aesd(v0, v19); __ aesimc(v0, v0);
2910       __ aesd(v0, v20); __ aesimc(v0, v0);
2911     __ BIND(L_rounds_44);
2912       __ aesd(v0, v21); __ aesimc(v0, v0);
2913       __ aesd(v0, v22); __ aesimc(v0, v0);
2914       __ aesd(v0, v23); __ aesimc(v0, v0);
2915       __ aesd(v0, v24); __ aesimc(v0, v0);
2916       __ aesd(v0, v25); __ aesimc(v0, v0);
2917       __ aesd(v0, v26); __ aesimc(v0, v0);
2918       __ aesd(v0, v27); __ aesimc(v0, v0);
2919       __ aesd(v0, v28); __ aesimc(v0, v0);
2920       __ aesd(v0, v29); __ aesimc(v0, v0);
2921       __ aesd(v0, v30);
2922       __ eor(v0, __ T16B, v0, v31);
2923       __ eor(v0, __ T16B, v0, v2);
2924 
2925       __ st1(v0, __ T16B, __ post(to, 16));
2926       __ orr(v2, __ T16B, v1, v1);
2927 
2928       __ subw(len_reg, len_reg, 16);
2929       __ cbnzw(len_reg, L_aes_loop);
2930 
2931       __ st1(v2, __ T16B, rvec);
2932 
2933       __ mov(r0, rscratch2);
2934 
2935       __ leave();
2936       __ ret(lr);
2937 
2938     return start;
2939   }
2940 
2941   // Arguments:
2942   //
2943   // Inputs:
2944   //   c_rarg0   - byte[]  source+offset
2945   //   c_rarg1   - int[]   SHA.state
2946   //   c_rarg2   - int     offset
2947   //   c_rarg3   - int     limit
2948   //
2949   address generate_sha1_implCompress(bool multi_block, const char *name) {
2950     __ align(CodeEntryAlignment);
2951     StubCodeMark mark(this, "StubRoutines", name);
2952     address start = __ pc();
2953 
2954     Register buf   = c_rarg0;
2955     Register state = c_rarg1;
2956     Register ofs   = c_rarg2;
2957     Register limit = c_rarg3;
2958 
2959     Label keys;
2960     Label sha1_loop;
2961 
2962     // load the keys into v0..v3
2963     __ adr(rscratch1, keys);
2964     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2965     // load 5 words state into v6, v7
2966     __ ldrq(v6, Address(state, 0));
2967     __ ldrs(v7, Address(state, 16));
2968 
2969 
2970     __ BIND(sha1_loop);
2971     // load 64 bytes of data into v16..v19
2972     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2973     __ rev32(v16, __ T16B, v16);
2974     __ rev32(v17, __ T16B, v17);
2975     __ rev32(v18, __ T16B, v18);
2976     __ rev32(v19, __ T16B, v19);
2977 
2978     // do the sha1
2979     __ addv(v4, __ T4S, v16, v0);
2980     __ orr(v20, __ T16B, v6, v6);
2981 
2982     FloatRegister d0 = v16;
2983     FloatRegister d1 = v17;
2984     FloatRegister d2 = v18;
2985     FloatRegister d3 = v19;
2986 
2987     for (int round = 0; round < 20; round++) {
2988       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2989       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2990       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2991       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2992       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2993 
2994       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2995       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2996       __ sha1h(tmp2, __ T4S, v20);
2997       if (round < 5)
2998         __ sha1c(v20, __ T4S, tmp3, tmp4);
2999       else if (round < 10 || round >= 15)
3000         __ sha1p(v20, __ T4S, tmp3, tmp4);
3001       else
3002         __ sha1m(v20, __ T4S, tmp3, tmp4);
3003       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3004 
3005       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3006     }
3007 
3008     __ addv(v7, __ T2S, v7, v21);
3009     __ addv(v6, __ T4S, v6, v20);
3010 
3011     if (multi_block) {
3012       __ add(ofs, ofs, 64);
3013       __ cmp(ofs, limit);
3014       __ br(Assembler::LE, sha1_loop);
3015       __ mov(c_rarg0, ofs); // return ofs
3016     }
3017 
3018     __ strq(v6, Address(state, 0));
3019     __ strs(v7, Address(state, 16));
3020 
3021     __ ret(lr);
3022 
3023     __ bind(keys);
3024     __ emit_int32(0x5a827999);
3025     __ emit_int32(0x6ed9eba1);
3026     __ emit_int32(0x8f1bbcdc);
3027     __ emit_int32(0xca62c1d6);
3028 
3029     return start;
3030   }
3031 
3032 
3033   // Arguments:
3034   //
3035   // Inputs:
3036   //   c_rarg0   - byte[]  source+offset
3037   //   c_rarg1   - int[]   SHA.state
3038   //   c_rarg2   - int     offset
3039   //   c_rarg3   - int     limit
3040   //
3041   address generate_sha256_implCompress(bool multi_block, const char *name) {
3042     static const uint32_t round_consts[64] = {
3043       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3044       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3045       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3046       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3047       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3048       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3049       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3050       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3051       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3052       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3053       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3054       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3055       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3056       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3057       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3058       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3059     };
3060     __ align(CodeEntryAlignment);
3061     StubCodeMark mark(this, "StubRoutines", name);
3062     address start = __ pc();
3063 
3064     Register buf   = c_rarg0;
3065     Register state = c_rarg1;
3066     Register ofs   = c_rarg2;
3067     Register limit = c_rarg3;
3068 
3069     Label sha1_loop;
3070 
3071     __ stpd(v8, v9, __ pre(sp, -32));
3072     __ stpd(v10, v11, Address(sp, 16));
3073 
3074 // dga == v0
3075 // dgb == v1
3076 // dg0 == v2
3077 // dg1 == v3
3078 // dg2 == v4
3079 // t0 == v6
3080 // t1 == v7
3081 
3082     // load 16 keys to v16..v31
3083     __ lea(rscratch1, ExternalAddress((address)round_consts));
3084     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3085     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3086     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3087     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3088 
3089     // load 8 words (256 bits) state
3090     __ ldpq(v0, v1, state);
3091 
3092     __ BIND(sha1_loop);
3093     // load 64 bytes of data into v8..v11
3094     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3095     __ rev32(v8, __ T16B, v8);
3096     __ rev32(v9, __ T16B, v9);
3097     __ rev32(v10, __ T16B, v10);
3098     __ rev32(v11, __ T16B, v11);
3099 
3100     __ addv(v6, __ T4S, v8, v16);
3101     __ orr(v2, __ T16B, v0, v0);
3102     __ orr(v3, __ T16B, v1, v1);
3103 
3104     FloatRegister d0 = v8;
3105     FloatRegister d1 = v9;
3106     FloatRegister d2 = v10;
3107     FloatRegister d3 = v11;
3108 
3109 
3110     for (int round = 0; round < 16; round++) {
3111       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3112       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3113       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3114       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3115 
3116       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3117        __ orr(v4, __ T16B, v2, v2);
3118       if (round < 15)
3119         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3120       __ sha256h(v2, __ T4S, v3, tmp2);
3121       __ sha256h2(v3, __ T4S, v4, tmp2);
3122       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3123 
3124       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3125     }
3126 
3127     __ addv(v0, __ T4S, v0, v2);
3128     __ addv(v1, __ T4S, v1, v3);
3129 
3130     if (multi_block) {
3131       __ add(ofs, ofs, 64);
3132       __ cmp(ofs, limit);
3133       __ br(Assembler::LE, sha1_loop);
3134       __ mov(c_rarg0, ofs); // return ofs
3135     }
3136 
3137     __ ldpd(v10, v11, Address(sp, 16));
3138     __ ldpd(v8, v9, __ post(sp, 32));
3139 
3140     __ stpq(v0, v1, state);
3141 
3142     __ ret(lr);
3143 
3144     return start;
3145   }
3146 
3147 #ifndef BUILTIN_SIM
3148   // Safefetch stubs.
3149   void generate_safefetch(const char* name, int size, address* entry,
3150                           address* fault_pc, address* continuation_pc) {
3151     // safefetch signatures:
3152     //   int      SafeFetch32(int*      adr, int      errValue);
3153     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3154     //
3155     // arguments:
3156     //   c_rarg0 = adr
3157     //   c_rarg1 = errValue
3158     //
3159     // result:
3160     //   PPC_RET  = *adr or errValue
3161 
3162     StubCodeMark mark(this, "StubRoutines", name);
3163 
3164     // Entry point, pc or function descriptor.
3165     *entry = __ pc();
3166 
3167     // Load *adr into c_rarg1, may fault.
3168     *fault_pc = __ pc();
3169     switch (size) {
3170       case 4:
3171         // int32_t
3172         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3173         break;
3174       case 8:
3175         // int64_t
3176         __ ldr(c_rarg1, Address(c_rarg0, 0));
3177         break;
3178       default:
3179         ShouldNotReachHere();
3180     }
3181 
3182     // return errValue or *adr
3183     *continuation_pc = __ pc();
3184     __ mov(r0, c_rarg1);
3185     __ ret(lr);
3186   }
3187 #endif
3188 
3189   /**
3190    *  Arguments:
3191    *
3192    * Inputs:
3193    *   c_rarg0   - int crc
3194    *   c_rarg1   - byte* buf
3195    *   c_rarg2   - int length
3196    *
3197    * Ouput:
3198    *       rax   - int crc result
3199    */
3200   address generate_updateBytesCRC32() {
3201     assert(UseCRC32Intrinsics, "what are we doing here?");
3202 
3203     __ align(CodeEntryAlignment);
3204     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3205 
3206     address start = __ pc();
3207 
3208     const Register crc   = c_rarg0;  // crc
3209     const Register buf   = c_rarg1;  // source java byte array address
3210     const Register len   = c_rarg2;  // length
3211     const Register table0 = c_rarg3; // crc_table address
3212     const Register table1 = c_rarg4;
3213     const Register table2 = c_rarg5;
3214     const Register table3 = c_rarg6;
3215     const Register tmp3 = c_rarg7;
3216 
3217     BLOCK_COMMENT("Entry:");
3218     __ enter(); // required for proper stackwalking of RuntimeStub frame
3219 
3220     __ kernel_crc32(crc, buf, len,
3221               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3222 
3223     __ leave(); // required for proper stackwalking of RuntimeStub frame
3224     __ ret(lr);
3225 
3226     return start;
3227   }
3228 
3229   /**
3230    *  Arguments:
3231    *
3232    * Inputs:
3233    *   c_rarg0   - int crc
3234    *   c_rarg1   - byte* buf
3235    *   c_rarg2   - int length
3236    *   c_rarg3   - int* table
3237    *
3238    * Ouput:
3239    *       r0   - int crc result
3240    */
3241   address generate_updateBytesCRC32C() {
3242     assert(UseCRC32CIntrinsics, "what are we doing here?");
3243 
3244     __ align(CodeEntryAlignment);
3245     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3246 
3247     address start = __ pc();
3248 
3249     const Register crc   = c_rarg0;  // crc
3250     const Register buf   = c_rarg1;  // source java byte array address
3251     const Register len   = c_rarg2;  // length
3252     const Register table0 = c_rarg3; // crc_table address
3253     const Register table1 = c_rarg4;
3254     const Register table2 = c_rarg5;
3255     const Register table3 = c_rarg6;
3256     const Register tmp3 = c_rarg7;
3257 
3258     BLOCK_COMMENT("Entry:");
3259     __ enter(); // required for proper stackwalking of RuntimeStub frame
3260 
3261     __ kernel_crc32c(crc, buf, len,
3262               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3263 
3264     __ leave(); // required for proper stackwalking of RuntimeStub frame
3265     __ ret(lr);
3266 
3267     return start;
3268   }
3269 
3270   /***
3271    *  Arguments:
3272    *
3273    *  Inputs:
3274    *   c_rarg0   - int   adler
3275    *   c_rarg1   - byte* buff
3276    *   c_rarg2   - int   len
3277    *
3278    * Output:
3279    *   c_rarg0   - int adler result
3280    */
3281   address generate_updateBytesAdler32() {
3282     __ align(CodeEntryAlignment);
3283     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3284     address start = __ pc();
3285 
3286     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3287 
3288     // Aliases
3289     Register adler  = c_rarg0;
3290     Register s1     = c_rarg0;
3291     Register s2     = c_rarg3;
3292     Register buff   = c_rarg1;
3293     Register len    = c_rarg2;
3294     Register nmax  = r4;
3295     Register base = r5;
3296     Register count = r6;
3297     Register temp0 = rscratch1;
3298     Register temp1 = rscratch2;
3299     Register temp2 = r7;
3300 
3301     // Max number of bytes we can process before having to take the mod
3302     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3303     unsigned long BASE = 0xfff1;
3304     unsigned long NMAX = 0x15B0;
3305 
3306     __ mov(base, BASE);
3307     __ mov(nmax, NMAX);
3308 
3309     // s1 is initialized to the lower 16 bits of adler
3310     // s2 is initialized to the upper 16 bits of adler
3311     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3312     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3313 
3314     // The pipelined loop needs at least 16 elements for 1 iteration
3315     // It does check this, but it is more effective to skip to the cleanup loop
3316     __ cmp(len, 16);
3317     __ br(Assembler::HS, L_nmax);
3318     __ cbz(len, L_combine);
3319 
3320     __ bind(L_simple_by1_loop);
3321     __ ldrb(temp0, Address(__ post(buff, 1)));
3322     __ add(s1, s1, temp0);
3323     __ add(s2, s2, s1);
3324     __ subs(len, len, 1);
3325     __ br(Assembler::HI, L_simple_by1_loop);
3326 
3327     // s1 = s1 % BASE
3328     __ subs(temp0, s1, base);
3329     __ csel(s1, temp0, s1, Assembler::HS);
3330 
3331     // s2 = s2 % BASE
3332     __ lsr(temp0, s2, 16);
3333     __ lsl(temp1, temp0, 4);
3334     __ sub(temp1, temp1, temp0);
3335     __ add(s2, temp1, s2, ext::uxth);
3336 
3337     __ subs(temp0, s2, base);
3338     __ csel(s2, temp0, s2, Assembler::HS);
3339 
3340     __ b(L_combine);
3341 
3342     __ bind(L_nmax);
3343     __ subs(len, len, nmax);
3344     __ sub(count, nmax, 16);
3345     __ br(Assembler::LO, L_by16);
3346 
3347     __ bind(L_nmax_loop);
3348 
3349     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3350 
3351     __ add(s1, s1, temp0, ext::uxtb);
3352     __ ubfx(temp2, temp0, 8, 8);
3353     __ add(s2, s2, s1);
3354     __ add(s1, s1, temp2);
3355     __ ubfx(temp2, temp0, 16, 8);
3356     __ add(s2, s2, s1);
3357     __ add(s1, s1, temp2);
3358     __ ubfx(temp2, temp0, 24, 8);
3359     __ add(s2, s2, s1);
3360     __ add(s1, s1, temp2);
3361     __ ubfx(temp2, temp0, 32, 8);
3362     __ add(s2, s2, s1);
3363     __ add(s1, s1, temp2);
3364     __ ubfx(temp2, temp0, 40, 8);
3365     __ add(s2, s2, s1);
3366     __ add(s1, s1, temp2);
3367     __ ubfx(temp2, temp0, 48, 8);
3368     __ add(s2, s2, s1);
3369     __ add(s1, s1, temp2);
3370     __ add(s2, s2, s1);
3371     __ add(s1, s1, temp0, Assembler::LSR, 56);
3372     __ add(s2, s2, s1);
3373 
3374     __ add(s1, s1, temp1, ext::uxtb);
3375     __ ubfx(temp2, temp1, 8, 8);
3376     __ add(s2, s2, s1);
3377     __ add(s1, s1, temp2);
3378     __ ubfx(temp2, temp1, 16, 8);
3379     __ add(s2, s2, s1);
3380     __ add(s1, s1, temp2);
3381     __ ubfx(temp2, temp1, 24, 8);
3382     __ add(s2, s2, s1);
3383     __ add(s1, s1, temp2);
3384     __ ubfx(temp2, temp1, 32, 8);
3385     __ add(s2, s2, s1);
3386     __ add(s1, s1, temp2);
3387     __ ubfx(temp2, temp1, 40, 8);
3388     __ add(s2, s2, s1);
3389     __ add(s1, s1, temp2);
3390     __ ubfx(temp2, temp1, 48, 8);
3391     __ add(s2, s2, s1);
3392     __ add(s1, s1, temp2);
3393     __ add(s2, s2, s1);
3394     __ add(s1, s1, temp1, Assembler::LSR, 56);
3395     __ add(s2, s2, s1);
3396 
3397     __ subs(count, count, 16);
3398     __ br(Assembler::HS, L_nmax_loop);
3399 
3400     // s1 = s1 % BASE
3401     __ lsr(temp0, s1, 16);
3402     __ lsl(temp1, temp0, 4);
3403     __ sub(temp1, temp1, temp0);
3404     __ add(temp1, temp1, s1, ext::uxth);
3405 
3406     __ lsr(temp0, temp1, 16);
3407     __ lsl(s1, temp0, 4);
3408     __ sub(s1, s1, temp0);
3409     __ add(s1, s1, temp1, ext:: uxth);
3410 
3411     __ subs(temp0, s1, base);
3412     __ csel(s1, temp0, s1, Assembler::HS);
3413 
3414     // s2 = s2 % BASE
3415     __ lsr(temp0, s2, 16);
3416     __ lsl(temp1, temp0, 4);
3417     __ sub(temp1, temp1, temp0);
3418     __ add(temp1, temp1, s2, ext::uxth);
3419 
3420     __ lsr(temp0, temp1, 16);
3421     __ lsl(s2, temp0, 4);
3422     __ sub(s2, s2, temp0);
3423     __ add(s2, s2, temp1, ext:: uxth);
3424 
3425     __ subs(temp0, s2, base);
3426     __ csel(s2, temp0, s2, Assembler::HS);
3427 
3428     __ subs(len, len, nmax);
3429     __ sub(count, nmax, 16);
3430     __ br(Assembler::HS, L_nmax_loop);
3431 
3432     __ bind(L_by16);
3433     __ adds(len, len, count);
3434     __ br(Assembler::LO, L_by1);
3435 
3436     __ bind(L_by16_loop);
3437 
3438     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3439 
3440     __ add(s1, s1, temp0, ext::uxtb);
3441     __ ubfx(temp2, temp0, 8, 8);
3442     __ add(s2, s2, s1);
3443     __ add(s1, s1, temp2);
3444     __ ubfx(temp2, temp0, 16, 8);
3445     __ add(s2, s2, s1);
3446     __ add(s1, s1, temp2);
3447     __ ubfx(temp2, temp0, 24, 8);
3448     __ add(s2, s2, s1);
3449     __ add(s1, s1, temp2);
3450     __ ubfx(temp2, temp0, 32, 8);
3451     __ add(s2, s2, s1);
3452     __ add(s1, s1, temp2);
3453     __ ubfx(temp2, temp0, 40, 8);
3454     __ add(s2, s2, s1);
3455     __ add(s1, s1, temp2);
3456     __ ubfx(temp2, temp0, 48, 8);
3457     __ add(s2, s2, s1);
3458     __ add(s1, s1, temp2);
3459     __ add(s2, s2, s1);
3460     __ add(s1, s1, temp0, Assembler::LSR, 56);
3461     __ add(s2, s2, s1);
3462 
3463     __ add(s1, s1, temp1, ext::uxtb);
3464     __ ubfx(temp2, temp1, 8, 8);
3465     __ add(s2, s2, s1);
3466     __ add(s1, s1, temp2);
3467     __ ubfx(temp2, temp1, 16, 8);
3468     __ add(s2, s2, s1);
3469     __ add(s1, s1, temp2);
3470     __ ubfx(temp2, temp1, 24, 8);
3471     __ add(s2, s2, s1);
3472     __ add(s1, s1, temp2);
3473     __ ubfx(temp2, temp1, 32, 8);
3474     __ add(s2, s2, s1);
3475     __ add(s1, s1, temp2);
3476     __ ubfx(temp2, temp1, 40, 8);
3477     __ add(s2, s2, s1);
3478     __ add(s1, s1, temp2);
3479     __ ubfx(temp2, temp1, 48, 8);
3480     __ add(s2, s2, s1);
3481     __ add(s1, s1, temp2);
3482     __ add(s2, s2, s1);
3483     __ add(s1, s1, temp1, Assembler::LSR, 56);
3484     __ add(s2, s2, s1);
3485 
3486     __ subs(len, len, 16);
3487     __ br(Assembler::HS, L_by16_loop);
3488 
3489     __ bind(L_by1);
3490     __ adds(len, len, 15);
3491     __ br(Assembler::LO, L_do_mod);
3492 
3493     __ bind(L_by1_loop);
3494     __ ldrb(temp0, Address(__ post(buff, 1)));
3495     __ add(s1, temp0, s1);
3496     __ add(s2, s2, s1);
3497     __ subs(len, len, 1);
3498     __ br(Assembler::HS, L_by1_loop);
3499 
3500     __ bind(L_do_mod);
3501     // s1 = s1 % BASE
3502     __ lsr(temp0, s1, 16);
3503     __ lsl(temp1, temp0, 4);
3504     __ sub(temp1, temp1, temp0);
3505     __ add(temp1, temp1, s1, ext::uxth);
3506 
3507     __ lsr(temp0, temp1, 16);
3508     __ lsl(s1, temp0, 4);
3509     __ sub(s1, s1, temp0);
3510     __ add(s1, s1, temp1, ext:: uxth);
3511 
3512     __ subs(temp0, s1, base);
3513     __ csel(s1, temp0, s1, Assembler::HS);
3514 
3515     // s2 = s2 % BASE
3516     __ lsr(temp0, s2, 16);
3517     __ lsl(temp1, temp0, 4);
3518     __ sub(temp1, temp1, temp0);
3519     __ add(temp1, temp1, s2, ext::uxth);
3520 
3521     __ lsr(temp0, temp1, 16);
3522     __ lsl(s2, temp0, 4);
3523     __ sub(s2, s2, temp0);
3524     __ add(s2, s2, temp1, ext:: uxth);
3525 
3526     __ subs(temp0, s2, base);
3527     __ csel(s2, temp0, s2, Assembler::HS);
3528 
3529     // Combine lower bits and higher bits
3530     __ bind(L_combine);
3531     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3532 
3533     __ ret(lr);
3534 
3535     return start;
3536   }
3537 
3538   /**
3539    *  Arguments:
3540    *
3541    *  Input:
3542    *    c_rarg0   - x address
3543    *    c_rarg1   - x length
3544    *    c_rarg2   - y address
3545    *    c_rarg3   - y lenth
3546    *    c_rarg4   - z address
3547    *    c_rarg5   - z length
3548    */
3549   address generate_multiplyToLen() {
3550     __ align(CodeEntryAlignment);
3551     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3552 
3553     address start = __ pc();
3554     const Register x     = r0;
3555     const Register xlen  = r1;
3556     const Register y     = r2;
3557     const Register ylen  = r3;
3558     const Register z     = r4;
3559     const Register zlen  = r5;
3560 
3561     const Register tmp1  = r10;
3562     const Register tmp2  = r11;
3563     const Register tmp3  = r12;
3564     const Register tmp4  = r13;
3565     const Register tmp5  = r14;
3566     const Register tmp6  = r15;
3567     const Register tmp7  = r16;
3568 
3569     BLOCK_COMMENT("Entry:");
3570     __ enter(); // required for proper stackwalking of RuntimeStub frame
3571     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3572     __ leave(); // required for proper stackwalking of RuntimeStub frame
3573     __ ret(lr);
3574 
3575     return start;
3576   }
3577 
3578   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3579                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3580                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3581     // Karatsuba multiplication performs a 128*128 -> 256-bit
3582     // multiplication in three 128-bit multiplications and a few
3583     // additions.
3584     //
3585     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3586     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3587     //
3588     // Inputs:
3589     //
3590     // A0 in a.d[0]     (subkey)
3591     // A1 in a.d[1]
3592     // (A1+A0) in a1_xor_a0.d[0]
3593     //
3594     // B0 in b.d[0]     (state)
3595     // B1 in b.d[1]
3596 
3597     __ ext(tmp1, __ T16B, b, b, 0x08);
3598     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3599     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3600     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3601     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3602 
3603     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3604     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3605     __ eor(tmp2, __ T16B, tmp2, tmp4);
3606     __ eor(tmp2, __ T16B, tmp2, tmp3);
3607 
3608     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3609     __ ins(result_hi, __ D, tmp2, 0, 1);
3610     __ ins(result_lo, __ D, tmp2, 1, 0);
3611   }
3612 
3613   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3614                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3615     const FloatRegister t0 = result;
3616 
3617     // The GCM field polynomial f is z^128 + p(z), where p =
3618     // z^7+z^2+z+1.
3619     //
3620     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3621     //
3622     // so, given that the product we're reducing is
3623     //    a == lo + hi * z^128
3624     // substituting,
3625     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3626     //
3627     // we reduce by multiplying hi by p(z) and subtracting the result
3628     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3629     // bits we can do this with two 64-bit multiplications, lo*p and
3630     // hi*p.
3631 
3632     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3633     __ ext(t1, __ T16B, t0, z, 8);
3634     __ eor(hi, __ T16B, hi, t1);
3635     __ ext(t1, __ T16B, z, t0, 8);
3636     __ eor(lo, __ T16B, lo, t1);
3637     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3638     __ eor(result, __ T16B, lo, t0);
3639   }
3640 
3641   /**
3642    *  Arguments:
3643    *
3644    *  Input:
3645    *  c_rarg0   - current state address
3646    *  c_rarg1   - H key address
3647    *  c_rarg2   - data address
3648    *  c_rarg3   - number of blocks
3649    *
3650    *  Output:
3651    *  Updated state at c_rarg0
3652    */
3653   address generate_ghash_processBlocks() {
3654     // Bafflingly, GCM uses little-endian for the byte order, but
3655     // big-endian for the bit order.  For example, the polynomial 1 is
3656     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3657     //
3658     // So, we must either reverse the bytes in each word and do
3659     // everything big-endian or reverse the bits in each byte and do
3660     // it little-endian.  On AArch64 it's more idiomatic to reverse
3661     // the bits in each byte (we have an instruction, RBIT, to do
3662     // that) and keep the data in little-endian bit order throught the
3663     // calculation, bit-reversing the inputs and outputs.
3664 
3665     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3666     __ align(wordSize * 2);
3667     address p = __ pc();
3668     __ emit_int64(0x87);  // The low-order bits of the field
3669                           // polynomial (i.e. p = z^7+z^2+z+1)
3670                           // repeated in the low and high parts of a
3671                           // 128-bit vector
3672     __ emit_int64(0x87);
3673 
3674     __ align(CodeEntryAlignment);
3675     address start = __ pc();
3676 
3677     Register state   = c_rarg0;
3678     Register subkeyH = c_rarg1;
3679     Register data    = c_rarg2;
3680     Register blocks  = c_rarg3;
3681 
3682     FloatRegister vzr = v30;
3683     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3684 
3685     __ ldrq(v0, Address(state));
3686     __ ldrq(v1, Address(subkeyH));
3687 
3688     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3689     __ rbit(v0, __ T16B, v0);
3690     __ rev64(v1, __ T16B, v1);
3691     __ rbit(v1, __ T16B, v1);
3692 
3693     __ ldrq(v26, p);
3694 
3695     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3696     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3697 
3698     {
3699       Label L_ghash_loop;
3700       __ bind(L_ghash_loop);
3701 
3702       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3703                                                  // reversing each byte
3704       __ rbit(v2, __ T16B, v2);
3705       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3706 
3707       // Multiply state in v2 by subkey in v1
3708       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3709                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3710                      /*temps*/v6, v20, v18, v21);
3711       // Reduce v7:v5 by the field polynomial
3712       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3713 
3714       __ sub(blocks, blocks, 1);
3715       __ cbnz(blocks, L_ghash_loop);
3716     }
3717 
3718     // The bit-reversed result is at this point in v0
3719     __ rev64(v1, __ T16B, v0);
3720     __ rbit(v1, __ T16B, v1);
3721 
3722     __ st1(v1, __ T16B, state);
3723     __ ret(lr);
3724 
3725     return start;
3726   }
3727 
3728   // Continuation point for throwing of implicit exceptions that are
3729   // not handled in the current activation. Fabricates an exception
3730   // oop and initiates normal exception dispatching in this
3731   // frame. Since we need to preserve callee-saved values (currently
3732   // only for C2, but done for C1 as well) we need a callee-saved oop
3733   // map and therefore have to make these stubs into RuntimeStubs
3734   // rather than BufferBlobs.  If the compiler needs all registers to
3735   // be preserved between the fault point and the exception handler
3736   // then it must assume responsibility for that in
3737   // AbstractCompiler::continuation_for_implicit_null_exception or
3738   // continuation_for_implicit_division_by_zero_exception. All other
3739   // implicit exceptions (e.g., NullPointerException or
3740   // AbstractMethodError on entry) are either at call sites or
3741   // otherwise assume that stack unwinding will be initiated, so
3742   // caller saved registers were assumed volatile in the compiler.
3743 
3744 #undef __
3745 #define __ masm->
3746 
3747   address generate_throw_exception(const char* name,
3748                                    address runtime_entry,
3749                                    Register arg1 = noreg,
3750                                    Register arg2 = noreg) {
3751     // Information about frame layout at time of blocking runtime call.
3752     // Note that we only have to preserve callee-saved registers since
3753     // the compilers are responsible for supplying a continuation point
3754     // if they expect all registers to be preserved.
3755     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3756     enum layout {
3757       rfp_off = 0,
3758       rfp_off2,
3759       return_off,
3760       return_off2,
3761       framesize // inclusive of return address
3762     };
3763 
3764     int insts_size = 512;
3765     int locs_size  = 64;
3766 
3767     CodeBuffer code(name, insts_size, locs_size);
3768     OopMapSet* oop_maps  = new OopMapSet();
3769     MacroAssembler* masm = new MacroAssembler(&code);
3770 
3771     address start = __ pc();
3772 
3773     // This is an inlined and slightly modified version of call_VM
3774     // which has the ability to fetch the return PC out of
3775     // thread-local storage and also sets up last_Java_sp slightly
3776     // differently than the real call_VM
3777 
3778     __ enter(); // Save FP and LR before call
3779 
3780     assert(is_even(framesize/2), "sp not 16-byte aligned");
3781 
3782     // lr and fp are already in place
3783     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3784 
3785     int frame_complete = __ pc() - start;
3786 
3787     // Set up last_Java_sp and last_Java_fp
3788     address the_pc = __ pc();
3789     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3790 
3791     // Call runtime
3792     if (arg1 != noreg) {
3793       assert(arg2 != c_rarg1, "clobbered");
3794       __ mov(c_rarg1, arg1);
3795     }
3796     if (arg2 != noreg) {
3797       __ mov(c_rarg2, arg2);
3798     }
3799     __ mov(c_rarg0, rthread);
3800     BLOCK_COMMENT("call runtime_entry");
3801     __ mov(rscratch1, runtime_entry);
3802     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3803 
3804     // Generate oop map
3805     OopMap* map = new OopMap(framesize, 0);
3806 
3807     oop_maps->add_gc_map(the_pc - start, map);
3808 
3809     __ reset_last_Java_frame(true);
3810     __ maybe_isb();
3811 
3812     __ leave();
3813 
3814     // check for pending exceptions
3815 #ifdef ASSERT
3816     Label L;
3817     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3818     __ cbnz(rscratch1, L);
3819     __ should_not_reach_here();
3820     __ bind(L);
3821 #endif // ASSERT
3822     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3823 
3824 
3825     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3826     RuntimeStub* stub =
3827       RuntimeStub::new_runtime_stub(name,
3828                                     &code,
3829                                     frame_complete,
3830                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3831                                     oop_maps, false);
3832     return stub->entry_point();
3833   }
3834 
3835   class MontgomeryMultiplyGenerator : public MacroAssembler {
3836 
3837     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3838       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3839 
3840     RegSet _toSave;
3841     bool _squaring;
3842 
3843   public:
3844     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3845       : MacroAssembler(as->code()), _squaring(squaring) {
3846 
3847       // Register allocation
3848 
3849       Register reg = c_rarg0;
3850       Pa_base = reg;       // Argument registers
3851       if (squaring)
3852         Pb_base = Pa_base;
3853       else
3854         Pb_base = ++reg;
3855       Pn_base = ++reg;
3856       Rlen= ++reg;
3857       inv = ++reg;
3858       Pm_base = ++reg;
3859 
3860                           // Working registers:
3861       Ra =  ++reg;        // The current digit of a, b, n, and m.
3862       Rb =  ++reg;
3863       Rm =  ++reg;
3864       Rn =  ++reg;
3865 
3866       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3867       Pb =  ++reg;
3868       Pm =  ++reg;
3869       Pn =  ++reg;
3870 
3871       t0 =  ++reg;        // Three registers which form a
3872       t1 =  ++reg;        // triple-precision accumuator.
3873       t2 =  ++reg;
3874 
3875       Ri =  ++reg;        // Inner and outer loop indexes.
3876       Rj =  ++reg;
3877 
3878       Rhi_ab = ++reg;     // Product registers: low and high parts
3879       Rlo_ab = ++reg;     // of a*b and m*n.
3880       Rhi_mn = ++reg;
3881       Rlo_mn = ++reg;
3882 
3883       // r19 and up are callee-saved.
3884       _toSave = RegSet::range(r19, reg) + Pm_base;
3885     }
3886 
3887   private:
3888     void save_regs() {
3889       push(_toSave, sp);
3890     }
3891 
3892     void restore_regs() {
3893       pop(_toSave, sp);
3894     }
3895 
3896     template <typename T>
3897     void unroll_2(Register count, T block) {
3898       Label loop, end, odd;
3899       tbnz(count, 0, odd);
3900       cbz(count, end);
3901       align(16);
3902       bind(loop);
3903       (this->*block)();
3904       bind(odd);
3905       (this->*block)();
3906       subs(count, count, 2);
3907       br(Assembler::GT, loop);
3908       bind(end);
3909     }
3910 
3911     template <typename T>
3912     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3913       Label loop, end, odd;
3914       tbnz(count, 0, odd);
3915       cbz(count, end);
3916       align(16);
3917       bind(loop);
3918       (this->*block)(d, s, tmp);
3919       bind(odd);
3920       (this->*block)(d, s, tmp);
3921       subs(count, count, 2);
3922       br(Assembler::GT, loop);
3923       bind(end);
3924     }
3925 
3926     void pre1(RegisterOrConstant i) {
3927       block_comment("pre1");
3928       // Pa = Pa_base;
3929       // Pb = Pb_base + i;
3930       // Pm = Pm_base;
3931       // Pn = Pn_base + i;
3932       // Ra = *Pa;
3933       // Rb = *Pb;
3934       // Rm = *Pm;
3935       // Rn = *Pn;
3936       ldr(Ra, Address(Pa_base));
3937       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3938       ldr(Rm, Address(Pm_base));
3939       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3940       lea(Pa, Address(Pa_base));
3941       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3942       lea(Pm, Address(Pm_base));
3943       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3944 
3945       // Zero the m*n result.
3946       mov(Rhi_mn, zr);
3947       mov(Rlo_mn, zr);
3948     }
3949 
3950     // The core multiply-accumulate step of a Montgomery
3951     // multiplication.  The idea is to schedule operations as a
3952     // pipeline so that instructions with long latencies (loads and
3953     // multiplies) have time to complete before their results are
3954     // used.  This most benefits in-order implementations of the
3955     // architecture but out-of-order ones also benefit.
3956     void step() {
3957       block_comment("step");
3958       // MACC(Ra, Rb, t0, t1, t2);
3959       // Ra = *++Pa;
3960       // Rb = *--Pb;
3961       umulh(Rhi_ab, Ra, Rb);
3962       mul(Rlo_ab, Ra, Rb);
3963       ldr(Ra, pre(Pa, wordSize));
3964       ldr(Rb, pre(Pb, -wordSize));
3965       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3966                                        // previous iteration.
3967       // MACC(Rm, Rn, t0, t1, t2);
3968       // Rm = *++Pm;
3969       // Rn = *--Pn;
3970       umulh(Rhi_mn, Rm, Rn);
3971       mul(Rlo_mn, Rm, Rn);
3972       ldr(Rm, pre(Pm, wordSize));
3973       ldr(Rn, pre(Pn, -wordSize));
3974       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3975     }
3976 
3977     void post1() {
3978       block_comment("post1");
3979 
3980       // MACC(Ra, Rb, t0, t1, t2);
3981       // Ra = *++Pa;
3982       // Rb = *--Pb;
3983       umulh(Rhi_ab, Ra, Rb);
3984       mul(Rlo_ab, Ra, Rb);
3985       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3986       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3987 
3988       // *Pm = Rm = t0 * inv;
3989       mul(Rm, t0, inv);
3990       str(Rm, Address(Pm));
3991 
3992       // MACC(Rm, Rn, t0, t1, t2);
3993       // t0 = t1; t1 = t2; t2 = 0;
3994       umulh(Rhi_mn, Rm, Rn);
3995 
3996 #ifndef PRODUCT
3997       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3998       {
3999         mul(Rlo_mn, Rm, Rn);
4000         add(Rlo_mn, t0, Rlo_mn);
4001         Label ok;
4002         cbz(Rlo_mn, ok); {
4003           stop("broken Montgomery multiply");
4004         } bind(ok);
4005       }
4006 #endif
4007       // We have very carefully set things up so that
4008       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4009       // the lower half of Rm * Rn because we know the result already:
4010       // it must be -t0.  t0 + (-t0) must generate a carry iff
4011       // t0 != 0.  So, rather than do a mul and an adds we just set
4012       // the carry flag iff t0 is nonzero.
4013       //
4014       // mul(Rlo_mn, Rm, Rn);
4015       // adds(zr, t0, Rlo_mn);
4016       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4017       adcs(t0, t1, Rhi_mn);
4018       adc(t1, t2, zr);
4019       mov(t2, zr);
4020     }
4021 
4022     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4023       block_comment("pre2");
4024       // Pa = Pa_base + i-len;
4025       // Pb = Pb_base + len;
4026       // Pm = Pm_base + i-len;
4027       // Pn = Pn_base + len;
4028 
4029       if (i.is_register()) {
4030         sub(Rj, i.as_register(), len);
4031       } else {
4032         mov(Rj, i.as_constant());
4033         sub(Rj, Rj, len);
4034       }
4035       // Rj == i-len
4036 
4037       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4038       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4039       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4040       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4041 
4042       // Ra = *++Pa;
4043       // Rb = *--Pb;
4044       // Rm = *++Pm;
4045       // Rn = *--Pn;
4046       ldr(Ra, pre(Pa, wordSize));
4047       ldr(Rb, pre(Pb, -wordSize));
4048       ldr(Rm, pre(Pm, wordSize));
4049       ldr(Rn, pre(Pn, -wordSize));
4050 
4051       mov(Rhi_mn, zr);
4052       mov(Rlo_mn, zr);
4053     }
4054 
4055     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4056       block_comment("post2");
4057       if (i.is_constant()) {
4058         mov(Rj, i.as_constant()-len.as_constant());
4059       } else {
4060         sub(Rj, i.as_register(), len);
4061       }
4062 
4063       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4064 
4065       // As soon as we know the least significant digit of our result,
4066       // store it.
4067       // Pm_base[i-len] = t0;
4068       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4069 
4070       // t0 = t1; t1 = t2; t2 = 0;
4071       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4072       adc(t1, t2, zr);
4073       mov(t2, zr);
4074     }
4075 
4076     // A carry in t0 after Montgomery multiplication means that we
4077     // should subtract multiples of n from our result in m.  We'll
4078     // keep doing that until there is no carry.
4079     void normalize(RegisterOrConstant len) {
4080       block_comment("normalize");
4081       // while (t0)
4082       //   t0 = sub(Pm_base, Pn_base, t0, len);
4083       Label loop, post, again;
4084       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4085       cbz(t0, post); {
4086         bind(again); {
4087           mov(i, zr);
4088           mov(cnt, len);
4089           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4090           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4091           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4092           align(16);
4093           bind(loop); {
4094             sbcs(Rm, Rm, Rn);
4095             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4096             add(i, i, 1);
4097             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4098             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4099             sub(cnt, cnt, 1);
4100           } cbnz(cnt, loop);
4101           sbc(t0, t0, zr);
4102         } cbnz(t0, again);
4103       } bind(post);
4104     }
4105 
4106     // Move memory at s to d, reversing words.
4107     //    Increments d to end of copied memory
4108     //    Destroys tmp1, tmp2
4109     //    Preserves len
4110     //    Leaves s pointing to the address which was in d at start
4111     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4112       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4113 
4114       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4115       mov(tmp1, len);
4116       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4117       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4118     }
4119     // where
4120     void reverse1(Register d, Register s, Register tmp) {
4121       ldr(tmp, pre(s, -wordSize));
4122       ror(tmp, tmp, 32);
4123       str(tmp, post(d, wordSize));
4124     }
4125 
4126     void step_squaring() {
4127       // An extra ACC
4128       step();
4129       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4130     }
4131 
4132     void last_squaring(RegisterOrConstant i) {
4133       Label dont;
4134       // if ((i & 1) == 0) {
4135       tbnz(i.as_register(), 0, dont); {
4136         // MACC(Ra, Rb, t0, t1, t2);
4137         // Ra = *++Pa;
4138         // Rb = *--Pb;
4139         umulh(Rhi_ab, Ra, Rb);
4140         mul(Rlo_ab, Ra, Rb);
4141         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4142       } bind(dont);
4143     }
4144 
4145     void extra_step_squaring() {
4146       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4147 
4148       // MACC(Rm, Rn, t0, t1, t2);
4149       // Rm = *++Pm;
4150       // Rn = *--Pn;
4151       umulh(Rhi_mn, Rm, Rn);
4152       mul(Rlo_mn, Rm, Rn);
4153       ldr(Rm, pre(Pm, wordSize));
4154       ldr(Rn, pre(Pn, -wordSize));
4155     }
4156 
4157     void post1_squaring() {
4158       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4159 
4160       // *Pm = Rm = t0 * inv;
4161       mul(Rm, t0, inv);
4162       str(Rm, Address(Pm));
4163 
4164       // MACC(Rm, Rn, t0, t1, t2);
4165       // t0 = t1; t1 = t2; t2 = 0;
4166       umulh(Rhi_mn, Rm, Rn);
4167 
4168 #ifndef PRODUCT
4169       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4170       {
4171         mul(Rlo_mn, Rm, Rn);
4172         add(Rlo_mn, t0, Rlo_mn);
4173         Label ok;
4174         cbz(Rlo_mn, ok); {
4175           stop("broken Montgomery multiply");
4176         } bind(ok);
4177       }
4178 #endif
4179       // We have very carefully set things up so that
4180       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4181       // the lower half of Rm * Rn because we know the result already:
4182       // it must be -t0.  t0 + (-t0) must generate a carry iff
4183       // t0 != 0.  So, rather than do a mul and an adds we just set
4184       // the carry flag iff t0 is nonzero.
4185       //
4186       // mul(Rlo_mn, Rm, Rn);
4187       // adds(zr, t0, Rlo_mn);
4188       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4189       adcs(t0, t1, Rhi_mn);
4190       adc(t1, t2, zr);
4191       mov(t2, zr);
4192     }
4193 
4194     void acc(Register Rhi, Register Rlo,
4195              Register t0, Register t1, Register t2) {
4196       adds(t0, t0, Rlo);
4197       adcs(t1, t1, Rhi);
4198       adc(t2, t2, zr);
4199     }
4200 
4201   public:
4202     /**
4203      * Fast Montgomery multiplication.  The derivation of the
4204      * algorithm is in A Cryptographic Library for the Motorola
4205      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4206      *
4207      * Arguments:
4208      *
4209      * Inputs for multiplication:
4210      *   c_rarg0   - int array elements a
4211      *   c_rarg1   - int array elements b
4212      *   c_rarg2   - int array elements n (the modulus)
4213      *   c_rarg3   - int length
4214      *   c_rarg4   - int inv
4215      *   c_rarg5   - int array elements m (the result)
4216      *
4217      * Inputs for squaring:
4218      *   c_rarg0   - int array elements a
4219      *   c_rarg1   - int array elements n (the modulus)
4220      *   c_rarg2   - int length
4221      *   c_rarg3   - int inv
4222      *   c_rarg4   - int array elements m (the result)
4223      *
4224      */
4225     address generate_multiply() {
4226       Label argh, nothing;
4227       bind(argh);
4228       stop("MontgomeryMultiply total_allocation must be <= 8192");
4229 
4230       align(CodeEntryAlignment);
4231       address entry = pc();
4232 
4233       cbzw(Rlen, nothing);
4234 
4235       enter();
4236 
4237       // Make room.
4238       cmpw(Rlen, 512);
4239       br(Assembler::HI, argh);
4240       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4241       andr(sp, Ra, -2 * wordSize);
4242 
4243       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4244 
4245       {
4246         // Copy input args, reversing as we go.  We use Ra as a
4247         // temporary variable.
4248         reverse(Ra, Pa_base, Rlen, t0, t1);
4249         if (!_squaring)
4250           reverse(Ra, Pb_base, Rlen, t0, t1);
4251         reverse(Ra, Pn_base, Rlen, t0, t1);
4252       }
4253 
4254       // Push all call-saved registers and also Pm_base which we'll need
4255       // at the end.
4256       save_regs();
4257 
4258 #ifndef PRODUCT
4259       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4260       {
4261         ldr(Rn, Address(Pn_base, 0));
4262         mul(Rlo_mn, Rn, inv);
4263         cmp(Rlo_mn, -1);
4264         Label ok;
4265         br(EQ, ok); {
4266           stop("broken inverse in Montgomery multiply");
4267         } bind(ok);
4268       }
4269 #endif
4270 
4271       mov(Pm_base, Ra);
4272 
4273       mov(t0, zr);
4274       mov(t1, zr);
4275       mov(t2, zr);
4276 
4277       block_comment("for (int i = 0; i < len; i++) {");
4278       mov(Ri, zr); {
4279         Label loop, end;
4280         cmpw(Ri, Rlen);
4281         br(Assembler::GE, end);
4282 
4283         bind(loop);
4284         pre1(Ri);
4285 
4286         block_comment("  for (j = i; j; j--) {"); {
4287           movw(Rj, Ri);
4288           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4289         } block_comment("  } // j");
4290 
4291         post1();
4292         addw(Ri, Ri, 1);
4293         cmpw(Ri, Rlen);
4294         br(Assembler::LT, loop);
4295         bind(end);
4296         block_comment("} // i");
4297       }
4298 
4299       block_comment("for (int i = len; i < 2*len; i++) {");
4300       mov(Ri, Rlen); {
4301         Label loop, end;
4302         cmpw(Ri, Rlen, Assembler::LSL, 1);
4303         br(Assembler::GE, end);
4304 
4305         bind(loop);
4306         pre2(Ri, Rlen);
4307 
4308         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4309           lslw(Rj, Rlen, 1);
4310           subw(Rj, Rj, Ri);
4311           subw(Rj, Rj, 1);
4312           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4313         } block_comment("  } // j");
4314 
4315         post2(Ri, Rlen);
4316         addw(Ri, Ri, 1);
4317         cmpw(Ri, Rlen, Assembler::LSL, 1);
4318         br(Assembler::LT, loop);
4319         bind(end);
4320       }
4321       block_comment("} // i");
4322 
4323       normalize(Rlen);
4324 
4325       mov(Ra, Pm_base);  // Save Pm_base in Ra
4326       restore_regs();  // Restore caller's Pm_base
4327 
4328       // Copy our result into caller's Pm_base
4329       reverse(Pm_base, Ra, Rlen, t0, t1);
4330 
4331       leave();
4332       bind(nothing);
4333       ret(lr);
4334 
4335       return entry;
4336     }
4337     // In C, approximately:
4338 
4339     // void
4340     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4341     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4342     //                     unsigned long inv, int len) {
4343     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4344     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4345     //   unsigned long Ra, Rb, Rn, Rm;
4346 
4347     //   int i;
4348 
4349     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4350 
4351     //   for (i = 0; i < len; i++) {
4352     //     int j;
4353 
4354     //     Pa = Pa_base;
4355     //     Pb = Pb_base + i;
4356     //     Pm = Pm_base;
4357     //     Pn = Pn_base + i;
4358 
4359     //     Ra = *Pa;
4360     //     Rb = *Pb;
4361     //     Rm = *Pm;
4362     //     Rn = *Pn;
4363 
4364     //     int iters = i;
4365     //     for (j = 0; iters--; j++) {
4366     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4367     //       MACC(Ra, Rb, t0, t1, t2);
4368     //       Ra = *++Pa;
4369     //       Rb = *--Pb;
4370     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4371     //       MACC(Rm, Rn, t0, t1, t2);
4372     //       Rm = *++Pm;
4373     //       Rn = *--Pn;
4374     //     }
4375 
4376     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4377     //     MACC(Ra, Rb, t0, t1, t2);
4378     //     *Pm = Rm = t0 * inv;
4379     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4380     //     MACC(Rm, Rn, t0, t1, t2);
4381 
4382     //     assert(t0 == 0, "broken Montgomery multiply");
4383 
4384     //     t0 = t1; t1 = t2; t2 = 0;
4385     //   }
4386 
4387     //   for (i = len; i < 2*len; i++) {
4388     //     int j;
4389 
4390     //     Pa = Pa_base + i-len;
4391     //     Pb = Pb_base + len;
4392     //     Pm = Pm_base + i-len;
4393     //     Pn = Pn_base + len;
4394 
4395     //     Ra = *++Pa;
4396     //     Rb = *--Pb;
4397     //     Rm = *++Pm;
4398     //     Rn = *--Pn;
4399 
4400     //     int iters = len*2-i-1;
4401     //     for (j = i-len+1; iters--; j++) {
4402     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4403     //       MACC(Ra, Rb, t0, t1, t2);
4404     //       Ra = *++Pa;
4405     //       Rb = *--Pb;
4406     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4407     //       MACC(Rm, Rn, t0, t1, t2);
4408     //       Rm = *++Pm;
4409     //       Rn = *--Pn;
4410     //     }
4411 
4412     //     Pm_base[i-len] = t0;
4413     //     t0 = t1; t1 = t2; t2 = 0;
4414     //   }
4415 
4416     //   while (t0)
4417     //     t0 = sub(Pm_base, Pn_base, t0, len);
4418     // }
4419 
4420     /**
4421      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4422      * multiplies than Montgomery multiplication so it should be up to
4423      * 25% faster.  However, its loop control is more complex and it
4424      * may actually run slower on some machines.
4425      *
4426      * Arguments:
4427      *
4428      * Inputs:
4429      *   c_rarg0   - int array elements a
4430      *   c_rarg1   - int array elements n (the modulus)
4431      *   c_rarg2   - int length
4432      *   c_rarg3   - int inv
4433      *   c_rarg4   - int array elements m (the result)
4434      *
4435      */
4436     address generate_square() {
4437       Label argh;
4438       bind(argh);
4439       stop("MontgomeryMultiply total_allocation must be <= 8192");
4440 
4441       align(CodeEntryAlignment);
4442       address entry = pc();
4443 
4444       enter();
4445 
4446       // Make room.
4447       cmpw(Rlen, 512);
4448       br(Assembler::HI, argh);
4449       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4450       andr(sp, Ra, -2 * wordSize);
4451 
4452       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4453 
4454       {
4455         // Copy input args, reversing as we go.  We use Ra as a
4456         // temporary variable.
4457         reverse(Ra, Pa_base, Rlen, t0, t1);
4458         reverse(Ra, Pn_base, Rlen, t0, t1);
4459       }
4460 
4461       // Push all call-saved registers and also Pm_base which we'll need
4462       // at the end.
4463       save_regs();
4464 
4465       mov(Pm_base, Ra);
4466 
4467       mov(t0, zr);
4468       mov(t1, zr);
4469       mov(t2, zr);
4470 
4471       block_comment("for (int i = 0; i < len; i++) {");
4472       mov(Ri, zr); {
4473         Label loop, end;
4474         bind(loop);
4475         cmp(Ri, Rlen);
4476         br(Assembler::GE, end);
4477 
4478         pre1(Ri);
4479 
4480         block_comment("for (j = (i+1)/2; j; j--) {"); {
4481           add(Rj, Ri, 1);
4482           lsr(Rj, Rj, 1);
4483           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4484         } block_comment("  } // j");
4485 
4486         last_squaring(Ri);
4487 
4488         block_comment("  for (j = i/2; j; j--) {"); {
4489           lsr(Rj, Ri, 1);
4490           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4491         } block_comment("  } // j");
4492 
4493         post1_squaring();
4494         add(Ri, Ri, 1);
4495         cmp(Ri, Rlen);
4496         br(Assembler::LT, loop);
4497 
4498         bind(end);
4499         block_comment("} // i");
4500       }
4501 
4502       block_comment("for (int i = len; i < 2*len; i++) {");
4503       mov(Ri, Rlen); {
4504         Label loop, end;
4505         bind(loop);
4506         cmp(Ri, Rlen, Assembler::LSL, 1);
4507         br(Assembler::GE, end);
4508 
4509         pre2(Ri, Rlen);
4510 
4511         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4512           lsl(Rj, Rlen, 1);
4513           sub(Rj, Rj, Ri);
4514           sub(Rj, Rj, 1);
4515           lsr(Rj, Rj, 1);
4516           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4517         } block_comment("  } // j");
4518 
4519         last_squaring(Ri);
4520 
4521         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4522           lsl(Rj, Rlen, 1);
4523           sub(Rj, Rj, Ri);
4524           lsr(Rj, Rj, 1);
4525           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4526         } block_comment("  } // j");
4527 
4528         post2(Ri, Rlen);
4529         add(Ri, Ri, 1);
4530         cmp(Ri, Rlen, Assembler::LSL, 1);
4531 
4532         br(Assembler::LT, loop);
4533         bind(end);
4534         block_comment("} // i");
4535       }
4536 
4537       normalize(Rlen);
4538 
4539       mov(Ra, Pm_base);  // Save Pm_base in Ra
4540       restore_regs();  // Restore caller's Pm_base
4541 
4542       // Copy our result into caller's Pm_base
4543       reverse(Pm_base, Ra, Rlen, t0, t1);
4544 
4545       leave();
4546       ret(lr);
4547 
4548       return entry;
4549     }
4550     // In C, approximately:
4551 
4552     // void
4553     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4554     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4555     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4556     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4557     //   unsigned long Ra, Rb, Rn, Rm;
4558 
4559     //   int i;
4560 
4561     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4562 
4563     //   for (i = 0; i < len; i++) {
4564     //     int j;
4565 
4566     //     Pa = Pa_base;
4567     //     Pb = Pa_base + i;
4568     //     Pm = Pm_base;
4569     //     Pn = Pn_base + i;
4570 
4571     //     Ra = *Pa;
4572     //     Rb = *Pb;
4573     //     Rm = *Pm;
4574     //     Rn = *Pn;
4575 
4576     //     int iters = (i+1)/2;
4577     //     for (j = 0; iters--; j++) {
4578     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4579     //       MACC2(Ra, Rb, t0, t1, t2);
4580     //       Ra = *++Pa;
4581     //       Rb = *--Pb;
4582     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4583     //       MACC(Rm, Rn, t0, t1, t2);
4584     //       Rm = *++Pm;
4585     //       Rn = *--Pn;
4586     //     }
4587     //     if ((i & 1) == 0) {
4588     //       assert(Ra == Pa_base[j], "must be");
4589     //       MACC(Ra, Ra, t0, t1, t2);
4590     //     }
4591     //     iters = i/2;
4592     //     assert(iters == i-j, "must be");
4593     //     for (; iters--; j++) {
4594     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4595     //       MACC(Rm, Rn, t0, t1, t2);
4596     //       Rm = *++Pm;
4597     //       Rn = *--Pn;
4598     //     }
4599 
4600     //     *Pm = Rm = t0 * inv;
4601     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4602     //     MACC(Rm, Rn, t0, t1, t2);
4603 
4604     //     assert(t0 == 0, "broken Montgomery multiply");
4605 
4606     //     t0 = t1; t1 = t2; t2 = 0;
4607     //   }
4608 
4609     //   for (i = len; i < 2*len; i++) {
4610     //     int start = i-len+1;
4611     //     int end = start + (len - start)/2;
4612     //     int j;
4613 
4614     //     Pa = Pa_base + i-len;
4615     //     Pb = Pa_base + len;
4616     //     Pm = Pm_base + i-len;
4617     //     Pn = Pn_base + len;
4618 
4619     //     Ra = *++Pa;
4620     //     Rb = *--Pb;
4621     //     Rm = *++Pm;
4622     //     Rn = *--Pn;
4623 
4624     //     int iters = (2*len-i-1)/2;
4625     //     assert(iters == end-start, "must be");
4626     //     for (j = start; iters--; j++) {
4627     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4628     //       MACC2(Ra, Rb, t0, t1, t2);
4629     //       Ra = *++Pa;
4630     //       Rb = *--Pb;
4631     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4632     //       MACC(Rm, Rn, t0, t1, t2);
4633     //       Rm = *++Pm;
4634     //       Rn = *--Pn;
4635     //     }
4636     //     if ((i & 1) == 0) {
4637     //       assert(Ra == Pa_base[j], "must be");
4638     //       MACC(Ra, Ra, t0, t1, t2);
4639     //     }
4640     //     iters =  (2*len-i)/2;
4641     //     assert(iters == len-j, "must be");
4642     //     for (; iters--; j++) {
4643     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4644     //       MACC(Rm, Rn, t0, t1, t2);
4645     //       Rm = *++Pm;
4646     //       Rn = *--Pn;
4647     //     }
4648     //     Pm_base[i-len] = t0;
4649     //     t0 = t1; t1 = t2; t2 = 0;
4650     //   }
4651 
4652     //   while (t0)
4653     //     t0 = sub(Pm_base, Pn_base, t0, len);
4654     // }
4655   };
4656 
4657   // Initialization
4658   void generate_initial() {
4659     // Generate initial stubs and initializes the entry points
4660 
4661     // entry points that exist in all platforms Note: This is code
4662     // that could be shared among different platforms - however the
4663     // benefit seems to be smaller than the disadvantage of having a
4664     // much more complicated generator structure. See also comment in
4665     // stubRoutines.hpp.
4666 
4667     StubRoutines::_forward_exception_entry = generate_forward_exception();
4668 
4669     StubRoutines::_call_stub_entry =
4670       generate_call_stub(StubRoutines::_call_stub_return_address);
4671 
4672     // is referenced by megamorphic call
4673     StubRoutines::_catch_exception_entry = generate_catch_exception();
4674 
4675     // Build this early so it's available for the interpreter.
4676     StubRoutines::_throw_StackOverflowError_entry =
4677       generate_throw_exception("StackOverflowError throw_exception",
4678                                CAST_FROM_FN_PTR(address,
4679                                                 SharedRuntime::
4680                                                 throw_StackOverflowError));
4681     if (UseCRC32Intrinsics) {
4682       // set table address before stub generation which use it
4683       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4684       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4685     }
4686   }
4687 
4688   void generate_all() {
4689     // support for verify_oop (must happen after universe_init)
4690     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4691     StubRoutines::_throw_AbstractMethodError_entry =
4692       generate_throw_exception("AbstractMethodError throw_exception",
4693                                CAST_FROM_FN_PTR(address,
4694                                                 SharedRuntime::
4695                                                 throw_AbstractMethodError));
4696 
4697     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4698       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4699                                CAST_FROM_FN_PTR(address,
4700                                                 SharedRuntime::
4701                                                 throw_IncompatibleClassChangeError));
4702 
4703     StubRoutines::_throw_NullPointerException_at_call_entry =
4704       generate_throw_exception("NullPointerException at call throw_exception",
4705                                CAST_FROM_FN_PTR(address,
4706                                                 SharedRuntime::
4707                                                 throw_NullPointerException_at_call));
4708 
4709     // arraycopy stubs used by compilers
4710     generate_arraycopy_stubs();
4711 
4712     if (UseMultiplyToLenIntrinsic) {
4713       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4714     }
4715 
4716     if (UseMontgomeryMultiplyIntrinsic) {
4717       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4718       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4719       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4720     }
4721 
4722     if (UseMontgomerySquareIntrinsic) {
4723       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4724       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4725       // We use generate_multiply() rather than generate_square()
4726       // because it's faster for the sizes of modulus we care about.
4727       StubRoutines::_montgomerySquare = g.generate_multiply();
4728     }
4729 
4730 #ifndef BUILTIN_SIM
4731     // generate GHASH intrinsics code
4732     if (UseGHASHIntrinsics) {
4733       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4734     }
4735 
4736     if (UseAESIntrinsics) {
4737       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4738       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4739       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4740       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4741     }
4742 
4743     if (UseSHA1Intrinsics) {
4744       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4745       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4746     }
4747     if (UseSHA256Intrinsics) {
4748       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4749       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4750     }
4751 
4752     if (UseCRC32CIntrinsics) {
4753       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4754     }
4755 
4756     // generate Adler32 intrinsics code
4757     if (UseAdler32Intrinsics) {
4758       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4759     }
4760 
4761     // Safefetch stubs.
4762     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4763                                                        &StubRoutines::_safefetch32_fault_pc,
4764                                                        &StubRoutines::_safefetch32_continuation_pc);
4765     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4766                                                        &StubRoutines::_safefetchN_fault_pc,
4767                                                        &StubRoutines::_safefetchN_continuation_pc);
4768 #endif
4769   }
4770 
4771  public:
4772   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4773     if (all) {
4774       generate_all();
4775     } else {
4776       generate_initial();
4777     }
4778   }
4779 }; // end class declaration
4780 
4781 void StubGenerator_generate(CodeBuffer* code, bool all) {
4782   StubGenerator g(code, all);
4783 }