1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #ifdef COMPILER2
  43 #include "opto/runtime.hpp"
  44 #endif
  45 
  46 #ifdef BUILTIN_SIM
  47 #include "../../../../../../simulator/simulator.hpp"
  48 #endif
  49 
  50 // Declaration and definition of StubGenerator (no .hpp file).
  51 // For a more detailed description of the stub routine structure
  52 // see the comment in stubRoutines.hpp
  53 
  54 #undef __
  55 #define __ _masm->
  56 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #else
  61 #define BLOCK_COMMENT(str) __ block_comment(str)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 // Stub Code definitions
  67 
  68 class StubGenerator: public StubCodeGenerator {
  69  private:
  70 
  71 #ifdef PRODUCT
  72 #define inc_counter_np(counter) ((void)0)
  73 #else
  74   void inc_counter_np_(int& counter) {
  75     __ lea(rscratch2, ExternalAddress((address)&counter));
  76     __ ldrw(rscratch1, Address(rscratch2));
  77     __ addw(rscratch1, rscratch1, 1);
  78     __ strw(rscratch1, Address(rscratch2));
  79   }
  80 #define inc_counter_np(counter) \
  81   BLOCK_COMMENT("inc_counter " #counter); \
  82   inc_counter_np_(counter);
  83 #endif
  84 
  85   // Call stubs are used to call Java from C
  86   //
  87   // Arguments:
  88   //    c_rarg0:   call wrapper address                   address
  89   //    c_rarg1:   result                                 address
  90   //    c_rarg2:   result type                            BasicType
  91   //    c_rarg3:   method                                 Method*
  92   //    c_rarg4:   (interpreter) entry point              address
  93   //    c_rarg5:   parameters                             intptr_t*
  94   //    c_rarg6:   parameter size (in words)              int
  95   //    c_rarg7:   thread                                 Thread*
  96   //
  97   // There is no return from the stub itself as any Java result
  98   // is written to result
  99   //
 100   // we save r30 (lr) as the return PC at the base of the frame and
 101   // link r29 (fp) below it as the frame pointer installing sp (r31)
 102   // into fp.
 103   //
 104   // we save r0-r7, which accounts for all the c arguments.
 105   //
 106   // TODO: strictly do we need to save them all? they are treated as
 107   // volatile by C so could we omit saving the ones we are going to
 108   // place in global registers (thread? method?) or those we only use
 109   // during setup of the Java call?
 110   //
 111   // we don't need to save r8 which C uses as an indirect result location
 112   // return register.
 113   //
 114   // we don't need to save r9-r15 which both C and Java treat as
 115   // volatile
 116   //
 117   // we don't need to save r16-18 because Java does not use them
 118   //
 119   // we save r19-r28 which Java uses as scratch registers and C
 120   // expects to be callee-save
 121   //
 122   // we save the bottom 64 bits of each value stored in v8-v15; it is
 123   // the responsibility of the caller to preserve larger values.
 124   //
 125   // so the stub frame looks like this when we enter Java code
 126   //
 127   //     [ return_from_Java     ] <--- sp
 128   //     [ argument word n      ]
 129   //      ...
 130   // -27 [ argument word 1      ]
 131   // -26 [ saved v15            ] <--- sp_after_call
 132   // -25 [ saved v14            ]
 133   // -24 [ saved v13            ]
 134   // -23 [ saved v12            ]
 135   // -22 [ saved v11            ]
 136   // -21 [ saved v10            ]
 137   // -20 [ saved v9             ]
 138   // -19 [ saved v8             ]
 139   // -18 [ saved r28            ]
 140   // -17 [ saved r27            ]
 141   // -16 [ saved r26            ]
 142   // -15 [ saved r25            ]
 143   // -14 [ saved r24            ]
 144   // -13 [ saved r23            ]
 145   // -12 [ saved r22            ]
 146   // -11 [ saved r21            ]
 147   // -10 [ saved r20            ]
 148   //  -9 [ saved r19            ]
 149   //  -8 [ call wrapper    (r0) ]
 150   //  -7 [ result          (r1) ]
 151   //  -6 [ result type     (r2) ]
 152   //  -5 [ method          (r3) ]
 153   //  -4 [ entry point     (r4) ]
 154   //  -3 [ parameters      (r5) ]
 155   //  -2 [ parameter size  (r6) ]
 156   //  -1 [ thread (r7)          ]
 157   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 158   //   1 [ saved lr       (r30) ]
 159 
 160   // Call stub stack layout word offsets from fp
 161   enum call_stub_layout {
 162     sp_after_call_off = -26,
 163 
 164     d15_off            = -26,
 165     d13_off            = -24,
 166     d11_off            = -22,
 167     d9_off             = -20,
 168 
 169     r28_off            = -18,
 170     r26_off            = -16,
 171     r24_off            = -14,
 172     r22_off            = -12,
 173     r20_off            = -10,
 174     call_wrapper_off   =  -8,
 175     result_off         =  -7,
 176     result_type_off    =  -6,
 177     method_off         =  -5,
 178     entry_point_off    =  -4,
 179     parameter_size_off =  -2,
 180     thread_off         =  -1,
 181     fp_f               =   0,
 182     retaddr_off        =   1,
 183   };
 184 
 185   address generate_call_stub(address& return_address) {
 186     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 187            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 188            "adjust this code");
 189 
 190     StubCodeMark mark(this, "StubRoutines", "call_stub");
 191     address start = __ pc();
 192 
 193     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 194 
 195     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 196     const Address result        (rfp, result_off         * wordSize);
 197     const Address result_type   (rfp, result_type_off    * wordSize);
 198     const Address method        (rfp, method_off         * wordSize);
 199     const Address entry_point   (rfp, entry_point_off    * wordSize);
 200     const Address parameter_size(rfp, parameter_size_off * wordSize);
 201 
 202     const Address thread        (rfp, thread_off         * wordSize);
 203 
 204     const Address d15_save      (rfp, d15_off * wordSize);
 205     const Address d13_save      (rfp, d13_off * wordSize);
 206     const Address d11_save      (rfp, d11_off * wordSize);
 207     const Address d9_save       (rfp, d9_off * wordSize);
 208 
 209     const Address r28_save      (rfp, r28_off * wordSize);
 210     const Address r26_save      (rfp, r26_off * wordSize);
 211     const Address r24_save      (rfp, r24_off * wordSize);
 212     const Address r22_save      (rfp, r22_off * wordSize);
 213     const Address r20_save      (rfp, r20_off * wordSize);
 214 
 215     // stub code
 216 
 217     // we need a C prolog to bootstrap the x86 caller into the sim
 218     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 219 
 220     address aarch64_entry = __ pc();
 221 
 222 #ifdef BUILTIN_SIM
 223     // Save sender's SP for stack traces.
 224     __ mov(rscratch1, sp);
 225     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 226 #endif
 227     // set up frame and move sp to end of save area
 228     __ enter();
 229     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 230 
 231     // save register parameters and Java scratch/global registers
 232     // n.b. we save thread even though it gets installed in
 233     // rthread because we want to sanity check rthread later
 234     __ str(c_rarg7,  thread);
 235     __ strw(c_rarg6, parameter_size);
 236     __ stp(c_rarg4, c_rarg5,  entry_point);
 237     __ stp(c_rarg2, c_rarg3,  result_type);
 238     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 239 
 240     __ stp(r20, r19,   r20_save);
 241     __ stp(r22, r21,   r22_save);
 242     __ stp(r24, r23,   r24_save);
 243     __ stp(r26, r25,   r26_save);
 244     __ stp(r28, r27,   r28_save);
 245 
 246     __ stpd(v9,  v8,   d9_save);
 247     __ stpd(v11, v10,  d11_save);
 248     __ stpd(v13, v12,  d13_save);
 249     __ stpd(v15, v14,  d15_save);
 250 
 251     // install Java thread in global register now we have saved
 252     // whatever value it held
 253     __ mov(rthread, c_rarg7);
 254     // And method
 255     __ mov(rmethod, c_rarg3);
 256 
 257     // set up the heapbase register
 258     __ reinit_heapbase();
 259 
 260 #ifdef ASSERT
 261     // make sure we have no pending exceptions
 262     {
 263       Label L;
 264       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 265       __ cmp(rscratch1, (unsigned)NULL_WORD);
 266       __ br(Assembler::EQ, L);
 267       __ stop("StubRoutines::call_stub: entered with pending exception");
 268       __ BIND(L);
 269     }
 270 #endif
 271     // pass parameters if any
 272     __ mov(esp, sp);
 273     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 274     __ andr(sp, rscratch1, -2 * wordSize);
 275 
 276     BLOCK_COMMENT("pass parameters if any");
 277     Label parameters_done;
 278     // parameter count is still in c_rarg6
 279     // and parameter pointer identifying param 1 is in c_rarg5
 280     __ cbzw(c_rarg6, parameters_done);
 281 
 282     address loop = __ pc();
 283     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 284     __ subsw(c_rarg6, c_rarg6, 1);
 285     __ push(rscratch1);
 286     __ br(Assembler::GT, loop);
 287 
 288     __ BIND(parameters_done);
 289 
 290     // call Java entry -- passing methdoOop, and current sp
 291     //      rmethod: Method*
 292     //      r13: sender sp
 293     BLOCK_COMMENT("call Java function");
 294     __ mov(r13, sp);
 295     __ blr(c_rarg4);
 296 
 297     // tell the simulator we have returned to the stub
 298 
 299     // we do this here because the notify will already have been done
 300     // if we get to the next instruction via an exception
 301     //
 302     // n.b. adding this instruction here affects the calculation of
 303     // whether or not a routine returns to the call stub (used when
 304     // doing stack walks) since the normal test is to check the return
 305     // pc against the address saved below. so we may need to allow for
 306     // this extra instruction in the check.
 307 
 308     if (NotifySimulator) {
 309       __ notify(Assembler::method_reentry);
 310     }
 311     // save current address for use by exception handling code
 312 
 313     return_address = __ pc();
 314 
 315     // store result depending on type (everything that is not
 316     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 317     // n.b. this assumes Java returns an integral result in r0
 318     // and a floating result in j_farg0
 319     __ ldr(j_rarg2, result);
 320     Label is_long, is_float, is_double, exit;
 321     __ ldr(j_rarg1, result_type);
 322     __ cmp(j_rarg1, T_OBJECT);
 323     __ br(Assembler::EQ, is_long);
 324     __ cmp(j_rarg1, T_LONG);
 325     __ br(Assembler::EQ, is_long);
 326     __ cmp(j_rarg1, T_FLOAT);
 327     __ br(Assembler::EQ, is_float);
 328     __ cmp(j_rarg1, T_DOUBLE);
 329     __ br(Assembler::EQ, is_double);
 330 
 331     // handle T_INT case
 332     __ strw(r0, Address(j_rarg2));
 333 
 334     __ BIND(exit);
 335 
 336     // pop parameters
 337     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 338 
 339 #ifdef ASSERT
 340     // verify that threads correspond
 341     {
 342       Label L, S;
 343       __ ldr(rscratch1, thread);
 344       __ cmp(rthread, rscratch1);
 345       __ br(Assembler::NE, S);
 346       __ get_thread(rscratch1);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::EQ, L);
 349       __ BIND(S);
 350       __ stop("StubRoutines::call_stub: threads must correspond");
 351       __ BIND(L);
 352     }
 353 #endif
 354 
 355     // restore callee-save registers
 356     __ ldpd(v15, v14,  d15_save);
 357     __ ldpd(v13, v12,  d13_save);
 358     __ ldpd(v11, v10,  d11_save);
 359     __ ldpd(v9,  v8,   d9_save);
 360 
 361     __ ldp(r28, r27,   r28_save);
 362     __ ldp(r26, r25,   r26_save);
 363     __ ldp(r24, r23,   r24_save);
 364     __ ldp(r22, r21,   r22_save);
 365     __ ldp(r20, r19,   r20_save);
 366 
 367     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 368     __ ldrw(c_rarg2, result_type);
 369     __ ldr(c_rarg3,  method);
 370     __ ldp(c_rarg4, c_rarg5,  entry_point);
 371     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 372 
 373 #ifndef PRODUCT
 374     // tell the simulator we are about to end Java execution
 375     if (NotifySimulator) {
 376       __ notify(Assembler::method_exit);
 377     }
 378 #endif
 379     // leave frame and return to caller
 380     __ leave();
 381     __ ret(lr);
 382 
 383     // handle return types different from T_INT
 384 
 385     __ BIND(is_long);
 386     __ str(r0, Address(j_rarg2, 0));
 387     __ br(Assembler::AL, exit);
 388 
 389     __ BIND(is_float);
 390     __ strs(j_farg0, Address(j_rarg2, 0));
 391     __ br(Assembler::AL, exit);
 392 
 393     __ BIND(is_double);
 394     __ strd(j_farg0, Address(j_rarg2, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     return start;
 398   }
 399 
 400   // Return point for a Java call if there's an exception thrown in
 401   // Java code.  The exception is caught and transformed into a
 402   // pending exception stored in JavaThread that can be tested from
 403   // within the VM.
 404   //
 405   // Note: Usually the parameters are removed by the callee. In case
 406   // of an exception crossing an activation frame boundary, that is
 407   // not the case if the callee is compiled code => need to setup the
 408   // rsp.
 409   //
 410   // r0: exception oop
 411 
 412   // NOTE: this is used as a target from the signal handler so it
 413   // needs an x86 prolog which returns into the current simulator
 414   // executing the generated catch_exception code. so the prolog
 415   // needs to install rax in a sim register and adjust the sim's
 416   // restart pc to enter the generated code at the start position
 417   // then return from native to simulated execution.
 418 
 419   address generate_catch_exception() {
 420     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 421     address start = __ pc();
 422 
 423     // same as in generate_call_stub():
 424     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 425     const Address thread        (rfp, thread_off         * wordSize);
 426 
 427 #ifdef ASSERT
 428     // verify that threads correspond
 429     {
 430       Label L, S;
 431       __ ldr(rscratch1, thread);
 432       __ cmp(rthread, rscratch1);
 433       __ br(Assembler::NE, S);
 434       __ get_thread(rscratch1);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::EQ, L);
 437       __ bind(S);
 438       __ stop("StubRoutines::catch_exception: threads must correspond");
 439       __ bind(L);
 440     }
 441 #endif
 442 
 443     // set pending exception
 444     __ verify_oop(r0);
 445 
 446     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 447     __ mov(rscratch1, (address)__FILE__);
 448     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 449     __ movw(rscratch1, (int)__LINE__);
 450     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 451 
 452     // complete return to VM
 453     assert(StubRoutines::_call_stub_return_address != NULL,
 454            "_call_stub_return_address must have been generated before");
 455     __ b(StubRoutines::_call_stub_return_address);
 456 
 457     return start;
 458   }
 459 
 460   // Continuation point for runtime calls returning with a pending
 461   // exception.  The pending exception check happened in the runtime
 462   // or native call stub.  The pending exception in Thread is
 463   // converted into a Java-level exception.
 464   //
 465   // Contract with Java-level exception handlers:
 466   // r0: exception
 467   // r3: throwing pc
 468   //
 469   // NOTE: At entry of this stub, exception-pc must be in LR !!
 470 
 471   // NOTE: this is always used as a jump target within generated code
 472   // so it just needs to be generated code wiht no x86 prolog
 473 
 474   address generate_forward_exception() {
 475     StubCodeMark mark(this, "StubRoutines", "forward exception");
 476     address start = __ pc();
 477 
 478     // Upon entry, LR points to the return address returning into
 479     // Java (interpreted or compiled) code; i.e., the return address
 480     // becomes the throwing pc.
 481     //
 482     // Arguments pushed before the runtime call are still on the stack
 483     // but the exception handler will reset the stack pointer ->
 484     // ignore them.  A potential result in registers can be ignored as
 485     // well.
 486 
 487 #ifdef ASSERT
 488     // make sure this code is only executed if there is a pending exception
 489     {
 490       Label L;
 491       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 492       __ cbnz(rscratch1, L);
 493       __ stop("StubRoutines::forward exception: no pending exception (1)");
 494       __ bind(L);
 495     }
 496 #endif
 497 
 498     // compute exception handler into r19
 499 
 500     // call the VM to find the handler address associated with the
 501     // caller address. pass thread in r0 and caller pc (ret address)
 502     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 503     // the stack.
 504     __ mov(c_rarg1, lr);
 505     // lr will be trashed by the VM call so we move it to R19
 506     // (callee-saved) because we also need to pass it to the handler
 507     // returned by this call.
 508     __ mov(r19, lr);
 509     BLOCK_COMMENT("call exception_handler_for_return_address");
 510     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 511                          SharedRuntime::exception_handler_for_return_address),
 512                     rthread, c_rarg1);
 513     // we should not really care that lr is no longer the callee
 514     // address. we saved the value the handler needs in r19 so we can
 515     // just copy it to r3. however, the C2 handler will push its own
 516     // frame and then calls into the VM and the VM code asserts that
 517     // the PC for the frame above the handler belongs to a compiled
 518     // Java method. So, we restore lr here to satisfy that assert.
 519     __ mov(lr, r19);
 520     // setup r0 & r3 & clear pending exception
 521     __ mov(r3, r19);
 522     __ mov(r19, r0);
 523     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 524     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 525 
 526 #ifdef ASSERT
 527     // make sure exception is set
 528     {
 529       Label L;
 530       __ cbnz(r0, L);
 531       __ stop("StubRoutines::forward exception: no pending exception (2)");
 532       __ bind(L);
 533     }
 534 #endif
 535 
 536     // continue at exception handler
 537     // r0: exception
 538     // r3: throwing pc
 539     // r19: exception handler
 540     __ verify_oop(r0);
 541     __ br(r19);
 542 
 543     return start;
 544   }
 545 
 546   // Non-destructive plausibility checks for oops
 547   //
 548   // Arguments:
 549   //    r0: oop to verify
 550   //    rscratch1: error message
 551   //
 552   // Stack after saving c_rarg3:
 553   //    [tos + 0]: saved c_rarg3
 554   //    [tos + 1]: saved c_rarg2
 555   //    [tos + 2]: saved lr
 556   //    [tos + 3]: saved rscratch2
 557   //    [tos + 4]: saved r0
 558   //    [tos + 5]: saved rscratch1
 559   address generate_verify_oop() {
 560 
 561     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 562     address start = __ pc();
 563 
 564     Label exit, error;
 565 
 566     // save c_rarg2 and c_rarg3
 567     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 568 
 569     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 570     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 571     __ ldr(c_rarg3, Address(c_rarg2));
 572     __ add(c_rarg3, c_rarg3, 1);
 573     __ str(c_rarg3, Address(c_rarg2));
 574 
 575     // object is in r0
 576     // make sure object is 'reasonable'
 577     __ cbz(r0, exit); // if obj is NULL it is OK
 578 
 579     // Check if the oop is in the right area of memory
 580     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 581     __ andr(c_rarg2, r0, c_rarg3);
 582     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 583 
 584     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 585     // instruction here because the flags register is live.
 586     __ eor(c_rarg2, c_rarg2, c_rarg3);
 587     __ cbnz(c_rarg2, error);
 588 
 589     // make sure klass is 'reasonable', which is not zero.
 590     __ load_klass(r0, r0);  // get klass
 591     __ cbz(r0, error);      // if klass is NULL it is broken
 592 
 593     // return if everything seems ok
 594     __ bind(exit);
 595 
 596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 597     __ ret(lr);
 598 
 599     // handle errors
 600     __ bind(error);
 601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 602 
 603     __ push(RegSet::range(r0, r29), sp);
 604     // debug(char* msg, int64_t pc, int64_t regs[])
 605     __ mov(c_rarg0, rscratch1);      // pass address of error message
 606     __ mov(c_rarg1, lr);             // pass return address
 607     __ mov(c_rarg2, sp);             // pass address of regs on stack
 608 #ifndef PRODUCT
 609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 610 #endif
 611     BLOCK_COMMENT("call MacroAssembler::debug");
 612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 613     __ blrt(rscratch1, 3, 0, 1);
 614 
 615     return start;
 616   }
 617 
 618   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 619 
 620   // Generate code for an array write pre barrier
 621   //
 622   //     addr    -  starting address
 623   //     count   -  element count
 624   //     tmp     - scratch register
 625   //
 626   //     Destroy no registers except rscratch1 and rscratch2
 627   //
 628   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 629     BarrierSet* bs = Universe::heap()->barrier_set();
 630     switch (bs->kind()) {
 631     case BarrierSet::G1SATBCTLogging:
 632       // With G1, don't generate the call if we statically know that the target in uninitialized
 633       if (!dest_uninitialized) {
 634         __ push_call_clobbered_registers();
 635         if (count == c_rarg0) {
 636           if (addr == c_rarg1) {
 637             // exactly backwards!!
 638             __ mov(rscratch1, c_rarg0);
 639             __ mov(c_rarg0, c_rarg1);
 640             __ mov(c_rarg1, rscratch1);
 641           } else {
 642             __ mov(c_rarg1, count);
 643             __ mov(c_rarg0, addr);
 644           }
 645         } else {
 646           __ mov(c_rarg0, addr);
 647           __ mov(c_rarg1, count);
 648         }
 649         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 650         __ pop_call_clobbered_registers();
 651         break;
 652       case BarrierSet::CardTableForRS:
 653       case BarrierSet::CardTableExtension:
 654       case BarrierSet::ModRef:
 655         break;
 656       default:
 657         ShouldNotReachHere();
 658 
 659       }
 660     }
 661   }
 662 
 663   //
 664   // Generate code for an array write post barrier
 665   //
 666   //  Input:
 667   //     start    - register containing starting address of destination array
 668   //     end      - register containing ending address of destination array
 669   //     scratch  - scratch register
 670   //
 671   //  The input registers are overwritten.
 672   //  The ending address is inclusive.
 673   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 674     assert_different_registers(start, end, scratch);
 675     BarrierSet* bs = Universe::heap()->barrier_set();
 676     switch (bs->kind()) {
 677       case BarrierSet::G1SATBCTLogging:
 678 
 679         {
 680           __ push_call_clobbered_registers();
 681           // must compute element count unless barrier set interface is changed (other platforms supply count)
 682           assert_different_registers(start, end, scratch);
 683           __ lea(scratch, Address(end, BytesPerHeapOop));
 684           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 685           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 686           __ mov(c_rarg0, start);
 687           __ mov(c_rarg1, scratch);
 688           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 689           __ pop_call_clobbered_registers();
 690         }
 691         break;
 692       case BarrierSet::CardTableForRS:
 693       case BarrierSet::CardTableExtension:
 694         {
 695           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 696           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 697 
 698           Label L_loop;
 699 
 700            __ lsr(start, start, CardTableModRefBS::card_shift);
 701            __ lsr(end, end, CardTableModRefBS::card_shift);
 702            __ sub(end, end, start); // number of bytes to copy
 703 
 704           const Register count = end; // 'end' register contains bytes count now
 705           __ load_byte_map_base(scratch);
 706           __ add(start, start, scratch);
 707           if (UseConcMarkSweepGC) {
 708             __ membar(__ StoreStore);
 709           }
 710           __ BIND(L_loop);
 711           __ strb(zr, Address(start, count));
 712           __ subs(count, count, 1);
 713           __ br(Assembler::GE, L_loop);
 714         }
 715         break;
 716       default:
 717         ShouldNotReachHere();
 718 
 719     }
 720   }
 721 
 722   // The inner part of zero_words().  This is the bulk operation,
 723   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 724   // caller is responsible for zeroing the last few words.
 725   //
 726   // Inputs:
 727   // r10: the HeapWord-aligned base address of an array to zero.
 728   // r11: the count in HeapWords, r11 > 0.
 729   //
 730   // Returns r10 and r11, adjusted for the caller to clear.
 731   // r10: the base address of the tail of words left to clear.
 732   // r11: the number of words in the tail.
 733   //      r11 < MacroAssembler::zero_words_block_size.
 734 
 735   address generate_zero_blocks() {
 736     Label store_pair, loop_store_pair, done;
 737     Label base_aligned;
 738 
 739     Register base = r10, cnt = r11;
 740 
 741     __ align(CodeEntryAlignment);
 742     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 743     address start = __ pc();
 744 
 745     if (UseBlockZeroing) {
 746       int zva_length = VM_Version::zva_length();
 747 
 748       // Ensure ZVA length can be divided by 16. This is required by
 749       // the subsequent operations.
 750       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 751 
 752       __ tbz(base, 3, base_aligned);
 753       __ str(zr, Address(__ post(base, 8)));
 754       __ sub(cnt, cnt, 1);
 755       __ bind(base_aligned);
 756 
 757       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 758       // alignment.
 759       Label small;
 760       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 761       __ cmp(cnt, low_limit >> 3);
 762       __ br(Assembler::LT, small);
 763       __ zero_dcache_blocks(base, cnt);
 764       __ bind(small);
 765     }
 766 
 767     {
 768       // Number of stp instructions we'll unroll
 769       const int unroll =
 770         MacroAssembler::zero_words_block_size / 2;
 771       // Clear the remaining blocks.
 772       Label loop;
 773       __ subs(cnt, cnt, unroll * 2);
 774       __ br(Assembler::LT, done);
 775       __ bind(loop);
 776       for (int i = 0; i < unroll; i++)
 777         __ stp(zr, zr, __ post(base, 16));
 778       __ subs(cnt, cnt, unroll * 2);
 779       __ br(Assembler::GE, loop);
 780       __ bind(done);
 781       __ add(cnt, cnt, unroll * 2);
 782     }
 783 
 784     __ ret(lr);
 785 
 786     return start;
 787   }
 788 
 789 
 790   typedef enum {
 791     copy_forwards = 1,
 792     copy_backwards = -1
 793   } copy_direction;
 794 
 795   // Bulk copy of blocks of 8 words.
 796   //
 797   // count is a count of words.
 798   //
 799   // Precondition: count >= 8
 800   //
 801   // Postconditions:
 802   //
 803   // The least significant bit of count contains the remaining count
 804   // of words to copy.  The rest of count is trash.
 805   //
 806   // s and d are adjusted to point to the remaining words to copy
 807   //
 808   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 809                            copy_direction direction) {
 810     int unit = wordSize * direction;
 811     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 812 
 813     int offset;
 814     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 815       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 816     const Register stride = r13;
 817 
 818     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 819     assert_different_registers(s, d, count, rscratch1);
 820 
 821     Label again, drain;
 822     const char *stub_name;
 823     if (direction == copy_forwards)
 824       stub_name = "foward_copy_longs";
 825     else
 826       stub_name = "backward_copy_longs";
 827     StubCodeMark mark(this, "StubRoutines", stub_name);
 828     __ align(CodeEntryAlignment);
 829     __ bind(start);
 830 
 831     Label unaligned_copy_long;
 832     if (AvoidUnalignedAccesses) {
 833       __ tbnz(d, 3, unaligned_copy_long);
 834     }
 835 
 836     if (direction == copy_forwards) {
 837       __ sub(s, s, bias);
 838       __ sub(d, d, bias);
 839     }
 840 
 841 #ifdef ASSERT
 842     // Make sure we are never given < 8 words
 843     {
 844       Label L;
 845       __ cmp(count, 8);
 846       __ br(Assembler::GE, L);
 847       __ stop("genrate_copy_longs called with < 8 words");
 848       __ bind(L);
 849     }
 850 #endif
 851 
 852     // Fill 8 registers
 853     if (UseSIMDForMemoryOps) {
 854       __ ldpq(v0, v1, Address(s, 4 * unit));
 855       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 856     } else {
 857       __ ldp(t0, t1, Address(s, 2 * unit));
 858       __ ldp(t2, t3, Address(s, 4 * unit));
 859       __ ldp(t4, t5, Address(s, 6 * unit));
 860       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 861     }
 862 
 863     __ subs(count, count, 16);
 864     __ br(Assembler::LO, drain);
 865 
 866     int prefetch = PrefetchCopyIntervalInBytes;
 867     bool use_stride = false;
 868     if (direction == copy_backwards) {
 869        use_stride = prefetch > 256;
 870        prefetch = -prefetch;
 871        if (use_stride) __ mov(stride, prefetch);
 872     }
 873 
 874     __ bind(again);
 875 
 876     if (PrefetchCopyIntervalInBytes > 0)
 877       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 878 
 879     if (UseSIMDForMemoryOps) {
 880       __ stpq(v0, v1, Address(d, 4 * unit));
 881       __ ldpq(v0, v1, Address(s, 4 * unit));
 882       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 883       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 884     } else {
 885       __ stp(t0, t1, Address(d, 2 * unit));
 886       __ ldp(t0, t1, Address(s, 2 * unit));
 887       __ stp(t2, t3, Address(d, 4 * unit));
 888       __ ldp(t2, t3, Address(s, 4 * unit));
 889       __ stp(t4, t5, Address(d, 6 * unit));
 890       __ ldp(t4, t5, Address(s, 6 * unit));
 891       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 892       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 893     }
 894 
 895     __ subs(count, count, 8);
 896     __ br(Assembler::HS, again);
 897 
 898     // Drain
 899     __ bind(drain);
 900     if (UseSIMDForMemoryOps) {
 901       __ stpq(v0, v1, Address(d, 4 * unit));
 902       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 903     } else {
 904       __ stp(t0, t1, Address(d, 2 * unit));
 905       __ stp(t2, t3, Address(d, 4 * unit));
 906       __ stp(t4, t5, Address(d, 6 * unit));
 907       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 908     }
 909 
 910     {
 911       Label L1, L2;
 912       __ tbz(count, exact_log2(4), L1);
 913       if (UseSIMDForMemoryOps) {
 914         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 915         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 916       } else {
 917         __ ldp(t0, t1, Address(s, 2 * unit));
 918         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 919         __ stp(t0, t1, Address(d, 2 * unit));
 920         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 921       }
 922       __ bind(L1);
 923 
 924       if (direction == copy_forwards) {
 925         __ add(s, s, bias);
 926         __ add(d, d, bias);
 927       }
 928 
 929       __ tbz(count, 1, L2);
 930       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 931       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 932       __ bind(L2);
 933     }
 934 
 935     __ ret(lr);
 936 
 937     if (AvoidUnalignedAccesses) {
 938       Label drain, again;
 939       // Register order for storing. Order is different for backward copy.
 940 
 941       __ bind(unaligned_copy_long);
 942 
 943       // source address is even aligned, target odd aligned
 944       //
 945       // when forward copying word pairs we read long pairs at offsets
 946       // {0, 2, 4, 6} (in long words). when backwards copying we read
 947       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 948       // address by -2 in the forwards case so we can compute the
 949       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 950       // or -1.
 951       //
 952       // when forward copying we need to store 1 word, 3 pairs and
 953       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 954       // zero offset We adjust the destination by -1 which means we
 955       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 956       //
 957       // When backwards copyng we need to store 1 word, 3 pairs and
 958       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 959       // offsets {1, 3, 5, 7, 8} * unit.
 960 
 961       if (direction == copy_forwards) {
 962         __ sub(s, s, 16);
 963         __ sub(d, d, 8);
 964       }
 965 
 966       // Fill 8 registers
 967       //
 968       // for forwards copy s was offset by -16 from the original input
 969       // value of s so the register contents are at these offsets
 970       // relative to the 64 bit block addressed by that original input
 971       // and so on for each successive 64 byte block when s is updated
 972       //
 973       // t0 at offset 0,  t1 at offset 8
 974       // t2 at offset 16, t3 at offset 24
 975       // t4 at offset 32, t5 at offset 40
 976       // t6 at offset 48, t7 at offset 56
 977 
 978       // for backwards copy s was not offset so the register contents
 979       // are at these offsets into the preceding 64 byte block
 980       // relative to that original input and so on for each successive
 981       // preceding 64 byte block when s is updated. this explains the
 982       // slightly counter-intuitive looking pattern of register usage
 983       // in the stp instructions for backwards copy.
 984       //
 985       // t0 at offset -16, t1 at offset -8
 986       // t2 at offset -32, t3 at offset -24
 987       // t4 at offset -48, t5 at offset -40
 988       // t6 at offset -64, t7 at offset -56
 989 
 990       __ ldp(t0, t1, Address(s, 2 * unit));
 991       __ ldp(t2, t3, Address(s, 4 * unit));
 992       __ ldp(t4, t5, Address(s, 6 * unit));
 993       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 994 
 995       __ subs(count, count, 16);
 996       __ br(Assembler::LO, drain);
 997 
 998       int prefetch = PrefetchCopyIntervalInBytes;
 999       bool use_stride = false;
1000       if (direction == copy_backwards) {
1001          use_stride = prefetch > 256;
1002          prefetch = -prefetch;
1003          if (use_stride) __ mov(stride, prefetch);
1004       }
1005 
1006       __ bind(again);
1007 
1008       if (PrefetchCopyIntervalInBytes > 0)
1009         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1010 
1011       if (direction == copy_forwards) {
1012        // allowing for the offset of -8 the store instructions place
1013        // registers into the target 64 bit block at the following
1014        // offsets
1015        //
1016        // t0 at offset 0
1017        // t1 at offset 8,  t2 at offset 16
1018        // t3 at offset 24, t4 at offset 32
1019        // t5 at offset 40, t6 at offset 48
1020        // t7 at offset 56
1021 
1022         __ str(t0, Address(d, 1 * unit));
1023         __ stp(t1, t2, Address(d, 2 * unit));
1024         __ ldp(t0, t1, Address(s, 2 * unit));
1025         __ stp(t3, t4, Address(d, 4 * unit));
1026         __ ldp(t2, t3, Address(s, 4 * unit));
1027         __ stp(t5, t6, Address(d, 6 * unit));
1028         __ ldp(t4, t5, Address(s, 6 * unit));
1029         __ str(t7, Address(__ pre(d, 8 * unit)));
1030         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1031       } else {
1032        // d was not offset when we started so the registers are
1033        // written into the 64 bit block preceding d with the following
1034        // offsets
1035        //
1036        // t1 at offset -8
1037        // t3 at offset -24, t0 at offset -16
1038        // t5 at offset -48, t2 at offset -32
1039        // t7 at offset -56, t4 at offset -48
1040        //                   t6 at offset -64
1041        //
1042        // note that this matches the offsets previously noted for the
1043        // loads
1044 
1045         __ str(t1, Address(d, 1 * unit));
1046         __ stp(t3, t0, Address(d, 3 * unit));
1047         __ ldp(t0, t1, Address(s, 2 * unit));
1048         __ stp(t5, t2, Address(d, 5 * unit));
1049         __ ldp(t2, t3, Address(s, 4 * unit));
1050         __ stp(t7, t4, Address(d, 7 * unit));
1051         __ ldp(t4, t5, Address(s, 6 * unit));
1052         __ str(t6, Address(__ pre(d, 8 * unit)));
1053         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1054       }
1055 
1056       __ subs(count, count, 8);
1057       __ br(Assembler::HS, again);
1058 
1059       // Drain
1060       //
1061       // this uses the same pattern of offsets and register arguments
1062       // as above
1063       __ bind(drain);
1064       if (direction == copy_forwards) {
1065         __ str(t0, Address(d, 1 * unit));
1066         __ stp(t1, t2, Address(d, 2 * unit));
1067         __ stp(t3, t4, Address(d, 4 * unit));
1068         __ stp(t5, t6, Address(d, 6 * unit));
1069         __ str(t7, Address(__ pre(d, 8 * unit)));
1070       } else {
1071         __ str(t1, Address(d, 1 * unit));
1072         __ stp(t3, t0, Address(d, 3 * unit));
1073         __ stp(t5, t2, Address(d, 5 * unit));
1074         __ stp(t7, t4, Address(d, 7 * unit));
1075         __ str(t6, Address(__ pre(d, 8 * unit)));
1076       }
1077       // now we need to copy any remaining part block which may
1078       // include a 4 word block subblock and/or a 2 word subblock.
1079       // bits 2 and 1 in the count are the tell-tale for whetehr we
1080       // have each such subblock
1081       {
1082         Label L1, L2;
1083         __ tbz(count, exact_log2(4), L1);
1084        // this is the same as above but copying only 4 longs hence
1085        // with ony one intervening stp between the str instructions
1086        // but note that the offsets and registers still follow the
1087        // same pattern
1088         __ ldp(t0, t1, Address(s, 2 * unit));
1089         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1090         if (direction == copy_forwards) {
1091           __ str(t0, Address(d, 1 * unit));
1092           __ stp(t1, t2, Address(d, 2 * unit));
1093           __ str(t3, Address(__ pre(d, 4 * unit)));
1094         } else {
1095           __ str(t1, Address(d, 1 * unit));
1096           __ stp(t3, t0, Address(d, 3 * unit));
1097           __ str(t2, Address(__ pre(d, 4 * unit)));
1098         }
1099         __ bind(L1);
1100 
1101         __ tbz(count, 1, L2);
1102        // this is the same as above but copying only 2 longs hence
1103        // there is no intervening stp between the str instructions
1104        // but note that the offset and register patterns are still
1105        // the same
1106         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1107         if (direction == copy_forwards) {
1108           __ str(t0, Address(d, 1 * unit));
1109           __ str(t1, Address(__ pre(d, 2 * unit)));
1110         } else {
1111           __ str(t1, Address(d, 1 * unit));
1112           __ str(t0, Address(__ pre(d, 2 * unit)));
1113         }
1114         __ bind(L2);
1115 
1116        // for forwards copy we need to re-adjust the offsets we
1117        // applied so that s and d are follow the last words written
1118 
1119        if (direction == copy_forwards) {
1120          __ add(s, s, 16);
1121          __ add(d, d, 8);
1122        }
1123 
1124       }
1125 
1126       __ ret(lr);
1127       }
1128   }
1129 
1130   // Small copy: less than 16 bytes.
1131   //
1132   // NB: Ignores all of the bits of count which represent more than 15
1133   // bytes, so a caller doesn't have to mask them.
1134 
1135   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1136     bool is_backwards = step < 0;
1137     size_t granularity = uabs(step);
1138     int direction = is_backwards ? -1 : 1;
1139     int unit = wordSize * direction;
1140 
1141     Label Lpair, Lword, Lint, Lshort, Lbyte;
1142 
1143     assert(granularity
1144            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1145 
1146     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1147 
1148     // ??? I don't know if this bit-test-and-branch is the right thing
1149     // to do.  It does a lot of jumping, resulting in several
1150     // mispredicted branches.  It might make more sense to do this
1151     // with something like Duff's device with a single computed branch.
1152 
1153     __ tbz(count, 3 - exact_log2(granularity), Lword);
1154     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1155     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1156     __ bind(Lword);
1157 
1158     if (granularity <= sizeof (jint)) {
1159       __ tbz(count, 2 - exact_log2(granularity), Lint);
1160       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1161       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1162       __ bind(Lint);
1163     }
1164 
1165     if (granularity <= sizeof (jshort)) {
1166       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1167       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1168       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1169       __ bind(Lshort);
1170     }
1171 
1172     if (granularity <= sizeof (jbyte)) {
1173       __ tbz(count, 0, Lbyte);
1174       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1175       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1176       __ bind(Lbyte);
1177     }
1178   }
1179 
1180   Label copy_f, copy_b;
1181 
1182   // All-singing all-dancing memory copy.
1183   //
1184   // Copy count units of memory from s to d.  The size of a unit is
1185   // step, which can be positive or negative depending on the direction
1186   // of copy.  If is_aligned is false, we align the source address.
1187   //
1188 
1189   void copy_memory(bool is_aligned, Register s, Register d,
1190                    Register count, Register tmp, int step) {
1191     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1192     bool is_backwards = step < 0;
1193     int granularity = uabs(step);
1194     const Register t0 = r3, t1 = r4;
1195 
1196     // <= 96 bytes do inline. Direction doesn't matter because we always
1197     // load all the data before writing anything
1198     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1199     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1200     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1201     const Register send = r17, dend = r18;
1202 
1203     if (PrefetchCopyIntervalInBytes > 0)
1204       __ prfm(Address(s, 0), PLDL1KEEP);
1205     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1206     __ br(Assembler::HI, copy_big);
1207 
1208     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1209     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1210 
1211     __ cmp(count, 16/granularity);
1212     __ br(Assembler::LS, copy16);
1213 
1214     __ cmp(count, 64/granularity);
1215     __ br(Assembler::HI, copy80);
1216 
1217     __ cmp(count, 32/granularity);
1218     __ br(Assembler::LS, copy32);
1219 
1220     // 33..64 bytes
1221     if (UseSIMDForMemoryOps) {
1222       __ ldpq(v0, v1, Address(s, 0));
1223       __ ldpq(v2, v3, Address(send, -32));
1224       __ stpq(v0, v1, Address(d, 0));
1225       __ stpq(v2, v3, Address(dend, -32));
1226     } else {
1227       __ ldp(t0, t1, Address(s, 0));
1228       __ ldp(t2, t3, Address(s, 16));
1229       __ ldp(t4, t5, Address(send, -32));
1230       __ ldp(t6, t7, Address(send, -16));
1231 
1232       __ stp(t0, t1, Address(d, 0));
1233       __ stp(t2, t3, Address(d, 16));
1234       __ stp(t4, t5, Address(dend, -32));
1235       __ stp(t6, t7, Address(dend, -16));
1236     }
1237     __ b(finish);
1238 
1239     // 17..32 bytes
1240     __ bind(copy32);
1241     __ ldp(t0, t1, Address(s, 0));
1242     __ ldp(t2, t3, Address(send, -16));
1243     __ stp(t0, t1, Address(d, 0));
1244     __ stp(t2, t3, Address(dend, -16));
1245     __ b(finish);
1246 
1247     // 65..80/96 bytes
1248     // (96 bytes if SIMD because we do 32 byes per instruction)
1249     __ bind(copy80);
1250     if (UseSIMDForMemoryOps) {
1251       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1252       __ ldpq(v4, v5, Address(send, -32));
1253       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1254       __ stpq(v4, v5, Address(dend, -32));
1255     } else {
1256       __ ldp(t0, t1, Address(s, 0));
1257       __ ldp(t2, t3, Address(s, 16));
1258       __ ldp(t4, t5, Address(s, 32));
1259       __ ldp(t6, t7, Address(s, 48));
1260       __ ldp(t8, t9, Address(send, -16));
1261 
1262       __ stp(t0, t1, Address(d, 0));
1263       __ stp(t2, t3, Address(d, 16));
1264       __ stp(t4, t5, Address(d, 32));
1265       __ stp(t6, t7, Address(d, 48));
1266       __ stp(t8, t9, Address(dend, -16));
1267     }
1268     __ b(finish);
1269 
1270     // 0..16 bytes
1271     __ bind(copy16);
1272     __ cmp(count, 8/granularity);
1273     __ br(Assembler::LO, copy8);
1274 
1275     // 8..16 bytes
1276     __ ldr(t0, Address(s, 0));
1277     __ ldr(t1, Address(send, -8));
1278     __ str(t0, Address(d, 0));
1279     __ str(t1, Address(dend, -8));
1280     __ b(finish);
1281 
1282     if (granularity < 8) {
1283       // 4..7 bytes
1284       __ bind(copy8);
1285       __ tbz(count, 2 - exact_log2(granularity), copy4);
1286       __ ldrw(t0, Address(s, 0));
1287       __ ldrw(t1, Address(send, -4));
1288       __ strw(t0, Address(d, 0));
1289       __ strw(t1, Address(dend, -4));
1290       __ b(finish);
1291       if (granularity < 4) {
1292         // 0..3 bytes
1293         __ bind(copy4);
1294         __ cbz(count, finish); // get rid of 0 case
1295         if (granularity == 2) {
1296           __ ldrh(t0, Address(s, 0));
1297           __ strh(t0, Address(d, 0));
1298         } else { // granularity == 1
1299           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1300           // the first and last byte.
1301           // Handle the 3 byte case by loading and storing base + count/2
1302           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1303           // This does means in the 1 byte case we load/store the same
1304           // byte 3 times.
1305           __ lsr(count, count, 1);
1306           __ ldrb(t0, Address(s, 0));
1307           __ ldrb(t1, Address(send, -1));
1308           __ ldrb(t2, Address(s, count));
1309           __ strb(t0, Address(d, 0));
1310           __ strb(t1, Address(dend, -1));
1311           __ strb(t2, Address(d, count));
1312         }
1313         __ b(finish);
1314       }
1315     }
1316 
1317     __ bind(copy_big);
1318     if (is_backwards) {
1319       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1320       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1321     }
1322 
1323     // Now we've got the small case out of the way we can align the
1324     // source address on a 2-word boundary.
1325 
1326     Label aligned;
1327 
1328     if (is_aligned) {
1329       // We may have to adjust by 1 word to get s 2-word-aligned.
1330       __ tbz(s, exact_log2(wordSize), aligned);
1331       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1332       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1333       __ sub(count, count, wordSize/granularity);
1334     } else {
1335       if (is_backwards) {
1336         __ andr(rscratch2, s, 2 * wordSize - 1);
1337       } else {
1338         __ neg(rscratch2, s);
1339         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1340       }
1341       // rscratch2 is the byte adjustment needed to align s.
1342       __ cbz(rscratch2, aligned);
1343       int shift = exact_log2(granularity);
1344       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1345       __ sub(count, count, rscratch2);
1346 
1347 #if 0
1348       // ?? This code is only correct for a disjoint copy.  It may or
1349       // may not make sense to use it in that case.
1350 
1351       // Copy the first pair; s and d may not be aligned.
1352       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1353       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1354 
1355       // Align s and d, adjust count
1356       if (is_backwards) {
1357         __ sub(s, s, rscratch2);
1358         __ sub(d, d, rscratch2);
1359       } else {
1360         __ add(s, s, rscratch2);
1361         __ add(d, d, rscratch2);
1362       }
1363 #else
1364       copy_memory_small(s, d, rscratch2, rscratch1, step);
1365 #endif
1366     }
1367 
1368     __ bind(aligned);
1369 
1370     // s is now 2-word-aligned.
1371 
1372     // We have a count of units and some trailing bytes.  Adjust the
1373     // count and do a bulk copy of words.
1374     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1375     if (direction == copy_forwards)
1376       __ bl(copy_f);
1377     else
1378       __ bl(copy_b);
1379 
1380     // And the tail.
1381     copy_memory_small(s, d, count, tmp, step);
1382 
1383     if (granularity >= 8) __ bind(copy8);
1384     if (granularity >= 4) __ bind(copy4);
1385     __ bind(finish);
1386   }
1387 
1388 
1389   void clobber_registers() {
1390 #ifdef ASSERT
1391     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1392     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1393     for (Register r = r3; r <= r18; r++)
1394       if (r != rscratch1) __ mov(r, rscratch1);
1395 #endif
1396   }
1397 
1398   // Scan over array at a for count oops, verifying each one.
1399   // Preserves a and count, clobbers rscratch1 and rscratch2.
1400   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1401     Label loop, end;
1402     __ mov(rscratch1, a);
1403     __ mov(rscratch2, zr);
1404     __ bind(loop);
1405     __ cmp(rscratch2, count);
1406     __ br(Assembler::HS, end);
1407     if (size == (size_t)wordSize) {
1408       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1409       __ verify_oop(temp);
1410     } else {
1411       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1412       __ decode_heap_oop(temp); // calls verify_oop
1413     }
1414     __ add(rscratch2, rscratch2, size);
1415     __ b(loop);
1416     __ bind(end);
1417   }
1418 
1419   // Arguments:
1420   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1421   //             ignored
1422   //   is_oop  - true => oop array, so generate store check code
1423   //   name    - stub name string
1424   //
1425   // Inputs:
1426   //   c_rarg0   - source array address
1427   //   c_rarg1   - destination array address
1428   //   c_rarg2   - element count, treated as ssize_t, can be zero
1429   //
1430   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1431   // the hardware handle it.  The two dwords within qwords that span
1432   // cache line boundaries will still be loaded and stored atomicly.
1433   //
1434   // Side Effects:
1435   //   disjoint_int_copy_entry is set to the no-overlap entry point
1436   //   used by generate_conjoint_int_oop_copy().
1437   //
1438   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1439                                   const char *name, bool dest_uninitialized = false) {
1440     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1441     __ align(CodeEntryAlignment);
1442     StubCodeMark mark(this, "StubRoutines", name);
1443     address start = __ pc();
1444     __ enter();
1445 
1446     if (entry != NULL) {
1447       *entry = __ pc();
1448       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1449       BLOCK_COMMENT("Entry:");
1450     }
1451 
1452     if (is_oop) {
1453       __ push(RegSet::of(d, count), sp);
1454       // no registers are destroyed by this call
1455       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1456     }
1457     copy_memory(aligned, s, d, count, rscratch1, size);
1458     if (is_oop) {
1459       __ pop(RegSet::of(d, count), sp);
1460       if (VerifyOops)
1461         verify_oop_array(size, d, count, r16);
1462       __ sub(count, count, 1); // make an inclusive end pointer
1463       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1464       gen_write_ref_array_post_barrier(d, count, rscratch1);
1465     }
1466     __ leave();
1467     __ mov(r0, zr); // return 0
1468     __ ret(lr);
1469 #ifdef BUILTIN_SIM
1470     {
1471       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1472       sim->notifyCompile(const_cast<char*>(name), start);
1473     }
1474 #endif
1475     return start;
1476   }
1477 
1478   // Arguments:
1479   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1480   //             ignored
1481   //   is_oop  - true => oop array, so generate store check code
1482   //   name    - stub name string
1483   //
1484   // Inputs:
1485   //   c_rarg0   - source array address
1486   //   c_rarg1   - destination array address
1487   //   c_rarg2   - element count, treated as ssize_t, can be zero
1488   //
1489   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1490   // the hardware handle it.  The two dwords within qwords that span
1491   // cache line boundaries will still be loaded and stored atomicly.
1492   //
1493   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1494                                  address *entry, const char *name,
1495                                  bool dest_uninitialized = false) {
1496     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1497 
1498     StubCodeMark mark(this, "StubRoutines", name);
1499     address start = __ pc();
1500     __ enter();
1501 
1502     if (entry != NULL) {
1503       *entry = __ pc();
1504       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1505       BLOCK_COMMENT("Entry:");
1506     }
1507 
1508     // use fwd copy when (d-s) above_equal (count*size)
1509     __ sub(rscratch1, d, s);
1510     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1511     __ br(Assembler::HS, nooverlap_target);
1512 
1513     if (is_oop) {
1514       __ push(RegSet::of(d, count), sp);
1515       // no registers are destroyed by this call
1516       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1517     }
1518     copy_memory(aligned, s, d, count, rscratch1, -size);
1519     if (is_oop) {
1520       __ pop(RegSet::of(d, count), sp);
1521       if (VerifyOops)
1522         verify_oop_array(size, d, count, r16);
1523       __ sub(count, count, 1); // make an inclusive end pointer
1524       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1525       gen_write_ref_array_post_barrier(d, count, rscratch1);
1526     }
1527     __ leave();
1528     __ mov(r0, zr); // return 0
1529     __ ret(lr);
1530 #ifdef BUILTIN_SIM
1531     {
1532       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1533       sim->notifyCompile(const_cast<char*>(name), start);
1534     }
1535 #endif
1536     return start;
1537 }
1538 
1539   // Arguments:
1540   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1541   //             ignored
1542   //   name    - stub name string
1543   //
1544   // Inputs:
1545   //   c_rarg0   - source array address
1546   //   c_rarg1   - destination array address
1547   //   c_rarg2   - element count, treated as ssize_t, can be zero
1548   //
1549   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1550   // we let the hardware handle it.  The one to eight bytes within words,
1551   // dwords or qwords that span cache line boundaries will still be loaded
1552   // and stored atomically.
1553   //
1554   // Side Effects:
1555   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1556   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1557   // we let the hardware handle it.  The one to eight bytes within words,
1558   // dwords or qwords that span cache line boundaries will still be loaded
1559   // and stored atomically.
1560   //
1561   // Side Effects:
1562   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1563   //   used by generate_conjoint_byte_copy().
1564   //
1565   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1566     const bool not_oop = false;
1567     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1568   }
1569 
1570   // Arguments:
1571   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1572   //             ignored
1573   //   name    - stub name string
1574   //
1575   // Inputs:
1576   //   c_rarg0   - source array address
1577   //   c_rarg1   - destination array address
1578   //   c_rarg2   - element count, treated as ssize_t, can be zero
1579   //
1580   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1581   // we let the hardware handle it.  The one to eight bytes within words,
1582   // dwords or qwords that span cache line boundaries will still be loaded
1583   // and stored atomically.
1584   //
1585   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1586                                       address* entry, const char *name) {
1587     const bool not_oop = false;
1588     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1589   }
1590 
1591   // Arguments:
1592   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1593   //             ignored
1594   //   name    - stub name string
1595   //
1596   // Inputs:
1597   //   c_rarg0   - source array address
1598   //   c_rarg1   - destination array address
1599   //   c_rarg2   - element count, treated as ssize_t, can be zero
1600   //
1601   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1602   // let the hardware handle it.  The two or four words within dwords
1603   // or qwords that span cache line boundaries will still be loaded
1604   // and stored atomically.
1605   //
1606   // Side Effects:
1607   //   disjoint_short_copy_entry is set to the no-overlap entry point
1608   //   used by generate_conjoint_short_copy().
1609   //
1610   address generate_disjoint_short_copy(bool aligned,
1611                                        address* entry, const char *name) {
1612     const bool not_oop = false;
1613     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1614   }
1615 
1616   // Arguments:
1617   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1618   //             ignored
1619   //   name    - stub name string
1620   //
1621   // Inputs:
1622   //   c_rarg0   - source array address
1623   //   c_rarg1   - destination array address
1624   //   c_rarg2   - element count, treated as ssize_t, can be zero
1625   //
1626   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1627   // let the hardware handle it.  The two or four words within dwords
1628   // or qwords that span cache line boundaries will still be loaded
1629   // and stored atomically.
1630   //
1631   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1632                                        address *entry, const char *name) {
1633     const bool not_oop = false;
1634     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1635 
1636   }
1637   // Arguments:
1638   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1639   //             ignored
1640   //   name    - stub name string
1641   //
1642   // Inputs:
1643   //   c_rarg0   - source array address
1644   //   c_rarg1   - destination array address
1645   //   c_rarg2   - element count, treated as ssize_t, can be zero
1646   //
1647   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1648   // the hardware handle it.  The two dwords within qwords that span
1649   // cache line boundaries will still be loaded and stored atomicly.
1650   //
1651   // Side Effects:
1652   //   disjoint_int_copy_entry is set to the no-overlap entry point
1653   //   used by generate_conjoint_int_oop_copy().
1654   //
1655   address generate_disjoint_int_copy(bool aligned, address *entry,
1656                                          const char *name, bool dest_uninitialized = false) {
1657     const bool not_oop = false;
1658     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1659   }
1660 
1661   // Arguments:
1662   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1663   //             ignored
1664   //   name    - stub name string
1665   //
1666   // Inputs:
1667   //   c_rarg0   - source array address
1668   //   c_rarg1   - destination array address
1669   //   c_rarg2   - element count, treated as ssize_t, can be zero
1670   //
1671   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1672   // the hardware handle it.  The two dwords within qwords that span
1673   // cache line boundaries will still be loaded and stored atomicly.
1674   //
1675   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1676                                      address *entry, const char *name,
1677                                      bool dest_uninitialized = false) {
1678     const bool not_oop = false;
1679     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1680   }
1681 
1682 
1683   // Arguments:
1684   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1685   //             ignored
1686   //   name    - stub name string
1687   //
1688   // Inputs:
1689   //   c_rarg0   - source array address
1690   //   c_rarg1   - destination array address
1691   //   c_rarg2   - element count, treated as size_t, can be zero
1692   //
1693   // Side Effects:
1694   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1695   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1696   //
1697   address generate_disjoint_long_copy(bool aligned, address *entry,
1698                                           const char *name, bool dest_uninitialized = false) {
1699     const bool not_oop = false;
1700     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1701   }
1702 
1703   // Arguments:
1704   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1705   //             ignored
1706   //   name    - stub name string
1707   //
1708   // Inputs:
1709   //   c_rarg0   - source array address
1710   //   c_rarg1   - destination array address
1711   //   c_rarg2   - element count, treated as size_t, can be zero
1712   //
1713   address generate_conjoint_long_copy(bool aligned,
1714                                       address nooverlap_target, address *entry,
1715                                       const char *name, bool dest_uninitialized = false) {
1716     const bool not_oop = false;
1717     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1718   }
1719 
1720   // Arguments:
1721   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1722   //             ignored
1723   //   name    - stub name string
1724   //
1725   // Inputs:
1726   //   c_rarg0   - source array address
1727   //   c_rarg1   - destination array address
1728   //   c_rarg2   - element count, treated as size_t, can be zero
1729   //
1730   // Side Effects:
1731   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1732   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1733   //
1734   address generate_disjoint_oop_copy(bool aligned, address *entry,
1735                                      const char *name, bool dest_uninitialized) {
1736     const bool is_oop = true;
1737     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1738     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1739   }
1740 
1741   // Arguments:
1742   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1743   //             ignored
1744   //   name    - stub name string
1745   //
1746   // Inputs:
1747   //   c_rarg0   - source array address
1748   //   c_rarg1   - destination array address
1749   //   c_rarg2   - element count, treated as size_t, can be zero
1750   //
1751   address generate_conjoint_oop_copy(bool aligned,
1752                                      address nooverlap_target, address *entry,
1753                                      const char *name, bool dest_uninitialized) {
1754     const bool is_oop = true;
1755     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1756     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1757                                   name, dest_uninitialized);
1758   }
1759 
1760 
1761   // Helper for generating a dynamic type check.
1762   // Smashes rscratch1.
1763   void generate_type_check(Register sub_klass,
1764                            Register super_check_offset,
1765                            Register super_klass,
1766                            Label& L_success) {
1767     assert_different_registers(sub_klass, super_check_offset, super_klass);
1768 
1769     BLOCK_COMMENT("type_check:");
1770 
1771     Label L_miss;
1772 
1773     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1774                                      super_check_offset);
1775     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1776 
1777     // Fall through on failure!
1778     __ BIND(L_miss);
1779   }
1780 
1781   //
1782   //  Generate checkcasting array copy stub
1783   //
1784   //  Input:
1785   //    c_rarg0   - source array address
1786   //    c_rarg1   - destination array address
1787   //    c_rarg2   - element count, treated as ssize_t, can be zero
1788   //    c_rarg3   - size_t ckoff (super_check_offset)
1789   //    c_rarg4   - oop ckval (super_klass)
1790   //
1791   //  Output:
1792   //    r0 ==  0  -  success
1793   //    r0 == -1^K - failure, where K is partial transfer count
1794   //
1795   address generate_checkcast_copy(const char *name, address *entry,
1796                                   bool dest_uninitialized = false) {
1797 
1798     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1799 
1800     // Input registers (after setup_arg_regs)
1801     const Register from        = c_rarg0;   // source array address
1802     const Register to          = c_rarg1;   // destination array address
1803     const Register count       = c_rarg2;   // elementscount
1804     const Register ckoff       = c_rarg3;   // super_check_offset
1805     const Register ckval       = c_rarg4;   // super_klass
1806 
1807     // Registers used as temps (r18, r19, r20 are save-on-entry)
1808     const Register count_save  = r21;       // orig elementscount
1809     const Register start_to    = r20;       // destination array start address
1810     const Register copied_oop  = r18;       // actual oop copied
1811     const Register r19_klass   = r19;       // oop._klass
1812 
1813     //---------------------------------------------------------------
1814     // Assembler stub will be used for this call to arraycopy
1815     // if the two arrays are subtypes of Object[] but the
1816     // destination array type is not equal to or a supertype
1817     // of the source type.  Each element must be separately
1818     // checked.
1819 
1820     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1821                                copied_oop, r19_klass, count_save);
1822 
1823     __ align(CodeEntryAlignment);
1824     StubCodeMark mark(this, "StubRoutines", name);
1825     address start = __ pc();
1826 
1827     __ enter(); // required for proper stackwalking of RuntimeStub frame
1828 
1829 #ifdef ASSERT
1830     // caller guarantees that the arrays really are different
1831     // otherwise, we would have to make conjoint checks
1832     { Label L;
1833       array_overlap_test(L, TIMES_OOP);
1834       __ stop("checkcast_copy within a single array");
1835       __ bind(L);
1836     }
1837 #endif //ASSERT
1838 
1839     // Caller of this entry point must set up the argument registers.
1840     if (entry != NULL) {
1841       *entry = __ pc();
1842       BLOCK_COMMENT("Entry:");
1843     }
1844 
1845      // Empty array:  Nothing to do.
1846     __ cbz(count, L_done);
1847 
1848     __ push(RegSet::of(r18, r19, r20, r21), sp);
1849 
1850 #ifdef ASSERT
1851     BLOCK_COMMENT("assert consistent ckoff/ckval");
1852     // The ckoff and ckval must be mutually consistent,
1853     // even though caller generates both.
1854     { Label L;
1855       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1856       __ ldrw(start_to, Address(ckval, sco_offset));
1857       __ cmpw(ckoff, start_to);
1858       __ br(Assembler::EQ, L);
1859       __ stop("super_check_offset inconsistent");
1860       __ bind(L);
1861     }
1862 #endif //ASSERT
1863 
1864     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1865 
1866     // save the original count
1867     __ mov(count_save, count);
1868 
1869     // Copy from low to high addresses
1870     __ mov(start_to, to);              // Save destination array start address
1871     __ b(L_load_element);
1872 
1873     // ======== begin loop ========
1874     // (Loop is rotated; its entry is L_load_element.)
1875     // Loop control:
1876     //   for (; count != 0; count--) {
1877     //     copied_oop = load_heap_oop(from++);
1878     //     ... generate_type_check ...;
1879     //     store_heap_oop(to++, copied_oop);
1880     //   }
1881     __ align(OptoLoopAlignment);
1882 
1883     __ BIND(L_store_element);
1884     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1885     __ sub(count, count, 1);
1886     __ cbz(count, L_do_card_marks);
1887 
1888     // ======== loop entry is here ========
1889     __ BIND(L_load_element);
1890     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1891     __ cbz(copied_oop, L_store_element);
1892 
1893     __ load_klass(r19_klass, copied_oop);// query the object klass
1894     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1895     // ======== end loop ========
1896 
1897     // It was a real error; we must depend on the caller to finish the job.
1898     // Register count = remaining oops, count_orig = total oops.
1899     // Emit GC store barriers for the oops we have copied and report
1900     // their number to the caller.
1901 
1902     __ subs(count, count_save, count);     // K = partially copied oop count
1903     __ eon(count, count, zr);                   // report (-1^K) to caller
1904     __ br(Assembler::EQ, L_done_pop);
1905 
1906     __ BIND(L_do_card_marks);
1907     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1908     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1909 
1910     __ bind(L_done_pop);
1911     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1912     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1913 
1914     __ bind(L_done);
1915     __ mov(r0, count);
1916     __ leave();
1917     __ ret(lr);
1918 
1919     return start;
1920   }
1921 
1922   // Perform range checks on the proposed arraycopy.
1923   // Kills temp, but nothing else.
1924   // Also, clean the sign bits of src_pos and dst_pos.
1925   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1926                               Register src_pos, // source position (c_rarg1)
1927                               Register dst,     // destination array oo (c_rarg2)
1928                               Register dst_pos, // destination position (c_rarg3)
1929                               Register length,
1930                               Register temp,
1931                               Label& L_failed) {
1932     BLOCK_COMMENT("arraycopy_range_checks:");
1933 
1934     assert_different_registers(rscratch1, temp);
1935 
1936     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1937     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1938     __ addw(temp, length, src_pos);
1939     __ cmpw(temp, rscratch1);
1940     __ br(Assembler::HI, L_failed);
1941 
1942     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1943     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1944     __ addw(temp, length, dst_pos);
1945     __ cmpw(temp, rscratch1);
1946     __ br(Assembler::HI, L_failed);
1947 
1948     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1949     __ movw(src_pos, src_pos);
1950     __ movw(dst_pos, dst_pos);
1951 
1952     BLOCK_COMMENT("arraycopy_range_checks done");
1953   }
1954 
1955   // These stubs get called from some dumb test routine.
1956   // I'll write them properly when they're called from
1957   // something that's actually doing something.
1958   static void fake_arraycopy_stub(address src, address dst, int count) {
1959     assert(count == 0, "huh?");
1960   }
1961 
1962 
1963   //
1964   //  Generate 'unsafe' array copy stub
1965   //  Though just as safe as the other stubs, it takes an unscaled
1966   //  size_t argument instead of an element count.
1967   //
1968   //  Input:
1969   //    c_rarg0   - source array address
1970   //    c_rarg1   - destination array address
1971   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1972   //
1973   // Examines the alignment of the operands and dispatches
1974   // to a long, int, short, or byte copy loop.
1975   //
1976   address generate_unsafe_copy(const char *name,
1977                                address byte_copy_entry,
1978                                address short_copy_entry,
1979                                address int_copy_entry,
1980                                address long_copy_entry) {
1981     Label L_long_aligned, L_int_aligned, L_short_aligned;
1982     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1983 
1984     __ align(CodeEntryAlignment);
1985     StubCodeMark mark(this, "StubRoutines", name);
1986     address start = __ pc();
1987     __ enter(); // required for proper stackwalking of RuntimeStub frame
1988 
1989     // bump this on entry, not on exit:
1990     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1991 
1992     __ orr(rscratch1, s, d);
1993     __ orr(rscratch1, rscratch1, count);
1994 
1995     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1996     __ cbz(rscratch1, L_long_aligned);
1997     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1998     __ cbz(rscratch1, L_int_aligned);
1999     __ tbz(rscratch1, 0, L_short_aligned);
2000     __ b(RuntimeAddress(byte_copy_entry));
2001 
2002     __ BIND(L_short_aligned);
2003     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2004     __ b(RuntimeAddress(short_copy_entry));
2005     __ BIND(L_int_aligned);
2006     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2007     __ b(RuntimeAddress(int_copy_entry));
2008     __ BIND(L_long_aligned);
2009     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2010     __ b(RuntimeAddress(long_copy_entry));
2011 
2012     return start;
2013   }
2014 
2015   //
2016   //  Generate generic array copy stubs
2017   //
2018   //  Input:
2019   //    c_rarg0    -  src oop
2020   //    c_rarg1    -  src_pos (32-bits)
2021   //    c_rarg2    -  dst oop
2022   //    c_rarg3    -  dst_pos (32-bits)
2023   //    c_rarg4    -  element count (32-bits)
2024   //
2025   //  Output:
2026   //    r0 ==  0  -  success
2027   //    r0 == -1^K - failure, where K is partial transfer count
2028   //
2029   address generate_generic_copy(const char *name,
2030                                 address byte_copy_entry, address short_copy_entry,
2031                                 address int_copy_entry, address oop_copy_entry,
2032                                 address long_copy_entry, address checkcast_copy_entry) {
2033 
2034     Label L_failed, L_failed_0, L_objArray;
2035     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2036 
2037     // Input registers
2038     const Register src        = c_rarg0;  // source array oop
2039     const Register src_pos    = c_rarg1;  // source position
2040     const Register dst        = c_rarg2;  // destination array oop
2041     const Register dst_pos    = c_rarg3;  // destination position
2042     const Register length     = c_rarg4;
2043 
2044     StubCodeMark mark(this, "StubRoutines", name);
2045 
2046     __ align(CodeEntryAlignment);
2047     address start = __ pc();
2048 
2049     __ enter(); // required for proper stackwalking of RuntimeStub frame
2050 
2051     // bump this on entry, not on exit:
2052     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2053 
2054     //-----------------------------------------------------------------------
2055     // Assembler stub will be used for this call to arraycopy
2056     // if the following conditions are met:
2057     //
2058     // (1) src and dst must not be null.
2059     // (2) src_pos must not be negative.
2060     // (3) dst_pos must not be negative.
2061     // (4) length  must not be negative.
2062     // (5) src klass and dst klass should be the same and not NULL.
2063     // (6) src and dst should be arrays.
2064     // (7) src_pos + length must not exceed length of src.
2065     // (8) dst_pos + length must not exceed length of dst.
2066     //
2067 
2068     //  if (src == NULL) return -1;
2069     __ cbz(src, L_failed);
2070 
2071     //  if (src_pos < 0) return -1;
2072     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2073 
2074     //  if (dst == NULL) return -1;
2075     __ cbz(dst, L_failed);
2076 
2077     //  if (dst_pos < 0) return -1;
2078     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2079 
2080     // registers used as temp
2081     const Register scratch_length    = r16; // elements count to copy
2082     const Register scratch_src_klass = r17; // array klass
2083     const Register lh                = r18; // layout helper
2084 
2085     //  if (length < 0) return -1;
2086     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2087     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2088 
2089     __ load_klass(scratch_src_klass, src);
2090 #ifdef ASSERT
2091     //  assert(src->klass() != NULL);
2092     {
2093       BLOCK_COMMENT("assert klasses not null {");
2094       Label L1, L2;
2095       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2096       __ bind(L1);
2097       __ stop("broken null klass");
2098       __ bind(L2);
2099       __ load_klass(rscratch1, dst);
2100       __ cbz(rscratch1, L1);     // this would be broken also
2101       BLOCK_COMMENT("} assert klasses not null done");
2102     }
2103 #endif
2104 
2105     // Load layout helper (32-bits)
2106     //
2107     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2108     // 32        30    24            16              8     2                 0
2109     //
2110     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2111     //
2112 
2113     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2114 
2115     // Handle objArrays completely differently...
2116     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2117     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2118     __ movw(rscratch1, objArray_lh);
2119     __ eorw(rscratch2, lh, rscratch1);
2120     __ cbzw(rscratch2, L_objArray);
2121 
2122     //  if (src->klass() != dst->klass()) return -1;
2123     __ load_klass(rscratch2, dst);
2124     __ eor(rscratch2, rscratch2, scratch_src_klass);
2125     __ cbnz(rscratch2, L_failed);
2126 
2127     //  if (!src->is_Array()) return -1;
2128     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2129 
2130     // At this point, it is known to be a typeArray (array_tag 0x3).
2131 #ifdef ASSERT
2132     {
2133       BLOCK_COMMENT("assert primitive array {");
2134       Label L;
2135       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2136       __ cmpw(lh, rscratch2);
2137       __ br(Assembler::GE, L);
2138       __ stop("must be a primitive array");
2139       __ bind(L);
2140       BLOCK_COMMENT("} assert primitive array done");
2141     }
2142 #endif
2143 
2144     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2145                            rscratch2, L_failed);
2146 
2147     // TypeArrayKlass
2148     //
2149     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2150     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2151     //
2152 
2153     const Register rscratch1_offset = rscratch1;    // array offset
2154     const Register r18_elsize = lh; // element size
2155 
2156     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2157            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2158     __ add(src, src, rscratch1_offset);           // src array offset
2159     __ add(dst, dst, rscratch1_offset);           // dst array offset
2160     BLOCK_COMMENT("choose copy loop based on element size");
2161 
2162     // next registers should be set before the jump to corresponding stub
2163     const Register from     = c_rarg0;  // source array address
2164     const Register to       = c_rarg1;  // destination array address
2165     const Register count    = c_rarg2;  // elements count
2166 
2167     // 'from', 'to', 'count' registers should be set in such order
2168     // since they are the same as 'src', 'src_pos', 'dst'.
2169 
2170     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2171 
2172     // The possible values of elsize are 0-3, i.e. exact_log2(element
2173     // size in bytes).  We do a simple bitwise binary search.
2174   __ BIND(L_copy_bytes);
2175     __ tbnz(r18_elsize, 1, L_copy_ints);
2176     __ tbnz(r18_elsize, 0, L_copy_shorts);
2177     __ lea(from, Address(src, src_pos));// src_addr
2178     __ lea(to,   Address(dst, dst_pos));// dst_addr
2179     __ movw(count, scratch_length); // length
2180     __ b(RuntimeAddress(byte_copy_entry));
2181 
2182   __ BIND(L_copy_shorts);
2183     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2184     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2185     __ movw(count, scratch_length); // length
2186     __ b(RuntimeAddress(short_copy_entry));
2187 
2188   __ BIND(L_copy_ints);
2189     __ tbnz(r18_elsize, 0, L_copy_longs);
2190     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2191     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2192     __ movw(count, scratch_length); // length
2193     __ b(RuntimeAddress(int_copy_entry));
2194 
2195   __ BIND(L_copy_longs);
2196 #ifdef ASSERT
2197     {
2198       BLOCK_COMMENT("assert long copy {");
2199       Label L;
2200       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2201       __ cmpw(r18_elsize, LogBytesPerLong);
2202       __ br(Assembler::EQ, L);
2203       __ stop("must be long copy, but elsize is wrong");
2204       __ bind(L);
2205       BLOCK_COMMENT("} assert long copy done");
2206     }
2207 #endif
2208     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2209     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2210     __ movw(count, scratch_length); // length
2211     __ b(RuntimeAddress(long_copy_entry));
2212 
2213     // ObjArrayKlass
2214   __ BIND(L_objArray);
2215     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2216 
2217     Label L_plain_copy, L_checkcast_copy;
2218     //  test array classes for subtyping
2219     __ load_klass(r18, dst);
2220     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2221     __ br(Assembler::NE, L_checkcast_copy);
2222 
2223     // Identically typed arrays can be copied without element-wise checks.
2224     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2225                            rscratch2, L_failed);
2226 
2227     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2228     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2229     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2230     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2231     __ movw(count, scratch_length); // length
2232   __ BIND(L_plain_copy);
2233     __ b(RuntimeAddress(oop_copy_entry));
2234 
2235   __ BIND(L_checkcast_copy);
2236     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2237     {
2238       // Before looking at dst.length, make sure dst is also an objArray.
2239       __ ldrw(rscratch1, Address(r18, lh_offset));
2240       __ movw(rscratch2, objArray_lh);
2241       __ eorw(rscratch1, rscratch1, rscratch2);
2242       __ cbnzw(rscratch1, L_failed);
2243 
2244       // It is safe to examine both src.length and dst.length.
2245       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2246                              r18, L_failed);
2247 
2248       const Register rscratch2_dst_klass = rscratch2;
2249       __ load_klass(rscratch2_dst_klass, dst); // reload
2250 
2251       // Marshal the base address arguments now, freeing registers.
2252       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2253       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2254       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2255       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2256       __ movw(count, length);           // length (reloaded)
2257       Register sco_temp = c_rarg3;      // this register is free now
2258       assert_different_registers(from, to, count, sco_temp,
2259                                  rscratch2_dst_klass, scratch_src_klass);
2260       // assert_clean_int(count, sco_temp);
2261 
2262       // Generate the type check.
2263       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2264       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2265       // assert_clean_int(sco_temp, r18);
2266       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2267 
2268       // Fetch destination element klass from the ObjArrayKlass header.
2269       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2270       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2271       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2272 
2273       // the checkcast_copy loop needs two extra arguments:
2274       assert(c_rarg3 == sco_temp, "#3 already in place");
2275       // Set up arguments for checkcast_copy_entry.
2276       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2277       __ b(RuntimeAddress(checkcast_copy_entry));
2278     }
2279 
2280   __ BIND(L_failed);
2281     __ mov(r0, -1);
2282     __ leave();   // required for proper stackwalking of RuntimeStub frame
2283     __ ret(lr);
2284 
2285     return start;
2286   }
2287 
2288   //
2289   // Generate stub for array fill. If "aligned" is true, the
2290   // "to" address is assumed to be heapword aligned.
2291   //
2292   // Arguments for generated stub:
2293   //   to:    c_rarg0
2294   //   value: c_rarg1
2295   //   count: c_rarg2 treated as signed
2296   //
2297   address generate_fill(BasicType t, bool aligned, const char *name) {
2298     __ align(CodeEntryAlignment);
2299     StubCodeMark mark(this, "StubRoutines", name);
2300     address start = __ pc();
2301 
2302     BLOCK_COMMENT("Entry:");
2303 
2304     const Register to        = c_rarg0;  // source array address
2305     const Register value     = c_rarg1;  // value
2306     const Register count     = c_rarg2;  // elements count
2307 
2308     const Register bz_base = r10;        // base for block_zero routine
2309     const Register cnt_words = r11;      // temp register
2310 
2311     __ enter();
2312 
2313     Label L_fill_elements, L_exit1;
2314 
2315     int shift = -1;
2316     switch (t) {
2317       case T_BYTE:
2318         shift = 0;
2319         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2320         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2321         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2322         __ br(Assembler::LO, L_fill_elements);
2323         break;
2324       case T_SHORT:
2325         shift = 1;
2326         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2327         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2328         __ br(Assembler::LO, L_fill_elements);
2329         break;
2330       case T_INT:
2331         shift = 2;
2332         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2333         __ br(Assembler::LO, L_fill_elements);
2334         break;
2335       default: ShouldNotReachHere();
2336     }
2337 
2338     // Align source address at 8 bytes address boundary.
2339     Label L_skip_align1, L_skip_align2, L_skip_align4;
2340     if (!aligned) {
2341       switch (t) {
2342         case T_BYTE:
2343           // One byte misalignment happens only for byte arrays.
2344           __ tbz(to, 0, L_skip_align1);
2345           __ strb(value, Address(__ post(to, 1)));
2346           __ subw(count, count, 1);
2347           __ bind(L_skip_align1);
2348           // Fallthrough
2349         case T_SHORT:
2350           // Two bytes misalignment happens only for byte and short (char) arrays.
2351           __ tbz(to, 1, L_skip_align2);
2352           __ strh(value, Address(__ post(to, 2)));
2353           __ subw(count, count, 2 >> shift);
2354           __ bind(L_skip_align2);
2355           // Fallthrough
2356         case T_INT:
2357           // Align to 8 bytes, we know we are 4 byte aligned to start.
2358           __ tbz(to, 2, L_skip_align4);
2359           __ strw(value, Address(__ post(to, 4)));
2360           __ subw(count, count, 4 >> shift);
2361           __ bind(L_skip_align4);
2362           break;
2363         default: ShouldNotReachHere();
2364       }
2365     }
2366 
2367     //
2368     //  Fill large chunks
2369     //
2370     __ lsrw(cnt_words, count, 3 - shift); // number of words
2371     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2372     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2373     if (UseBlockZeroing) {
2374       Label non_block_zeroing, rest;
2375       // If the fill value is zero we can use the fast zero_words().
2376       __ cbnz(value, non_block_zeroing);
2377       __ mov(bz_base, to);
2378       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2379       __ zero_words(bz_base, cnt_words);
2380       __ b(rest);
2381       __ bind(non_block_zeroing);
2382       __ fill_words(to, cnt_words, value);
2383       __ bind(rest);
2384     } else {
2385       __ fill_words(to, cnt_words, value);
2386     }
2387 
2388     // Remaining count is less than 8 bytes. Fill it by a single store.
2389     // Note that the total length is no less than 8 bytes.
2390     if (t == T_BYTE || t == T_SHORT) {
2391       Label L_exit1;
2392       __ cbzw(count, L_exit1);
2393       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2394       __ str(value, Address(to, -8));    // overwrite some elements
2395       __ bind(L_exit1);
2396       __ leave();
2397       __ ret(lr);
2398     }
2399 
2400     // Handle copies less than 8 bytes.
2401     Label L_fill_2, L_fill_4, L_exit2;
2402     __ bind(L_fill_elements);
2403     switch (t) {
2404       case T_BYTE:
2405         __ tbz(count, 0, L_fill_2);
2406         __ strb(value, Address(__ post(to, 1)));
2407         __ bind(L_fill_2);
2408         __ tbz(count, 1, L_fill_4);
2409         __ strh(value, Address(__ post(to, 2)));
2410         __ bind(L_fill_4);
2411         __ tbz(count, 2, L_exit2);
2412         __ strw(value, Address(to));
2413         break;
2414       case T_SHORT:
2415         __ tbz(count, 0, L_fill_4);
2416         __ strh(value, Address(__ post(to, 2)));
2417         __ bind(L_fill_4);
2418         __ tbz(count, 1, L_exit2);
2419         __ strw(value, Address(to));
2420         break;
2421       case T_INT:
2422         __ cbzw(count, L_exit2);
2423         __ strw(value, Address(to));
2424         break;
2425       default: ShouldNotReachHere();
2426     }
2427     __ bind(L_exit2);
2428     __ leave();
2429     __ ret(lr);
2430     return start;
2431   }
2432 
2433   void generate_arraycopy_stubs() {
2434     address entry;
2435     address entry_jbyte_arraycopy;
2436     address entry_jshort_arraycopy;
2437     address entry_jint_arraycopy;
2438     address entry_oop_arraycopy;
2439     address entry_jlong_arraycopy;
2440     address entry_checkcast_arraycopy;
2441 
2442     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2443     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2444 
2445     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2446 
2447     //*** jbyte
2448     // Always need aligned and unaligned versions
2449     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2450                                                                                   "jbyte_disjoint_arraycopy");
2451     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2452                                                                                   &entry_jbyte_arraycopy,
2453                                                                                   "jbyte_arraycopy");
2454     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2455                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2456     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2457                                                                                   "arrayof_jbyte_arraycopy");
2458 
2459     //*** jshort
2460     // Always need aligned and unaligned versions
2461     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2462                                                                                     "jshort_disjoint_arraycopy");
2463     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2464                                                                                     &entry_jshort_arraycopy,
2465                                                                                     "jshort_arraycopy");
2466     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2467                                                                                     "arrayof_jshort_disjoint_arraycopy");
2468     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2469                                                                                     "arrayof_jshort_arraycopy");
2470 
2471     //*** jint
2472     // Aligned versions
2473     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2474                                                                                 "arrayof_jint_disjoint_arraycopy");
2475     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2476                                                                                 "arrayof_jint_arraycopy");
2477     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2478     // entry_jint_arraycopy always points to the unaligned version
2479     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2480                                                                                 "jint_disjoint_arraycopy");
2481     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2482                                                                                 &entry_jint_arraycopy,
2483                                                                                 "jint_arraycopy");
2484 
2485     //*** jlong
2486     // It is always aligned
2487     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2488                                                                                   "arrayof_jlong_disjoint_arraycopy");
2489     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2490                                                                                   "arrayof_jlong_arraycopy");
2491     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2492     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2493 
2494     //*** oops
2495     {
2496       // With compressed oops we need unaligned versions; notice that
2497       // we overwrite entry_oop_arraycopy.
2498       bool aligned = !UseCompressedOops;
2499 
2500       StubRoutines::_arrayof_oop_disjoint_arraycopy
2501         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2502                                      /*dest_uninitialized*/false);
2503       StubRoutines::_arrayof_oop_arraycopy
2504         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2505                                      /*dest_uninitialized*/false);
2506       // Aligned versions without pre-barriers
2507       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2508         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2509                                      /*dest_uninitialized*/true);
2510       StubRoutines::_arrayof_oop_arraycopy_uninit
2511         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2512                                      /*dest_uninitialized*/true);
2513     }
2514 
2515     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2516     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2517     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2518     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2519 
2520     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2521     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2522                                                                         /*dest_uninitialized*/true);
2523 
2524     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2525                                                               entry_jbyte_arraycopy,
2526                                                               entry_jshort_arraycopy,
2527                                                               entry_jint_arraycopy,
2528                                                               entry_jlong_arraycopy);
2529 
2530     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2531                                                                entry_jbyte_arraycopy,
2532                                                                entry_jshort_arraycopy,
2533                                                                entry_jint_arraycopy,
2534                                                                entry_oop_arraycopy,
2535                                                                entry_jlong_arraycopy,
2536                                                                entry_checkcast_arraycopy);
2537 
2538     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2539     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2540     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2541     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2542     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2543     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2544   }
2545 
2546   void generate_math_stubs() { Unimplemented(); }
2547 
2548   // Arguments:
2549   //
2550   // Inputs:
2551   //   c_rarg0   - source byte array address
2552   //   c_rarg1   - destination byte array address
2553   //   c_rarg2   - K (key) in little endian int array
2554   //
2555   address generate_aescrypt_encryptBlock() {
2556     __ align(CodeEntryAlignment);
2557     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2558 
2559     Label L_doLast;
2560 
2561     const Register from        = c_rarg0;  // source array address
2562     const Register to          = c_rarg1;  // destination array address
2563     const Register key         = c_rarg2;  // key array address
2564     const Register keylen      = rscratch1;
2565 
2566     address start = __ pc();
2567     __ enter();
2568 
2569     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2570 
2571     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2572 
2573     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2574     __ rev32(v1, __ T16B, v1);
2575     __ rev32(v2, __ T16B, v2);
2576     __ rev32(v3, __ T16B, v3);
2577     __ rev32(v4, __ T16B, v4);
2578     __ aese(v0, v1);
2579     __ aesmc(v0, v0);
2580     __ aese(v0, v2);
2581     __ aesmc(v0, v0);
2582     __ aese(v0, v3);
2583     __ aesmc(v0, v0);
2584     __ aese(v0, v4);
2585     __ aesmc(v0, v0);
2586 
2587     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2588     __ rev32(v1, __ T16B, v1);
2589     __ rev32(v2, __ T16B, v2);
2590     __ rev32(v3, __ T16B, v3);
2591     __ rev32(v4, __ T16B, v4);
2592     __ aese(v0, v1);
2593     __ aesmc(v0, v0);
2594     __ aese(v0, v2);
2595     __ aesmc(v0, v0);
2596     __ aese(v0, v3);
2597     __ aesmc(v0, v0);
2598     __ aese(v0, v4);
2599     __ aesmc(v0, v0);
2600 
2601     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2602     __ rev32(v1, __ T16B, v1);
2603     __ rev32(v2, __ T16B, v2);
2604 
2605     __ cmpw(keylen, 44);
2606     __ br(Assembler::EQ, L_doLast);
2607 
2608     __ aese(v0, v1);
2609     __ aesmc(v0, v0);
2610     __ aese(v0, v2);
2611     __ aesmc(v0, v0);
2612 
2613     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2614     __ rev32(v1, __ T16B, v1);
2615     __ rev32(v2, __ T16B, v2);
2616 
2617     __ cmpw(keylen, 52);
2618     __ br(Assembler::EQ, L_doLast);
2619 
2620     __ aese(v0, v1);
2621     __ aesmc(v0, v0);
2622     __ aese(v0, v2);
2623     __ aesmc(v0, v0);
2624 
2625     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2626     __ rev32(v1, __ T16B, v1);
2627     __ rev32(v2, __ T16B, v2);
2628 
2629     __ BIND(L_doLast);
2630 
2631     __ aese(v0, v1);
2632     __ aesmc(v0, v0);
2633     __ aese(v0, v2);
2634 
2635     __ ld1(v1, __ T16B, key);
2636     __ rev32(v1, __ T16B, v1);
2637     __ eor(v0, __ T16B, v0, v1);
2638 
2639     __ st1(v0, __ T16B, to);
2640 
2641     __ mov(r0, 0);
2642 
2643     __ leave();
2644     __ ret(lr);
2645 
2646     return start;
2647   }
2648 
2649   // Arguments:
2650   //
2651   // Inputs:
2652   //   c_rarg0   - source byte array address
2653   //   c_rarg1   - destination byte array address
2654   //   c_rarg2   - K (key) in little endian int array
2655   //
2656   address generate_aescrypt_decryptBlock() {
2657     assert(UseAES, "need AES instructions and misaligned SSE support");
2658     __ align(CodeEntryAlignment);
2659     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2660     Label L_doLast;
2661 
2662     const Register from        = c_rarg0;  // source array address
2663     const Register to          = c_rarg1;  // destination array address
2664     const Register key         = c_rarg2;  // key array address
2665     const Register keylen      = rscratch1;
2666 
2667     address start = __ pc();
2668     __ enter(); // required for proper stackwalking of RuntimeStub frame
2669 
2670     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2671 
2672     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2673 
2674     __ ld1(v5, __ T16B, __ post(key, 16));
2675     __ rev32(v5, __ T16B, v5);
2676 
2677     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2678     __ rev32(v1, __ T16B, v1);
2679     __ rev32(v2, __ T16B, v2);
2680     __ rev32(v3, __ T16B, v3);
2681     __ rev32(v4, __ T16B, v4);
2682     __ aesd(v0, v1);
2683     __ aesimc(v0, v0);
2684     __ aesd(v0, v2);
2685     __ aesimc(v0, v0);
2686     __ aesd(v0, v3);
2687     __ aesimc(v0, v0);
2688     __ aesd(v0, v4);
2689     __ aesimc(v0, v0);
2690 
2691     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2692     __ rev32(v1, __ T16B, v1);
2693     __ rev32(v2, __ T16B, v2);
2694     __ rev32(v3, __ T16B, v3);
2695     __ rev32(v4, __ T16B, v4);
2696     __ aesd(v0, v1);
2697     __ aesimc(v0, v0);
2698     __ aesd(v0, v2);
2699     __ aesimc(v0, v0);
2700     __ aesd(v0, v3);
2701     __ aesimc(v0, v0);
2702     __ aesd(v0, v4);
2703     __ aesimc(v0, v0);
2704 
2705     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2706     __ rev32(v1, __ T16B, v1);
2707     __ rev32(v2, __ T16B, v2);
2708 
2709     __ cmpw(keylen, 44);
2710     __ br(Assembler::EQ, L_doLast);
2711 
2712     __ aesd(v0, v1);
2713     __ aesimc(v0, v0);
2714     __ aesd(v0, v2);
2715     __ aesimc(v0, v0);
2716 
2717     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2718     __ rev32(v1, __ T16B, v1);
2719     __ rev32(v2, __ T16B, v2);
2720 
2721     __ cmpw(keylen, 52);
2722     __ br(Assembler::EQ, L_doLast);
2723 
2724     __ aesd(v0, v1);
2725     __ aesimc(v0, v0);
2726     __ aesd(v0, v2);
2727     __ aesimc(v0, v0);
2728 
2729     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2730     __ rev32(v1, __ T16B, v1);
2731     __ rev32(v2, __ T16B, v2);
2732 
2733     __ BIND(L_doLast);
2734 
2735     __ aesd(v0, v1);
2736     __ aesimc(v0, v0);
2737     __ aesd(v0, v2);
2738 
2739     __ eor(v0, __ T16B, v0, v5);
2740 
2741     __ st1(v0, __ T16B, to);
2742 
2743     __ mov(r0, 0);
2744 
2745     __ leave();
2746     __ ret(lr);
2747 
2748     return start;
2749   }
2750 
2751   // Arguments:
2752   //
2753   // Inputs:
2754   //   c_rarg0   - source byte array address
2755   //   c_rarg1   - destination byte array address
2756   //   c_rarg2   - K (key) in little endian int array
2757   //   c_rarg3   - r vector byte array address
2758   //   c_rarg4   - input length
2759   //
2760   // Output:
2761   //   x0        - input length
2762   //
2763   address generate_cipherBlockChaining_encryptAESCrypt() {
2764     assert(UseAES, "need AES instructions and misaligned SSE support");
2765     __ align(CodeEntryAlignment);
2766     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2767 
2768     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2769 
2770     const Register from        = c_rarg0;  // source array address
2771     const Register to          = c_rarg1;  // destination array address
2772     const Register key         = c_rarg2;  // key array address
2773     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2774                                            // and left with the results of the last encryption block
2775     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2776     const Register keylen      = rscratch1;
2777 
2778     address start = __ pc();
2779 
2780       __ enter();
2781 
2782       __ movw(rscratch2, len_reg);
2783 
2784       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2785 
2786       __ ld1(v0, __ T16B, rvec);
2787 
2788       __ cmpw(keylen, 52);
2789       __ br(Assembler::CC, L_loadkeys_44);
2790       __ br(Assembler::EQ, L_loadkeys_52);
2791 
2792       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2793       __ rev32(v17, __ T16B, v17);
2794       __ rev32(v18, __ T16B, v18);
2795     __ BIND(L_loadkeys_52);
2796       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2797       __ rev32(v19, __ T16B, v19);
2798       __ rev32(v20, __ T16B, v20);
2799     __ BIND(L_loadkeys_44);
2800       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2801       __ rev32(v21, __ T16B, v21);
2802       __ rev32(v22, __ T16B, v22);
2803       __ rev32(v23, __ T16B, v23);
2804       __ rev32(v24, __ T16B, v24);
2805       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2806       __ rev32(v25, __ T16B, v25);
2807       __ rev32(v26, __ T16B, v26);
2808       __ rev32(v27, __ T16B, v27);
2809       __ rev32(v28, __ T16B, v28);
2810       __ ld1(v29, v30, v31, __ T16B, key);
2811       __ rev32(v29, __ T16B, v29);
2812       __ rev32(v30, __ T16B, v30);
2813       __ rev32(v31, __ T16B, v31);
2814 
2815     __ BIND(L_aes_loop);
2816       __ ld1(v1, __ T16B, __ post(from, 16));
2817       __ eor(v0, __ T16B, v0, v1);
2818 
2819       __ br(Assembler::CC, L_rounds_44);
2820       __ br(Assembler::EQ, L_rounds_52);
2821 
2822       __ aese(v0, v17); __ aesmc(v0, v0);
2823       __ aese(v0, v18); __ aesmc(v0, v0);
2824     __ BIND(L_rounds_52);
2825       __ aese(v0, v19); __ aesmc(v0, v0);
2826       __ aese(v0, v20); __ aesmc(v0, v0);
2827     __ BIND(L_rounds_44);
2828       __ aese(v0, v21); __ aesmc(v0, v0);
2829       __ aese(v0, v22); __ aesmc(v0, v0);
2830       __ aese(v0, v23); __ aesmc(v0, v0);
2831       __ aese(v0, v24); __ aesmc(v0, v0);
2832       __ aese(v0, v25); __ aesmc(v0, v0);
2833       __ aese(v0, v26); __ aesmc(v0, v0);
2834       __ aese(v0, v27); __ aesmc(v0, v0);
2835       __ aese(v0, v28); __ aesmc(v0, v0);
2836       __ aese(v0, v29); __ aesmc(v0, v0);
2837       __ aese(v0, v30);
2838       __ eor(v0, __ T16B, v0, v31);
2839 
2840       __ st1(v0, __ T16B, __ post(to, 16));
2841 
2842       __ subw(len_reg, len_reg, 16);
2843       __ cbnzw(len_reg, L_aes_loop);
2844 
2845       __ st1(v0, __ T16B, rvec);
2846 
2847       __ mov(r0, rscratch2);
2848 
2849       __ leave();
2850       __ ret(lr);
2851 
2852       return start;
2853   }
2854 
2855   // Arguments:
2856   //
2857   // Inputs:
2858   //   c_rarg0   - source byte array address
2859   //   c_rarg1   - destination byte array address
2860   //   c_rarg2   - K (key) in little endian int array
2861   //   c_rarg3   - r vector byte array address
2862   //   c_rarg4   - input length
2863   //
2864   // Output:
2865   //   r0        - input length
2866   //
2867   address generate_cipherBlockChaining_decryptAESCrypt() {
2868     assert(UseAES, "need AES instructions and misaligned SSE support");
2869     __ align(CodeEntryAlignment);
2870     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2871 
2872     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2873 
2874     const Register from        = c_rarg0;  // source array address
2875     const Register to          = c_rarg1;  // destination array address
2876     const Register key         = c_rarg2;  // key array address
2877     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2878                                            // and left with the results of the last encryption block
2879     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2880     const Register keylen      = rscratch1;
2881 
2882     address start = __ pc();
2883 
2884       __ enter();
2885 
2886       __ movw(rscratch2, len_reg);
2887 
2888       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2889 
2890       __ ld1(v2, __ T16B, rvec);
2891 
2892       __ ld1(v31, __ T16B, __ post(key, 16));
2893       __ rev32(v31, __ T16B, v31);
2894 
2895       __ cmpw(keylen, 52);
2896       __ br(Assembler::CC, L_loadkeys_44);
2897       __ br(Assembler::EQ, L_loadkeys_52);
2898 
2899       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2900       __ rev32(v17, __ T16B, v17);
2901       __ rev32(v18, __ T16B, v18);
2902     __ BIND(L_loadkeys_52);
2903       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2904       __ rev32(v19, __ T16B, v19);
2905       __ rev32(v20, __ T16B, v20);
2906     __ BIND(L_loadkeys_44);
2907       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2908       __ rev32(v21, __ T16B, v21);
2909       __ rev32(v22, __ T16B, v22);
2910       __ rev32(v23, __ T16B, v23);
2911       __ rev32(v24, __ T16B, v24);
2912       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2913       __ rev32(v25, __ T16B, v25);
2914       __ rev32(v26, __ T16B, v26);
2915       __ rev32(v27, __ T16B, v27);
2916       __ rev32(v28, __ T16B, v28);
2917       __ ld1(v29, v30, __ T16B, key);
2918       __ rev32(v29, __ T16B, v29);
2919       __ rev32(v30, __ T16B, v30);
2920 
2921     __ BIND(L_aes_loop);
2922       __ ld1(v0, __ T16B, __ post(from, 16));
2923       __ orr(v1, __ T16B, v0, v0);
2924 
2925       __ br(Assembler::CC, L_rounds_44);
2926       __ br(Assembler::EQ, L_rounds_52);
2927 
2928       __ aesd(v0, v17); __ aesimc(v0, v0);
2929       __ aesd(v0, v18); __ aesimc(v0, v0);
2930     __ BIND(L_rounds_52);
2931       __ aesd(v0, v19); __ aesimc(v0, v0);
2932       __ aesd(v0, v20); __ aesimc(v0, v0);
2933     __ BIND(L_rounds_44);
2934       __ aesd(v0, v21); __ aesimc(v0, v0);
2935       __ aesd(v0, v22); __ aesimc(v0, v0);
2936       __ aesd(v0, v23); __ aesimc(v0, v0);
2937       __ aesd(v0, v24); __ aesimc(v0, v0);
2938       __ aesd(v0, v25); __ aesimc(v0, v0);
2939       __ aesd(v0, v26); __ aesimc(v0, v0);
2940       __ aesd(v0, v27); __ aesimc(v0, v0);
2941       __ aesd(v0, v28); __ aesimc(v0, v0);
2942       __ aesd(v0, v29); __ aesimc(v0, v0);
2943       __ aesd(v0, v30);
2944       __ eor(v0, __ T16B, v0, v31);
2945       __ eor(v0, __ T16B, v0, v2);
2946 
2947       __ st1(v0, __ T16B, __ post(to, 16));
2948       __ orr(v2, __ T16B, v1, v1);
2949 
2950       __ subw(len_reg, len_reg, 16);
2951       __ cbnzw(len_reg, L_aes_loop);
2952 
2953       __ st1(v2, __ T16B, rvec);
2954 
2955       __ mov(r0, rscratch2);
2956 
2957       __ leave();
2958       __ ret(lr);
2959 
2960     return start;
2961   }
2962 
2963   // Arguments:
2964   //
2965   // Inputs:
2966   //   c_rarg0   - byte[]  source+offset
2967   //   c_rarg1   - int[]   SHA.state
2968   //   c_rarg2   - int     offset
2969   //   c_rarg3   - int     limit
2970   //
2971   address generate_sha1_implCompress(bool multi_block, const char *name) {
2972     __ align(CodeEntryAlignment);
2973     StubCodeMark mark(this, "StubRoutines", name);
2974     address start = __ pc();
2975 
2976     Register buf   = c_rarg0;
2977     Register state = c_rarg1;
2978     Register ofs   = c_rarg2;
2979     Register limit = c_rarg3;
2980 
2981     Label keys;
2982     Label sha1_loop;
2983 
2984     // load the keys into v0..v3
2985     __ adr(rscratch1, keys);
2986     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2987     // load 5 words state into v6, v7
2988     __ ldrq(v6, Address(state, 0));
2989     __ ldrs(v7, Address(state, 16));
2990 
2991 
2992     __ BIND(sha1_loop);
2993     // load 64 bytes of data into v16..v19
2994     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2995     __ rev32(v16, __ T16B, v16);
2996     __ rev32(v17, __ T16B, v17);
2997     __ rev32(v18, __ T16B, v18);
2998     __ rev32(v19, __ T16B, v19);
2999 
3000     // do the sha1
3001     __ addv(v4, __ T4S, v16, v0);
3002     __ orr(v20, __ T16B, v6, v6);
3003 
3004     FloatRegister d0 = v16;
3005     FloatRegister d1 = v17;
3006     FloatRegister d2 = v18;
3007     FloatRegister d3 = v19;
3008 
3009     for (int round = 0; round < 20; round++) {
3010       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3011       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3012       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3013       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3014       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3015 
3016       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3017       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3018       __ sha1h(tmp2, __ T4S, v20);
3019       if (round < 5)
3020         __ sha1c(v20, __ T4S, tmp3, tmp4);
3021       else if (round < 10 || round >= 15)
3022         __ sha1p(v20, __ T4S, tmp3, tmp4);
3023       else
3024         __ sha1m(v20, __ T4S, tmp3, tmp4);
3025       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3026 
3027       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3028     }
3029 
3030     __ addv(v7, __ T2S, v7, v21);
3031     __ addv(v6, __ T4S, v6, v20);
3032 
3033     if (multi_block) {
3034       __ add(ofs, ofs, 64);
3035       __ cmp(ofs, limit);
3036       __ br(Assembler::LE, sha1_loop);
3037       __ mov(c_rarg0, ofs); // return ofs
3038     }
3039 
3040     __ strq(v6, Address(state, 0));
3041     __ strs(v7, Address(state, 16));
3042 
3043     __ ret(lr);
3044 
3045     __ bind(keys);
3046     __ emit_int32(0x5a827999);
3047     __ emit_int32(0x6ed9eba1);
3048     __ emit_int32(0x8f1bbcdc);
3049     __ emit_int32(0xca62c1d6);
3050 
3051     return start;
3052   }
3053 
3054 
3055   // Arguments:
3056   //
3057   // Inputs:
3058   //   c_rarg0   - byte[]  source+offset
3059   //   c_rarg1   - int[]   SHA.state
3060   //   c_rarg2   - int     offset
3061   //   c_rarg3   - int     limit
3062   //
3063   address generate_sha256_implCompress(bool multi_block, const char *name) {
3064     static const uint32_t round_consts[64] = {
3065       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3066       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3067       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3068       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3069       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3070       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3071       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3072       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3073       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3074       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3075       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3076       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3077       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3078       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3079       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3080       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3081     };
3082     __ align(CodeEntryAlignment);
3083     StubCodeMark mark(this, "StubRoutines", name);
3084     address start = __ pc();
3085 
3086     Register buf   = c_rarg0;
3087     Register state = c_rarg1;
3088     Register ofs   = c_rarg2;
3089     Register limit = c_rarg3;
3090 
3091     Label sha1_loop;
3092 
3093     __ stpd(v8, v9, __ pre(sp, -32));
3094     __ stpd(v10, v11, Address(sp, 16));
3095 
3096 // dga == v0
3097 // dgb == v1
3098 // dg0 == v2
3099 // dg1 == v3
3100 // dg2 == v4
3101 // t0 == v6
3102 // t1 == v7
3103 
3104     // load 16 keys to v16..v31
3105     __ lea(rscratch1, ExternalAddress((address)round_consts));
3106     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3107     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3108     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3109     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3110 
3111     // load 8 words (256 bits) state
3112     __ ldpq(v0, v1, state);
3113 
3114     __ BIND(sha1_loop);
3115     // load 64 bytes of data into v8..v11
3116     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3117     __ rev32(v8, __ T16B, v8);
3118     __ rev32(v9, __ T16B, v9);
3119     __ rev32(v10, __ T16B, v10);
3120     __ rev32(v11, __ T16B, v11);
3121 
3122     __ addv(v6, __ T4S, v8, v16);
3123     __ orr(v2, __ T16B, v0, v0);
3124     __ orr(v3, __ T16B, v1, v1);
3125 
3126     FloatRegister d0 = v8;
3127     FloatRegister d1 = v9;
3128     FloatRegister d2 = v10;
3129     FloatRegister d3 = v11;
3130 
3131 
3132     for (int round = 0; round < 16; round++) {
3133       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3134       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3135       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3136       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3137 
3138       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3139        __ orr(v4, __ T16B, v2, v2);
3140       if (round < 15)
3141         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3142       __ sha256h(v2, __ T4S, v3, tmp2);
3143       __ sha256h2(v3, __ T4S, v4, tmp2);
3144       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3145 
3146       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3147     }
3148 
3149     __ addv(v0, __ T4S, v0, v2);
3150     __ addv(v1, __ T4S, v1, v3);
3151 
3152     if (multi_block) {
3153       __ add(ofs, ofs, 64);
3154       __ cmp(ofs, limit);
3155       __ br(Assembler::LE, sha1_loop);
3156       __ mov(c_rarg0, ofs); // return ofs
3157     }
3158 
3159     __ ldpd(v10, v11, Address(sp, 16));
3160     __ ldpd(v8, v9, __ post(sp, 32));
3161 
3162     __ stpq(v0, v1, state);
3163 
3164     __ ret(lr);
3165 
3166     return start;
3167   }
3168 
3169 #ifndef BUILTIN_SIM
3170   // Safefetch stubs.
3171   void generate_safefetch(const char* name, int size, address* entry,
3172                           address* fault_pc, address* continuation_pc) {
3173     // safefetch signatures:
3174     //   int      SafeFetch32(int*      adr, int      errValue);
3175     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3176     //
3177     // arguments:
3178     //   c_rarg0 = adr
3179     //   c_rarg1 = errValue
3180     //
3181     // result:
3182     //   PPC_RET  = *adr or errValue
3183 
3184     StubCodeMark mark(this, "StubRoutines", name);
3185 
3186     // Entry point, pc or function descriptor.
3187     *entry = __ pc();
3188 
3189     // Load *adr into c_rarg1, may fault.
3190     *fault_pc = __ pc();
3191     switch (size) {
3192       case 4:
3193         // int32_t
3194         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3195         break;
3196       case 8:
3197         // int64_t
3198         __ ldr(c_rarg1, Address(c_rarg0, 0));
3199         break;
3200       default:
3201         ShouldNotReachHere();
3202     }
3203 
3204     // return errValue or *adr
3205     *continuation_pc = __ pc();
3206     __ mov(r0, c_rarg1);
3207     __ ret(lr);
3208   }
3209 #endif
3210 
3211   /**
3212    *  Arguments:
3213    *
3214    * Inputs:
3215    *   c_rarg0   - int crc
3216    *   c_rarg1   - byte* buf
3217    *   c_rarg2   - int length
3218    *
3219    * Ouput:
3220    *       rax   - int crc result
3221    */
3222   address generate_updateBytesCRC32() {
3223     assert(UseCRC32Intrinsics, "what are we doing here?");
3224 
3225     __ align(CodeEntryAlignment);
3226     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3227 
3228     address start = __ pc();
3229 
3230     const Register crc   = c_rarg0;  // crc
3231     const Register buf   = c_rarg1;  // source java byte array address
3232     const Register len   = c_rarg2;  // length
3233     const Register table0 = c_rarg3; // crc_table address
3234     const Register table1 = c_rarg4;
3235     const Register table2 = c_rarg5;
3236     const Register table3 = c_rarg6;
3237     const Register tmp3 = c_rarg7;
3238 
3239     BLOCK_COMMENT("Entry:");
3240     __ enter(); // required for proper stackwalking of RuntimeStub frame
3241 
3242     __ kernel_crc32(crc, buf, len,
3243               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3244 
3245     __ leave(); // required for proper stackwalking of RuntimeStub frame
3246     __ ret(lr);
3247 
3248     return start;
3249   }
3250 
3251   /**
3252    *  Arguments:
3253    *
3254    * Inputs:
3255    *   c_rarg0   - int crc
3256    *   c_rarg1   - byte* buf
3257    *   c_rarg2   - int length
3258    *   c_rarg3   - int* table
3259    *
3260    * Ouput:
3261    *       r0   - int crc result
3262    */
3263   address generate_updateBytesCRC32C() {
3264     assert(UseCRC32CIntrinsics, "what are we doing here?");
3265 
3266     __ align(CodeEntryAlignment);
3267     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3268 
3269     address start = __ pc();
3270 
3271     const Register crc   = c_rarg0;  // crc
3272     const Register buf   = c_rarg1;  // source java byte array address
3273     const Register len   = c_rarg2;  // length
3274     const Register table0 = c_rarg3; // crc_table address
3275     const Register table1 = c_rarg4;
3276     const Register table2 = c_rarg5;
3277     const Register table3 = c_rarg6;
3278     const Register tmp3 = c_rarg7;
3279 
3280     BLOCK_COMMENT("Entry:");
3281     __ enter(); // required for proper stackwalking of RuntimeStub frame
3282 
3283     __ kernel_crc32c(crc, buf, len,
3284               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3285 
3286     __ leave(); // required for proper stackwalking of RuntimeStub frame
3287     __ ret(lr);
3288 
3289     return start;
3290   }
3291 
3292   /***
3293    *  Arguments:
3294    *
3295    *  Inputs:
3296    *   c_rarg0   - int   adler
3297    *   c_rarg1   - byte* buff
3298    *   c_rarg2   - int   len
3299    *
3300    * Output:
3301    *   c_rarg0   - int adler result
3302    */
3303   address generate_updateBytesAdler32() {
3304     __ align(CodeEntryAlignment);
3305     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3306     address start = __ pc();
3307 
3308     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3309 
3310     // Aliases
3311     Register adler  = c_rarg0;
3312     Register s1     = c_rarg0;
3313     Register s2     = c_rarg3;
3314     Register buff   = c_rarg1;
3315     Register len    = c_rarg2;
3316     Register nmax  = r4;
3317     Register base = r5;
3318     Register count = r6;
3319     Register temp0 = rscratch1;
3320     Register temp1 = rscratch2;
3321     Register temp2 = r7;
3322 
3323     // Max number of bytes we can process before having to take the mod
3324     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3325     unsigned long BASE = 0xfff1;
3326     unsigned long NMAX = 0x15B0;
3327 
3328     __ mov(base, BASE);
3329     __ mov(nmax, NMAX);
3330 
3331     // s1 is initialized to the lower 16 bits of adler
3332     // s2 is initialized to the upper 16 bits of adler
3333     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3334     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3335 
3336     // The pipelined loop needs at least 16 elements for 1 iteration
3337     // It does check this, but it is more effective to skip to the cleanup loop
3338     __ cmp(len, 16);
3339     __ br(Assembler::HS, L_nmax);
3340     __ cbz(len, L_combine);
3341 
3342     __ bind(L_simple_by1_loop);
3343     __ ldrb(temp0, Address(__ post(buff, 1)));
3344     __ add(s1, s1, temp0);
3345     __ add(s2, s2, s1);
3346     __ subs(len, len, 1);
3347     __ br(Assembler::HI, L_simple_by1_loop);
3348 
3349     // s1 = s1 % BASE
3350     __ subs(temp0, s1, base);
3351     __ csel(s1, temp0, s1, Assembler::HS);
3352 
3353     // s2 = s2 % BASE
3354     __ lsr(temp0, s2, 16);
3355     __ lsl(temp1, temp0, 4);
3356     __ sub(temp1, temp1, temp0);
3357     __ add(s2, temp1, s2, ext::uxth);
3358 
3359     __ subs(temp0, s2, base);
3360     __ csel(s2, temp0, s2, Assembler::HS);
3361 
3362     __ b(L_combine);
3363 
3364     __ bind(L_nmax);
3365     __ subs(len, len, nmax);
3366     __ sub(count, nmax, 16);
3367     __ br(Assembler::LO, L_by16);
3368 
3369     __ bind(L_nmax_loop);
3370 
3371     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3372 
3373     __ add(s1, s1, temp0, ext::uxtb);
3374     __ ubfx(temp2, temp0, 8, 8);
3375     __ add(s2, s2, s1);
3376     __ add(s1, s1, temp2);
3377     __ ubfx(temp2, temp0, 16, 8);
3378     __ add(s2, s2, s1);
3379     __ add(s1, s1, temp2);
3380     __ ubfx(temp2, temp0, 24, 8);
3381     __ add(s2, s2, s1);
3382     __ add(s1, s1, temp2);
3383     __ ubfx(temp2, temp0, 32, 8);
3384     __ add(s2, s2, s1);
3385     __ add(s1, s1, temp2);
3386     __ ubfx(temp2, temp0, 40, 8);
3387     __ add(s2, s2, s1);
3388     __ add(s1, s1, temp2);
3389     __ ubfx(temp2, temp0, 48, 8);
3390     __ add(s2, s2, s1);
3391     __ add(s1, s1, temp2);
3392     __ add(s2, s2, s1);
3393     __ add(s1, s1, temp0, Assembler::LSR, 56);
3394     __ add(s2, s2, s1);
3395 
3396     __ add(s1, s1, temp1, ext::uxtb);
3397     __ ubfx(temp2, temp1, 8, 8);
3398     __ add(s2, s2, s1);
3399     __ add(s1, s1, temp2);
3400     __ ubfx(temp2, temp1, 16, 8);
3401     __ add(s2, s2, s1);
3402     __ add(s1, s1, temp2);
3403     __ ubfx(temp2, temp1, 24, 8);
3404     __ add(s2, s2, s1);
3405     __ add(s1, s1, temp2);
3406     __ ubfx(temp2, temp1, 32, 8);
3407     __ add(s2, s2, s1);
3408     __ add(s1, s1, temp2);
3409     __ ubfx(temp2, temp1, 40, 8);
3410     __ add(s2, s2, s1);
3411     __ add(s1, s1, temp2);
3412     __ ubfx(temp2, temp1, 48, 8);
3413     __ add(s2, s2, s1);
3414     __ add(s1, s1, temp2);
3415     __ add(s2, s2, s1);
3416     __ add(s1, s1, temp1, Assembler::LSR, 56);
3417     __ add(s2, s2, s1);
3418 
3419     __ subs(count, count, 16);
3420     __ br(Assembler::HS, L_nmax_loop);
3421 
3422     // s1 = s1 % BASE
3423     __ lsr(temp0, s1, 16);
3424     __ lsl(temp1, temp0, 4);
3425     __ sub(temp1, temp1, temp0);
3426     __ add(temp1, temp1, s1, ext::uxth);
3427 
3428     __ lsr(temp0, temp1, 16);
3429     __ lsl(s1, temp0, 4);
3430     __ sub(s1, s1, temp0);
3431     __ add(s1, s1, temp1, ext:: uxth);
3432 
3433     __ subs(temp0, s1, base);
3434     __ csel(s1, temp0, s1, Assembler::HS);
3435 
3436     // s2 = s2 % BASE
3437     __ lsr(temp0, s2, 16);
3438     __ lsl(temp1, temp0, 4);
3439     __ sub(temp1, temp1, temp0);
3440     __ add(temp1, temp1, s2, ext::uxth);
3441 
3442     __ lsr(temp0, temp1, 16);
3443     __ lsl(s2, temp0, 4);
3444     __ sub(s2, s2, temp0);
3445     __ add(s2, s2, temp1, ext:: uxth);
3446 
3447     __ subs(temp0, s2, base);
3448     __ csel(s2, temp0, s2, Assembler::HS);
3449 
3450     __ subs(len, len, nmax);
3451     __ sub(count, nmax, 16);
3452     __ br(Assembler::HS, L_nmax_loop);
3453 
3454     __ bind(L_by16);
3455     __ adds(len, len, count);
3456     __ br(Assembler::LO, L_by1);
3457 
3458     __ bind(L_by16_loop);
3459 
3460     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3461 
3462     __ add(s1, s1, temp0, ext::uxtb);
3463     __ ubfx(temp2, temp0, 8, 8);
3464     __ add(s2, s2, s1);
3465     __ add(s1, s1, temp2);
3466     __ ubfx(temp2, temp0, 16, 8);
3467     __ add(s2, s2, s1);
3468     __ add(s1, s1, temp2);
3469     __ ubfx(temp2, temp0, 24, 8);
3470     __ add(s2, s2, s1);
3471     __ add(s1, s1, temp2);
3472     __ ubfx(temp2, temp0, 32, 8);
3473     __ add(s2, s2, s1);
3474     __ add(s1, s1, temp2);
3475     __ ubfx(temp2, temp0, 40, 8);
3476     __ add(s2, s2, s1);
3477     __ add(s1, s1, temp2);
3478     __ ubfx(temp2, temp0, 48, 8);
3479     __ add(s2, s2, s1);
3480     __ add(s1, s1, temp2);
3481     __ add(s2, s2, s1);
3482     __ add(s1, s1, temp0, Assembler::LSR, 56);
3483     __ add(s2, s2, s1);
3484 
3485     __ add(s1, s1, temp1, ext::uxtb);
3486     __ ubfx(temp2, temp1, 8, 8);
3487     __ add(s2, s2, s1);
3488     __ add(s1, s1, temp2);
3489     __ ubfx(temp2, temp1, 16, 8);
3490     __ add(s2, s2, s1);
3491     __ add(s1, s1, temp2);
3492     __ ubfx(temp2, temp1, 24, 8);
3493     __ add(s2, s2, s1);
3494     __ add(s1, s1, temp2);
3495     __ ubfx(temp2, temp1, 32, 8);
3496     __ add(s2, s2, s1);
3497     __ add(s1, s1, temp2);
3498     __ ubfx(temp2, temp1, 40, 8);
3499     __ add(s2, s2, s1);
3500     __ add(s1, s1, temp2);
3501     __ ubfx(temp2, temp1, 48, 8);
3502     __ add(s2, s2, s1);
3503     __ add(s1, s1, temp2);
3504     __ add(s2, s2, s1);
3505     __ add(s1, s1, temp1, Assembler::LSR, 56);
3506     __ add(s2, s2, s1);
3507 
3508     __ subs(len, len, 16);
3509     __ br(Assembler::HS, L_by16_loop);
3510 
3511     __ bind(L_by1);
3512     __ adds(len, len, 15);
3513     __ br(Assembler::LO, L_do_mod);
3514 
3515     __ bind(L_by1_loop);
3516     __ ldrb(temp0, Address(__ post(buff, 1)));
3517     __ add(s1, temp0, s1);
3518     __ add(s2, s2, s1);
3519     __ subs(len, len, 1);
3520     __ br(Assembler::HS, L_by1_loop);
3521 
3522     __ bind(L_do_mod);
3523     // s1 = s1 % BASE
3524     __ lsr(temp0, s1, 16);
3525     __ lsl(temp1, temp0, 4);
3526     __ sub(temp1, temp1, temp0);
3527     __ add(temp1, temp1, s1, ext::uxth);
3528 
3529     __ lsr(temp0, temp1, 16);
3530     __ lsl(s1, temp0, 4);
3531     __ sub(s1, s1, temp0);
3532     __ add(s1, s1, temp1, ext:: uxth);
3533 
3534     __ subs(temp0, s1, base);
3535     __ csel(s1, temp0, s1, Assembler::HS);
3536 
3537     // s2 = s2 % BASE
3538     __ lsr(temp0, s2, 16);
3539     __ lsl(temp1, temp0, 4);
3540     __ sub(temp1, temp1, temp0);
3541     __ add(temp1, temp1, s2, ext::uxth);
3542 
3543     __ lsr(temp0, temp1, 16);
3544     __ lsl(s2, temp0, 4);
3545     __ sub(s2, s2, temp0);
3546     __ add(s2, s2, temp1, ext:: uxth);
3547 
3548     __ subs(temp0, s2, base);
3549     __ csel(s2, temp0, s2, Assembler::HS);
3550 
3551     // Combine lower bits and higher bits
3552     __ bind(L_combine);
3553     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3554 
3555     __ ret(lr);
3556 
3557     return start;
3558   }
3559 
3560   /**
3561    *  Arguments:
3562    *
3563    *  Input:
3564    *    c_rarg0   - x address
3565    *    c_rarg1   - x length
3566    *    c_rarg2   - y address
3567    *    c_rarg3   - y lenth
3568    *    c_rarg4   - z address
3569    *    c_rarg5   - z length
3570    */
3571   address generate_multiplyToLen() {
3572     __ align(CodeEntryAlignment);
3573     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3574 
3575     address start = __ pc();
3576     const Register x     = r0;
3577     const Register xlen  = r1;
3578     const Register y     = r2;
3579     const Register ylen  = r3;
3580     const Register z     = r4;
3581     const Register zlen  = r5;
3582 
3583     const Register tmp1  = r10;
3584     const Register tmp2  = r11;
3585     const Register tmp3  = r12;
3586     const Register tmp4  = r13;
3587     const Register tmp5  = r14;
3588     const Register tmp6  = r15;
3589     const Register tmp7  = r16;
3590 
3591     BLOCK_COMMENT("Entry:");
3592     __ enter(); // required for proper stackwalking of RuntimeStub frame
3593     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3594     __ leave(); // required for proper stackwalking of RuntimeStub frame
3595     __ ret(lr);
3596 
3597     return start;
3598   }
3599 
3600   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3601                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3602                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3603     // Karatsuba multiplication performs a 128*128 -> 256-bit
3604     // multiplication in three 128-bit multiplications and a few
3605     // additions.
3606     //
3607     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3608     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3609     //
3610     // Inputs:
3611     //
3612     // A0 in a.d[0]     (subkey)
3613     // A1 in a.d[1]
3614     // (A1+A0) in a1_xor_a0.d[0]
3615     //
3616     // B0 in b.d[0]     (state)
3617     // B1 in b.d[1]
3618 
3619     __ ext(tmp1, __ T16B, b, b, 0x08);
3620     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3621     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3622     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3623     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3624 
3625     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3626     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3627     __ eor(tmp2, __ T16B, tmp2, tmp4);
3628     __ eor(tmp2, __ T16B, tmp2, tmp3);
3629 
3630     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3631     __ ins(result_hi, __ D, tmp2, 0, 1);
3632     __ ins(result_lo, __ D, tmp2, 1, 0);
3633   }
3634 
3635   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3636                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3637     const FloatRegister t0 = result;
3638 
3639     // The GCM field polynomial f is z^128 + p(z), where p =
3640     // z^7+z^2+z+1.
3641     //
3642     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3643     //
3644     // so, given that the product we're reducing is
3645     //    a == lo + hi * z^128
3646     // substituting,
3647     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3648     //
3649     // we reduce by multiplying hi by p(z) and subtracting the result
3650     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3651     // bits we can do this with two 64-bit multiplications, lo*p and
3652     // hi*p.
3653 
3654     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3655     __ ext(t1, __ T16B, t0, z, 8);
3656     __ eor(hi, __ T16B, hi, t1);
3657     __ ext(t1, __ T16B, z, t0, 8);
3658     __ eor(lo, __ T16B, lo, t1);
3659     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3660     __ eor(result, __ T16B, lo, t0);
3661   }
3662 
3663   /**
3664    *  Arguments:
3665    *
3666    *  Input:
3667    *  c_rarg0   - current state address
3668    *  c_rarg1   - H key address
3669    *  c_rarg2   - data address
3670    *  c_rarg3   - number of blocks
3671    *
3672    *  Output:
3673    *  Updated state at c_rarg0
3674    */
3675   address generate_ghash_processBlocks() {
3676     // Bafflingly, GCM uses little-endian for the byte order, but
3677     // big-endian for the bit order.  For example, the polynomial 1 is
3678     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3679     //
3680     // So, we must either reverse the bytes in each word and do
3681     // everything big-endian or reverse the bits in each byte and do
3682     // it little-endian.  On AArch64 it's more idiomatic to reverse
3683     // the bits in each byte (we have an instruction, RBIT, to do
3684     // that) and keep the data in little-endian bit order throught the
3685     // calculation, bit-reversing the inputs and outputs.
3686 
3687     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3688     __ align(wordSize * 2);
3689     address p = __ pc();
3690     __ emit_int64(0x87);  // The low-order bits of the field
3691                           // polynomial (i.e. p = z^7+z^2+z+1)
3692                           // repeated in the low and high parts of a
3693                           // 128-bit vector
3694     __ emit_int64(0x87);
3695 
3696     __ align(CodeEntryAlignment);
3697     address start = __ pc();
3698 
3699     Register state   = c_rarg0;
3700     Register subkeyH = c_rarg1;
3701     Register data    = c_rarg2;
3702     Register blocks  = c_rarg3;
3703 
3704     FloatRegister vzr = v30;
3705     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3706 
3707     __ ldrq(v0, Address(state));
3708     __ ldrq(v1, Address(subkeyH));
3709 
3710     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3711     __ rbit(v0, __ T16B, v0);
3712     __ rev64(v1, __ T16B, v1);
3713     __ rbit(v1, __ T16B, v1);
3714 
3715     __ ldrq(v26, p);
3716 
3717     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3718     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3719 
3720     {
3721       Label L_ghash_loop;
3722       __ bind(L_ghash_loop);
3723 
3724       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3725                                                  // reversing each byte
3726       __ rbit(v2, __ T16B, v2);
3727       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3728 
3729       // Multiply state in v2 by subkey in v1
3730       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3731                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3732                      /*temps*/v6, v20, v18, v21);
3733       // Reduce v7:v5 by the field polynomial
3734       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3735 
3736       __ sub(blocks, blocks, 1);
3737       __ cbnz(blocks, L_ghash_loop);
3738     }
3739 
3740     // The bit-reversed result is at this point in v0
3741     __ rev64(v1, __ T16B, v0);
3742     __ rbit(v1, __ T16B, v1);
3743 
3744     __ st1(v1, __ T16B, state);
3745     __ ret(lr);
3746 
3747     return start;
3748   }
3749 
3750   // Continuation point for throwing of implicit exceptions that are
3751   // not handled in the current activation. Fabricates an exception
3752   // oop and initiates normal exception dispatching in this
3753   // frame. Since we need to preserve callee-saved values (currently
3754   // only for C2, but done for C1 as well) we need a callee-saved oop
3755   // map and therefore have to make these stubs into RuntimeStubs
3756   // rather than BufferBlobs.  If the compiler needs all registers to
3757   // be preserved between the fault point and the exception handler
3758   // then it must assume responsibility for that in
3759   // AbstractCompiler::continuation_for_implicit_null_exception or
3760   // continuation_for_implicit_division_by_zero_exception. All other
3761   // implicit exceptions (e.g., NullPointerException or
3762   // AbstractMethodError on entry) are either at call sites or
3763   // otherwise assume that stack unwinding will be initiated, so
3764   // caller saved registers were assumed volatile in the compiler.
3765 
3766 #undef __
3767 #define __ masm->
3768 
3769   address generate_throw_exception(const char* name,
3770                                    address runtime_entry,
3771                                    Register arg1 = noreg,
3772                                    Register arg2 = noreg) {
3773     // Information about frame layout at time of blocking runtime call.
3774     // Note that we only have to preserve callee-saved registers since
3775     // the compilers are responsible for supplying a continuation point
3776     // if they expect all registers to be preserved.
3777     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3778     enum layout {
3779       rfp_off = 0,
3780       rfp_off2,
3781       return_off,
3782       return_off2,
3783       framesize // inclusive of return address
3784     };
3785 
3786     int insts_size = 512;
3787     int locs_size  = 64;
3788 
3789     CodeBuffer code(name, insts_size, locs_size);
3790     OopMapSet* oop_maps  = new OopMapSet();
3791     MacroAssembler* masm = new MacroAssembler(&code);
3792 
3793     address start = __ pc();
3794 
3795     // This is an inlined and slightly modified version of call_VM
3796     // which has the ability to fetch the return PC out of
3797     // thread-local storage and also sets up last_Java_sp slightly
3798     // differently than the real call_VM
3799 
3800     __ enter(); // Save FP and LR before call
3801 
3802     assert(is_even(framesize/2), "sp not 16-byte aligned");
3803 
3804     // lr and fp are already in place
3805     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3806 
3807     int frame_complete = __ pc() - start;
3808 
3809     // Set up last_Java_sp and last_Java_fp
3810     address the_pc = __ pc();
3811     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3812 
3813     // Call runtime
3814     if (arg1 != noreg) {
3815       assert(arg2 != c_rarg1, "clobbered");
3816       __ mov(c_rarg1, arg1);
3817     }
3818     if (arg2 != noreg) {
3819       __ mov(c_rarg2, arg2);
3820     }
3821     __ mov(c_rarg0, rthread);
3822     BLOCK_COMMENT("call runtime_entry");
3823     __ mov(rscratch1, runtime_entry);
3824     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3825 
3826     // Generate oop map
3827     OopMap* map = new OopMap(framesize, 0);
3828 
3829     oop_maps->add_gc_map(the_pc - start, map);
3830 
3831     __ reset_last_Java_frame(true);
3832     __ maybe_isb();
3833 
3834     __ leave();
3835 
3836     // check for pending exceptions
3837 #ifdef ASSERT
3838     Label L;
3839     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3840     __ cbnz(rscratch1, L);
3841     __ should_not_reach_here();
3842     __ bind(L);
3843 #endif // ASSERT
3844     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3845 
3846 
3847     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3848     RuntimeStub* stub =
3849       RuntimeStub::new_runtime_stub(name,
3850                                     &code,
3851                                     frame_complete,
3852                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3853                                     oop_maps, false);
3854     return stub->entry_point();
3855   }
3856 
3857   class MontgomeryMultiplyGenerator : public MacroAssembler {
3858 
3859     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3860       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3861 
3862     RegSet _toSave;
3863     bool _squaring;
3864 
3865   public:
3866     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3867       : MacroAssembler(as->code()), _squaring(squaring) {
3868 
3869       // Register allocation
3870 
3871       Register reg = c_rarg0;
3872       Pa_base = reg;       // Argument registers
3873       if (squaring)
3874         Pb_base = Pa_base;
3875       else
3876         Pb_base = ++reg;
3877       Pn_base = ++reg;
3878       Rlen= ++reg;
3879       inv = ++reg;
3880       Pm_base = ++reg;
3881 
3882                           // Working registers:
3883       Ra =  ++reg;        // The current digit of a, b, n, and m.
3884       Rb =  ++reg;
3885       Rm =  ++reg;
3886       Rn =  ++reg;
3887 
3888       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3889       Pb =  ++reg;
3890       Pm =  ++reg;
3891       Pn =  ++reg;
3892 
3893       t0 =  ++reg;        // Three registers which form a
3894       t1 =  ++reg;        // triple-precision accumuator.
3895       t2 =  ++reg;
3896 
3897       Ri =  ++reg;        // Inner and outer loop indexes.
3898       Rj =  ++reg;
3899 
3900       Rhi_ab = ++reg;     // Product registers: low and high parts
3901       Rlo_ab = ++reg;     // of a*b and m*n.
3902       Rhi_mn = ++reg;
3903       Rlo_mn = ++reg;
3904 
3905       // r19 and up are callee-saved.
3906       _toSave = RegSet::range(r19, reg) + Pm_base;
3907     }
3908 
3909   private:
3910     void save_regs() {
3911       push(_toSave, sp);
3912     }
3913 
3914     void restore_regs() {
3915       pop(_toSave, sp);
3916     }
3917 
3918     template <typename T>
3919     void unroll_2(Register count, T block) {
3920       Label loop, end, odd;
3921       tbnz(count, 0, odd);
3922       cbz(count, end);
3923       align(16);
3924       bind(loop);
3925       (this->*block)();
3926       bind(odd);
3927       (this->*block)();
3928       subs(count, count, 2);
3929       br(Assembler::GT, loop);
3930       bind(end);
3931     }
3932 
3933     template <typename T>
3934     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3935       Label loop, end, odd;
3936       tbnz(count, 0, odd);
3937       cbz(count, end);
3938       align(16);
3939       bind(loop);
3940       (this->*block)(d, s, tmp);
3941       bind(odd);
3942       (this->*block)(d, s, tmp);
3943       subs(count, count, 2);
3944       br(Assembler::GT, loop);
3945       bind(end);
3946     }
3947 
3948     void pre1(RegisterOrConstant i) {
3949       block_comment("pre1");
3950       // Pa = Pa_base;
3951       // Pb = Pb_base + i;
3952       // Pm = Pm_base;
3953       // Pn = Pn_base + i;
3954       // Ra = *Pa;
3955       // Rb = *Pb;
3956       // Rm = *Pm;
3957       // Rn = *Pn;
3958       ldr(Ra, Address(Pa_base));
3959       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3960       ldr(Rm, Address(Pm_base));
3961       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3962       lea(Pa, Address(Pa_base));
3963       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3964       lea(Pm, Address(Pm_base));
3965       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3966 
3967       // Zero the m*n result.
3968       mov(Rhi_mn, zr);
3969       mov(Rlo_mn, zr);
3970     }
3971 
3972     // The core multiply-accumulate step of a Montgomery
3973     // multiplication.  The idea is to schedule operations as a
3974     // pipeline so that instructions with long latencies (loads and
3975     // multiplies) have time to complete before their results are
3976     // used.  This most benefits in-order implementations of the
3977     // architecture but out-of-order ones also benefit.
3978     void step() {
3979       block_comment("step");
3980       // MACC(Ra, Rb, t0, t1, t2);
3981       // Ra = *++Pa;
3982       // Rb = *--Pb;
3983       umulh(Rhi_ab, Ra, Rb);
3984       mul(Rlo_ab, Ra, Rb);
3985       ldr(Ra, pre(Pa, wordSize));
3986       ldr(Rb, pre(Pb, -wordSize));
3987       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3988                                        // previous iteration.
3989       // MACC(Rm, Rn, t0, t1, t2);
3990       // Rm = *++Pm;
3991       // Rn = *--Pn;
3992       umulh(Rhi_mn, Rm, Rn);
3993       mul(Rlo_mn, Rm, Rn);
3994       ldr(Rm, pre(Pm, wordSize));
3995       ldr(Rn, pre(Pn, -wordSize));
3996       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3997     }
3998 
3999     void post1() {
4000       block_comment("post1");
4001 
4002       // MACC(Ra, Rb, t0, t1, t2);
4003       // Ra = *++Pa;
4004       // Rb = *--Pb;
4005       umulh(Rhi_ab, Ra, Rb);
4006       mul(Rlo_ab, Ra, Rb);
4007       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4008       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4009 
4010       // *Pm = Rm = t0 * inv;
4011       mul(Rm, t0, inv);
4012       str(Rm, Address(Pm));
4013 
4014       // MACC(Rm, Rn, t0, t1, t2);
4015       // t0 = t1; t1 = t2; t2 = 0;
4016       umulh(Rhi_mn, Rm, Rn);
4017 
4018 #ifndef PRODUCT
4019       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4020       {
4021         mul(Rlo_mn, Rm, Rn);
4022         add(Rlo_mn, t0, Rlo_mn);
4023         Label ok;
4024         cbz(Rlo_mn, ok); {
4025           stop("broken Montgomery multiply");
4026         } bind(ok);
4027       }
4028 #endif
4029       // We have very carefully set things up so that
4030       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4031       // the lower half of Rm * Rn because we know the result already:
4032       // it must be -t0.  t0 + (-t0) must generate a carry iff
4033       // t0 != 0.  So, rather than do a mul and an adds we just set
4034       // the carry flag iff t0 is nonzero.
4035       //
4036       // mul(Rlo_mn, Rm, Rn);
4037       // adds(zr, t0, Rlo_mn);
4038       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4039       adcs(t0, t1, Rhi_mn);
4040       adc(t1, t2, zr);
4041       mov(t2, zr);
4042     }
4043 
4044     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4045       block_comment("pre2");
4046       // Pa = Pa_base + i-len;
4047       // Pb = Pb_base + len;
4048       // Pm = Pm_base + i-len;
4049       // Pn = Pn_base + len;
4050 
4051       if (i.is_register()) {
4052         sub(Rj, i.as_register(), len);
4053       } else {
4054         mov(Rj, i.as_constant());
4055         sub(Rj, Rj, len);
4056       }
4057       // Rj == i-len
4058 
4059       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4060       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4061       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4062       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4063 
4064       // Ra = *++Pa;
4065       // Rb = *--Pb;
4066       // Rm = *++Pm;
4067       // Rn = *--Pn;
4068       ldr(Ra, pre(Pa, wordSize));
4069       ldr(Rb, pre(Pb, -wordSize));
4070       ldr(Rm, pre(Pm, wordSize));
4071       ldr(Rn, pre(Pn, -wordSize));
4072 
4073       mov(Rhi_mn, zr);
4074       mov(Rlo_mn, zr);
4075     }
4076 
4077     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4078       block_comment("post2");
4079       if (i.is_constant()) {
4080         mov(Rj, i.as_constant()-len.as_constant());
4081       } else {
4082         sub(Rj, i.as_register(), len);
4083       }
4084 
4085       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4086 
4087       // As soon as we know the least significant digit of our result,
4088       // store it.
4089       // Pm_base[i-len] = t0;
4090       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4091 
4092       // t0 = t1; t1 = t2; t2 = 0;
4093       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4094       adc(t1, t2, zr);
4095       mov(t2, zr);
4096     }
4097 
4098     // A carry in t0 after Montgomery multiplication means that we
4099     // should subtract multiples of n from our result in m.  We'll
4100     // keep doing that until there is no carry.
4101     void normalize(RegisterOrConstant len) {
4102       block_comment("normalize");
4103       // while (t0)
4104       //   t0 = sub(Pm_base, Pn_base, t0, len);
4105       Label loop, post, again;
4106       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4107       cbz(t0, post); {
4108         bind(again); {
4109           mov(i, zr);
4110           mov(cnt, len);
4111           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4112           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4113           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4114           align(16);
4115           bind(loop); {
4116             sbcs(Rm, Rm, Rn);
4117             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4118             add(i, i, 1);
4119             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4120             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4121             sub(cnt, cnt, 1);
4122           } cbnz(cnt, loop);
4123           sbc(t0, t0, zr);
4124         } cbnz(t0, again);
4125       } bind(post);
4126     }
4127 
4128     // Move memory at s to d, reversing words.
4129     //    Increments d to end of copied memory
4130     //    Destroys tmp1, tmp2
4131     //    Preserves len
4132     //    Leaves s pointing to the address which was in d at start
4133     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4134       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4135 
4136       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4137       mov(tmp1, len);
4138       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4139       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4140     }
4141     // where
4142     void reverse1(Register d, Register s, Register tmp) {
4143       ldr(tmp, pre(s, -wordSize));
4144       ror(tmp, tmp, 32);
4145       str(tmp, post(d, wordSize));
4146     }
4147 
4148     void step_squaring() {
4149       // An extra ACC
4150       step();
4151       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4152     }
4153 
4154     void last_squaring(RegisterOrConstant i) {
4155       Label dont;
4156       // if ((i & 1) == 0) {
4157       tbnz(i.as_register(), 0, dont); {
4158         // MACC(Ra, Rb, t0, t1, t2);
4159         // Ra = *++Pa;
4160         // Rb = *--Pb;
4161         umulh(Rhi_ab, Ra, Rb);
4162         mul(Rlo_ab, Ra, Rb);
4163         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4164       } bind(dont);
4165     }
4166 
4167     void extra_step_squaring() {
4168       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4169 
4170       // MACC(Rm, Rn, t0, t1, t2);
4171       // Rm = *++Pm;
4172       // Rn = *--Pn;
4173       umulh(Rhi_mn, Rm, Rn);
4174       mul(Rlo_mn, Rm, Rn);
4175       ldr(Rm, pre(Pm, wordSize));
4176       ldr(Rn, pre(Pn, -wordSize));
4177     }
4178 
4179     void post1_squaring() {
4180       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4181 
4182       // *Pm = Rm = t0 * inv;
4183       mul(Rm, t0, inv);
4184       str(Rm, Address(Pm));
4185 
4186       // MACC(Rm, Rn, t0, t1, t2);
4187       // t0 = t1; t1 = t2; t2 = 0;
4188       umulh(Rhi_mn, Rm, Rn);
4189 
4190 #ifndef PRODUCT
4191       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4192       {
4193         mul(Rlo_mn, Rm, Rn);
4194         add(Rlo_mn, t0, Rlo_mn);
4195         Label ok;
4196         cbz(Rlo_mn, ok); {
4197           stop("broken Montgomery multiply");
4198         } bind(ok);
4199       }
4200 #endif
4201       // We have very carefully set things up so that
4202       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4203       // the lower half of Rm * Rn because we know the result already:
4204       // it must be -t0.  t0 + (-t0) must generate a carry iff
4205       // t0 != 0.  So, rather than do a mul and an adds we just set
4206       // the carry flag iff t0 is nonzero.
4207       //
4208       // mul(Rlo_mn, Rm, Rn);
4209       // adds(zr, t0, Rlo_mn);
4210       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4211       adcs(t0, t1, Rhi_mn);
4212       adc(t1, t2, zr);
4213       mov(t2, zr);
4214     }
4215 
4216     void acc(Register Rhi, Register Rlo,
4217              Register t0, Register t1, Register t2) {
4218       adds(t0, t0, Rlo);
4219       adcs(t1, t1, Rhi);
4220       adc(t2, t2, zr);
4221     }
4222 
4223   public:
4224     /**
4225      * Fast Montgomery multiplication.  The derivation of the
4226      * algorithm is in A Cryptographic Library for the Motorola
4227      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4228      *
4229      * Arguments:
4230      *
4231      * Inputs for multiplication:
4232      *   c_rarg0   - int array elements a
4233      *   c_rarg1   - int array elements b
4234      *   c_rarg2   - int array elements n (the modulus)
4235      *   c_rarg3   - int length
4236      *   c_rarg4   - int inv
4237      *   c_rarg5   - int array elements m (the result)
4238      *
4239      * Inputs for squaring:
4240      *   c_rarg0   - int array elements a
4241      *   c_rarg1   - int array elements n (the modulus)
4242      *   c_rarg2   - int length
4243      *   c_rarg3   - int inv
4244      *   c_rarg4   - int array elements m (the result)
4245      *
4246      */
4247     address generate_multiply() {
4248       Label argh, nothing;
4249       bind(argh);
4250       stop("MontgomeryMultiply total_allocation must be <= 8192");
4251 
4252       align(CodeEntryAlignment);
4253       address entry = pc();
4254 
4255       cbzw(Rlen, nothing);
4256 
4257       enter();
4258 
4259       // Make room.
4260       cmpw(Rlen, 512);
4261       br(Assembler::HI, argh);
4262       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4263       andr(sp, Ra, -2 * wordSize);
4264 
4265       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4266 
4267       {
4268         // Copy input args, reversing as we go.  We use Ra as a
4269         // temporary variable.
4270         reverse(Ra, Pa_base, Rlen, t0, t1);
4271         if (!_squaring)
4272           reverse(Ra, Pb_base, Rlen, t0, t1);
4273         reverse(Ra, Pn_base, Rlen, t0, t1);
4274       }
4275 
4276       // Push all call-saved registers and also Pm_base which we'll need
4277       // at the end.
4278       save_regs();
4279 
4280 #ifndef PRODUCT
4281       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4282       {
4283         ldr(Rn, Address(Pn_base, 0));
4284         mul(Rlo_mn, Rn, inv);
4285         cmp(Rlo_mn, -1);
4286         Label ok;
4287         br(EQ, ok); {
4288           stop("broken inverse in Montgomery multiply");
4289         } bind(ok);
4290       }
4291 #endif
4292 
4293       mov(Pm_base, Ra);
4294 
4295       mov(t0, zr);
4296       mov(t1, zr);
4297       mov(t2, zr);
4298 
4299       block_comment("for (int i = 0; i < len; i++) {");
4300       mov(Ri, zr); {
4301         Label loop, end;
4302         cmpw(Ri, Rlen);
4303         br(Assembler::GE, end);
4304 
4305         bind(loop);
4306         pre1(Ri);
4307 
4308         block_comment("  for (j = i; j; j--) {"); {
4309           movw(Rj, Ri);
4310           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4311         } block_comment("  } // j");
4312 
4313         post1();
4314         addw(Ri, Ri, 1);
4315         cmpw(Ri, Rlen);
4316         br(Assembler::LT, loop);
4317         bind(end);
4318         block_comment("} // i");
4319       }
4320 
4321       block_comment("for (int i = len; i < 2*len; i++) {");
4322       mov(Ri, Rlen); {
4323         Label loop, end;
4324         cmpw(Ri, Rlen, Assembler::LSL, 1);
4325         br(Assembler::GE, end);
4326 
4327         bind(loop);
4328         pre2(Ri, Rlen);
4329 
4330         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4331           lslw(Rj, Rlen, 1);
4332           subw(Rj, Rj, Ri);
4333           subw(Rj, Rj, 1);
4334           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4335         } block_comment("  } // j");
4336 
4337         post2(Ri, Rlen);
4338         addw(Ri, Ri, 1);
4339         cmpw(Ri, Rlen, Assembler::LSL, 1);
4340         br(Assembler::LT, loop);
4341         bind(end);
4342       }
4343       block_comment("} // i");
4344 
4345       normalize(Rlen);
4346 
4347       mov(Ra, Pm_base);  // Save Pm_base in Ra
4348       restore_regs();  // Restore caller's Pm_base
4349 
4350       // Copy our result into caller's Pm_base
4351       reverse(Pm_base, Ra, Rlen, t0, t1);
4352 
4353       leave();
4354       bind(nothing);
4355       ret(lr);
4356 
4357       return entry;
4358     }
4359     // In C, approximately:
4360 
4361     // void
4362     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4363     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4364     //                     unsigned long inv, int len) {
4365     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4366     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4367     //   unsigned long Ra, Rb, Rn, Rm;
4368 
4369     //   int i;
4370 
4371     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4372 
4373     //   for (i = 0; i < len; i++) {
4374     //     int j;
4375 
4376     //     Pa = Pa_base;
4377     //     Pb = Pb_base + i;
4378     //     Pm = Pm_base;
4379     //     Pn = Pn_base + i;
4380 
4381     //     Ra = *Pa;
4382     //     Rb = *Pb;
4383     //     Rm = *Pm;
4384     //     Rn = *Pn;
4385 
4386     //     int iters = i;
4387     //     for (j = 0; iters--; j++) {
4388     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4389     //       MACC(Ra, Rb, t0, t1, t2);
4390     //       Ra = *++Pa;
4391     //       Rb = *--Pb;
4392     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4393     //       MACC(Rm, Rn, t0, t1, t2);
4394     //       Rm = *++Pm;
4395     //       Rn = *--Pn;
4396     //     }
4397 
4398     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4399     //     MACC(Ra, Rb, t0, t1, t2);
4400     //     *Pm = Rm = t0 * inv;
4401     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4402     //     MACC(Rm, Rn, t0, t1, t2);
4403 
4404     //     assert(t0 == 0, "broken Montgomery multiply");
4405 
4406     //     t0 = t1; t1 = t2; t2 = 0;
4407     //   }
4408 
4409     //   for (i = len; i < 2*len; i++) {
4410     //     int j;
4411 
4412     //     Pa = Pa_base + i-len;
4413     //     Pb = Pb_base + len;
4414     //     Pm = Pm_base + i-len;
4415     //     Pn = Pn_base + len;
4416 
4417     //     Ra = *++Pa;
4418     //     Rb = *--Pb;
4419     //     Rm = *++Pm;
4420     //     Rn = *--Pn;
4421 
4422     //     int iters = len*2-i-1;
4423     //     for (j = i-len+1; iters--; j++) {
4424     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4425     //       MACC(Ra, Rb, t0, t1, t2);
4426     //       Ra = *++Pa;
4427     //       Rb = *--Pb;
4428     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4429     //       MACC(Rm, Rn, t0, t1, t2);
4430     //       Rm = *++Pm;
4431     //       Rn = *--Pn;
4432     //     }
4433 
4434     //     Pm_base[i-len] = t0;
4435     //     t0 = t1; t1 = t2; t2 = 0;
4436     //   }
4437 
4438     //   while (t0)
4439     //     t0 = sub(Pm_base, Pn_base, t0, len);
4440     // }
4441 
4442     /**
4443      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4444      * multiplies than Montgomery multiplication so it should be up to
4445      * 25% faster.  However, its loop control is more complex and it
4446      * may actually run slower on some machines.
4447      *
4448      * Arguments:
4449      *
4450      * Inputs:
4451      *   c_rarg0   - int array elements a
4452      *   c_rarg1   - int array elements n (the modulus)
4453      *   c_rarg2   - int length
4454      *   c_rarg3   - int inv
4455      *   c_rarg4   - int array elements m (the result)
4456      *
4457      */
4458     address generate_square() {
4459       Label argh;
4460       bind(argh);
4461       stop("MontgomeryMultiply total_allocation must be <= 8192");
4462 
4463       align(CodeEntryAlignment);
4464       address entry = pc();
4465 
4466       enter();
4467 
4468       // Make room.
4469       cmpw(Rlen, 512);
4470       br(Assembler::HI, argh);
4471       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4472       andr(sp, Ra, -2 * wordSize);
4473 
4474       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4475 
4476       {
4477         // Copy input args, reversing as we go.  We use Ra as a
4478         // temporary variable.
4479         reverse(Ra, Pa_base, Rlen, t0, t1);
4480         reverse(Ra, Pn_base, Rlen, t0, t1);
4481       }
4482 
4483       // Push all call-saved registers and also Pm_base which we'll need
4484       // at the end.
4485       save_regs();
4486 
4487       mov(Pm_base, Ra);
4488 
4489       mov(t0, zr);
4490       mov(t1, zr);
4491       mov(t2, zr);
4492 
4493       block_comment("for (int i = 0; i < len; i++) {");
4494       mov(Ri, zr); {
4495         Label loop, end;
4496         bind(loop);
4497         cmp(Ri, Rlen);
4498         br(Assembler::GE, end);
4499 
4500         pre1(Ri);
4501 
4502         block_comment("for (j = (i+1)/2; j; j--) {"); {
4503           add(Rj, Ri, 1);
4504           lsr(Rj, Rj, 1);
4505           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4506         } block_comment("  } // j");
4507 
4508         last_squaring(Ri);
4509 
4510         block_comment("  for (j = i/2; j; j--) {"); {
4511           lsr(Rj, Ri, 1);
4512           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4513         } block_comment("  } // j");
4514 
4515         post1_squaring();
4516         add(Ri, Ri, 1);
4517         cmp(Ri, Rlen);
4518         br(Assembler::LT, loop);
4519 
4520         bind(end);
4521         block_comment("} // i");
4522       }
4523 
4524       block_comment("for (int i = len; i < 2*len; i++) {");
4525       mov(Ri, Rlen); {
4526         Label loop, end;
4527         bind(loop);
4528         cmp(Ri, Rlen, Assembler::LSL, 1);
4529         br(Assembler::GE, end);
4530 
4531         pre2(Ri, Rlen);
4532 
4533         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4534           lsl(Rj, Rlen, 1);
4535           sub(Rj, Rj, Ri);
4536           sub(Rj, Rj, 1);
4537           lsr(Rj, Rj, 1);
4538           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4539         } block_comment("  } // j");
4540 
4541         last_squaring(Ri);
4542 
4543         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4544           lsl(Rj, Rlen, 1);
4545           sub(Rj, Rj, Ri);
4546           lsr(Rj, Rj, 1);
4547           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4548         } block_comment("  } // j");
4549 
4550         post2(Ri, Rlen);
4551         add(Ri, Ri, 1);
4552         cmp(Ri, Rlen, Assembler::LSL, 1);
4553 
4554         br(Assembler::LT, loop);
4555         bind(end);
4556         block_comment("} // i");
4557       }
4558 
4559       normalize(Rlen);
4560 
4561       mov(Ra, Pm_base);  // Save Pm_base in Ra
4562       restore_regs();  // Restore caller's Pm_base
4563 
4564       // Copy our result into caller's Pm_base
4565       reverse(Pm_base, Ra, Rlen, t0, t1);
4566 
4567       leave();
4568       ret(lr);
4569 
4570       return entry;
4571     }
4572     // In C, approximately:
4573 
4574     // void
4575     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4576     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4577     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4578     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4579     //   unsigned long Ra, Rb, Rn, Rm;
4580 
4581     //   int i;
4582 
4583     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4584 
4585     //   for (i = 0; i < len; i++) {
4586     //     int j;
4587 
4588     //     Pa = Pa_base;
4589     //     Pb = Pa_base + i;
4590     //     Pm = Pm_base;
4591     //     Pn = Pn_base + i;
4592 
4593     //     Ra = *Pa;
4594     //     Rb = *Pb;
4595     //     Rm = *Pm;
4596     //     Rn = *Pn;
4597 
4598     //     int iters = (i+1)/2;
4599     //     for (j = 0; iters--; j++) {
4600     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4601     //       MACC2(Ra, Rb, t0, t1, t2);
4602     //       Ra = *++Pa;
4603     //       Rb = *--Pb;
4604     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4605     //       MACC(Rm, Rn, t0, t1, t2);
4606     //       Rm = *++Pm;
4607     //       Rn = *--Pn;
4608     //     }
4609     //     if ((i & 1) == 0) {
4610     //       assert(Ra == Pa_base[j], "must be");
4611     //       MACC(Ra, Ra, t0, t1, t2);
4612     //     }
4613     //     iters = i/2;
4614     //     assert(iters == i-j, "must be");
4615     //     for (; iters--; j++) {
4616     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4617     //       MACC(Rm, Rn, t0, t1, t2);
4618     //       Rm = *++Pm;
4619     //       Rn = *--Pn;
4620     //     }
4621 
4622     //     *Pm = Rm = t0 * inv;
4623     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4624     //     MACC(Rm, Rn, t0, t1, t2);
4625 
4626     //     assert(t0 == 0, "broken Montgomery multiply");
4627 
4628     //     t0 = t1; t1 = t2; t2 = 0;
4629     //   }
4630 
4631     //   for (i = len; i < 2*len; i++) {
4632     //     int start = i-len+1;
4633     //     int end = start + (len - start)/2;
4634     //     int j;
4635 
4636     //     Pa = Pa_base + i-len;
4637     //     Pb = Pa_base + len;
4638     //     Pm = Pm_base + i-len;
4639     //     Pn = Pn_base + len;
4640 
4641     //     Ra = *++Pa;
4642     //     Rb = *--Pb;
4643     //     Rm = *++Pm;
4644     //     Rn = *--Pn;
4645 
4646     //     int iters = (2*len-i-1)/2;
4647     //     assert(iters == end-start, "must be");
4648     //     for (j = start; iters--; j++) {
4649     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4650     //       MACC2(Ra, Rb, t0, t1, t2);
4651     //       Ra = *++Pa;
4652     //       Rb = *--Pb;
4653     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4654     //       MACC(Rm, Rn, t0, t1, t2);
4655     //       Rm = *++Pm;
4656     //       Rn = *--Pn;
4657     //     }
4658     //     if ((i & 1) == 0) {
4659     //       assert(Ra == Pa_base[j], "must be");
4660     //       MACC(Ra, Ra, t0, t1, t2);
4661     //     }
4662     //     iters =  (2*len-i)/2;
4663     //     assert(iters == len-j, "must be");
4664     //     for (; iters--; j++) {
4665     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4666     //       MACC(Rm, Rn, t0, t1, t2);
4667     //       Rm = *++Pm;
4668     //       Rn = *--Pn;
4669     //     }
4670     //     Pm_base[i-len] = t0;
4671     //     t0 = t1; t1 = t2; t2 = 0;
4672     //   }
4673 
4674     //   while (t0)
4675     //     t0 = sub(Pm_base, Pn_base, t0, len);
4676     // }
4677   };
4678 
4679   // Initialization
4680   void generate_initial() {
4681     // Generate initial stubs and initializes the entry points
4682 
4683     // entry points that exist in all platforms Note: This is code
4684     // that could be shared among different platforms - however the
4685     // benefit seems to be smaller than the disadvantage of having a
4686     // much more complicated generator structure. See also comment in
4687     // stubRoutines.hpp.
4688 
4689     StubRoutines::_forward_exception_entry = generate_forward_exception();
4690 
4691     StubRoutines::_call_stub_entry =
4692       generate_call_stub(StubRoutines::_call_stub_return_address);
4693 
4694     // is referenced by megamorphic call
4695     StubRoutines::_catch_exception_entry = generate_catch_exception();
4696 
4697     // Build this early so it's available for the interpreter.
4698     StubRoutines::_throw_StackOverflowError_entry =
4699       generate_throw_exception("StackOverflowError throw_exception",
4700                                CAST_FROM_FN_PTR(address,
4701                                                 SharedRuntime::throw_StackOverflowError));
4702     StubRoutines::_throw_delayed_StackOverflowError_entry =
4703       generate_throw_exception("delayed StackOverflowError throw_exception",
4704                                CAST_FROM_FN_PTR(address,
4705                                                 SharedRuntime::throw_delayed_StackOverflowError));
4706     if (UseCRC32Intrinsics) {
4707       // set table address before stub generation which use it
4708       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4709       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4710     }
4711   }
4712 
4713   void generate_all() {
4714     // support for verify_oop (must happen after universe_init)
4715     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4716     StubRoutines::_throw_AbstractMethodError_entry =
4717       generate_throw_exception("AbstractMethodError throw_exception",
4718                                CAST_FROM_FN_PTR(address,
4719                                                 SharedRuntime::
4720                                                 throw_AbstractMethodError));
4721 
4722     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4723       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4724                                CAST_FROM_FN_PTR(address,
4725                                                 SharedRuntime::
4726                                                 throw_IncompatibleClassChangeError));
4727 
4728     StubRoutines::_throw_NullPointerException_at_call_entry =
4729       generate_throw_exception("NullPointerException at call throw_exception",
4730                                CAST_FROM_FN_PTR(address,
4731                                                 SharedRuntime::
4732                                                 throw_NullPointerException_at_call));
4733 
4734     // arraycopy stubs used by compilers
4735     generate_arraycopy_stubs();
4736 
4737     if (UseMultiplyToLenIntrinsic) {
4738       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4739     }
4740 
4741     if (UseMontgomeryMultiplyIntrinsic) {
4742       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4743       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4744       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4745     }
4746 
4747     if (UseMontgomerySquareIntrinsic) {
4748       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4749       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4750       // We use generate_multiply() rather than generate_square()
4751       // because it's faster for the sizes of modulus we care about.
4752       StubRoutines::_montgomerySquare = g.generate_multiply();
4753     }
4754 
4755 #ifndef BUILTIN_SIM
4756     // generate GHASH intrinsics code
4757     if (UseGHASHIntrinsics) {
4758       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4759     }
4760 
4761     if (UseAESIntrinsics) {
4762       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4763       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4764       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4765       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4766     }
4767 
4768     if (UseSHA1Intrinsics) {
4769       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4770       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4771     }
4772     if (UseSHA256Intrinsics) {
4773       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4774       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4775     }
4776 
4777     if (UseCRC32CIntrinsics) {
4778       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4779     }
4780 
4781     // generate Adler32 intrinsics code
4782     if (UseAdler32Intrinsics) {
4783       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4784     }
4785 
4786     // Safefetch stubs.
4787     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4788                                                        &StubRoutines::_safefetch32_fault_pc,
4789                                                        &StubRoutines::_safefetch32_continuation_pc);
4790     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4791                                                        &StubRoutines::_safefetchN_fault_pc,
4792                                                        &StubRoutines::_safefetchN_continuation_pc);
4793 #endif
4794     StubRoutines::aarch64::set_completed();
4795   }
4796 
4797  public:
4798   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4799     if (all) {
4800       generate_all();
4801     } else {
4802       generate_initial();
4803     }
4804   }
4805 }; // end class declaration
4806 
4807 void StubGenerator_generate(CodeBuffer* code, bool all) {
4808   StubGenerator g(code, all);
4809 }