New src/cpu/aarch64/vm/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #ifdef COMPILER2
  43 #include "opto/runtime.hpp"
  44 #endif
  45 
  46 #ifdef BUILTIN_SIM
  47 #include "../../../../../../simulator/simulator.hpp"
  48 #endif
  49 
  50 // Declaration and definition of StubGenerator (no .hpp file).
  51 // For a more detailed description of the stub routine structure
  52 // see the comment in stubRoutines.hpp
  53 
  54 #undef __
  55 #define __ _masm->
  56 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #else
  61 #define BLOCK_COMMENT(str) __ block_comment(str)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 // Stub Code definitions
  67 
  68 class StubGenerator: public StubCodeGenerator {
  69  private:
  70 
  71 #ifdef PRODUCT
  72 #define inc_counter_np(counter) ((void)0)
  73 #else
  74   void inc_counter_np_(int& counter) {
  75     __ lea(rscratch2, ExternalAddress((address)&counter));
  76     __ ldrw(rscratch1, Address(rscratch2));
  77     __ addw(rscratch1, rscratch1, 1);
  78     __ strw(rscratch1, Address(rscratch2));
  79   }
  80 #define inc_counter_np(counter) \
  81   BLOCK_COMMENT("inc_counter " #counter); \
  82   inc_counter_np_(counter);
  83 #endif
  84 
  85   // Call stubs are used to call Java from C
  86   //
  87   // Arguments:
  88   //    c_rarg0:   call wrapper address                   address
  89   //    c_rarg1:   result                                 address
  90   //    c_rarg2:   result type                            BasicType
  91   //    c_rarg3:   method                                 Method*
  92   //    c_rarg4:   (interpreter) entry point              address
  93   //    c_rarg5:   parameters                             intptr_t*
  94   //    c_rarg6:   parameter size (in words)              int
  95   //    c_rarg7:   thread                                 Thread*
  96   //
  97   // There is no return from the stub itself as any Java result
  98   // is written to result
  99   //
 100   // we save r30 (lr) as the return PC at the base of the frame and
 101   // link r29 (fp) below it as the frame pointer installing sp (r31)
 102   // into fp.
 103   //
 104   // we save r0-r7, which accounts for all the c arguments.
 105   //
 106   // TODO: strictly do we need to save them all? they are treated as
 107   // volatile by C so could we omit saving the ones we are going to
 108   // place in global registers (thread? method?) or those we only use
 109   // during setup of the Java call?
 110   //
 111   // we don't need to save r8 which C uses as an indirect result location
 112   // return register.
 113   //
 114   // we don't need to save r9-r15 which both C and Java treat as
 115   // volatile
 116   //
 117   // we don't need to save r16-18 because Java does not use them
 118   //
 119   // we save r19-r28 which Java uses as scratch registers and C
 120   // expects to be callee-save
 121   //
 122   // we save the bottom 64 bits of each value stored in v8-v15; it is
 123   // the responsibility of the caller to preserve larger values.
 124   //
 125   // so the stub frame looks like this when we enter Java code
 126   //
 127   //     [ return_from_Java     ] <--- sp
 128   //     [ argument word n      ]
 129   //      ...
 130   // -27 [ argument word 1      ]
 131   // -26 [ saved v15            ] <--- sp_after_call
 132   // -25 [ saved v14            ]
 133   // -24 [ saved v13            ]
 134   // -23 [ saved v12            ]
 135   // -22 [ saved v11            ]
 136   // -21 [ saved v10            ]
 137   // -20 [ saved v9             ]
 138   // -19 [ saved v8             ]
 139   // -18 [ saved r28            ]
 140   // -17 [ saved r27            ]
 141   // -16 [ saved r26            ]
 142   // -15 [ saved r25            ]
 143   // -14 [ saved r24            ]
 144   // -13 [ saved r23            ]
 145   // -12 [ saved r22            ]
 146   // -11 [ saved r21            ]
 147   // -10 [ saved r20            ]
 148   //  -9 [ saved r19            ]
 149   //  -8 [ call wrapper    (r0) ]
 150   //  -7 [ result          (r1) ]
 151   //  -6 [ result type     (r2) ]
 152   //  -5 [ method          (r3) ]
 153   //  -4 [ entry point     (r4) ]
 154   //  -3 [ parameters      (r5) ]
 155   //  -2 [ parameter size  (r6) ]
 156   //  -1 [ thread (r7)          ]
 157   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 158   //   1 [ saved lr       (r30) ]
 159 
 160   // Call stub stack layout word offsets from fp
 161   enum call_stub_layout {
 162     sp_after_call_off = -26,
 163 
 164     d15_off            = -26,
 165     d13_off            = -24,
 166     d11_off            = -22,
 167     d9_off             = -20,
 168 
 169     r28_off            = -18,
 170     r26_off            = -16,
 171     r24_off            = -14,
 172     r22_off            = -12,
 173     r20_off            = -10,
 174     call_wrapper_off   =  -8,
 175     result_off         =  -7,
 176     result_type_off    =  -6,
 177     method_off         =  -5,
 178     entry_point_off    =  -4,
 179     parameter_size_off =  -2,
 180     thread_off         =  -1,
 181     fp_f               =   0,
 182     retaddr_off        =   1,
 183   };
 184 
 185   address generate_call_stub(address& return_address) {
 186     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 187            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 188            "adjust this code");
 189 
 190     StubCodeMark mark(this, "StubRoutines", "call_stub");
 191     address start = __ pc();
 192 
 193     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 194 
 195     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 196     const Address result        (rfp, result_off         * wordSize);
 197     const Address result_type   (rfp, result_type_off    * wordSize);
 198     const Address method        (rfp, method_off         * wordSize);
 199     const Address entry_point   (rfp, entry_point_off    * wordSize);
 200     const Address parameter_size(rfp, parameter_size_off * wordSize);
 201 
 202     const Address thread        (rfp, thread_off         * wordSize);
 203 
 204     const Address d15_save      (rfp, d15_off * wordSize);
 205     const Address d13_save      (rfp, d13_off * wordSize);
 206     const Address d11_save      (rfp, d11_off * wordSize);
 207     const Address d9_save       (rfp, d9_off * wordSize);
 208 
 209     const Address r28_save      (rfp, r28_off * wordSize);
 210     const Address r26_save      (rfp, r26_off * wordSize);
 211     const Address r24_save      (rfp, r24_off * wordSize);
 212     const Address r22_save      (rfp, r22_off * wordSize);
 213     const Address r20_save      (rfp, r20_off * wordSize);
 214 
 215     // stub code
 216 
 217     // we need a C prolog to bootstrap the x86 caller into the sim
 218     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 219 
 220     address aarch64_entry = __ pc();
 221 
 222 #ifdef BUILTIN_SIM
 223     // Save sender's SP for stack traces.
 224     __ mov(rscratch1, sp);
 225     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 226 #endif
 227     // set up frame and move sp to end of save area
 228     __ enter();
 229     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 230 
 231     // save register parameters and Java scratch/global registers
 232     // n.b. we save thread even though it gets installed in
 233     // rthread because we want to sanity check rthread later
 234     __ str(c_rarg7,  thread);
 235     __ strw(c_rarg6, parameter_size);
 236     __ stp(c_rarg4, c_rarg5,  entry_point);
 237     __ stp(c_rarg2, c_rarg3,  result_type);
 238     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 239 
 240     __ stp(r20, r19,   r20_save);
 241     __ stp(r22, r21,   r22_save);
 242     __ stp(r24, r23,   r24_save);
 243     __ stp(r26, r25,   r26_save);
 244     __ stp(r28, r27,   r28_save);
 245 
 246     __ stpd(v9,  v8,   d9_save);
 247     __ stpd(v11, v10,  d11_save);
 248     __ stpd(v13, v12,  d13_save);
 249     __ stpd(v15, v14,  d15_save);
 250 
 251     // install Java thread in global register now we have saved
 252     // whatever value it held
 253     __ mov(rthread, c_rarg7);
 254     // And method
 255     __ mov(rmethod, c_rarg3);
 256 
 257     // set up the heapbase register
 258     __ reinit_heapbase();
 259 
 260 #ifdef ASSERT
 261     // make sure we have no pending exceptions
 262     {
 263       Label L;
 264       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 265       __ cmp(rscratch1, (unsigned)NULL_WORD);
 266       __ br(Assembler::EQ, L);
 267       __ stop("StubRoutines::call_stub: entered with pending exception");
 268       __ BIND(L);
 269     }
 270 #endif
 271     // pass parameters if any
 272     __ mov(esp, sp);
 273     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 274     __ andr(sp, rscratch1, -2 * wordSize);
 275 
 276     BLOCK_COMMENT("pass parameters if any");
 277     Label parameters_done;
 278     // parameter count is still in c_rarg6
 279     // and parameter pointer identifying param 1 is in c_rarg5
 280     __ cbzw(c_rarg6, parameters_done);
 281 
 282     address loop = __ pc();
 283     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 284     __ subsw(c_rarg6, c_rarg6, 1);
 285     __ push(rscratch1);
 286     __ br(Assembler::GT, loop);
 287 
 288     __ BIND(parameters_done);
 289 
 290     // call Java entry -- passing methdoOop, and current sp
 291     //      rmethod: Method*
 292     //      r13: sender sp
 293     BLOCK_COMMENT("call Java function");
 294     __ mov(r13, sp);
 295     __ blr(c_rarg4);
 296 
 297     // tell the simulator we have returned to the stub
 298 
 299     // we do this here because the notify will already have been done
 300     // if we get to the next instruction via an exception
 301     //
 302     // n.b. adding this instruction here affects the calculation of
 303     // whether or not a routine returns to the call stub (used when
 304     // doing stack walks) since the normal test is to check the return
 305     // pc against the address saved below. so we may need to allow for
 306     // this extra instruction in the check.
 307 
 308     if (NotifySimulator) {
 309       __ notify(Assembler::method_reentry);
 310     }
 311     // save current address for use by exception handling code
 312 
 313     return_address = __ pc();
 314 
 315     // store result depending on type (everything that is not
 316     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 317     // n.b. this assumes Java returns an integral result in r0
 318     // and a floating result in j_farg0
 319     __ ldr(j_rarg2, result);
 320     Label is_long, is_float, is_double, exit;
 321     __ ldr(j_rarg1, result_type);
 322     __ cmp(j_rarg1, T_OBJECT);
 323     __ br(Assembler::EQ, is_long);
 324     __ cmp(j_rarg1, T_LONG);
 325     __ br(Assembler::EQ, is_long);
 326     __ cmp(j_rarg1, T_FLOAT);
 327     __ br(Assembler::EQ, is_float);
 328     __ cmp(j_rarg1, T_DOUBLE);
 329     __ br(Assembler::EQ, is_double);
 330 
 331     // handle T_INT case
 332     __ strw(r0, Address(j_rarg2));
 333 
 334     __ BIND(exit);
 335 
 336     // pop parameters
 337     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 338 
 339 #ifdef ASSERT
 340     // verify that threads correspond
 341     {
 342       Label L, S;
 343       __ ldr(rscratch1, thread);
 344       __ cmp(rthread, rscratch1);
 345       __ br(Assembler::NE, S);
 346       __ get_thread(rscratch1);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::EQ, L);
 349       __ BIND(S);
 350       __ stop("StubRoutines::call_stub: threads must correspond");
 351       __ BIND(L);
 352     }
 353 #endif
 354 
 355     // restore callee-save registers
 356     __ ldpd(v15, v14,  d15_save);
 357     __ ldpd(v13, v12,  d13_save);
 358     __ ldpd(v11, v10,  d11_save);
 359     __ ldpd(v9,  v8,   d9_save);
 360 
 361     __ ldp(r28, r27,   r28_save);
 362     __ ldp(r26, r25,   r26_save);
 363     __ ldp(r24, r23,   r24_save);
 364     __ ldp(r22, r21,   r22_save);
 365     __ ldp(r20, r19,   r20_save);
 366 
 367     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 368     __ ldrw(c_rarg2, result_type);
 369     __ ldr(c_rarg3,  method);
 370     __ ldp(c_rarg4, c_rarg5,  entry_point);
 371     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 372 
 373 #ifndef PRODUCT
 374     // tell the simulator we are about to end Java execution
 375     if (NotifySimulator) {
 376       __ notify(Assembler::method_exit);
 377     }
 378 #endif
 379     // leave frame and return to caller
 380     __ leave();
 381     __ ret(lr);
 382 
 383     // handle return types different from T_INT
 384 
 385     __ BIND(is_long);
 386     __ str(r0, Address(j_rarg2, 0));
 387     __ br(Assembler::AL, exit);
 388 
 389     __ BIND(is_float);
 390     __ strs(j_farg0, Address(j_rarg2, 0));
 391     __ br(Assembler::AL, exit);
 392 
 393     __ BIND(is_double);
 394     __ strd(j_farg0, Address(j_rarg2, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     return start;
 398   }
 399 
 400   // Return point for a Java call if there's an exception thrown in
 401   // Java code.  The exception is caught and transformed into a
 402   // pending exception stored in JavaThread that can be tested from
 403   // within the VM.
 404   //
 405   // Note: Usually the parameters are removed by the callee. In case
 406   // of an exception crossing an activation frame boundary, that is
 407   // not the case if the callee is compiled code => need to setup the
 408   // rsp.
 409   //
 410   // r0: exception oop
 411 
 412   // NOTE: this is used as a target from the signal handler so it
 413   // needs an x86 prolog which returns into the current simulator
 414   // executing the generated catch_exception code. so the prolog
 415   // needs to install rax in a sim register and adjust the sim's
 416   // restart pc to enter the generated code at the start position
 417   // then return from native to simulated execution.
 418 
 419   address generate_catch_exception() {
 420     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 421     address start = __ pc();
 422 
 423     // same as in generate_call_stub():
 424     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 425     const Address thread        (rfp, thread_off         * wordSize);
 426 
 427 #ifdef ASSERT
 428     // verify that threads correspond
 429     {
 430       Label L, S;
 431       __ ldr(rscratch1, thread);
 432       __ cmp(rthread, rscratch1);
 433       __ br(Assembler::NE, S);
 434       __ get_thread(rscratch1);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::EQ, L);
 437       __ bind(S);
 438       __ stop("StubRoutines::catch_exception: threads must correspond");
 439       __ bind(L);
 440     }
 441 #endif
 442 
 443     // set pending exception
 444     __ verify_oop(r0);
 445 
 446     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 447     __ mov(rscratch1, (address)__FILE__);
 448     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 449     __ movw(rscratch1, (int)__LINE__);
 450     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 451 
 452     // complete return to VM
 453     assert(StubRoutines::_call_stub_return_address != NULL,
 454            "_call_stub_return_address must have been generated before");
 455     __ b(StubRoutines::_call_stub_return_address);
 456 
 457     return start;
 458   }
 459 
 460   // Continuation point for runtime calls returning with a pending
 461   // exception.  The pending exception check happened in the runtime
 462   // or native call stub.  The pending exception in Thread is
 463   // converted into a Java-level exception.
 464   //
 465   // Contract with Java-level exception handlers:
 466   // r0: exception
 467   // r3: throwing pc
 468   //
 469   // NOTE: At entry of this stub, exception-pc must be in LR !!
 470 
 471   // NOTE: this is always used as a jump target within generated code
 472   // so it just needs to be generated code wiht no x86 prolog
 473 
 474   address generate_forward_exception() {
 475     StubCodeMark mark(this, "StubRoutines", "forward exception");
 476     address start = __ pc();
 477 
 478     // Upon entry, LR points to the return address returning into
 479     // Java (interpreted or compiled) code; i.e., the return address
 480     // becomes the throwing pc.
 481     //
 482     // Arguments pushed before the runtime call are still on the stack
 483     // but the exception handler will reset the stack pointer ->
 484     // ignore them.  A potential result in registers can be ignored as
 485     // well.
 486 
 487 #ifdef ASSERT
 488     // make sure this code is only executed if there is a pending exception
 489     {
 490       Label L;
 491       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 492       __ cbnz(rscratch1, L);
 493       __ stop("StubRoutines::forward exception: no pending exception (1)");
 494       __ bind(L);
 495     }
 496 #endif
 497 
 498     // compute exception handler into r19
 499 
 500     // call the VM to find the handler address associated with the
 501     // caller address. pass thread in r0 and caller pc (ret address)
 502     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 503     // the stack.
 504     __ mov(c_rarg1, lr);
 505     // lr will be trashed by the VM call so we move it to R19
 506     // (callee-saved) because we also need to pass it to the handler
 507     // returned by this call.
 508     __ mov(r19, lr);
 509     BLOCK_COMMENT("call exception_handler_for_return_address");
 510     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 511                          SharedRuntime::exception_handler_for_return_address),
 512                     rthread, c_rarg1);
 513     // we should not really care that lr is no longer the callee
 514     // address. we saved the value the handler needs in r19 so we can
 515     // just copy it to r3. however, the C2 handler will push its own
 516     // frame and then calls into the VM and the VM code asserts that
 517     // the PC for the frame above the handler belongs to a compiled
 518     // Java method. So, we restore lr here to satisfy that assert.
 519     __ mov(lr, r19);
 520     // setup r0 & r3 & clear pending exception
 521     __ mov(r3, r19);
 522     __ mov(r19, r0);
 523     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 524     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 525 
 526 #ifdef ASSERT
 527     // make sure exception is set
 528     {
 529       Label L;
 530       __ cbnz(r0, L);
 531       __ stop("StubRoutines::forward exception: no pending exception (2)");
 532       __ bind(L);
 533     }
 534 #endif
 535 
 536     // continue at exception handler
 537     // r0: exception
 538     // r3: throwing pc
 539     // r19: exception handler
 540     __ verify_oop(r0);
 541     __ br(r19);
 542 
 543     return start;
 544   }
 545 
 546   // Non-destructive plausibility checks for oops
 547   //
 548   // Arguments:
 549   //    r0: oop to verify
 550   //    rscratch1: error message
 551   //
 552   // Stack after saving c_rarg3:
 553   //    [tos + 0]: saved c_rarg3
 554   //    [tos + 1]: saved c_rarg2
 555   //    [tos + 2]: saved lr
 556   //    [tos + 3]: saved rscratch2
 557   //    [tos + 4]: saved r0
 558   //    [tos + 5]: saved rscratch1
 559   address generate_verify_oop() {
 560 
 561     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 562     address start = __ pc();
 563 
 564     Label exit, error;
 565 
 566     // save c_rarg2 and c_rarg3
 567     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 568 
 569     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 570     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 571     __ ldr(c_rarg3, Address(c_rarg2));
 572     __ add(c_rarg3, c_rarg3, 1);
 573     __ str(c_rarg3, Address(c_rarg2));
 574 
 575     // object is in r0
 576     // make sure object is 'reasonable'
 577     __ cbz(r0, exit); // if obj is NULL it is OK
 578 
 579     // Check if the oop is in the right area of memory
 580     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 581     __ andr(c_rarg2, r0, c_rarg3);
 582     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 583 
 584     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 585     // instruction here because the flags register is live.
 586     __ eor(c_rarg2, c_rarg2, c_rarg3);
 587     __ cbnz(c_rarg2, error);
 588 
 589     // make sure klass is 'reasonable', which is not zero.
 590     __ load_klass(r0, r0);  // get klass
 591     __ cbz(r0, error);      // if klass is NULL it is broken
 592 
 593     // return if everything seems ok
 594     __ bind(exit);
 595 
 596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 597     __ ret(lr);
 598 
 599     // handle errors
 600     __ bind(error);
 601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 602 
 603     __ push(RegSet::range(r0, r29), sp);
 604     // debug(char* msg, int64_t pc, int64_t regs[])
 605     __ mov(c_rarg0, rscratch1);      // pass address of error message
 606     __ mov(c_rarg1, lr);             // pass return address
 607     __ mov(c_rarg2, sp);             // pass address of regs on stack
 608 #ifndef PRODUCT
 609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 610 #endif
 611     BLOCK_COMMENT("call MacroAssembler::debug");
 612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 613     __ blrt(rscratch1, 3, 0, 1);
 614 
 615     return start;
 616   }
 617 
 618   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 619 
 620   // Generate code for an array write pre barrier
 621   //
 622   //     addr    -  starting address
 623   //     count   -  element count
 624   //     tmp     - scratch register
 625   //
 626   //     Destroy no registers except rscratch1 and rscratch2
 627   //
 628   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 629     BarrierSet* bs = Universe::heap()->barrier_set();
 630     switch (bs->kind()) {
 631     case BarrierSet::G1SATBCTLogging:
 632       // With G1, don't generate the call if we statically know that the target in uninitialized
 633       if (!dest_uninitialized) {
 634         __ push_call_clobbered_registers();
 635         if (count == c_rarg0) {
 636           if (addr == c_rarg1) {
 637             // exactly backwards!!
 638             __ mov(rscratch1, c_rarg0);
 639             __ mov(c_rarg0, c_rarg1);
 640             __ mov(c_rarg1, rscratch1);
 641           } else {
 642             __ mov(c_rarg1, count);
 643             __ mov(c_rarg0, addr);
 644           }
 645         } else {
 646           __ mov(c_rarg0, addr);
 647           __ mov(c_rarg1, count);
 648         }
 649         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 650         __ pop_call_clobbered_registers();
 651         break;
 652       case BarrierSet::CardTableForRS:
 653       case BarrierSet::CardTableExtension:
 654       case BarrierSet::ModRef:
 655         break;
 656       default:
 657         ShouldNotReachHere();
 658 
 659       }
 660     }
 661   }
 662 
 663   //
 664   // Generate code for an array write post barrier
 665   //
 666   //  Input:
 667   //     start    - register containing starting address of destination array
 668   //     end      - register containing ending address of destination array
 669   //     scratch  - scratch register
 670   //
 671   //  The input registers are overwritten.
 672   //  The ending address is inclusive.
 673   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 674     assert_different_registers(start, end, scratch);
 675     BarrierSet* bs = Universe::heap()->barrier_set();
 676     switch (bs->kind()) {
 677       case BarrierSet::G1SATBCTLogging:
 678 
 679         {
 680           __ push_call_clobbered_registers();
 681           // must compute element count unless barrier set interface is changed (other platforms supply count)
 682           assert_different_registers(start, end, scratch);
 683           __ lea(scratch, Address(end, BytesPerHeapOop));
 684           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 685           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 686           __ mov(c_rarg0, start);
 687           __ mov(c_rarg1, scratch);
 688           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 689           __ pop_call_clobbered_registers();
 690         }
 691         break;
 692       case BarrierSet::CardTableForRS:
 693       case BarrierSet::CardTableExtension:
 694         {
 695           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 696           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 697 
 698           Label L_loop;
 699 
 700            __ lsr(start, start, CardTableModRefBS::card_shift);
 701            __ lsr(end, end, CardTableModRefBS::card_shift);
 702            __ sub(end, end, start); // number of bytes to copy
 703 
 704           const Register count = end; // 'end' register contains bytes count now
 705           __ load_byte_map_base(scratch);
 706           __ add(start, start, scratch);
 707           if (UseConcMarkSweepGC) {
 708             __ membar(__ StoreStore);
 709           }
 710           __ BIND(L_loop);
 711           __ strb(zr, Address(start, count));
 712           __ subs(count, count, 1);
 713           __ br(Assembler::GE, L_loop);
 714         }
 715         break;
 716       default:
 717         ShouldNotReachHere();
 718 
 719     }
 720   }
 721 
 722   address generate_zero_longs(Register base, Register cnt) {
 723     Register tmp = rscratch1;
 724     Register tmp2 = rscratch2;
 725     int zva_length = VM_Version::zva_length();
 726     Label initial_table_end, loop_zva;
 727     Label fini;
 728 
 729     __ align(CodeEntryAlignment);
 730     StubCodeMark mark(this, "StubRoutines", "zero_longs");
 731     address start = __ pc();
 732 
 733     // Base must be 16 byte aligned. If not just return and let caller handle it
 734     __ tst(base, 0x0f);
 735     __ br(Assembler::NE, fini);
 736     // Align base with ZVA length.
 737     __ neg(tmp, base);
 738     __ andr(tmp, tmp, zva_length - 1);
 739 
 740     // tmp: the number of bytes to be filled to align the base with ZVA length.
 741     __ add(base, base, tmp);
 742     __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
 743     __ adr(tmp2, initial_table_end);
 744     __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
 745     __ br(tmp2);
 746 
 747     for (int i = -zva_length + 16; i < 0; i += 16)
 748       __ stp(zr, zr, Address(base, i));
 749     __ bind(initial_table_end);
 750 
 751     __ sub(cnt, cnt, zva_length >> 3);
 752     __ bind(loop_zva);
 753     __ dc(Assembler::ZVA, base);
 754     __ subs(cnt, cnt, zva_length >> 3);
 755     __ add(base, base, zva_length);
 756     __ br(Assembler::GE, loop_zva);
 757     __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
 758     __ bind(fini);
 759     __ ret(lr);
 760 
 761     return start;
 762   }
 763 
 764   typedef enum {
 765     copy_forwards = 1,
 766     copy_backwards = -1
 767   } copy_direction;
 768 
 769   // Bulk copy of blocks of 8 words.
 770   //
 771   // count is a count of words.
 772   //
 773   // Precondition: count >= 8
 774   //
 775   // Postconditions:
 776   //
 777   // The least significant bit of count contains the remaining count
 778   // of words to copy.  The rest of count is trash.
 779   //
 780   // s and d are adjusted to point to the remaining words to copy
 781   //
 782   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 783                            copy_direction direction) {
 784     int unit = wordSize * direction;
 785     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 786 
 787     int offset;
 788     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 789       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 790     const Register stride = r13;
 791 
 792     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 793     assert_different_registers(s, d, count, rscratch1);
 794 
 795     Label again, drain;
 796     const char *stub_name;
 797     if (direction == copy_forwards)
 798       stub_name = "foward_copy_longs";
 799     else
 800       stub_name = "backward_copy_longs";
 801     StubCodeMark mark(this, "StubRoutines", stub_name);
 802     __ align(CodeEntryAlignment);
 803     __ bind(start);
 804 
 805     Label unaligned_copy_long;
 806     if (AvoidUnalignedAccesses) {
 807       __ tbnz(d, 3, unaligned_copy_long);
 808     }
 809 
 810     if (direction == copy_forwards) {
 811       __ sub(s, s, bias);
 812       __ sub(d, d, bias);
 813     }
 814 
 815 #ifdef ASSERT
 816     // Make sure we are never given < 8 words
 817     {
 818       Label L;
 819       __ cmp(count, 8);
 820       __ br(Assembler::GE, L);
 821       __ stop("genrate_copy_longs called with < 8 words");
 822       __ bind(L);
 823     }
 824 #endif
 825 
 826     // Fill 8 registers
 827     if (UseSIMDForMemoryOps) {
 828       __ ldpq(v0, v1, Address(s, 4 * unit));
 829       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 830     } else {
 831       __ ldp(t0, t1, Address(s, 2 * unit));
 832       __ ldp(t2, t3, Address(s, 4 * unit));
 833       __ ldp(t4, t5, Address(s, 6 * unit));
 834       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 835     }
 836 
 837     __ subs(count, count, 16);
 838     __ br(Assembler::LO, drain);
 839 
 840     int prefetch = PrefetchCopyIntervalInBytes;
 841     bool use_stride = false;
 842     if (direction == copy_backwards) {
 843        use_stride = prefetch > 256;
 844        prefetch = -prefetch;
 845        if (use_stride) __ mov(stride, prefetch);
 846     }
 847 
 848     __ bind(again);
 849 
 850     if (PrefetchCopyIntervalInBytes > 0)
 851       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 852 
 853     if (UseSIMDForMemoryOps) {
 854       __ stpq(v0, v1, Address(d, 4 * unit));
 855       __ ldpq(v0, v1, Address(s, 4 * unit));
 856       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 857       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 858     } else {
 859       __ stp(t0, t1, Address(d, 2 * unit));
 860       __ ldp(t0, t1, Address(s, 2 * unit));
 861       __ stp(t2, t3, Address(d, 4 * unit));
 862       __ ldp(t2, t3, Address(s, 4 * unit));
 863       __ stp(t4, t5, Address(d, 6 * unit));
 864       __ ldp(t4, t5, Address(s, 6 * unit));
 865       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 866       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 867     }
 868 
 869     __ subs(count, count, 8);
 870     __ br(Assembler::HS, again);
 871 
 872     // Drain
 873     __ bind(drain);
 874     if (UseSIMDForMemoryOps) {
 875       __ stpq(v0, v1, Address(d, 4 * unit));
 876       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 877     } else {
 878       __ stp(t0, t1, Address(d, 2 * unit));
 879       __ stp(t2, t3, Address(d, 4 * unit));
 880       __ stp(t4, t5, Address(d, 6 * unit));
 881       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 882     }
 883 
 884     {
 885       Label L1, L2;
 886       __ tbz(count, exact_log2(4), L1);
 887       if (UseSIMDForMemoryOps) {
 888         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 889         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 890       } else {
 891         __ ldp(t0, t1, Address(s, 2 * unit));
 892         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 893         __ stp(t0, t1, Address(d, 2 * unit));
 894         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 895       }
 896       __ bind(L1);
 897 
 898       if (direction == copy_forwards) {
 899         __ add(s, s, bias);
 900         __ add(d, d, bias);
 901       }
 902 
 903       __ tbz(count, 1, L2);
 904       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 905       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 906       __ bind(L2);
 907     }
 908 
 909     __ ret(lr);
 910 
 911     if (AvoidUnalignedAccesses) {
 912       Label drain, again;
 913       // Register order for storing. Order is different for backward copy.
 914 
 915       __ bind(unaligned_copy_long);
 916 
 917       // source address is even aligned, target odd aligned
 918       //
 919       // when forward copying word pairs we read long pairs at offsets
 920       // {0, 2, 4, 6} (in long words). when backwards copying we read
 921       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 922       // address by -2 in the forwards case so we can compute the
 923       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 924       // or -1.
 925       //
 926       // when forward copying we need to store 1 word, 3 pairs and
 927       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 928       // zero offset We adjust the destination by -1 which means we
 929       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 930       //
 931       // When backwards copyng we need to store 1 word, 3 pairs and
 932       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 933       // offsets {1, 3, 5, 7, 8} * unit.
 934 
 935       if (direction == copy_forwards) {
 936         __ sub(s, s, 16);
 937         __ sub(d, d, 8);
 938       }
 939 
 940       // Fill 8 registers
 941       //
 942       // for forwards copy s was offset by -16 from the original input
 943       // value of s so the register contents are at these offsets
 944       // relative to the 64 bit block addressed by that original input
 945       // and so on for each successive 64 byte block when s is updated
 946       //
 947       // t0 at offset 0,  t1 at offset 8
 948       // t2 at offset 16, t3 at offset 24
 949       // t4 at offset 32, t5 at offset 40
 950       // t6 at offset 48, t7 at offset 56
 951 
 952       // for backwards copy s was not offset so the register contents
 953       // are at these offsets into the preceding 64 byte block
 954       // relative to that original input and so on for each successive
 955       // preceding 64 byte block when s is updated. this explains the
 956       // slightly counter-intuitive looking pattern of register usage
 957       // in the stp instructions for backwards copy.
 958       //
 959       // t0 at offset -16, t1 at offset -8
 960       // t2 at offset -32, t3 at offset -24
 961       // t4 at offset -48, t5 at offset -40
 962       // t6 at offset -64, t7 at offset -56
 963 
 964       __ ldp(t0, t1, Address(s, 2 * unit));
 965       __ ldp(t2, t3, Address(s, 4 * unit));
 966       __ ldp(t4, t5, Address(s, 6 * unit));
 967       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 968 
 969       __ subs(count, count, 16);
 970       __ br(Assembler::LO, drain);
 971 
 972       int prefetch = PrefetchCopyIntervalInBytes;
 973       bool use_stride = false;
 974       if (direction == copy_backwards) {
 975          use_stride = prefetch > 256;
 976          prefetch = -prefetch;
 977          if (use_stride) __ mov(stride, prefetch);
 978       }
 979 
 980       __ bind(again);
 981 
 982       if (PrefetchCopyIntervalInBytes > 0)
 983         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 984 
 985       if (direction == copy_forwards) {
 986        // allowing for the offset of -8 the store instructions place
 987        // registers into the target 64 bit block at the following
 988        // offsets
 989        //
 990        // t0 at offset 0
 991        // t1 at offset 8,  t2 at offset 16
 992        // t3 at offset 24, t4 at offset 32
 993        // t5 at offset 40, t6 at offset 48
 994        // t7 at offset 56
 995 
 996         __ str(t0, Address(d, 1 * unit));
 997         __ stp(t1, t2, Address(d, 2 * unit));
 998         __ ldp(t0, t1, Address(s, 2 * unit));
 999         __ stp(t3, t4, Address(d, 4 * unit));
1000         __ ldp(t2, t3, Address(s, 4 * unit));
1001         __ stp(t5, t6, Address(d, 6 * unit));
1002         __ ldp(t4, t5, Address(s, 6 * unit));
1003         __ str(t7, Address(__ pre(d, 8 * unit)));
1004         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1005       } else {
1006        // d was not offset when we started so the registers are
1007        // written into the 64 bit block preceding d with the following
1008        // offsets
1009        //
1010        // t1 at offset -8
1011        // t3 at offset -24, t0 at offset -16
1012        // t5 at offset -48, t2 at offset -32
1013        // t7 at offset -56, t4 at offset -48
1014        //                   t6 at offset -64
1015        //
1016        // note that this matches the offsets previously noted for the
1017        // loads
1018 
1019         __ str(t1, Address(d, 1 * unit));
1020         __ stp(t3, t0, Address(d, 3 * unit));
1021         __ ldp(t0, t1, Address(s, 2 * unit));
1022         __ stp(t5, t2, Address(d, 5 * unit));
1023         __ ldp(t2, t3, Address(s, 4 * unit));
1024         __ stp(t7, t4, Address(d, 7 * unit));
1025         __ ldp(t4, t5, Address(s, 6 * unit));
1026         __ str(t6, Address(__ pre(d, 8 * unit)));
1027         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1028       }
1029 
1030       __ subs(count, count, 8);
1031       __ br(Assembler::HS, again);
1032 
1033       // Drain
1034       //
1035       // this uses the same pattern of offsets and register arguments
1036       // as above
1037       __ bind(drain);
1038       if (direction == copy_forwards) {
1039         __ str(t0, Address(d, 1 * unit));
1040         __ stp(t1, t2, Address(d, 2 * unit));
1041         __ stp(t3, t4, Address(d, 4 * unit));
1042         __ stp(t5, t6, Address(d, 6 * unit));
1043         __ str(t7, Address(__ pre(d, 8 * unit)));
1044       } else {
1045         __ str(t1, Address(d, 1 * unit));
1046         __ stp(t3, t0, Address(d, 3 * unit));
1047         __ stp(t5, t2, Address(d, 5 * unit));
1048         __ stp(t7, t4, Address(d, 7 * unit));
1049         __ str(t6, Address(__ pre(d, 8 * unit)));
1050       }
1051       // now we need to copy any remaining part block which may
1052       // include a 4 word block subblock and/or a 2 word subblock.
1053       // bits 2 and 1 in the count are the tell-tale for whetehr we
1054       // have each such subblock
1055       {
1056         Label L1, L2;
1057         __ tbz(count, exact_log2(4), L1);
1058        // this is the same as above but copying only 4 longs hence
1059        // with ony one intervening stp between the str instructions
1060        // but note that the offsets and registers still follow the
1061        // same pattern
1062         __ ldp(t0, t1, Address(s, 2 * unit));
1063         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1064         if (direction == copy_forwards) {
1065           __ str(t0, Address(d, 1 * unit));
1066           __ stp(t1, t2, Address(d, 2 * unit));
1067           __ str(t3, Address(__ pre(d, 4 * unit)));
1068         } else {
1069           __ str(t1, Address(d, 1 * unit));
1070           __ stp(t3, t0, Address(d, 3 * unit));
1071           __ str(t2, Address(__ pre(d, 4 * unit)));
1072         }
1073         __ bind(L1);
1074 
1075         __ tbz(count, 1, L2);
1076        // this is the same as above but copying only 2 longs hence
1077        // there is no intervening stp between the str instructions
1078        // but note that the offset and register patterns are still
1079        // the same
1080         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1081         if (direction == copy_forwards) {
1082           __ str(t0, Address(d, 1 * unit));
1083           __ str(t1, Address(__ pre(d, 2 * unit)));
1084         } else {
1085           __ str(t1, Address(d, 1 * unit));
1086           __ str(t0, Address(__ pre(d, 2 * unit)));
1087         }
1088         __ bind(L2);
1089 
1090        // for forwards copy we need to re-adjust the offsets we
1091        // applied so that s and d are follow the last words written
1092 
1093        if (direction == copy_forwards) {
1094          __ add(s, s, 16);
1095          __ add(d, d, 8);
1096        }
1097 
1098       }
1099 
1100       __ ret(lr);
1101       }
1102   }
1103 
1104   // Small copy: less than 16 bytes.
1105   //
1106   // NB: Ignores all of the bits of count which represent more than 15
1107   // bytes, so a caller doesn't have to mask them.
1108 
1109   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1110     bool is_backwards = step < 0;
1111     size_t granularity = uabs(step);
1112     int direction = is_backwards ? -1 : 1;
1113     int unit = wordSize * direction;
1114 
1115     Label Lpair, Lword, Lint, Lshort, Lbyte;
1116 
1117     assert(granularity
1118            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1119 
1120     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1121 
1122     // ??? I don't know if this bit-test-and-branch is the right thing
1123     // to do.  It does a lot of jumping, resulting in several
1124     // mispredicted branches.  It might make more sense to do this
1125     // with something like Duff's device with a single computed branch.
1126 
1127     __ tbz(count, 3 - exact_log2(granularity), Lword);
1128     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1129     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1130     __ bind(Lword);
1131 
1132     if (granularity <= sizeof (jint)) {
1133       __ tbz(count, 2 - exact_log2(granularity), Lint);
1134       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1135       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1136       __ bind(Lint);
1137     }
1138 
1139     if (granularity <= sizeof (jshort)) {
1140       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1141       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1142       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1143       __ bind(Lshort);
1144     }
1145 
1146     if (granularity <= sizeof (jbyte)) {
1147       __ tbz(count, 0, Lbyte);
1148       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1149       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1150       __ bind(Lbyte);
1151     }
1152   }
1153 
1154   Label copy_f, copy_b;
1155 
1156   // All-singing all-dancing memory copy.
1157   //
1158   // Copy count units of memory from s to d.  The size of a unit is
1159   // step, which can be positive or negative depending on the direction
1160   // of copy.  If is_aligned is false, we align the source address.
1161   //
1162 
1163   void copy_memory(bool is_aligned, Register s, Register d,
1164                    Register count, Register tmp, int step) {
1165     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1166     bool is_backwards = step < 0;
1167     int granularity = uabs(step);
1168     const Register t0 = r3, t1 = r4;
1169 
1170     // <= 96 bytes do inline. Direction doesn't matter because we always
1171     // load all the data before writing anything
1172     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1173     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1174     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1175     const Register send = r17, dend = r18;
1176 
1177     if (PrefetchCopyIntervalInBytes > 0)
1178       __ prfm(Address(s, 0), PLDL1KEEP);
1179     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1180     __ br(Assembler::HI, copy_big);
1181 
1182     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1183     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1184 
1185     __ cmp(count, 16/granularity);
1186     __ br(Assembler::LS, copy16);
1187 
1188     __ cmp(count, 64/granularity);
1189     __ br(Assembler::HI, copy80);
1190 
1191     __ cmp(count, 32/granularity);
1192     __ br(Assembler::LS, copy32);
1193 
1194     // 33..64 bytes
1195     if (UseSIMDForMemoryOps) {
1196       __ ldpq(v0, v1, Address(s, 0));
1197       __ ldpq(v2, v3, Address(send, -32));
1198       __ stpq(v0, v1, Address(d, 0));
1199       __ stpq(v2, v3, Address(dend, -32));
1200     } else {
1201       __ ldp(t0, t1, Address(s, 0));
1202       __ ldp(t2, t3, Address(s, 16));
1203       __ ldp(t4, t5, Address(send, -32));
1204       __ ldp(t6, t7, Address(send, -16));
1205 
1206       __ stp(t0, t1, Address(d, 0));
1207       __ stp(t2, t3, Address(d, 16));
1208       __ stp(t4, t5, Address(dend, -32));
1209       __ stp(t6, t7, Address(dend, -16));
1210     }
1211     __ b(finish);
1212 
1213     // 17..32 bytes
1214     __ bind(copy32);
1215     __ ldp(t0, t1, Address(s, 0));
1216     __ ldp(t2, t3, Address(send, -16));
1217     __ stp(t0, t1, Address(d, 0));
1218     __ stp(t2, t3, Address(dend, -16));
1219     __ b(finish);
1220 
1221     // 65..80/96 bytes
1222     // (96 bytes if SIMD because we do 32 byes per instruction)
1223     __ bind(copy80);
1224     if (UseSIMDForMemoryOps) {
1225       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1226       __ ldpq(v4, v5, Address(send, -32));
1227       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1228       __ stpq(v4, v5, Address(dend, -32));
1229     } else {
1230       __ ldp(t0, t1, Address(s, 0));
1231       __ ldp(t2, t3, Address(s, 16));
1232       __ ldp(t4, t5, Address(s, 32));
1233       __ ldp(t6, t7, Address(s, 48));
1234       __ ldp(t8, t9, Address(send, -16));
1235 
1236       __ stp(t0, t1, Address(d, 0));
1237       __ stp(t2, t3, Address(d, 16));
1238       __ stp(t4, t5, Address(d, 32));
1239       __ stp(t6, t7, Address(d, 48));
1240       __ stp(t8, t9, Address(dend, -16));
1241     }
1242     __ b(finish);
1243 
1244     // 0..16 bytes
1245     __ bind(copy16);
1246     __ cmp(count, 8/granularity);
1247     __ br(Assembler::LO, copy8);
1248 
1249     // 8..16 bytes
1250     __ ldr(t0, Address(s, 0));
1251     __ ldr(t1, Address(send, -8));
1252     __ str(t0, Address(d, 0));
1253     __ str(t1, Address(dend, -8));
1254     __ b(finish);
1255 
1256     if (granularity < 8) {
1257       // 4..7 bytes
1258       __ bind(copy8);
1259       __ tbz(count, 2 - exact_log2(granularity), copy4);
1260       __ ldrw(t0, Address(s, 0));
1261       __ ldrw(t1, Address(send, -4));
1262       __ strw(t0, Address(d, 0));
1263       __ strw(t1, Address(dend, -4));
1264       __ b(finish);
1265       if (granularity < 4) {
1266         // 0..3 bytes
1267         __ bind(copy4);
1268         __ cbz(count, finish); // get rid of 0 case
1269         if (granularity == 2) {
1270           __ ldrh(t0, Address(s, 0));
1271           __ strh(t0, Address(d, 0));
1272         } else { // granularity == 1
1273           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1274           // the first and last byte.
1275           // Handle the 3 byte case by loading and storing base + count/2
1276           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1277           // This does means in the 1 byte case we load/store the same
1278           // byte 3 times.
1279           __ lsr(count, count, 1);
1280           __ ldrb(t0, Address(s, 0));
1281           __ ldrb(t1, Address(send, -1));
1282           __ ldrb(t2, Address(s, count));
1283           __ strb(t0, Address(d, 0));
1284           __ strb(t1, Address(dend, -1));
1285           __ strb(t2, Address(d, count));
1286         }
1287         __ b(finish);
1288       }
1289     }
1290 
1291     __ bind(copy_big);
1292     if (is_backwards) {
1293       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1294       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1295     }
1296 
1297     // Now we've got the small case out of the way we can align the
1298     // source address on a 2-word boundary.
1299 
1300     Label aligned;
1301 
1302     if (is_aligned) {
1303       // We may have to adjust by 1 word to get s 2-word-aligned.
1304       __ tbz(s, exact_log2(wordSize), aligned);
1305       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1306       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1307       __ sub(count, count, wordSize/granularity);
1308     } else {
1309       if (is_backwards) {
1310         __ andr(rscratch2, s, 2 * wordSize - 1);
1311       } else {
1312         __ neg(rscratch2, s);
1313         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1314       }
1315       // rscratch2 is the byte adjustment needed to align s.
1316       __ cbz(rscratch2, aligned);
1317       int shift = exact_log2(granularity);
1318       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1319       __ sub(count, count, rscratch2);
1320 
1321 #if 0
1322       // ?? This code is only correct for a disjoint copy.  It may or
1323       // may not make sense to use it in that case.
1324 
1325       // Copy the first pair; s and d may not be aligned.
1326       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1327       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1328 
1329       // Align s and d, adjust count
1330       if (is_backwards) {
1331         __ sub(s, s, rscratch2);
1332         __ sub(d, d, rscratch2);
1333       } else {
1334         __ add(s, s, rscratch2);
1335         __ add(d, d, rscratch2);
1336       }
1337 #else
1338       copy_memory_small(s, d, rscratch2, rscratch1, step);
1339 #endif
1340     }
1341 
1342     __ bind(aligned);
1343 
1344     // s is now 2-word-aligned.
1345 
1346     // We have a count of units and some trailing bytes.  Adjust the
1347     // count and do a bulk copy of words.
1348     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1349     if (direction == copy_forwards)
1350       __ bl(copy_f);
1351     else
1352       __ bl(copy_b);
1353 
1354     // And the tail.
1355     copy_memory_small(s, d, count, tmp, step);
1356 
1357     if (granularity >= 8) __ bind(copy8);
1358     if (granularity >= 4) __ bind(copy4);
1359     __ bind(finish);
1360   }
1361 
1362 
1363   void clobber_registers() {
1364 #ifdef ASSERT
1365     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1366     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1367     for (Register r = r3; r <= r18; r++)
1368       if (r != rscratch1) __ mov(r, rscratch1);
1369 #endif
1370   }
1371 
1372   // Scan over array at a for count oops, verifying each one.
1373   // Preserves a and count, clobbers rscratch1 and rscratch2.
1374   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1375     Label loop, end;
1376     __ mov(rscratch1, a);
1377     __ mov(rscratch2, zr);
1378     __ bind(loop);
1379     __ cmp(rscratch2, count);
1380     __ br(Assembler::HS, end);
1381     if (size == (size_t)wordSize) {
1382       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1383       __ verify_oop(temp);
1384     } else {
1385       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1386       __ decode_heap_oop(temp); // calls verify_oop
1387     }
1388     __ add(rscratch2, rscratch2, size);
1389     __ b(loop);
1390     __ bind(end);
1391   }
1392 
1393   // Arguments:
1394   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1395   //             ignored
1396   //   is_oop  - true => oop array, so generate store check code
1397   //   name    - stub name string
1398   //
1399   // Inputs:
1400   //   c_rarg0   - source array address
1401   //   c_rarg1   - destination array address
1402   //   c_rarg2   - element count, treated as ssize_t, can be zero
1403   //
1404   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1405   // the hardware handle it.  The two dwords within qwords that span
1406   // cache line boundaries will still be loaded and stored atomicly.
1407   //
1408   // Side Effects:
1409   //   disjoint_int_copy_entry is set to the no-overlap entry point
1410   //   used by generate_conjoint_int_oop_copy().
1411   //
1412   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1413                                   const char *name, bool dest_uninitialized = false) {
1414     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1415     __ align(CodeEntryAlignment);
1416     StubCodeMark mark(this, "StubRoutines", name);
1417     address start = __ pc();
1418     __ enter();
1419 
1420     if (entry != NULL) {
1421       *entry = __ pc();
1422       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1423       BLOCK_COMMENT("Entry:");
1424     }
1425 
1426     if (is_oop) {
1427       __ push(RegSet::of(d, count), sp);
1428       // no registers are destroyed by this call
1429       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1430     }
1431     copy_memory(aligned, s, d, count, rscratch1, size);
1432     if (is_oop) {
1433       __ pop(RegSet::of(d, count), sp);
1434       if (VerifyOops)
1435         verify_oop_array(size, d, count, r16);
1436       __ sub(count, count, 1); // make an inclusive end pointer
1437       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1438       gen_write_ref_array_post_barrier(d, count, rscratch1);
1439     }
1440     __ leave();
1441     __ mov(r0, zr); // return 0
1442     __ ret(lr);
1443 #ifdef BUILTIN_SIM
1444     {
1445       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1446       sim->notifyCompile(const_cast<char*>(name), start);
1447     }
1448 #endif
1449     return start;
1450   }
1451 
1452   // Arguments:
1453   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1454   //             ignored
1455   //   is_oop  - true => oop array, so generate store check code
1456   //   name    - stub name string
1457   //
1458   // Inputs:
1459   //   c_rarg0   - source array address
1460   //   c_rarg1   - destination array address
1461   //   c_rarg2   - element count, treated as ssize_t, can be zero
1462   //
1463   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1464   // the hardware handle it.  The two dwords within qwords that span
1465   // cache line boundaries will still be loaded and stored atomicly.
1466   //
1467   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1468                                  address *entry, const char *name,
1469                                  bool dest_uninitialized = false) {
1470     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1471 
1472     StubCodeMark mark(this, "StubRoutines", name);
1473     address start = __ pc();
1474     __ enter();
1475 
1476     if (entry != NULL) {
1477       *entry = __ pc();
1478       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1479       BLOCK_COMMENT("Entry:");
1480     }
1481 
1482     // use fwd copy when (d-s) above_equal (count*size)
1483     __ sub(rscratch1, d, s);
1484     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1485     __ br(Assembler::HS, nooverlap_target);
1486 
1487     if (is_oop) {
1488       __ push(RegSet::of(d, count), sp);
1489       // no registers are destroyed by this call
1490       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1491     }
1492     copy_memory(aligned, s, d, count, rscratch1, -size);
1493     if (is_oop) {
1494       __ pop(RegSet::of(d, count), sp);
1495       if (VerifyOops)
1496         verify_oop_array(size, d, count, r16);
1497       __ sub(count, count, 1); // make an inclusive end pointer
1498       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1499       gen_write_ref_array_post_barrier(d, count, rscratch1);
1500     }
1501     __ leave();
1502     __ mov(r0, zr); // return 0
1503     __ ret(lr);
1504 #ifdef BUILTIN_SIM
1505     {
1506       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1507       sim->notifyCompile(const_cast<char*>(name), start);
1508     }
1509 #endif
1510     return start;
1511 }
1512 
1513   // Arguments:
1514   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1515   //             ignored
1516   //   name    - stub name string
1517   //
1518   // Inputs:
1519   //   c_rarg0   - source array address
1520   //   c_rarg1   - destination array address
1521   //   c_rarg2   - element count, treated as ssize_t, can be zero
1522   //
1523   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1524   // we let the hardware handle it.  The one to eight bytes within words,
1525   // dwords or qwords that span cache line boundaries will still be loaded
1526   // and stored atomically.
1527   //
1528   // Side Effects:
1529   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1530   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1531   // we let the hardware handle it.  The one to eight bytes within words,
1532   // dwords or qwords that span cache line boundaries will still be loaded
1533   // and stored atomically.
1534   //
1535   // Side Effects:
1536   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1537   //   used by generate_conjoint_byte_copy().
1538   //
1539   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1542   }
1543 
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1555   // we let the hardware handle it.  The one to eight bytes within words,
1556   // dwords or qwords that span cache line boundaries will still be loaded
1557   // and stored atomically.
1558   //
1559   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1560                                       address* entry, const char *name) {
1561     const bool not_oop = false;
1562     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1563   }
1564 
1565   // Arguments:
1566   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1567   //             ignored
1568   //   name    - stub name string
1569   //
1570   // Inputs:
1571   //   c_rarg0   - source array address
1572   //   c_rarg1   - destination array address
1573   //   c_rarg2   - element count, treated as ssize_t, can be zero
1574   //
1575   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1576   // let the hardware handle it.  The two or four words within dwords
1577   // or qwords that span cache line boundaries will still be loaded
1578   // and stored atomically.
1579   //
1580   // Side Effects:
1581   //   disjoint_short_copy_entry is set to the no-overlap entry point
1582   //   used by generate_conjoint_short_copy().
1583   //
1584   address generate_disjoint_short_copy(bool aligned,
1585                                        address* entry, const char *name) {
1586     const bool not_oop = false;
1587     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1588   }
1589 
1590   // Arguments:
1591   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1592   //             ignored
1593   //   name    - stub name string
1594   //
1595   // Inputs:
1596   //   c_rarg0   - source array address
1597   //   c_rarg1   - destination array address
1598   //   c_rarg2   - element count, treated as ssize_t, can be zero
1599   //
1600   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1601   // let the hardware handle it.  The two or four words within dwords
1602   // or qwords that span cache line boundaries will still be loaded
1603   // and stored atomically.
1604   //
1605   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1606                                        address *entry, const char *name) {
1607     const bool not_oop = false;
1608     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1609 
1610   }
1611   // Arguments:
1612   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1613   //             ignored
1614   //   name    - stub name string
1615   //
1616   // Inputs:
1617   //   c_rarg0   - source array address
1618   //   c_rarg1   - destination array address
1619   //   c_rarg2   - element count, treated as ssize_t, can be zero
1620   //
1621   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1622   // the hardware handle it.  The two dwords within qwords that span
1623   // cache line boundaries will still be loaded and stored atomicly.
1624   //
1625   // Side Effects:
1626   //   disjoint_int_copy_entry is set to the no-overlap entry point
1627   //   used by generate_conjoint_int_oop_copy().
1628   //
1629   address generate_disjoint_int_copy(bool aligned, address *entry,
1630                                          const char *name, bool dest_uninitialized = false) {
1631     const bool not_oop = false;
1632     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1633   }
1634 
1635   // Arguments:
1636   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1637   //             ignored
1638   //   name    - stub name string
1639   //
1640   // Inputs:
1641   //   c_rarg0   - source array address
1642   //   c_rarg1   - destination array address
1643   //   c_rarg2   - element count, treated as ssize_t, can be zero
1644   //
1645   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1646   // the hardware handle it.  The two dwords within qwords that span
1647   // cache line boundaries will still be loaded and stored atomicly.
1648   //
1649   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1650                                      address *entry, const char *name,
1651                                      bool dest_uninitialized = false) {
1652     const bool not_oop = false;
1653     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1654   }
1655 
1656 
1657   // Arguments:
1658   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1659   //             ignored
1660   //   name    - stub name string
1661   //
1662   // Inputs:
1663   //   c_rarg0   - source array address
1664   //   c_rarg1   - destination array address
1665   //   c_rarg2   - element count, treated as size_t, can be zero
1666   //
1667   // Side Effects:
1668   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1669   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1670   //
1671   address generate_disjoint_long_copy(bool aligned, address *entry,
1672                                           const char *name, bool dest_uninitialized = false) {
1673     const bool not_oop = false;
1674     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1675   }
1676 
1677   // Arguments:
1678   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1679   //             ignored
1680   //   name    - stub name string
1681   //
1682   // Inputs:
1683   //   c_rarg0   - source array address
1684   //   c_rarg1   - destination array address
1685   //   c_rarg2   - element count, treated as size_t, can be zero
1686   //
1687   address generate_conjoint_long_copy(bool aligned,
1688                                       address nooverlap_target, address *entry,
1689                                       const char *name, bool dest_uninitialized = false) {
1690     const bool not_oop = false;
1691     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1692   }
1693 
1694   // Arguments:
1695   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1696   //             ignored
1697   //   name    - stub name string
1698   //
1699   // Inputs:
1700   //   c_rarg0   - source array address
1701   //   c_rarg1   - destination array address
1702   //   c_rarg2   - element count, treated as size_t, can be zero
1703   //
1704   // Side Effects:
1705   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1706   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1707   //
1708   address generate_disjoint_oop_copy(bool aligned, address *entry,
1709                                      const char *name, bool dest_uninitialized) {
1710     const bool is_oop = true;
1711     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1712     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1713   }
1714 
1715   // Arguments:
1716   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1717   //             ignored
1718   //   name    - stub name string
1719   //
1720   // Inputs:
1721   //   c_rarg0   - source array address
1722   //   c_rarg1   - destination array address
1723   //   c_rarg2   - element count, treated as size_t, can be zero
1724   //
1725   address generate_conjoint_oop_copy(bool aligned,
1726                                      address nooverlap_target, address *entry,
1727                                      const char *name, bool dest_uninitialized) {
1728     const bool is_oop = true;
1729     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1730     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1731                                   name, dest_uninitialized);
1732   }
1733 
1734 
1735   // Helper for generating a dynamic type check.
1736   // Smashes rscratch1.
1737   void generate_type_check(Register sub_klass,
1738                            Register super_check_offset,
1739                            Register super_klass,
1740                            Label& L_success) {
1741     assert_different_registers(sub_klass, super_check_offset, super_klass);
1742 
1743     BLOCK_COMMENT("type_check:");
1744 
1745     Label L_miss;
1746 
1747     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1748                                      super_check_offset);
1749     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1750 
1751     // Fall through on failure!
1752     __ BIND(L_miss);
1753   }
1754 
1755   //
1756   //  Generate checkcasting array copy stub
1757   //
1758   //  Input:
1759   //    c_rarg0   - source array address
1760   //    c_rarg1   - destination array address
1761   //    c_rarg2   - element count, treated as ssize_t, can be zero
1762   //    c_rarg3   - size_t ckoff (super_check_offset)
1763   //    c_rarg4   - oop ckval (super_klass)
1764   //
1765   //  Output:
1766   //    r0 ==  0  -  success
1767   //    r0 == -1^K - failure, where K is partial transfer count
1768   //
1769   address generate_checkcast_copy(const char *name, address *entry,
1770                                   bool dest_uninitialized = false) {
1771 
1772     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1773 
1774     // Input registers (after setup_arg_regs)
1775     const Register from        = c_rarg0;   // source array address
1776     const Register to          = c_rarg1;   // destination array address
1777     const Register count       = c_rarg2;   // elementscount
1778     const Register ckoff       = c_rarg3;   // super_check_offset
1779     const Register ckval       = c_rarg4;   // super_klass
1780 
1781     // Registers used as temps (r18, r19, r20 are save-on-entry)
1782     const Register count_save  = r21;       // orig elementscount
1783     const Register start_to    = r20;       // destination array start address
1784     const Register copied_oop  = r18;       // actual oop copied
1785     const Register r19_klass   = r19;       // oop._klass
1786 
1787     //---------------------------------------------------------------
1788     // Assembler stub will be used for this call to arraycopy
1789     // if the two arrays are subtypes of Object[] but the
1790     // destination array type is not equal to or a supertype
1791     // of the source type.  Each element must be separately
1792     // checked.
1793 
1794     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1795                                copied_oop, r19_klass, count_save);
1796 
1797     __ align(CodeEntryAlignment);
1798     StubCodeMark mark(this, "StubRoutines", name);
1799     address start = __ pc();
1800 
1801     __ enter(); // required for proper stackwalking of RuntimeStub frame
1802 
1803 #ifdef ASSERT
1804     // caller guarantees that the arrays really are different
1805     // otherwise, we would have to make conjoint checks
1806     { Label L;
1807       array_overlap_test(L, TIMES_OOP);
1808       __ stop("checkcast_copy within a single array");
1809       __ bind(L);
1810     }
1811 #endif //ASSERT
1812 
1813     // Caller of this entry point must set up the argument registers.
1814     if (entry != NULL) {
1815       *entry = __ pc();
1816       BLOCK_COMMENT("Entry:");
1817     }
1818 
1819      // Empty array:  Nothing to do.
1820     __ cbz(count, L_done);
1821 
1822     __ push(RegSet::of(r18, r19, r20, r21), sp);
1823 
1824 #ifdef ASSERT
1825     BLOCK_COMMENT("assert consistent ckoff/ckval");
1826     // The ckoff and ckval must be mutually consistent,
1827     // even though caller generates both.
1828     { Label L;
1829       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1830       __ ldrw(start_to, Address(ckval, sco_offset));
1831       __ cmpw(ckoff, start_to);
1832       __ br(Assembler::EQ, L);
1833       __ stop("super_check_offset inconsistent");
1834       __ bind(L);
1835     }
1836 #endif //ASSERT
1837 
1838     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1839 
1840     // save the original count
1841     __ mov(count_save, count);
1842 
1843     // Copy from low to high addresses
1844     __ mov(start_to, to);              // Save destination array start address
1845     __ b(L_load_element);
1846 
1847     // ======== begin loop ========
1848     // (Loop is rotated; its entry is L_load_element.)
1849     // Loop control:
1850     //   for (; count != 0; count--) {
1851     //     copied_oop = load_heap_oop(from++);
1852     //     ... generate_type_check ...;
1853     //     store_heap_oop(to++, copied_oop);
1854     //   }
1855     __ align(OptoLoopAlignment);
1856 
1857     __ BIND(L_store_element);
1858     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1859     __ sub(count, count, 1);
1860     __ cbz(count, L_do_card_marks);
1861 
1862     // ======== loop entry is here ========
1863     __ BIND(L_load_element);
1864     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1865     __ cbz(copied_oop, L_store_element);
1866 
1867     __ load_klass(r19_klass, copied_oop);// query the object klass
1868     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1869     // ======== end loop ========
1870 
1871     // It was a real error; we must depend on the caller to finish the job.
1872     // Register count = remaining oops, count_orig = total oops.
1873     // Emit GC store barriers for the oops we have copied and report
1874     // their number to the caller.
1875 
1876     __ subs(count, count_save, count);     // K = partially copied oop count
1877     __ eon(count, count, zr);                   // report (-1^K) to caller
1878     __ br(Assembler::EQ, L_done_pop);
1879 
1880     __ BIND(L_do_card_marks);
1881     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1882     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1883 
1884     __ bind(L_done_pop);
1885     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1886     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1887 
1888     __ bind(L_done);
1889     __ mov(r0, count);
1890     __ leave();
1891     __ ret(lr);
1892 
1893     return start;
1894   }
1895 
1896   // Perform range checks on the proposed arraycopy.
1897   // Kills temp, but nothing else.
1898   // Also, clean the sign bits of src_pos and dst_pos.
1899   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1900                               Register src_pos, // source position (c_rarg1)
1901                               Register dst,     // destination array oo (c_rarg2)
1902                               Register dst_pos, // destination position (c_rarg3)
1903                               Register length,
1904                               Register temp,
1905                               Label& L_failed) {
1906     BLOCK_COMMENT("arraycopy_range_checks:");
1907 
1908     assert_different_registers(rscratch1, temp);
1909 
1910     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1911     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1912     __ addw(temp, length, src_pos);
1913     __ cmpw(temp, rscratch1);
1914     __ br(Assembler::HI, L_failed);
1915 
1916     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1917     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1918     __ addw(temp, length, dst_pos);
1919     __ cmpw(temp, rscratch1);
1920     __ br(Assembler::HI, L_failed);
1921 
1922     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1923     __ movw(src_pos, src_pos);
1924     __ movw(dst_pos, dst_pos);
1925 
1926     BLOCK_COMMENT("arraycopy_range_checks done");
1927   }
1928 
1929   // These stubs get called from some dumb test routine.
1930   // I'll write them properly when they're called from
1931   // something that's actually doing something.
1932   static void fake_arraycopy_stub(address src, address dst, int count) {
1933     assert(count == 0, "huh?");
1934   }
1935 
1936 
1937   //
1938   //  Generate 'unsafe' array copy stub
1939   //  Though just as safe as the other stubs, it takes an unscaled
1940   //  size_t argument instead of an element count.
1941   //
1942   //  Input:
1943   //    c_rarg0   - source array address
1944   //    c_rarg1   - destination array address
1945   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1946   //
1947   // Examines the alignment of the operands and dispatches
1948   // to a long, int, short, or byte copy loop.
1949   //
1950   address generate_unsafe_copy(const char *name,
1951                                address byte_copy_entry,
1952                                address short_copy_entry,
1953                                address int_copy_entry,
1954                                address long_copy_entry) {
1955     Label L_long_aligned, L_int_aligned, L_short_aligned;
1956     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1957 
1958     __ align(CodeEntryAlignment);
1959     StubCodeMark mark(this, "StubRoutines", name);
1960     address start = __ pc();
1961     __ enter(); // required for proper stackwalking of RuntimeStub frame
1962 
1963     // bump this on entry, not on exit:
1964     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1965 
1966     __ orr(rscratch1, s, d);
1967     __ orr(rscratch1, rscratch1, count);
1968 
1969     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1970     __ cbz(rscratch1, L_long_aligned);
1971     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1972     __ cbz(rscratch1, L_int_aligned);
1973     __ tbz(rscratch1, 0, L_short_aligned);
1974     __ b(RuntimeAddress(byte_copy_entry));
1975 
1976     __ BIND(L_short_aligned);
1977     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1978     __ b(RuntimeAddress(short_copy_entry));
1979     __ BIND(L_int_aligned);
1980     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1981     __ b(RuntimeAddress(int_copy_entry));
1982     __ BIND(L_long_aligned);
1983     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1984     __ b(RuntimeAddress(long_copy_entry));
1985 
1986     return start;
1987   }
1988 
1989   //
1990   //  Generate generic array copy stubs
1991   //
1992   //  Input:
1993   //    c_rarg0    -  src oop
1994   //    c_rarg1    -  src_pos (32-bits)
1995   //    c_rarg2    -  dst oop
1996   //    c_rarg3    -  dst_pos (32-bits)
1997   //    c_rarg4    -  element count (32-bits)
1998   //
1999   //  Output:
2000   //    r0 ==  0  -  success
2001   //    r0 == -1^K - failure, where K is partial transfer count
2002   //
2003   address generate_generic_copy(const char *name,
2004                                 address byte_copy_entry, address short_copy_entry,
2005                                 address int_copy_entry, address oop_copy_entry,
2006                                 address long_copy_entry, address checkcast_copy_entry) {
2007 
2008     Label L_failed, L_failed_0, L_objArray;
2009     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2010 
2011     // Input registers
2012     const Register src        = c_rarg0;  // source array oop
2013     const Register src_pos    = c_rarg1;  // source position
2014     const Register dst        = c_rarg2;  // destination array oop
2015     const Register dst_pos    = c_rarg3;  // destination position
2016     const Register length     = c_rarg4;
2017 
2018     StubCodeMark mark(this, "StubRoutines", name);
2019 
2020     __ align(CodeEntryAlignment);
2021     address start = __ pc();
2022 
2023     __ enter(); // required for proper stackwalking of RuntimeStub frame
2024 
2025     // bump this on entry, not on exit:
2026     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2027 
2028     //-----------------------------------------------------------------------
2029     // Assembler stub will be used for this call to arraycopy
2030     // if the following conditions are met:
2031     //
2032     // (1) src and dst must not be null.
2033     // (2) src_pos must not be negative.
2034     // (3) dst_pos must not be negative.
2035     // (4) length  must not be negative.
2036     // (5) src klass and dst klass should be the same and not NULL.
2037     // (6) src and dst should be arrays.
2038     // (7) src_pos + length must not exceed length of src.
2039     // (8) dst_pos + length must not exceed length of dst.
2040     //
2041 
2042     //  if (src == NULL) return -1;
2043     __ cbz(src, L_failed);
2044 
2045     //  if (src_pos < 0) return -1;
2046     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2047 
2048     //  if (dst == NULL) return -1;
2049     __ cbz(dst, L_failed);
2050 
2051     //  if (dst_pos < 0) return -1;
2052     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2053 
2054     // registers used as temp
2055     const Register scratch_length    = r16; // elements count to copy
2056     const Register scratch_src_klass = r17; // array klass
2057     const Register lh                = r18; // layout helper
2058 
2059     //  if (length < 0) return -1;
2060     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2061     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2062 
2063     __ load_klass(scratch_src_klass, src);
2064 #ifdef ASSERT
2065     //  assert(src->klass() != NULL);
2066     {
2067       BLOCK_COMMENT("assert klasses not null {");
2068       Label L1, L2;
2069       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2070       __ bind(L1);
2071       __ stop("broken null klass");
2072       __ bind(L2);
2073       __ load_klass(rscratch1, dst);
2074       __ cbz(rscratch1, L1);     // this would be broken also
2075       BLOCK_COMMENT("} assert klasses not null done");
2076     }
2077 #endif
2078 
2079     // Load layout helper (32-bits)
2080     //
2081     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2082     // 32        30    24            16              8     2                 0
2083     //
2084     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2085     //
2086 
2087     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2088 
2089     // Handle objArrays completely differently...
2090     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2091     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2092     __ movw(rscratch1, objArray_lh);
2093     __ eorw(rscratch2, lh, rscratch1);
2094     __ cbzw(rscratch2, L_objArray);
2095 
2096     //  if (src->klass() != dst->klass()) return -1;
2097     __ load_klass(rscratch2, dst);
2098     __ eor(rscratch2, rscratch2, scratch_src_klass);
2099     __ cbnz(rscratch2, L_failed);
2100 
2101     //  if (!src->is_Array()) return -1;
2102     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2103 
2104     // At this point, it is known to be a typeArray (array_tag 0x3).
2105 #ifdef ASSERT
2106     {
2107       BLOCK_COMMENT("assert primitive array {");
2108       Label L;
2109       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2110       __ cmpw(lh, rscratch2);
2111       __ br(Assembler::GE, L);
2112       __ stop("must be a primitive array");
2113       __ bind(L);
2114       BLOCK_COMMENT("} assert primitive array done");
2115     }
2116 #endif
2117 
2118     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2119                            rscratch2, L_failed);
2120 
2121     // TypeArrayKlass
2122     //
2123     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2124     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2125     //
2126 
2127     const Register rscratch1_offset = rscratch1;    // array offset
2128     const Register r18_elsize = lh; // element size
2129 
2130     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2131            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2132     __ add(src, src, rscratch1_offset);           // src array offset
2133     __ add(dst, dst, rscratch1_offset);           // dst array offset
2134     BLOCK_COMMENT("choose copy loop based on element size");
2135 
2136     // next registers should be set before the jump to corresponding stub
2137     const Register from     = c_rarg0;  // source array address
2138     const Register to       = c_rarg1;  // destination array address
2139     const Register count    = c_rarg2;  // elements count
2140 
2141     // 'from', 'to', 'count' registers should be set in such order
2142     // since they are the same as 'src', 'src_pos', 'dst'.
2143 
2144     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2145 
2146     // The possible values of elsize are 0-3, i.e. exact_log2(element
2147     // size in bytes).  We do a simple bitwise binary search.
2148   __ BIND(L_copy_bytes);
2149     __ tbnz(r18_elsize, 1, L_copy_ints);
2150     __ tbnz(r18_elsize, 0, L_copy_shorts);
2151     __ lea(from, Address(src, src_pos));// src_addr
2152     __ lea(to,   Address(dst, dst_pos));// dst_addr
2153     __ movw(count, scratch_length); // length
2154     __ b(RuntimeAddress(byte_copy_entry));
2155 
2156   __ BIND(L_copy_shorts);
2157     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2158     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2159     __ movw(count, scratch_length); // length
2160     __ b(RuntimeAddress(short_copy_entry));
2161 
2162   __ BIND(L_copy_ints);
2163     __ tbnz(r18_elsize, 0, L_copy_longs);
2164     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2165     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2166     __ movw(count, scratch_length); // length
2167     __ b(RuntimeAddress(int_copy_entry));
2168 
2169   __ BIND(L_copy_longs);
2170 #ifdef ASSERT
2171     {
2172       BLOCK_COMMENT("assert long copy {");
2173       Label L;
2174       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2175       __ cmpw(r18_elsize, LogBytesPerLong);
2176       __ br(Assembler::EQ, L);
2177       __ stop("must be long copy, but elsize is wrong");
2178       __ bind(L);
2179       BLOCK_COMMENT("} assert long copy done");
2180     }
2181 #endif
2182     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2183     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2184     __ movw(count, scratch_length); // length
2185     __ b(RuntimeAddress(long_copy_entry));
2186 
2187     // ObjArrayKlass
2188   __ BIND(L_objArray);
2189     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2190 
2191     Label L_plain_copy, L_checkcast_copy;
2192     //  test array classes for subtyping
2193     __ load_klass(r18, dst);
2194     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2195     __ br(Assembler::NE, L_checkcast_copy);
2196 
2197     // Identically typed arrays can be copied without element-wise checks.
2198     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2199                            rscratch2, L_failed);
2200 
2201     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2202     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2203     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2204     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2205     __ movw(count, scratch_length); // length
2206   __ BIND(L_plain_copy);
2207     __ b(RuntimeAddress(oop_copy_entry));
2208 
2209   __ BIND(L_checkcast_copy);
2210     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2211     {
2212       // Before looking at dst.length, make sure dst is also an objArray.
2213       __ ldrw(rscratch1, Address(r18, lh_offset));
2214       __ movw(rscratch2, objArray_lh);
2215       __ eorw(rscratch1, rscratch1, rscratch2);
2216       __ cbnzw(rscratch1, L_failed);
2217 
2218       // It is safe to examine both src.length and dst.length.
2219       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2220                              r18, L_failed);
2221 
2222       const Register rscratch2_dst_klass = rscratch2;
2223       __ load_klass(rscratch2_dst_klass, dst); // reload
2224 
2225       // Marshal the base address arguments now, freeing registers.
2226       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2227       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2228       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2229       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2230       __ movw(count, length);           // length (reloaded)
2231       Register sco_temp = c_rarg3;      // this register is free now
2232       assert_different_registers(from, to, count, sco_temp,
2233                                  rscratch2_dst_klass, scratch_src_klass);
2234       // assert_clean_int(count, sco_temp);
2235 
2236       // Generate the type check.
2237       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2238       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2239       // assert_clean_int(sco_temp, r18);
2240       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2241 
2242       // Fetch destination element klass from the ObjArrayKlass header.
2243       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2244       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2245       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2246 
2247       // the checkcast_copy loop needs two extra arguments:
2248       assert(c_rarg3 == sco_temp, "#3 already in place");
2249       // Set up arguments for checkcast_copy_entry.
2250       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2251       __ b(RuntimeAddress(checkcast_copy_entry));
2252     }
2253 
2254   __ BIND(L_failed);
2255     __ mov(r0, -1);
2256     __ leave();   // required for proper stackwalking of RuntimeStub frame
2257     __ ret(lr);
2258 
2259     return start;
2260   }
2261 
2262   //
2263   // Generate stub for array fill. If "aligned" is true, the
2264   // "to" address is assumed to be heapword aligned.
2265   //
2266   // Arguments for generated stub:
2267   //   to:    c_rarg0
2268   //   value: c_rarg1
2269   //   count: c_rarg2 treated as signed
2270   //
2271   address generate_fill(BasicType t, bool aligned, const char *name) {
2272     __ align(CodeEntryAlignment);
2273     StubCodeMark mark(this, "StubRoutines", name);
2274     address start = __ pc();
2275 
2276     BLOCK_COMMENT("Entry:");
2277 
2278     const Register to        = c_rarg0;  // source array address
2279     const Register value     = c_rarg1;  // value
2280     const Register count     = c_rarg2;  // elements count
2281 
2282     const Register bz_base = r10;        // base for block_zero routine
2283     const Register cnt_words = r11;      // temp register
2284 
2285     __ enter();
2286 
2287     Label L_fill_elements, L_exit1;
2288 
2289     int shift = -1;
2290     switch (t) {
2291       case T_BYTE:
2292         shift = 0;
2293         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2294         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2295         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2296         __ br(Assembler::LO, L_fill_elements);
2297         break;
2298       case T_SHORT:
2299         shift = 1;
2300         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2301         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2302         __ br(Assembler::LO, L_fill_elements);
2303         break;
2304       case T_INT:
2305         shift = 2;
2306         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2307         __ br(Assembler::LO, L_fill_elements);
2308         break;
2309       default: ShouldNotReachHere();
2310     }
2311 
2312     // Align source address at 8 bytes address boundary.
2313     Label L_skip_align1, L_skip_align2, L_skip_align4;
2314     if (!aligned) {
2315       switch (t) {
2316         case T_BYTE:
2317           // One byte misalignment happens only for byte arrays.
2318           __ tbz(to, 0, L_skip_align1);
2319           __ strb(value, Address(__ post(to, 1)));
2320           __ subw(count, count, 1);
2321           __ bind(L_skip_align1);
2322           // Fallthrough
2323         case T_SHORT:
2324           // Two bytes misalignment happens only for byte and short (char) arrays.
2325           __ tbz(to, 1, L_skip_align2);
2326           __ strh(value, Address(__ post(to, 2)));
2327           __ subw(count, count, 2 >> shift);
2328           __ bind(L_skip_align2);
2329           // Fallthrough
2330         case T_INT:
2331           // Align to 8 bytes, we know we are 4 byte aligned to start.
2332           __ tbz(to, 2, L_skip_align4);
2333           __ strw(value, Address(__ post(to, 4)));
2334           __ subw(count, count, 4 >> shift);
2335           __ bind(L_skip_align4);
2336           break;
2337         default: ShouldNotReachHere();
2338       }
2339     }
2340 
2341     //
2342     //  Fill large chunks
2343     //
2344     __ lsrw(cnt_words, count, 3 - shift); // number of words
2345     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2346     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2347     if (UseBlockZeroing) {
2348       Label non_block_zeroing, rest;
2349       // count >= BlockZeroingLowLimit && value == 0
2350       __ cmp(cnt_words, BlockZeroingLowLimit >> 3);
2351       __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
2352       __ br(Assembler::NE, non_block_zeroing);
2353       __ mov(bz_base, to);
2354       __ block_zero(bz_base, cnt_words, true);
2355       __ mov(to, bz_base);
2356       __ b(rest);
2357       __ bind(non_block_zeroing);
2358       __ fill_words(to, cnt_words, value);
2359       __ bind(rest);
2360     }
2361     else {
2362       __ fill_words(to, cnt_words, value);
2363     }
2364 
2365     // Remaining count is less than 8 bytes. Fill it by a single store.
2366     // Note that the total length is no less than 8 bytes.
2367     if (t == T_BYTE || t == T_SHORT) {
2368       Label L_exit1;
2369       __ cbzw(count, L_exit1);
2370       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2371       __ str(value, Address(to, -8));    // overwrite some elements
2372       __ bind(L_exit1);
2373       __ leave();
2374       __ ret(lr);
2375     }
2376 
2377     // Handle copies less than 8 bytes.
2378     Label L_fill_2, L_fill_4, L_exit2;
2379     __ bind(L_fill_elements);
2380     switch (t) {
2381       case T_BYTE:
2382         __ tbz(count, 0, L_fill_2);
2383         __ strb(value, Address(__ post(to, 1)));
2384         __ bind(L_fill_2);
2385         __ tbz(count, 1, L_fill_4);
2386         __ strh(value, Address(__ post(to, 2)));
2387         __ bind(L_fill_4);
2388         __ tbz(count, 2, L_exit2);
2389         __ strw(value, Address(to));
2390         break;
2391       case T_SHORT:
2392         __ tbz(count, 0, L_fill_4);
2393         __ strh(value, Address(__ post(to, 2)));
2394         __ bind(L_fill_4);
2395         __ tbz(count, 1, L_exit2);
2396         __ strw(value, Address(to));
2397         break;
2398       case T_INT:
2399         __ cbzw(count, L_exit2);
2400         __ strw(value, Address(to));
2401         break;
2402       default: ShouldNotReachHere();
2403     }
2404     __ bind(L_exit2);
2405     __ leave();
2406     __ ret(lr);
2407     return start;
2408   }
2409 
2410   void generate_arraycopy_stubs() {
2411     address entry;
2412     address entry_jbyte_arraycopy;
2413     address entry_jshort_arraycopy;
2414     address entry_jint_arraycopy;
2415     address entry_oop_arraycopy;
2416     address entry_jlong_arraycopy;
2417     address entry_checkcast_arraycopy;
2418 
2419     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2420     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2421 
2422     StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
2423 
2424     //*** jbyte
2425     // Always need aligned and unaligned versions
2426     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2427                                                                                   "jbyte_disjoint_arraycopy");
2428     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2429                                                                                   &entry_jbyte_arraycopy,
2430                                                                                   "jbyte_arraycopy");
2431     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2432                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2433     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2434                                                                                   "arrayof_jbyte_arraycopy");
2435 
2436     //*** jshort
2437     // Always need aligned and unaligned versions
2438     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2439                                                                                     "jshort_disjoint_arraycopy");
2440     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2441                                                                                     &entry_jshort_arraycopy,
2442                                                                                     "jshort_arraycopy");
2443     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2444                                                                                     "arrayof_jshort_disjoint_arraycopy");
2445     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2446                                                                                     "arrayof_jshort_arraycopy");
2447 
2448     //*** jint
2449     // Aligned versions
2450     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2451                                                                                 "arrayof_jint_disjoint_arraycopy");
2452     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2453                                                                                 "arrayof_jint_arraycopy");
2454     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2455     // entry_jint_arraycopy always points to the unaligned version
2456     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2457                                                                                 "jint_disjoint_arraycopy");
2458     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2459                                                                                 &entry_jint_arraycopy,
2460                                                                                 "jint_arraycopy");
2461 
2462     //*** jlong
2463     // It is always aligned
2464     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2465                                                                                   "arrayof_jlong_disjoint_arraycopy");
2466     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2467                                                                                   "arrayof_jlong_arraycopy");
2468     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2469     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2470 
2471     //*** oops
2472     {
2473       // With compressed oops we need unaligned versions; notice that
2474       // we overwrite entry_oop_arraycopy.
2475       bool aligned = !UseCompressedOops;
2476 
2477       StubRoutines::_arrayof_oop_disjoint_arraycopy
2478         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2479                                      /*dest_uninitialized*/false);
2480       StubRoutines::_arrayof_oop_arraycopy
2481         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2482                                      /*dest_uninitialized*/false);
2483       // Aligned versions without pre-barriers
2484       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2485         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2486                                      /*dest_uninitialized*/true);
2487       StubRoutines::_arrayof_oop_arraycopy_uninit
2488         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2489                                      /*dest_uninitialized*/true);
2490     }
2491 
2492     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2493     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2494     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2495     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2496 
2497     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2498     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2499                                                                         /*dest_uninitialized*/true);
2500 
2501     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2502                                                               entry_jbyte_arraycopy,
2503                                                               entry_jshort_arraycopy,
2504                                                               entry_jint_arraycopy,
2505                                                               entry_jlong_arraycopy);
2506 
2507     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2508                                                                entry_jbyte_arraycopy,
2509                                                                entry_jshort_arraycopy,
2510                                                                entry_jint_arraycopy,
2511                                                                entry_oop_arraycopy,
2512                                                                entry_jlong_arraycopy,
2513                                                                entry_checkcast_arraycopy);
2514 
2515     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2516     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2517     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2518     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2519     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2520     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2521   }
2522 
2523   void generate_math_stubs() { Unimplemented(); }
2524 
2525   // Arguments:
2526   //
2527   // Inputs:
2528   //   c_rarg0   - source byte array address
2529   //   c_rarg1   - destination byte array address
2530   //   c_rarg2   - K (key) in little endian int array
2531   //
2532   address generate_aescrypt_encryptBlock() {
2533     __ align(CodeEntryAlignment);
2534     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2535 
2536     Label L_doLast;
2537 
2538     const Register from        = c_rarg0;  // source array address
2539     const Register to          = c_rarg1;  // destination array address
2540     const Register key         = c_rarg2;  // key array address
2541     const Register keylen      = rscratch1;
2542 
2543     address start = __ pc();
2544     __ enter();
2545 
2546     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2547 
2548     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2549 
2550     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2551     __ rev32(v1, __ T16B, v1);
2552     __ rev32(v2, __ T16B, v2);
2553     __ rev32(v3, __ T16B, v3);
2554     __ rev32(v4, __ T16B, v4);
2555     __ aese(v0, v1);
2556     __ aesmc(v0, v0);
2557     __ aese(v0, v2);
2558     __ aesmc(v0, v0);
2559     __ aese(v0, v3);
2560     __ aesmc(v0, v0);
2561     __ aese(v0, v4);
2562     __ aesmc(v0, v0);
2563 
2564     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2565     __ rev32(v1, __ T16B, v1);
2566     __ rev32(v2, __ T16B, v2);
2567     __ rev32(v3, __ T16B, v3);
2568     __ rev32(v4, __ T16B, v4);
2569     __ aese(v0, v1);
2570     __ aesmc(v0, v0);
2571     __ aese(v0, v2);
2572     __ aesmc(v0, v0);
2573     __ aese(v0, v3);
2574     __ aesmc(v0, v0);
2575     __ aese(v0, v4);
2576     __ aesmc(v0, v0);
2577 
2578     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2579     __ rev32(v1, __ T16B, v1);
2580     __ rev32(v2, __ T16B, v2);
2581 
2582     __ cmpw(keylen, 44);
2583     __ br(Assembler::EQ, L_doLast);
2584 
2585     __ aese(v0, v1);
2586     __ aesmc(v0, v0);
2587     __ aese(v0, v2);
2588     __ aesmc(v0, v0);
2589 
2590     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2591     __ rev32(v1, __ T16B, v1);
2592     __ rev32(v2, __ T16B, v2);
2593 
2594     __ cmpw(keylen, 52);
2595     __ br(Assembler::EQ, L_doLast);
2596 
2597     __ aese(v0, v1);
2598     __ aesmc(v0, v0);
2599     __ aese(v0, v2);
2600     __ aesmc(v0, v0);
2601 
2602     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2603     __ rev32(v1, __ T16B, v1);
2604     __ rev32(v2, __ T16B, v2);
2605 
2606     __ BIND(L_doLast);
2607 
2608     __ aese(v0, v1);
2609     __ aesmc(v0, v0);
2610     __ aese(v0, v2);
2611 
2612     __ ld1(v1, __ T16B, key);
2613     __ rev32(v1, __ T16B, v1);
2614     __ eor(v0, __ T16B, v0, v1);
2615 
2616     __ st1(v0, __ T16B, to);
2617 
2618     __ mov(r0, 0);
2619 
2620     __ leave();
2621     __ ret(lr);
2622 
2623     return start;
2624   }
2625 
2626   // Arguments:
2627   //
2628   // Inputs:
2629   //   c_rarg0   - source byte array address
2630   //   c_rarg1   - destination byte array address
2631   //   c_rarg2   - K (key) in little endian int array
2632   //
2633   address generate_aescrypt_decryptBlock() {
2634     assert(UseAES, "need AES instructions and misaligned SSE support");
2635     __ align(CodeEntryAlignment);
2636     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2637     Label L_doLast;
2638 
2639     const Register from        = c_rarg0;  // source array address
2640     const Register to          = c_rarg1;  // destination array address
2641     const Register key         = c_rarg2;  // key array address
2642     const Register keylen      = rscratch1;
2643 
2644     address start = __ pc();
2645     __ enter(); // required for proper stackwalking of RuntimeStub frame
2646 
2647     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2648 
2649     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2650 
2651     __ ld1(v5, __ T16B, __ post(key, 16));
2652     __ rev32(v5, __ T16B, v5);
2653 
2654     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2655     __ rev32(v1, __ T16B, v1);
2656     __ rev32(v2, __ T16B, v2);
2657     __ rev32(v3, __ T16B, v3);
2658     __ rev32(v4, __ T16B, v4);
2659     __ aesd(v0, v1);
2660     __ aesimc(v0, v0);
2661     __ aesd(v0, v2);
2662     __ aesimc(v0, v0);
2663     __ aesd(v0, v3);
2664     __ aesimc(v0, v0);
2665     __ aesd(v0, v4);
2666     __ aesimc(v0, v0);
2667 
2668     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2669     __ rev32(v1, __ T16B, v1);
2670     __ rev32(v2, __ T16B, v2);
2671     __ rev32(v3, __ T16B, v3);
2672     __ rev32(v4, __ T16B, v4);
2673     __ aesd(v0, v1);
2674     __ aesimc(v0, v0);
2675     __ aesd(v0, v2);
2676     __ aesimc(v0, v0);
2677     __ aesd(v0, v3);
2678     __ aesimc(v0, v0);
2679     __ aesd(v0, v4);
2680     __ aesimc(v0, v0);
2681 
2682     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2683     __ rev32(v1, __ T16B, v1);
2684     __ rev32(v2, __ T16B, v2);
2685 
2686     __ cmpw(keylen, 44);
2687     __ br(Assembler::EQ, L_doLast);
2688 
2689     __ aesd(v0, v1);
2690     __ aesimc(v0, v0);
2691     __ aesd(v0, v2);
2692     __ aesimc(v0, v0);
2693 
2694     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2695     __ rev32(v1, __ T16B, v1);
2696     __ rev32(v2, __ T16B, v2);
2697 
2698     __ cmpw(keylen, 52);
2699     __ br(Assembler::EQ, L_doLast);
2700 
2701     __ aesd(v0, v1);
2702     __ aesimc(v0, v0);
2703     __ aesd(v0, v2);
2704     __ aesimc(v0, v0);
2705 
2706     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2707     __ rev32(v1, __ T16B, v1);
2708     __ rev32(v2, __ T16B, v2);
2709 
2710     __ BIND(L_doLast);
2711 
2712     __ aesd(v0, v1);
2713     __ aesimc(v0, v0);
2714     __ aesd(v0, v2);
2715 
2716     __ eor(v0, __ T16B, v0, v5);
2717 
2718     __ st1(v0, __ T16B, to);
2719 
2720     __ mov(r0, 0);
2721 
2722     __ leave();
2723     __ ret(lr);
2724 
2725     return start;
2726   }
2727 
2728   // Arguments:
2729   //
2730   // Inputs:
2731   //   c_rarg0   - source byte array address
2732   //   c_rarg1   - destination byte array address
2733   //   c_rarg2   - K (key) in little endian int array
2734   //   c_rarg3   - r vector byte array address
2735   //   c_rarg4   - input length
2736   //
2737   // Output:
2738   //   x0        - input length
2739   //
2740   address generate_cipherBlockChaining_encryptAESCrypt() {
2741     assert(UseAES, "need AES instructions and misaligned SSE support");
2742     __ align(CodeEntryAlignment);
2743     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2744 
2745     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2746 
2747     const Register from        = c_rarg0;  // source array address
2748     const Register to          = c_rarg1;  // destination array address
2749     const Register key         = c_rarg2;  // key array address
2750     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2751                                            // and left with the results of the last encryption block
2752     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2753     const Register keylen      = rscratch1;
2754 
2755     address start = __ pc();
2756       __ enter();
2757 
2758       __ mov(rscratch2, len_reg);
2759       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2760 
2761       __ ld1(v0, __ T16B, rvec);
2762 
2763       __ cmpw(keylen, 52);
2764       __ br(Assembler::CC, L_loadkeys_44);
2765       __ br(Assembler::EQ, L_loadkeys_52);
2766 
2767       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2768       __ rev32(v17, __ T16B, v17);
2769       __ rev32(v18, __ T16B, v18);
2770     __ BIND(L_loadkeys_52);
2771       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2772       __ rev32(v19, __ T16B, v19);
2773       __ rev32(v20, __ T16B, v20);
2774     __ BIND(L_loadkeys_44);
2775       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2776       __ rev32(v21, __ T16B, v21);
2777       __ rev32(v22, __ T16B, v22);
2778       __ rev32(v23, __ T16B, v23);
2779       __ rev32(v24, __ T16B, v24);
2780       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2781       __ rev32(v25, __ T16B, v25);
2782       __ rev32(v26, __ T16B, v26);
2783       __ rev32(v27, __ T16B, v27);
2784       __ rev32(v28, __ T16B, v28);
2785       __ ld1(v29, v30, v31, __ T16B, key);
2786       __ rev32(v29, __ T16B, v29);
2787       __ rev32(v30, __ T16B, v30);
2788       __ rev32(v31, __ T16B, v31);
2789 
2790     __ BIND(L_aes_loop);
2791       __ ld1(v1, __ T16B, __ post(from, 16));
2792       __ eor(v0, __ T16B, v0, v1);
2793 
2794       __ br(Assembler::CC, L_rounds_44);
2795       __ br(Assembler::EQ, L_rounds_52);
2796 
2797       __ aese(v0, v17); __ aesmc(v0, v0);
2798       __ aese(v0, v18); __ aesmc(v0, v0);
2799     __ BIND(L_rounds_52);
2800       __ aese(v0, v19); __ aesmc(v0, v0);
2801       __ aese(v0, v20); __ aesmc(v0, v0);
2802     __ BIND(L_rounds_44);
2803       __ aese(v0, v21); __ aesmc(v0, v0);
2804       __ aese(v0, v22); __ aesmc(v0, v0);
2805       __ aese(v0, v23); __ aesmc(v0, v0);
2806       __ aese(v0, v24); __ aesmc(v0, v0);
2807       __ aese(v0, v25); __ aesmc(v0, v0);
2808       __ aese(v0, v26); __ aesmc(v0, v0);
2809       __ aese(v0, v27); __ aesmc(v0, v0);
2810       __ aese(v0, v28); __ aesmc(v0, v0);
2811       __ aese(v0, v29); __ aesmc(v0, v0);
2812       __ aese(v0, v30);
2813       __ eor(v0, __ T16B, v0, v31);
2814 
2815       __ st1(v0, __ T16B, __ post(to, 16));
2816       __ sub(len_reg, len_reg, 16);
2817       __ cbnz(len_reg, L_aes_loop);
2818 
2819       __ st1(v0, __ T16B, rvec);
2820 
2821       __ mov(r0, rscratch2);
2822 
2823       __ leave();
2824       __ ret(lr);
2825 
2826       return start;
2827   }
2828 
2829   // Arguments:
2830   //
2831   // Inputs:
2832   //   c_rarg0   - source byte array address
2833   //   c_rarg1   - destination byte array address
2834   //   c_rarg2   - K (key) in little endian int array
2835   //   c_rarg3   - r vector byte array address
2836   //   c_rarg4   - input length
2837   //
2838   // Output:
2839   //   r0        - input length
2840   //
2841   address generate_cipherBlockChaining_decryptAESCrypt() {
2842     assert(UseAES, "need AES instructions and misaligned SSE support");
2843     __ align(CodeEntryAlignment);
2844     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2845 
2846     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2847 
2848     const Register from        = c_rarg0;  // source array address
2849     const Register to          = c_rarg1;  // destination array address
2850     const Register key         = c_rarg2;  // key array address
2851     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2852                                            // and left with the results of the last encryption block
2853     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2854     const Register keylen      = rscratch1;
2855 
2856     address start = __ pc();
2857       __ enter();
2858 
2859       __ mov(rscratch2, len_reg);
2860       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2861 
2862       __ ld1(v2, __ T16B, rvec);
2863 
2864       __ ld1(v31, __ T16B, __ post(key, 16));
2865       __ rev32(v31, __ T16B, v31);
2866 
2867       __ cmpw(keylen, 52);
2868       __ br(Assembler::CC, L_loadkeys_44);
2869       __ br(Assembler::EQ, L_loadkeys_52);
2870 
2871       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2872       __ rev32(v17, __ T16B, v17);
2873       __ rev32(v18, __ T16B, v18);
2874     __ BIND(L_loadkeys_52);
2875       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2876       __ rev32(v19, __ T16B, v19);
2877       __ rev32(v20, __ T16B, v20);
2878     __ BIND(L_loadkeys_44);
2879       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2880       __ rev32(v21, __ T16B, v21);
2881       __ rev32(v22, __ T16B, v22);
2882       __ rev32(v23, __ T16B, v23);
2883       __ rev32(v24, __ T16B, v24);
2884       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2885       __ rev32(v25, __ T16B, v25);
2886       __ rev32(v26, __ T16B, v26);
2887       __ rev32(v27, __ T16B, v27);
2888       __ rev32(v28, __ T16B, v28);
2889       __ ld1(v29, v30, __ T16B, key);
2890       __ rev32(v29, __ T16B, v29);
2891       __ rev32(v30, __ T16B, v30);
2892 
2893     __ BIND(L_aes_loop);
2894       __ ld1(v0, __ T16B, __ post(from, 16));
2895       __ orr(v1, __ T16B, v0, v0);
2896 
2897       __ br(Assembler::CC, L_rounds_44);
2898       __ br(Assembler::EQ, L_rounds_52);
2899 
2900       __ aesd(v0, v17); __ aesimc(v0, v0);
2901       __ aesd(v0, v18); __ aesimc(v0, v0);
2902     __ BIND(L_rounds_52);
2903       __ aesd(v0, v19); __ aesimc(v0, v0);
2904       __ aesd(v0, v20); __ aesimc(v0, v0);
2905     __ BIND(L_rounds_44);
2906       __ aesd(v0, v21); __ aesimc(v0, v0);
2907       __ aesd(v0, v22); __ aesimc(v0, v0);
2908       __ aesd(v0, v23); __ aesimc(v0, v0);
2909       __ aesd(v0, v24); __ aesimc(v0, v0);
2910       __ aesd(v0, v25); __ aesimc(v0, v0);
2911       __ aesd(v0, v26); __ aesimc(v0, v0);
2912       __ aesd(v0, v27); __ aesimc(v0, v0);
2913       __ aesd(v0, v28); __ aesimc(v0, v0);
2914       __ aesd(v0, v29); __ aesimc(v0, v0);
2915       __ aesd(v0, v30);
2916       __ eor(v0, __ T16B, v0, v31);
2917       __ eor(v0, __ T16B, v0, v2);
2918 
2919       __ st1(v0, __ T16B, __ post(to, 16));
2920       __ orr(v2, __ T16B, v1, v1);
2921 
2922       __ sub(len_reg, len_reg, 16);
2923       __ cbnz(len_reg, L_aes_loop);
2924 
2925       __ st1(v2, __ T16B, rvec);
2926 
2927       __ mov(r0, rscratch2);
2928 
2929       __ leave();
2930       __ ret(lr);
2931 
2932     return start;
2933   }
2934 
2935   // Arguments:
2936   //
2937   // Inputs:
2938   //   c_rarg0   - byte[]  source+offset
2939   //   c_rarg1   - int[]   SHA.state
2940   //   c_rarg2   - int     offset
2941   //   c_rarg3   - int     limit
2942   //
2943   address generate_sha1_implCompress(bool multi_block, const char *name) {
2944     __ align(CodeEntryAlignment);
2945     StubCodeMark mark(this, "StubRoutines", name);
2946     address start = __ pc();
2947 
2948     Register buf   = c_rarg0;
2949     Register state = c_rarg1;
2950     Register ofs   = c_rarg2;
2951     Register limit = c_rarg3;
2952 
2953     Label keys;
2954     Label sha1_loop;
2955 
2956     // load the keys into v0..v3
2957     __ adr(rscratch1, keys);
2958     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2959     // load 5 words state into v6, v7
2960     __ ldrq(v6, Address(state, 0));
2961     __ ldrs(v7, Address(state, 16));
2962 
2963 
2964     __ BIND(sha1_loop);
2965     // load 64 bytes of data into v16..v19
2966     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2967     __ rev32(v16, __ T16B, v16);
2968     __ rev32(v17, __ T16B, v17);
2969     __ rev32(v18, __ T16B, v18);
2970     __ rev32(v19, __ T16B, v19);
2971 
2972     // do the sha1
2973     __ addv(v4, __ T4S, v16, v0);
2974     __ orr(v20, __ T16B, v6, v6);
2975 
2976     FloatRegister d0 = v16;
2977     FloatRegister d1 = v17;
2978     FloatRegister d2 = v18;
2979     FloatRegister d3 = v19;
2980 
2981     for (int round = 0; round < 20; round++) {
2982       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2983       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2984       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2985       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2986       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2987 
2988       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2989       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2990       __ sha1h(tmp2, __ T4S, v20);
2991       if (round < 5)
2992         __ sha1c(v20, __ T4S, tmp3, tmp4);
2993       else if (round < 10 || round >= 15)
2994         __ sha1p(v20, __ T4S, tmp3, tmp4);
2995       else
2996         __ sha1m(v20, __ T4S, tmp3, tmp4);
2997       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2998 
2999       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3000     }
3001 
3002     __ addv(v7, __ T2S, v7, v21);
3003     __ addv(v6, __ T4S, v6, v20);
3004 
3005     if (multi_block) {
3006       __ add(ofs, ofs, 64);
3007       __ cmp(ofs, limit);
3008       __ br(Assembler::LE, sha1_loop);
3009       __ mov(c_rarg0, ofs); // return ofs
3010     }
3011 
3012     __ strq(v6, Address(state, 0));
3013     __ strs(v7, Address(state, 16));
3014 
3015     __ ret(lr);
3016 
3017     __ bind(keys);
3018     __ emit_int32(0x5a827999);
3019     __ emit_int32(0x6ed9eba1);
3020     __ emit_int32(0x8f1bbcdc);
3021     __ emit_int32(0xca62c1d6);
3022 
3023     return start;
3024   }
3025 
3026 
3027   // Arguments:
3028   //
3029   // Inputs:
3030   //   c_rarg0   - byte[]  source+offset
3031   //   c_rarg1   - int[]   SHA.state
3032   //   c_rarg2   - int     offset
3033   //   c_rarg3   - int     limit
3034   //
3035   address generate_sha256_implCompress(bool multi_block, const char *name) {
3036     static const uint32_t round_consts[64] = {
3037       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3038       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3039       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3040       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3041       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3042       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3043       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3044       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3045       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3046       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3047       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3048       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3049       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3050       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3051       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3052       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3053     };
3054     __ align(CodeEntryAlignment);
3055     StubCodeMark mark(this, "StubRoutines", name);
3056     address start = __ pc();
3057 
3058     Register buf   = c_rarg0;
3059     Register state = c_rarg1;
3060     Register ofs   = c_rarg2;
3061     Register limit = c_rarg3;
3062 
3063     Label sha1_loop;
3064 
3065     __ stpd(v8, v9, __ pre(sp, -32));
3066     __ stpd(v10, v11, Address(sp, 16));
3067 
3068 // dga == v0
3069 // dgb == v1
3070 // dg0 == v2
3071 // dg1 == v3
3072 // dg2 == v4
3073 // t0 == v6
3074 // t1 == v7
3075 
3076     // load 16 keys to v16..v31
3077     __ lea(rscratch1, ExternalAddress((address)round_consts));
3078     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3079     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3080     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3081     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3082 
3083     // load 8 words (256 bits) state
3084     __ ldpq(v0, v1, state);
3085 
3086     __ BIND(sha1_loop);
3087     // load 64 bytes of data into v8..v11
3088     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3089     __ rev32(v8, __ T16B, v8);
3090     __ rev32(v9, __ T16B, v9);
3091     __ rev32(v10, __ T16B, v10);
3092     __ rev32(v11, __ T16B, v11);
3093 
3094     __ addv(v6, __ T4S, v8, v16);
3095     __ orr(v2, __ T16B, v0, v0);
3096     __ orr(v3, __ T16B, v1, v1);
3097 
3098     FloatRegister d0 = v8;
3099     FloatRegister d1 = v9;
3100     FloatRegister d2 = v10;
3101     FloatRegister d3 = v11;
3102 
3103 
3104     for (int round = 0; round < 16; round++) {
3105       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3106       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3107       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3108       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3109 
3110       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3111        __ orr(v4, __ T16B, v2, v2);
3112       if (round < 15)
3113         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3114       __ sha256h(v2, __ T4S, v3, tmp2);
3115       __ sha256h2(v3, __ T4S, v4, tmp2);
3116       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3117 
3118       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3119     }
3120 
3121     __ addv(v0, __ T4S, v0, v2);
3122     __ addv(v1, __ T4S, v1, v3);
3123 
3124     if (multi_block) {
3125       __ add(ofs, ofs, 64);
3126       __ cmp(ofs, limit);
3127       __ br(Assembler::LE, sha1_loop);
3128       __ mov(c_rarg0, ofs); // return ofs
3129     }
3130 
3131     __ ldpd(v10, v11, Address(sp, 16));
3132     __ ldpd(v8, v9, __ post(sp, 32));
3133 
3134     __ stpq(v0, v1, state);
3135 
3136     __ ret(lr);
3137 
3138     return start;
3139   }
3140 
3141 #ifndef BUILTIN_SIM
3142   // Safefetch stubs.
3143   void generate_safefetch(const char* name, int size, address* entry,
3144                           address* fault_pc, address* continuation_pc) {
3145     // safefetch signatures:
3146     //   int      SafeFetch32(int*      adr, int      errValue);
3147     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3148     //
3149     // arguments:
3150     //   c_rarg0 = adr
3151     //   c_rarg1 = errValue
3152     //
3153     // result:
3154     //   PPC_RET  = *adr or errValue
3155 
3156     StubCodeMark mark(this, "StubRoutines", name);
3157 
3158     // Entry point, pc or function descriptor.
3159     *entry = __ pc();
3160 
3161     // Load *adr into c_rarg1, may fault.
3162     *fault_pc = __ pc();
3163     switch (size) {
3164       case 4:
3165         // int32_t
3166         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3167         break;
3168       case 8:
3169         // int64_t
3170         __ ldr(c_rarg1, Address(c_rarg0, 0));
3171         break;
3172       default:
3173         ShouldNotReachHere();
3174     }
3175 
3176     // return errValue or *adr
3177     *continuation_pc = __ pc();
3178     __ mov(r0, c_rarg1);
3179     __ ret(lr);
3180   }
3181 #endif
3182 
3183   /**
3184    *  Arguments:
3185    *
3186    * Inputs:
3187    *   c_rarg0   - int crc
3188    *   c_rarg1   - byte* buf
3189    *   c_rarg2   - int length
3190    *
3191    * Ouput:
3192    *       rax   - int crc result
3193    */
3194   address generate_updateBytesCRC32() {
3195     assert(UseCRC32Intrinsics, "what are we doing here?");
3196 
3197     __ align(CodeEntryAlignment);
3198     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3199 
3200     address start = __ pc();
3201 
3202     const Register crc   = c_rarg0;  // crc
3203     const Register buf   = c_rarg1;  // source java byte array address
3204     const Register len   = c_rarg2;  // length
3205     const Register table0 = c_rarg3; // crc_table address
3206     const Register table1 = c_rarg4;
3207     const Register table2 = c_rarg5;
3208     const Register table3 = c_rarg6;
3209     const Register tmp3 = c_rarg7;
3210 
3211     BLOCK_COMMENT("Entry:");
3212     __ enter(); // required for proper stackwalking of RuntimeStub frame
3213 
3214     __ kernel_crc32(crc, buf, len,
3215               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3216 
3217     __ leave(); // required for proper stackwalking of RuntimeStub frame
3218     __ ret(lr);
3219 
3220     return start;
3221   }
3222 
3223   /**
3224    *  Arguments:
3225    *
3226    * Inputs:
3227    *   c_rarg0   - int crc
3228    *   c_rarg1   - byte* buf
3229    *   c_rarg2   - int length
3230    *   c_rarg3   - int* table
3231    *
3232    * Ouput:
3233    *       r0   - int crc result
3234    */
3235   address generate_updateBytesCRC32C() {
3236     assert(UseCRC32CIntrinsics, "what are we doing here?");
3237 
3238     __ align(CodeEntryAlignment);
3239     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3240 
3241     address start = __ pc();
3242 
3243     const Register crc   = c_rarg0;  // crc
3244     const Register buf   = c_rarg1;  // source java byte array address
3245     const Register len   = c_rarg2;  // length
3246     const Register table0 = c_rarg3; // crc_table address
3247     const Register table1 = c_rarg4;
3248     const Register table2 = c_rarg5;
3249     const Register table3 = c_rarg6;
3250     const Register tmp3 = c_rarg7;
3251 
3252     BLOCK_COMMENT("Entry:");
3253     __ enter(); // required for proper stackwalking of RuntimeStub frame
3254 
3255     __ kernel_crc32c(crc, buf, len,
3256               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3257 
3258     __ leave(); // required for proper stackwalking of RuntimeStub frame
3259     __ ret(lr);
3260 
3261     return start;
3262   }
3263 
3264   /***
3265    *  Arguments:
3266    *
3267    *  Inputs:
3268    *   c_rarg0   - int   adler
3269    *   c_rarg1   - byte* buff
3270    *   c_rarg2   - int   len
3271    *
3272    * Output:
3273    *   c_rarg0   - int adler result
3274    */
3275   address generate_updateBytesAdler32() {
3276     __ align(CodeEntryAlignment);
3277     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3278     address start = __ pc();
3279 
3280     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3281 
3282     // Aliases
3283     Register adler  = c_rarg0;
3284     Register s1     = c_rarg0;
3285     Register s2     = c_rarg3;
3286     Register buff   = c_rarg1;
3287     Register len    = c_rarg2;
3288     Register nmax  = r4;
3289     Register base = r5;
3290     Register count = r6;
3291     Register temp0 = rscratch1;
3292     Register temp1 = rscratch2;
3293     Register temp2 = r7;
3294 
3295     // Max number of bytes we can process before having to take the mod
3296     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3297     unsigned long BASE = 0xfff1;
3298     unsigned long NMAX = 0x15B0;
3299 
3300     __ mov(base, BASE);
3301     __ mov(nmax, NMAX);
3302 
3303     // s1 is initialized to the lower 16 bits of adler
3304     // s2 is initialized to the upper 16 bits of adler
3305     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3306     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3307 
3308     // The pipelined loop needs at least 16 elements for 1 iteration
3309     // It does check this, but it is more effective to skip to the cleanup loop
3310     __ cmp(len, 16);
3311     __ br(Assembler::HS, L_nmax);
3312     __ cbz(len, L_combine);
3313 
3314     __ bind(L_simple_by1_loop);
3315     __ ldrb(temp0, Address(__ post(buff, 1)));
3316     __ add(s1, s1, temp0);
3317     __ add(s2, s2, s1);
3318     __ subs(len, len, 1);
3319     __ br(Assembler::HI, L_simple_by1_loop);
3320 
3321     // s1 = s1 % BASE
3322     __ subs(temp0, s1, base);
3323     __ csel(s1, temp0, s1, Assembler::HS);
3324 
3325     // s2 = s2 % BASE
3326     __ lsr(temp0, s2, 16);
3327     __ lsl(temp1, temp0, 4);
3328     __ sub(temp1, temp1, temp0);
3329     __ add(s2, temp1, s2, ext::uxth);
3330 
3331     __ subs(temp0, s2, base);
3332     __ csel(s2, temp0, s2, Assembler::HS);
3333 
3334     __ b(L_combine);
3335 
3336     __ bind(L_nmax);
3337     __ subs(len, len, nmax);
3338     __ sub(count, nmax, 16);
3339     __ br(Assembler::LO, L_by16);
3340 
3341     __ bind(L_nmax_loop);
3342 
3343     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3344 
3345     __ add(s1, s1, temp0, ext::uxtb);
3346     __ ubfx(temp2, temp0, 8, 8);
3347     __ add(s2, s2, s1);
3348     __ add(s1, s1, temp2);
3349     __ ubfx(temp2, temp0, 16, 8);
3350     __ add(s2, s2, s1);
3351     __ add(s1, s1, temp2);
3352     __ ubfx(temp2, temp0, 24, 8);
3353     __ add(s2, s2, s1);
3354     __ add(s1, s1, temp2);
3355     __ ubfx(temp2, temp0, 32, 8);
3356     __ add(s2, s2, s1);
3357     __ add(s1, s1, temp2);
3358     __ ubfx(temp2, temp0, 40, 8);
3359     __ add(s2, s2, s1);
3360     __ add(s1, s1, temp2);
3361     __ ubfx(temp2, temp0, 48, 8);
3362     __ add(s2, s2, s1);
3363     __ add(s1, s1, temp2);
3364     __ add(s2, s2, s1);
3365     __ add(s1, s1, temp0, Assembler::LSR, 56);
3366     __ add(s2, s2, s1);
3367 
3368     __ add(s1, s1, temp1, ext::uxtb);
3369     __ ubfx(temp2, temp1, 8, 8);
3370     __ add(s2, s2, s1);
3371     __ add(s1, s1, temp2);
3372     __ ubfx(temp2, temp1, 16, 8);
3373     __ add(s2, s2, s1);
3374     __ add(s1, s1, temp2);
3375     __ ubfx(temp2, temp1, 24, 8);
3376     __ add(s2, s2, s1);
3377     __ add(s1, s1, temp2);
3378     __ ubfx(temp2, temp1, 32, 8);
3379     __ add(s2, s2, s1);
3380     __ add(s1, s1, temp2);
3381     __ ubfx(temp2, temp1, 40, 8);
3382     __ add(s2, s2, s1);
3383     __ add(s1, s1, temp2);
3384     __ ubfx(temp2, temp1, 48, 8);
3385     __ add(s2, s2, s1);
3386     __ add(s1, s1, temp2);
3387     __ add(s2, s2, s1);
3388     __ add(s1, s1, temp1, Assembler::LSR, 56);
3389     __ add(s2, s2, s1);
3390 
3391     __ subs(count, count, 16);
3392     __ br(Assembler::HS, L_nmax_loop);
3393 
3394     // s1 = s1 % BASE
3395     __ lsr(temp0, s1, 16);
3396     __ lsl(temp1, temp0, 4);
3397     __ sub(temp1, temp1, temp0);
3398     __ add(temp1, temp1, s1, ext::uxth);
3399 
3400     __ lsr(temp0, temp1, 16);
3401     __ lsl(s1, temp0, 4);
3402     __ sub(s1, s1, temp0);
3403     __ add(s1, s1, temp1, ext:: uxth);
3404 
3405     __ subs(temp0, s1, base);
3406     __ csel(s1, temp0, s1, Assembler::HS);
3407 
3408     // s2 = s2 % BASE
3409     __ lsr(temp0, s2, 16);
3410     __ lsl(temp1, temp0, 4);
3411     __ sub(temp1, temp1, temp0);
3412     __ add(temp1, temp1, s2, ext::uxth);
3413 
3414     __ lsr(temp0, temp1, 16);
3415     __ lsl(s2, temp0, 4);
3416     __ sub(s2, s2, temp0);
3417     __ add(s2, s2, temp1, ext:: uxth);
3418 
3419     __ subs(temp0, s2, base);
3420     __ csel(s2, temp0, s2, Assembler::HS);
3421 
3422     __ subs(len, len, nmax);
3423     __ sub(count, nmax, 16);
3424     __ br(Assembler::HS, L_nmax_loop);
3425 
3426     __ bind(L_by16);
3427     __ adds(len, len, count);
3428     __ br(Assembler::LO, L_by1);
3429 
3430     __ bind(L_by16_loop);
3431 
3432     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3433 
3434     __ add(s1, s1, temp0, ext::uxtb);
3435     __ ubfx(temp2, temp0, 8, 8);
3436     __ add(s2, s2, s1);
3437     __ add(s1, s1, temp2);
3438     __ ubfx(temp2, temp0, 16, 8);
3439     __ add(s2, s2, s1);
3440     __ add(s1, s1, temp2);
3441     __ ubfx(temp2, temp0, 24, 8);
3442     __ add(s2, s2, s1);
3443     __ add(s1, s1, temp2);
3444     __ ubfx(temp2, temp0, 32, 8);
3445     __ add(s2, s2, s1);
3446     __ add(s1, s1, temp2);
3447     __ ubfx(temp2, temp0, 40, 8);
3448     __ add(s2, s2, s1);
3449     __ add(s1, s1, temp2);
3450     __ ubfx(temp2, temp0, 48, 8);
3451     __ add(s2, s2, s1);
3452     __ add(s1, s1, temp2);
3453     __ add(s2, s2, s1);
3454     __ add(s1, s1, temp0, Assembler::LSR, 56);
3455     __ add(s2, s2, s1);
3456 
3457     __ add(s1, s1, temp1, ext::uxtb);
3458     __ ubfx(temp2, temp1, 8, 8);
3459     __ add(s2, s2, s1);
3460     __ add(s1, s1, temp2);
3461     __ ubfx(temp2, temp1, 16, 8);
3462     __ add(s2, s2, s1);
3463     __ add(s1, s1, temp2);
3464     __ ubfx(temp2, temp1, 24, 8);
3465     __ add(s2, s2, s1);
3466     __ add(s1, s1, temp2);
3467     __ ubfx(temp2, temp1, 32, 8);
3468     __ add(s2, s2, s1);
3469     __ add(s1, s1, temp2);
3470     __ ubfx(temp2, temp1, 40, 8);
3471     __ add(s2, s2, s1);
3472     __ add(s1, s1, temp2);
3473     __ ubfx(temp2, temp1, 48, 8);
3474     __ add(s2, s2, s1);
3475     __ add(s1, s1, temp2);
3476     __ add(s2, s2, s1);
3477     __ add(s1, s1, temp1, Assembler::LSR, 56);
3478     __ add(s2, s2, s1);
3479 
3480     __ subs(len, len, 16);
3481     __ br(Assembler::HS, L_by16_loop);
3482 
3483     __ bind(L_by1);
3484     __ adds(len, len, 15);
3485     __ br(Assembler::LO, L_do_mod);
3486 
3487     __ bind(L_by1_loop);
3488     __ ldrb(temp0, Address(__ post(buff, 1)));
3489     __ add(s1, temp0, s1);
3490     __ add(s2, s2, s1);
3491     __ subs(len, len, 1);
3492     __ br(Assembler::HS, L_by1_loop);
3493 
3494     __ bind(L_do_mod);
3495     // s1 = s1 % BASE
3496     __ lsr(temp0, s1, 16);
3497     __ lsl(temp1, temp0, 4);
3498     __ sub(temp1, temp1, temp0);
3499     __ add(temp1, temp1, s1, ext::uxth);
3500 
3501     __ lsr(temp0, temp1, 16);
3502     __ lsl(s1, temp0, 4);
3503     __ sub(s1, s1, temp0);
3504     __ add(s1, s1, temp1, ext:: uxth);
3505 
3506     __ subs(temp0, s1, base);
3507     __ csel(s1, temp0, s1, Assembler::HS);
3508 
3509     // s2 = s2 % BASE
3510     __ lsr(temp0, s2, 16);
3511     __ lsl(temp1, temp0, 4);
3512     __ sub(temp1, temp1, temp0);
3513     __ add(temp1, temp1, s2, ext::uxth);
3514 
3515     __ lsr(temp0, temp1, 16);
3516     __ lsl(s2, temp0, 4);
3517     __ sub(s2, s2, temp0);
3518     __ add(s2, s2, temp1, ext:: uxth);
3519 
3520     __ subs(temp0, s2, base);
3521     __ csel(s2, temp0, s2, Assembler::HS);
3522 
3523     // Combine lower bits and higher bits
3524     __ bind(L_combine);
3525     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3526 
3527     __ ret(lr);
3528 
3529     return start;
3530   }
3531 
3532   /**
3533    *  Arguments:
3534    *
3535    *  Input:
3536    *    c_rarg0   - x address
3537    *    c_rarg1   - x length
3538    *    c_rarg2   - y address
3539    *    c_rarg3   - y lenth
3540    *    c_rarg4   - z address
3541    *    c_rarg5   - z length
3542    */
3543   address generate_multiplyToLen() {
3544     __ align(CodeEntryAlignment);
3545     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3546 
3547     address start = __ pc();
3548     const Register x     = r0;
3549     const Register xlen  = r1;
3550     const Register y     = r2;
3551     const Register ylen  = r3;
3552     const Register z     = r4;
3553     const Register zlen  = r5;
3554 
3555     const Register tmp1  = r10;
3556     const Register tmp2  = r11;
3557     const Register tmp3  = r12;
3558     const Register tmp4  = r13;
3559     const Register tmp5  = r14;
3560     const Register tmp6  = r15;
3561     const Register tmp7  = r16;
3562 
3563     BLOCK_COMMENT("Entry:");
3564     __ enter(); // required for proper stackwalking of RuntimeStub frame
3565     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3566     __ leave(); // required for proper stackwalking of RuntimeStub frame
3567     __ ret(lr);
3568 
3569     return start;
3570   }
3571 
3572   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3573                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3574                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3575     // Karatsuba multiplication performs a 128*128 -> 256-bit
3576     // multiplication in three 128-bit multiplications and a few
3577     // additions.
3578     //
3579     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3580     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3581     //
3582     // Inputs:
3583     //
3584     // A0 in a.d[0]     (subkey)
3585     // A1 in a.d[1]
3586     // (A1+A0) in a1_xor_a0.d[0]
3587     //
3588     // B0 in b.d[0]     (state)
3589     // B1 in b.d[1]
3590 
3591     __ ext(tmp1, __ T16B, b, b, 0x08);
3592     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3593     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3594     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3595     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3596 
3597     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3598     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3599     __ eor(tmp2, __ T16B, tmp2, tmp4);
3600     __ eor(tmp2, __ T16B, tmp2, tmp3);
3601 
3602     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3603     __ ins(result_hi, __ D, tmp2, 0, 1);
3604     __ ins(result_lo, __ D, tmp2, 1, 0);
3605   }
3606 
3607   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3608                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3609     const FloatRegister t0 = result;
3610 
3611     // The GCM field polynomial f is z^128 + p(z), where p =
3612     // z^7+z^2+z+1.
3613     //
3614     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3615     //
3616     // so, given that the product we're reducing is
3617     //    a == lo + hi * z^128
3618     // substituting,
3619     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3620     //
3621     // we reduce by multiplying hi by p(z) and subtracting the result
3622     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3623     // bits we can do this with two 64-bit multiplications, lo*p and
3624     // hi*p.
3625 
3626     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3627     __ ext(t1, __ T16B, t0, z, 8);
3628     __ eor(hi, __ T16B, hi, t1);
3629     __ ext(t1, __ T16B, z, t0, 8);
3630     __ eor(lo, __ T16B, lo, t1);
3631     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3632     __ eor(result, __ T16B, lo, t0);
3633   }
3634 
3635   /**
3636    *  Arguments:
3637    *
3638    *  Input:
3639    *  c_rarg0   - current state address
3640    *  c_rarg1   - H key address
3641    *  c_rarg2   - data address
3642    *  c_rarg3   - number of blocks
3643    *
3644    *  Output:
3645    *  Updated state at c_rarg0
3646    */
3647   address generate_ghash_processBlocks() {
3648     // Bafflingly, GCM uses little-endian for the byte order, but
3649     // big-endian for the bit order.  For example, the polynomial 1 is
3650     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3651     //
3652     // So, we must either reverse the bytes in each word and do
3653     // everything big-endian or reverse the bits in each byte and do
3654     // it little-endian.  On AArch64 it's more idiomatic to reverse
3655     // the bits in each byte (we have an instruction, RBIT, to do
3656     // that) and keep the data in little-endian bit order throught the
3657     // calculation, bit-reversing the inputs and outputs.
3658 
3659     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3660     __ align(wordSize * 2);
3661     address p = __ pc();
3662     __ emit_int64(0x87);  // The low-order bits of the field
3663                           // polynomial (i.e. p = z^7+z^2+z+1)
3664                           // repeated in the low and high parts of a
3665                           // 128-bit vector
3666     __ emit_int64(0x87);
3667 
3668     __ align(CodeEntryAlignment);
3669     address start = __ pc();
3670 
3671     Register state   = c_rarg0;
3672     Register subkeyH = c_rarg1;
3673     Register data    = c_rarg2;
3674     Register blocks  = c_rarg3;
3675 
3676     FloatRegister vzr = v30;
3677     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3678 
3679     __ ldrq(v0, Address(state));
3680     __ ldrq(v1, Address(subkeyH));
3681 
3682     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3683     __ rbit(v0, __ T16B, v0);
3684     __ rev64(v1, __ T16B, v1);
3685     __ rbit(v1, __ T16B, v1);
3686 
3687     __ ldrq(v26, p);
3688 
3689     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3690     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3691 
3692     {
3693       Label L_ghash_loop;
3694       __ bind(L_ghash_loop);
3695 
3696       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3697                                                  // reversing each byte
3698       __ rbit(v2, __ T16B, v2);
3699       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3700 
3701       // Multiply state in v2 by subkey in v1
3702       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3703                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3704                      /*temps*/v6, v20, v18, v21);
3705       // Reduce v7:v5 by the field polynomial
3706       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3707 
3708       __ sub(blocks, blocks, 1);
3709       __ cbnz(blocks, L_ghash_loop);
3710     }
3711 
3712     // The bit-reversed result is at this point in v0
3713     __ rev64(v1, __ T16B, v0);
3714     __ rbit(v1, __ T16B, v1);
3715 
3716     __ st1(v1, __ T16B, state);
3717     __ ret(lr);
3718 
3719     return start;
3720   }
3721 
3722   // Continuation point for throwing of implicit exceptions that are
3723   // not handled in the current activation. Fabricates an exception
3724   // oop and initiates normal exception dispatching in this
3725   // frame. Since we need to preserve callee-saved values (currently
3726   // only for C2, but done for C1 as well) we need a callee-saved oop
3727   // map and therefore have to make these stubs into RuntimeStubs
3728   // rather than BufferBlobs.  If the compiler needs all registers to
3729   // be preserved between the fault point and the exception handler
3730   // then it must assume responsibility for that in
3731   // AbstractCompiler::continuation_for_implicit_null_exception or
3732   // continuation_for_implicit_division_by_zero_exception. All other
3733   // implicit exceptions (e.g., NullPointerException or
3734   // AbstractMethodError on entry) are either at call sites or
3735   // otherwise assume that stack unwinding will be initiated, so
3736   // caller saved registers were assumed volatile in the compiler.
3737 
3738 #undef __
3739 #define __ masm->
3740 
3741   address generate_throw_exception(const char* name,
3742                                    address runtime_entry,
3743                                    Register arg1 = noreg,
3744                                    Register arg2 = noreg) {
3745     // Information about frame layout at time of blocking runtime call.
3746     // Note that we only have to preserve callee-saved registers since
3747     // the compilers are responsible for supplying a continuation point
3748     // if they expect all registers to be preserved.
3749     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3750     enum layout {
3751       rfp_off = 0,
3752       rfp_off2,
3753       return_off,
3754       return_off2,
3755       framesize // inclusive of return address
3756     };
3757 
3758     int insts_size = 512;
3759     int locs_size  = 64;
3760 
3761     CodeBuffer code(name, insts_size, locs_size);
3762     OopMapSet* oop_maps  = new OopMapSet();
3763     MacroAssembler* masm = new MacroAssembler(&code);
3764 
3765     address start = __ pc();
3766 
3767     // This is an inlined and slightly modified version of call_VM
3768     // which has the ability to fetch the return PC out of
3769     // thread-local storage and also sets up last_Java_sp slightly
3770     // differently than the real call_VM
3771 
3772     __ enter(); // Save FP and LR before call
3773 
3774     assert(is_even(framesize/2), "sp not 16-byte aligned");
3775 
3776     // lr and fp are already in place
3777     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3778 
3779     int frame_complete = __ pc() - start;
3780 
3781     // Set up last_Java_sp and last_Java_fp
3782     address the_pc = __ pc();
3783     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3784 
3785     // Call runtime
3786     if (arg1 != noreg) {
3787       assert(arg2 != c_rarg1, "clobbered");
3788       __ mov(c_rarg1, arg1);
3789     }
3790     if (arg2 != noreg) {
3791       __ mov(c_rarg2, arg2);
3792     }
3793     __ mov(c_rarg0, rthread);
3794     BLOCK_COMMENT("call runtime_entry");
3795     __ mov(rscratch1, runtime_entry);
3796     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3797 
3798     // Generate oop map
3799     OopMap* map = new OopMap(framesize, 0);
3800 
3801     oop_maps->add_gc_map(the_pc - start, map);
3802 
3803     __ reset_last_Java_frame(true, true);
3804     __ maybe_isb();
3805 
3806     __ leave();
3807 
3808     // check for pending exceptions
3809 #ifdef ASSERT
3810     Label L;
3811     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3812     __ cbnz(rscratch1, L);
3813     __ should_not_reach_here();
3814     __ bind(L);
3815 #endif // ASSERT
3816     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3817 
3818 
3819     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3820     RuntimeStub* stub =
3821       RuntimeStub::new_runtime_stub(name,
3822                                     &code,
3823                                     frame_complete,
3824                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3825                                     oop_maps, false);
3826     return stub->entry_point();
3827   }
3828 
3829   class MontgomeryMultiplyGenerator : public MacroAssembler {
3830 
3831     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3832       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3833 
3834     RegSet _toSave;
3835     bool _squaring;
3836 
3837   public:
3838     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3839       : MacroAssembler(as->code()), _squaring(squaring) {
3840 
3841       // Register allocation
3842 
3843       Register reg = c_rarg0;
3844       Pa_base = reg;       // Argument registers
3845       if (squaring)
3846         Pb_base = Pa_base;
3847       else
3848         Pb_base = ++reg;
3849       Pn_base = ++reg;
3850       Rlen= ++reg;
3851       inv = ++reg;
3852       Pm_base = ++reg;
3853 
3854                           // Working registers:
3855       Ra =  ++reg;        // The current digit of a, b, n, and m.
3856       Rb =  ++reg;
3857       Rm =  ++reg;
3858       Rn =  ++reg;
3859 
3860       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3861       Pb =  ++reg;
3862       Pm =  ++reg;
3863       Pn =  ++reg;
3864 
3865       t0 =  ++reg;        // Three registers which form a
3866       t1 =  ++reg;        // triple-precision accumuator.
3867       t2 =  ++reg;
3868 
3869       Ri =  ++reg;        // Inner and outer loop indexes.
3870       Rj =  ++reg;
3871 
3872       Rhi_ab = ++reg;     // Product registers: low and high parts
3873       Rlo_ab = ++reg;     // of a*b and m*n.
3874       Rhi_mn = ++reg;
3875       Rlo_mn = ++reg;
3876 
3877       // r19 and up are callee-saved.
3878       _toSave = RegSet::range(r19, reg) + Pm_base;
3879     }
3880 
3881   private:
3882     void save_regs() {
3883       push(_toSave, sp);
3884     }
3885 
3886     void restore_regs() {
3887       pop(_toSave, sp);
3888     }
3889 
3890     template <typename T>
3891     void unroll_2(Register count, T block) {
3892       Label loop, end, odd;
3893       tbnz(count, 0, odd);
3894       cbz(count, end);
3895       align(16);
3896       bind(loop);
3897       (this->*block)();
3898       bind(odd);
3899       (this->*block)();
3900       subs(count, count, 2);
3901       br(Assembler::GT, loop);
3902       bind(end);
3903     }
3904 
3905     template <typename T>
3906     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3907       Label loop, end, odd;
3908       tbnz(count, 0, odd);
3909       cbz(count, end);
3910       align(16);
3911       bind(loop);
3912       (this->*block)(d, s, tmp);
3913       bind(odd);
3914       (this->*block)(d, s, tmp);
3915       subs(count, count, 2);
3916       br(Assembler::GT, loop);
3917       bind(end);
3918     }
3919 
3920     void pre1(RegisterOrConstant i) {
3921       block_comment("pre1");
3922       // Pa = Pa_base;
3923       // Pb = Pb_base + i;
3924       // Pm = Pm_base;
3925       // Pn = Pn_base + i;
3926       // Ra = *Pa;
3927       // Rb = *Pb;
3928       // Rm = *Pm;
3929       // Rn = *Pn;
3930       ldr(Ra, Address(Pa_base));
3931       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3932       ldr(Rm, Address(Pm_base));
3933       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3934       lea(Pa, Address(Pa_base));
3935       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3936       lea(Pm, Address(Pm_base));
3937       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3938 
3939       // Zero the m*n result.
3940       mov(Rhi_mn, zr);
3941       mov(Rlo_mn, zr);
3942     }
3943 
3944     // The core multiply-accumulate step of a Montgomery
3945     // multiplication.  The idea is to schedule operations as a
3946     // pipeline so that instructions with long latencies (loads and
3947     // multiplies) have time to complete before their results are
3948     // used.  This most benefits in-order implementations of the
3949     // architecture but out-of-order ones also benefit.
3950     void step() {
3951       block_comment("step");
3952       // MACC(Ra, Rb, t0, t1, t2);
3953       // Ra = *++Pa;
3954       // Rb = *--Pb;
3955       umulh(Rhi_ab, Ra, Rb);
3956       mul(Rlo_ab, Ra, Rb);
3957       ldr(Ra, pre(Pa, wordSize));
3958       ldr(Rb, pre(Pb, -wordSize));
3959       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3960                                        // previous iteration.
3961       // MACC(Rm, Rn, t0, t1, t2);
3962       // Rm = *++Pm;
3963       // Rn = *--Pn;
3964       umulh(Rhi_mn, Rm, Rn);
3965       mul(Rlo_mn, Rm, Rn);
3966       ldr(Rm, pre(Pm, wordSize));
3967       ldr(Rn, pre(Pn, -wordSize));
3968       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3969     }
3970 
3971     void post1() {
3972       block_comment("post1");
3973 
3974       // MACC(Ra, Rb, t0, t1, t2);
3975       // Ra = *++Pa;
3976       // Rb = *--Pb;
3977       umulh(Rhi_ab, Ra, Rb);
3978       mul(Rlo_ab, Ra, Rb);
3979       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3980       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3981 
3982       // *Pm = Rm = t0 * inv;
3983       mul(Rm, t0, inv);
3984       str(Rm, Address(Pm));
3985 
3986       // MACC(Rm, Rn, t0, t1, t2);
3987       // t0 = t1; t1 = t2; t2 = 0;
3988       umulh(Rhi_mn, Rm, Rn);
3989 
3990 #ifndef PRODUCT
3991       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3992       {
3993         mul(Rlo_mn, Rm, Rn);
3994         add(Rlo_mn, t0, Rlo_mn);
3995         Label ok;
3996         cbz(Rlo_mn, ok); {
3997           stop("broken Montgomery multiply");
3998         } bind(ok);
3999       }
4000 #endif
4001       // We have very carefully set things up so that
4002       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4003       // the lower half of Rm * Rn because we know the result already:
4004       // it must be -t0.  t0 + (-t0) must generate a carry iff
4005       // t0 != 0.  So, rather than do a mul and an adds we just set
4006       // the carry flag iff t0 is nonzero.
4007       //
4008       // mul(Rlo_mn, Rm, Rn);
4009       // adds(zr, t0, Rlo_mn);
4010       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4011       adcs(t0, t1, Rhi_mn);
4012       adc(t1, t2, zr);
4013       mov(t2, zr);
4014     }
4015 
4016     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4017       block_comment("pre2");
4018       // Pa = Pa_base + i-len;
4019       // Pb = Pb_base + len;
4020       // Pm = Pm_base + i-len;
4021       // Pn = Pn_base + len;
4022 
4023       if (i.is_register()) {
4024         sub(Rj, i.as_register(), len);
4025       } else {
4026         mov(Rj, i.as_constant());
4027         sub(Rj, Rj, len);
4028       }
4029       // Rj == i-len
4030 
4031       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4032       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4033       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4034       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4035 
4036       // Ra = *++Pa;
4037       // Rb = *--Pb;
4038       // Rm = *++Pm;
4039       // Rn = *--Pn;
4040       ldr(Ra, pre(Pa, wordSize));
4041       ldr(Rb, pre(Pb, -wordSize));
4042       ldr(Rm, pre(Pm, wordSize));
4043       ldr(Rn, pre(Pn, -wordSize));
4044 
4045       mov(Rhi_mn, zr);
4046       mov(Rlo_mn, zr);
4047     }
4048 
4049     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4050       block_comment("post2");
4051       if (i.is_constant()) {
4052         mov(Rj, i.as_constant()-len.as_constant());
4053       } else {
4054         sub(Rj, i.as_register(), len);
4055       }
4056 
4057       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4058 
4059       // As soon as we know the least significant digit of our result,
4060       // store it.
4061       // Pm_base[i-len] = t0;
4062       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4063 
4064       // t0 = t1; t1 = t2; t2 = 0;
4065       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4066       adc(t1, t2, zr);
4067       mov(t2, zr);
4068     }
4069 
4070     // A carry in t0 after Montgomery multiplication means that we
4071     // should subtract multiples of n from our result in m.  We'll
4072     // keep doing that until there is no carry.
4073     void normalize(RegisterOrConstant len) {
4074       block_comment("normalize");
4075       // while (t0)
4076       //   t0 = sub(Pm_base, Pn_base, t0, len);
4077       Label loop, post, again;
4078       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4079       cbz(t0, post); {
4080         bind(again); {
4081           mov(i, zr);
4082           mov(cnt, len);
4083           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4084           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4085           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4086           align(16);
4087           bind(loop); {
4088             sbcs(Rm, Rm, Rn);
4089             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4090             add(i, i, 1);
4091             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4092             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4093             sub(cnt, cnt, 1);
4094           } cbnz(cnt, loop);
4095           sbc(t0, t0, zr);
4096         } cbnz(t0, again);
4097       } bind(post);
4098     }
4099 
4100     // Move memory at s to d, reversing words.
4101     //    Increments d to end of copied memory
4102     //    Destroys tmp1, tmp2
4103     //    Preserves len
4104     //    Leaves s pointing to the address which was in d at start
4105     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4106       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4107 
4108       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4109       mov(tmp1, len);
4110       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4111       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4112     }
4113     // where
4114     void reverse1(Register d, Register s, Register tmp) {
4115       ldr(tmp, pre(s, -wordSize));
4116       ror(tmp, tmp, 32);
4117       str(tmp, post(d, wordSize));
4118     }
4119 
4120     void step_squaring() {
4121       // An extra ACC
4122       step();
4123       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4124     }
4125 
4126     void last_squaring(RegisterOrConstant i) {
4127       Label dont;
4128       // if ((i & 1) == 0) {
4129       tbnz(i.as_register(), 0, dont); {
4130         // MACC(Ra, Rb, t0, t1, t2);
4131         // Ra = *++Pa;
4132         // Rb = *--Pb;
4133         umulh(Rhi_ab, Ra, Rb);
4134         mul(Rlo_ab, Ra, Rb);
4135         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4136       } bind(dont);
4137     }
4138 
4139     void extra_step_squaring() {
4140       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4141 
4142       // MACC(Rm, Rn, t0, t1, t2);
4143       // Rm = *++Pm;
4144       // Rn = *--Pn;
4145       umulh(Rhi_mn, Rm, Rn);
4146       mul(Rlo_mn, Rm, Rn);
4147       ldr(Rm, pre(Pm, wordSize));
4148       ldr(Rn, pre(Pn, -wordSize));
4149     }
4150 
4151     void post1_squaring() {
4152       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4153 
4154       // *Pm = Rm = t0 * inv;
4155       mul(Rm, t0, inv);
4156       str(Rm, Address(Pm));
4157 
4158       // MACC(Rm, Rn, t0, t1, t2);
4159       // t0 = t1; t1 = t2; t2 = 0;
4160       umulh(Rhi_mn, Rm, Rn);
4161 
4162 #ifndef PRODUCT
4163       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4164       {
4165         mul(Rlo_mn, Rm, Rn);
4166         add(Rlo_mn, t0, Rlo_mn);
4167         Label ok;
4168         cbz(Rlo_mn, ok); {
4169           stop("broken Montgomery multiply");
4170         } bind(ok);
4171       }
4172 #endif
4173       // We have very carefully set things up so that
4174       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4175       // the lower half of Rm * Rn because we know the result already:
4176       // it must be -t0.  t0 + (-t0) must generate a carry iff
4177       // t0 != 0.  So, rather than do a mul and an adds we just set
4178       // the carry flag iff t0 is nonzero.
4179       //
4180       // mul(Rlo_mn, Rm, Rn);
4181       // adds(zr, t0, Rlo_mn);
4182       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4183       adcs(t0, t1, Rhi_mn);
4184       adc(t1, t2, zr);
4185       mov(t2, zr);
4186     }
4187 
4188     void acc(Register Rhi, Register Rlo,
4189              Register t0, Register t1, Register t2) {
4190       adds(t0, t0, Rlo);
4191       adcs(t1, t1, Rhi);
4192       adc(t2, t2, zr);
4193     }
4194 
4195   public:
4196     /**
4197      * Fast Montgomery multiplication.  The derivation of the
4198      * algorithm is in A Cryptographic Library for the Motorola
4199      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4200      *
4201      * Arguments:
4202      *
4203      * Inputs for multiplication:
4204      *   c_rarg0   - int array elements a
4205      *   c_rarg1   - int array elements b
4206      *   c_rarg2   - int array elements n (the modulus)
4207      *   c_rarg3   - int length
4208      *   c_rarg4   - int inv
4209      *   c_rarg5   - int array elements m (the result)
4210      *
4211      * Inputs for squaring:
4212      *   c_rarg0   - int array elements a
4213      *   c_rarg1   - int array elements n (the modulus)
4214      *   c_rarg2   - int length
4215      *   c_rarg3   - int inv
4216      *   c_rarg4   - int array elements m (the result)
4217      *
4218      */
4219     address generate_multiply() {
4220       Label argh, nothing;
4221       bind(argh);
4222       stop("MontgomeryMultiply total_allocation must be <= 8192");
4223 
4224       align(CodeEntryAlignment);
4225       address entry = pc();
4226 
4227       cbzw(Rlen, nothing);
4228 
4229       enter();
4230 
4231       // Make room.
4232       cmpw(Rlen, 512);
4233       br(Assembler::HI, argh);
4234       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4235       andr(sp, Ra, -2 * wordSize);
4236 
4237       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4238 
4239       {
4240         // Copy input args, reversing as we go.  We use Ra as a
4241         // temporary variable.
4242         reverse(Ra, Pa_base, Rlen, t0, t1);
4243         if (!_squaring)
4244           reverse(Ra, Pb_base, Rlen, t0, t1);
4245         reverse(Ra, Pn_base, Rlen, t0, t1);
4246       }
4247 
4248       // Push all call-saved registers and also Pm_base which we'll need
4249       // at the end.
4250       save_regs();
4251 
4252 #ifndef PRODUCT
4253       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4254       {
4255         ldr(Rn, Address(Pn_base, 0));
4256         mul(Rlo_mn, Rn, inv);
4257         cmp(Rlo_mn, -1);
4258         Label ok;
4259         br(EQ, ok); {
4260           stop("broken inverse in Montgomery multiply");
4261         } bind(ok);
4262       }
4263 #endif
4264 
4265       mov(Pm_base, Ra);
4266 
4267       mov(t0, zr);
4268       mov(t1, zr);
4269       mov(t2, zr);
4270 
4271       block_comment("for (int i = 0; i < len; i++) {");
4272       mov(Ri, zr); {
4273         Label loop, end;
4274         cmpw(Ri, Rlen);
4275         br(Assembler::GE, end);
4276 
4277         bind(loop);
4278         pre1(Ri);
4279 
4280         block_comment("  for (j = i; j; j--) {"); {
4281           movw(Rj, Ri);
4282           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4283         } block_comment("  } // j");
4284 
4285         post1();
4286         addw(Ri, Ri, 1);
4287         cmpw(Ri, Rlen);
4288         br(Assembler::LT, loop);
4289         bind(end);
4290         block_comment("} // i");
4291       }
4292 
4293       block_comment("for (int i = len; i < 2*len; i++) {");
4294       mov(Ri, Rlen); {
4295         Label loop, end;
4296         cmpw(Ri, Rlen, Assembler::LSL, 1);
4297         br(Assembler::GE, end);
4298 
4299         bind(loop);
4300         pre2(Ri, Rlen);
4301 
4302         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4303           lslw(Rj, Rlen, 1);
4304           subw(Rj, Rj, Ri);
4305           subw(Rj, Rj, 1);
4306           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4307         } block_comment("  } // j");
4308 
4309         post2(Ri, Rlen);
4310         addw(Ri, Ri, 1);
4311         cmpw(Ri, Rlen, Assembler::LSL, 1);
4312         br(Assembler::LT, loop);
4313         bind(end);
4314       }
4315       block_comment("} // i");
4316 
4317       normalize(Rlen);
4318 
4319       mov(Ra, Pm_base);  // Save Pm_base in Ra
4320       restore_regs();  // Restore caller's Pm_base
4321 
4322       // Copy our result into caller's Pm_base
4323       reverse(Pm_base, Ra, Rlen, t0, t1);
4324 
4325       leave();
4326       bind(nothing);
4327       ret(lr);
4328 
4329       return entry;
4330     }
4331     // In C, approximately:
4332 
4333     // void
4334     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4335     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4336     //                     unsigned long inv, int len) {
4337     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4338     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4339     //   unsigned long Ra, Rb, Rn, Rm;
4340 
4341     //   int i;
4342 
4343     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4344 
4345     //   for (i = 0; i < len; i++) {
4346     //     int j;
4347 
4348     //     Pa = Pa_base;
4349     //     Pb = Pb_base + i;
4350     //     Pm = Pm_base;
4351     //     Pn = Pn_base + i;
4352 
4353     //     Ra = *Pa;
4354     //     Rb = *Pb;
4355     //     Rm = *Pm;
4356     //     Rn = *Pn;
4357 
4358     //     int iters = i;
4359     //     for (j = 0; iters--; j++) {
4360     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4361     //       MACC(Ra, Rb, t0, t1, t2);
4362     //       Ra = *++Pa;
4363     //       Rb = *--Pb;
4364     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4365     //       MACC(Rm, Rn, t0, t1, t2);
4366     //       Rm = *++Pm;
4367     //       Rn = *--Pn;
4368     //     }
4369 
4370     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4371     //     MACC(Ra, Rb, t0, t1, t2);
4372     //     *Pm = Rm = t0 * inv;
4373     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4374     //     MACC(Rm, Rn, t0, t1, t2);
4375 
4376     //     assert(t0 == 0, "broken Montgomery multiply");
4377 
4378     //     t0 = t1; t1 = t2; t2 = 0;
4379     //   }
4380 
4381     //   for (i = len; i < 2*len; i++) {
4382     //     int j;
4383 
4384     //     Pa = Pa_base + i-len;
4385     //     Pb = Pb_base + len;
4386     //     Pm = Pm_base + i-len;
4387     //     Pn = Pn_base + len;
4388 
4389     //     Ra = *++Pa;
4390     //     Rb = *--Pb;
4391     //     Rm = *++Pm;
4392     //     Rn = *--Pn;
4393 
4394     //     int iters = len*2-i-1;
4395     //     for (j = i-len+1; iters--; j++) {
4396     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4397     //       MACC(Ra, Rb, t0, t1, t2);
4398     //       Ra = *++Pa;
4399     //       Rb = *--Pb;
4400     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4401     //       MACC(Rm, Rn, t0, t1, t2);
4402     //       Rm = *++Pm;
4403     //       Rn = *--Pn;
4404     //     }
4405 
4406     //     Pm_base[i-len] = t0;
4407     //     t0 = t1; t1 = t2; t2 = 0;
4408     //   }
4409 
4410     //   while (t0)
4411     //     t0 = sub(Pm_base, Pn_base, t0, len);
4412     // }
4413 
4414     /**
4415      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4416      * multiplies than Montgomery multiplication so it should be up to
4417      * 25% faster.  However, its loop control is more complex and it
4418      * may actually run slower on some machines.
4419      *
4420      * Arguments:
4421      *
4422      * Inputs:
4423      *   c_rarg0   - int array elements a
4424      *   c_rarg1   - int array elements n (the modulus)
4425      *   c_rarg2   - int length
4426      *   c_rarg3   - int inv
4427      *   c_rarg4   - int array elements m (the result)
4428      *
4429      */
4430     address generate_square() {
4431       Label argh;
4432       bind(argh);
4433       stop("MontgomeryMultiply total_allocation must be <= 8192");
4434 
4435       align(CodeEntryAlignment);
4436       address entry = pc();
4437 
4438       enter();
4439 
4440       // Make room.
4441       cmpw(Rlen, 512);
4442       br(Assembler::HI, argh);
4443       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4444       andr(sp, Ra, -2 * wordSize);
4445 
4446       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4447 
4448       {
4449         // Copy input args, reversing as we go.  We use Ra as a
4450         // temporary variable.
4451         reverse(Ra, Pa_base, Rlen, t0, t1);
4452         reverse(Ra, Pn_base, Rlen, t0, t1);
4453       }
4454 
4455       // Push all call-saved registers and also Pm_base which we'll need
4456       // at the end.
4457       save_regs();
4458 
4459       mov(Pm_base, Ra);
4460 
4461       mov(t0, zr);
4462       mov(t1, zr);
4463       mov(t2, zr);
4464 
4465       block_comment("for (int i = 0; i < len; i++) {");
4466       mov(Ri, zr); {
4467         Label loop, end;
4468         bind(loop);
4469         cmp(Ri, Rlen);
4470         br(Assembler::GE, end);
4471 
4472         pre1(Ri);
4473 
4474         block_comment("for (j = (i+1)/2; j; j--) {"); {
4475           add(Rj, Ri, 1);
4476           lsr(Rj, Rj, 1);
4477           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4478         } block_comment("  } // j");
4479 
4480         last_squaring(Ri);
4481 
4482         block_comment("  for (j = i/2; j; j--) {"); {
4483           lsr(Rj, Ri, 1);
4484           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4485         } block_comment("  } // j");
4486 
4487         post1_squaring();
4488         add(Ri, Ri, 1);
4489         cmp(Ri, Rlen);
4490         br(Assembler::LT, loop);
4491 
4492         bind(end);
4493         block_comment("} // i");
4494       }
4495 
4496       block_comment("for (int i = len; i < 2*len; i++) {");
4497       mov(Ri, Rlen); {
4498         Label loop, end;
4499         bind(loop);
4500         cmp(Ri, Rlen, Assembler::LSL, 1);
4501         br(Assembler::GE, end);
4502 
4503         pre2(Ri, Rlen);
4504 
4505         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4506           lsl(Rj, Rlen, 1);
4507           sub(Rj, Rj, Ri);
4508           sub(Rj, Rj, 1);
4509           lsr(Rj, Rj, 1);
4510           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4511         } block_comment("  } // j");
4512 
4513         last_squaring(Ri);
4514 
4515         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4516           lsl(Rj, Rlen, 1);
4517           sub(Rj, Rj, Ri);
4518           lsr(Rj, Rj, 1);
4519           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4520         } block_comment("  } // j");
4521 
4522         post2(Ri, Rlen);
4523         add(Ri, Ri, 1);
4524         cmp(Ri, Rlen, Assembler::LSL, 1);
4525 
4526         br(Assembler::LT, loop);
4527         bind(end);
4528         block_comment("} // i");
4529       }
4530 
4531       normalize(Rlen);
4532 
4533       mov(Ra, Pm_base);  // Save Pm_base in Ra
4534       restore_regs();  // Restore caller's Pm_base
4535 
4536       // Copy our result into caller's Pm_base
4537       reverse(Pm_base, Ra, Rlen, t0, t1);
4538 
4539       leave();
4540       ret(lr);
4541 
4542       return entry;
4543     }
4544     // In C, approximately:
4545 
4546     // void
4547     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4548     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4549     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4550     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4551     //   unsigned long Ra, Rb, Rn, Rm;
4552 
4553     //   int i;
4554 
4555     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4556 
4557     //   for (i = 0; i < len; i++) {
4558     //     int j;
4559 
4560     //     Pa = Pa_base;
4561     //     Pb = Pa_base + i;
4562     //     Pm = Pm_base;
4563     //     Pn = Pn_base + i;
4564 
4565     //     Ra = *Pa;
4566     //     Rb = *Pb;
4567     //     Rm = *Pm;
4568     //     Rn = *Pn;
4569 
4570     //     int iters = (i+1)/2;
4571     //     for (j = 0; iters--; j++) {
4572     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4573     //       MACC2(Ra, Rb, t0, t1, t2);
4574     //       Ra = *++Pa;
4575     //       Rb = *--Pb;
4576     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4577     //       MACC(Rm, Rn, t0, t1, t2);
4578     //       Rm = *++Pm;
4579     //       Rn = *--Pn;
4580     //     }
4581     //     if ((i & 1) == 0) {
4582     //       assert(Ra == Pa_base[j], "must be");
4583     //       MACC(Ra, Ra, t0, t1, t2);
4584     //     }
4585     //     iters = i/2;
4586     //     assert(iters == i-j, "must be");
4587     //     for (; iters--; j++) {
4588     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4589     //       MACC(Rm, Rn, t0, t1, t2);
4590     //       Rm = *++Pm;
4591     //       Rn = *--Pn;
4592     //     }
4593 
4594     //     *Pm = Rm = t0 * inv;
4595     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4596     //     MACC(Rm, Rn, t0, t1, t2);
4597 
4598     //     assert(t0 == 0, "broken Montgomery multiply");
4599 
4600     //     t0 = t1; t1 = t2; t2 = 0;
4601     //   }
4602 
4603     //   for (i = len; i < 2*len; i++) {
4604     //     int start = i-len+1;
4605     //     int end = start + (len - start)/2;
4606     //     int j;
4607 
4608     //     Pa = Pa_base + i-len;
4609     //     Pb = Pa_base + len;
4610     //     Pm = Pm_base + i-len;
4611     //     Pn = Pn_base + len;
4612 
4613     //     Ra = *++Pa;
4614     //     Rb = *--Pb;
4615     //     Rm = *++Pm;
4616     //     Rn = *--Pn;
4617 
4618     //     int iters = (2*len-i-1)/2;
4619     //     assert(iters == end-start, "must be");
4620     //     for (j = start; iters--; j++) {
4621     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4622     //       MACC2(Ra, Rb, t0, t1, t2);
4623     //       Ra = *++Pa;
4624     //       Rb = *--Pb;
4625     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4626     //       MACC(Rm, Rn, t0, t1, t2);
4627     //       Rm = *++Pm;
4628     //       Rn = *--Pn;
4629     //     }
4630     //     if ((i & 1) == 0) {
4631     //       assert(Ra == Pa_base[j], "must be");
4632     //       MACC(Ra, Ra, t0, t1, t2);
4633     //     }
4634     //     iters =  (2*len-i)/2;
4635     //     assert(iters == len-j, "must be");
4636     //     for (; iters--; j++) {
4637     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4638     //       MACC(Rm, Rn, t0, t1, t2);
4639     //       Rm = *++Pm;
4640     //       Rn = *--Pn;
4641     //     }
4642     //     Pm_base[i-len] = t0;
4643     //     t0 = t1; t1 = t2; t2 = 0;
4644     //   }
4645 
4646     //   while (t0)
4647     //     t0 = sub(Pm_base, Pn_base, t0, len);
4648     // }
4649   };
4650 
4651   // Initialization
4652   void generate_initial() {
4653     // Generate initial stubs and initializes the entry points
4654 
4655     // entry points that exist in all platforms Note: This is code
4656     // that could be shared among different platforms - however the
4657     // benefit seems to be smaller than the disadvantage of having a
4658     // much more complicated generator structure. See also comment in
4659     // stubRoutines.hpp.
4660 
4661     StubRoutines::_forward_exception_entry = generate_forward_exception();
4662 
4663     StubRoutines::_call_stub_entry =
4664       generate_call_stub(StubRoutines::_call_stub_return_address);
4665 
4666     // is referenced by megamorphic call
4667     StubRoutines::_catch_exception_entry = generate_catch_exception();
4668 
4669     // Build this early so it's available for the interpreter.
4670     StubRoutines::_throw_StackOverflowError_entry =
4671       generate_throw_exception("StackOverflowError throw_exception",
4672                                CAST_FROM_FN_PTR(address,
4673                                                 SharedRuntime::
4674                                                 throw_StackOverflowError));
4675     if (UseCRC32Intrinsics) {
4676       // set table address before stub generation which use it
4677       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4678       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4679     }
4680   }
4681 
4682   void generate_all() {
4683     // support for verify_oop (must happen after universe_init)
4684     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4685     StubRoutines::_throw_AbstractMethodError_entry =
4686       generate_throw_exception("AbstractMethodError throw_exception",
4687                                CAST_FROM_FN_PTR(address,
4688                                                 SharedRuntime::
4689                                                 throw_AbstractMethodError));
4690 
4691     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4692       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4693                                CAST_FROM_FN_PTR(address,
4694                                                 SharedRuntime::
4695                                                 throw_IncompatibleClassChangeError));
4696 
4697     StubRoutines::_throw_NullPointerException_at_call_entry =
4698       generate_throw_exception("NullPointerException at call throw_exception",
4699                                CAST_FROM_FN_PTR(address,
4700                                                 SharedRuntime::
4701                                                 throw_NullPointerException_at_call));
4702 
4703     // arraycopy stubs used by compilers
4704     generate_arraycopy_stubs();
4705 
4706     if (UseMultiplyToLenIntrinsic) {
4707       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4708     }
4709 
4710     if (UseMontgomeryMultiplyIntrinsic) {
4711       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4712       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4713       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4714     }
4715 
4716     if (UseMontgomerySquareIntrinsic) {
4717       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4718       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4719       // We use generate_multiply() rather than generate_square()
4720       // because it's faster for the sizes of modulus we care about.
4721       StubRoutines::_montgomerySquare = g.generate_multiply();
4722     }
4723 
4724 #ifndef BUILTIN_SIM
4725     // generate GHASH intrinsics code
4726     if (UseGHASHIntrinsics) {
4727       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4728     }
4729 
4730     if (UseAESIntrinsics) {
4731       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4732       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4733       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4734       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4735     }
4736 
4737     if (UseSHA1Intrinsics) {
4738       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4739       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4740     }
4741     if (UseSHA256Intrinsics) {
4742       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4743       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4744     }
4745 
4746     if (UseCRC32CIntrinsics) {
4747       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4748     }
4749 
4750     // generate Adler32 intrinsics code
4751     if (UseAdler32Intrinsics) {
4752       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4753     }
4754 
4755     // Safefetch stubs.
4756     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4757                                                        &StubRoutines::_safefetch32_fault_pc,
4758                                                        &StubRoutines::_safefetch32_continuation_pc);
4759     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4760                                                        &StubRoutines::_safefetchN_fault_pc,
4761                                                        &StubRoutines::_safefetchN_continuation_pc);
4762 #endif
4763   }
4764 
4765  public:
4766   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4767     if (all) {
4768       generate_all();
4769     } else {
4770       generate_initial();
4771     }
4772   }
4773 }; // end class declaration
4774 
4775 void StubGenerator_generate(CodeBuffer* code, bool all) {
4776   StubGenerator g(code, all);
4777 }