1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #include "utilities/top.hpp"
  43 #ifdef COMPILER2
  44 #include "opto/runtime.hpp"
  45 #endif
  46 
  47 #ifdef BUILTIN_SIM
  48 #include "../../../../../../simulator/simulator.hpp"
  49 #endif
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp
  54 
  55 #undef __
  56 #define __ _masm->
  57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) __ block_comment(str)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 // Stub Code definitions
  68 
  69 class StubGenerator: public StubCodeGenerator {
  70  private:
  71 
  72 #ifdef PRODUCT
  73 #define inc_counter_np(counter) ((void)0)
  74 #else
  75   void inc_counter_np_(int& counter) {
  76     __ lea(rscratch2, ExternalAddress((address)&counter));
  77     __ ldrw(rscratch1, Address(rscratch2));
  78     __ addw(rscratch1, rscratch1, 1);
  79     __ strw(rscratch1, Address(rscratch2));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save r30 (lr) as the return PC at the base of the frame and
 102   // link r29 (fp) below it as the frame pointer installing sp (r31)
 103   // into fp.
 104   //
 105   // we save r0-r7, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save r8 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save r9-r15 which both C and Java treat as
 116   // volatile
 117   //
 118   // we don't need to save r16-18 because Java does not use them
 119   //
 120   // we save r19-r28 which Java uses as scratch registers and C
 121   // expects to be callee-save
 122   //
 123   // we save the bottom 64 bits of each value stored in v8-v15; it is
 124   // the responsibility of the caller to preserve larger values.
 125   //
 126   // so the stub frame looks like this when we enter Java code
 127   //
 128   //     [ return_from_Java     ] <--- sp
 129   //     [ argument word n      ]
 130   //      ...
 131   // -27 [ argument word 1      ]
 132   // -26 [ saved v15            ] <--- sp_after_call
 133   // -25 [ saved v14            ]
 134   // -24 [ saved v13            ]
 135   // -23 [ saved v12            ]
 136   // -22 [ saved v11            ]
 137   // -21 [ saved v10            ]
 138   // -20 [ saved v9             ]
 139   // -19 [ saved v8             ]
 140   // -18 [ saved r28            ]
 141   // -17 [ saved r27            ]
 142   // -16 [ saved r26            ]
 143   // -15 [ saved r25            ]
 144   // -14 [ saved r24            ]
 145   // -13 [ saved r23            ]
 146   // -12 [ saved r22            ]
 147   // -11 [ saved r21            ]
 148   // -10 [ saved r20            ]
 149   //  -9 [ saved r19            ]
 150   //  -8 [ call wrapper    (r0) ]
 151   //  -7 [ result          (r1) ]
 152   //  -6 [ result type     (r2) ]
 153   //  -5 [ method          (r3) ]
 154   //  -4 [ entry point     (r4) ]
 155   //  -3 [ parameters      (r5) ]
 156   //  -2 [ parameter size  (r6) ]
 157   //  -1 [ thread (r7)          ]
 158   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 159   //   1 [ saved lr       (r30) ]
 160 
 161   // Call stub stack layout word offsets from fp
 162   enum call_stub_layout {
 163     sp_after_call_off = -26,
 164 
 165     d15_off            = -26,
 166     d13_off            = -24,
 167     d11_off            = -22,
 168     d9_off             = -20,
 169 
 170     r28_off            = -18,
 171     r26_off            = -16,
 172     r24_off            = -14,
 173     r22_off            = -12,
 174     r20_off            = -10,
 175     call_wrapper_off   =  -8,
 176     result_off         =  -7,
 177     result_type_off    =  -6,
 178     method_off         =  -5,
 179     entry_point_off    =  -4,
 180     parameter_size_off =  -2,
 181     thread_off         =  -1,
 182     fp_f               =   0,
 183     retaddr_off        =   1,
 184   };
 185 
 186   address generate_call_stub(address& return_address) {
 187     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 188            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 189            "adjust this code");
 190 
 191     StubCodeMark mark(this, "StubRoutines", "call_stub");
 192     address start = __ pc();
 193 
 194     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 195 
 196     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 197     const Address result        (rfp, result_off         * wordSize);
 198     const Address result_type   (rfp, result_type_off    * wordSize);
 199     const Address method        (rfp, method_off         * wordSize);
 200     const Address entry_point   (rfp, entry_point_off    * wordSize);
 201     const Address parameter_size(rfp, parameter_size_off * wordSize);
 202 
 203     const Address thread        (rfp, thread_off         * wordSize);
 204 
 205     const Address d15_save      (rfp, d15_off * wordSize);
 206     const Address d13_save      (rfp, d13_off * wordSize);
 207     const Address d11_save      (rfp, d11_off * wordSize);
 208     const Address d9_save       (rfp, d9_off * wordSize);
 209 
 210     const Address r28_save      (rfp, r28_off * wordSize);
 211     const Address r26_save      (rfp, r26_off * wordSize);
 212     const Address r24_save      (rfp, r24_off * wordSize);
 213     const Address r22_save      (rfp, r22_off * wordSize);
 214     const Address r20_save      (rfp, r20_off * wordSize);
 215 
 216     // stub code
 217 
 218     // we need a C prolog to bootstrap the x86 caller into the sim
 219     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 220 
 221     address aarch64_entry = __ pc();
 222 
 223 #ifdef BUILTIN_SIM
 224     // Save sender's SP for stack traces.
 225     __ mov(rscratch1, sp);
 226     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 227 #endif
 228     // set up frame and move sp to end of save area
 229     __ enter();
 230     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 231 
 232     // save register parameters and Java scratch/global registers
 233     // n.b. we save thread even though it gets installed in
 234     // rthread because we want to sanity check rthread later
 235     __ str(c_rarg7,  thread);
 236     __ strw(c_rarg6, parameter_size);
 237     __ stp(c_rarg4, c_rarg5,  entry_point);
 238     __ stp(c_rarg2, c_rarg3,  result_type);
 239     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 240 
 241     __ stp(r20, r19,   r20_save);
 242     __ stp(r22, r21,   r22_save);
 243     __ stp(r24, r23,   r24_save);
 244     __ stp(r26, r25,   r26_save);
 245     __ stp(r28, r27,   r28_save);
 246 
 247     __ stpd(v9,  v8,   d9_save);
 248     __ stpd(v11, v10,  d11_save);
 249     __ stpd(v13, v12,  d13_save);
 250     __ stpd(v15, v14,  d15_save);
 251 
 252     // install Java thread in global register now we have saved
 253     // whatever value it held
 254     __ mov(rthread, c_rarg7);
 255     // And method
 256     __ mov(rmethod, c_rarg3);
 257 
 258     // set up the heapbase register
 259     __ reinit_heapbase();
 260 
 261 #ifdef ASSERT
 262     // make sure we have no pending exceptions
 263     {
 264       Label L;
 265       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 266       __ cmp(rscratch1, (unsigned)NULL_WORD);
 267       __ br(Assembler::EQ, L);
 268       __ stop("StubRoutines::call_stub: entered with pending exception");
 269       __ BIND(L);
 270     }
 271 #endif
 272     // pass parameters if any
 273     __ mov(esp, sp);
 274     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 275     __ andr(sp, rscratch1, -2 * wordSize);
 276 
 277     BLOCK_COMMENT("pass parameters if any");
 278     Label parameters_done;
 279     // parameter count is still in c_rarg6
 280     // and parameter pointer identifying param 1 is in c_rarg5
 281     __ cbzw(c_rarg6, parameters_done);
 282 
 283     address loop = __ pc();
 284     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 285     __ subsw(c_rarg6, c_rarg6, 1);
 286     __ push(rscratch1);
 287     __ br(Assembler::GT, loop);
 288 
 289     __ BIND(parameters_done);
 290 
 291     // call Java entry -- passing methdoOop, and current sp
 292     //      rmethod: Method*
 293     //      r13: sender sp
 294     BLOCK_COMMENT("call Java function");
 295     __ mov(r13, sp);
 296     __ blr(c_rarg4);
 297 
 298     // tell the simulator we have returned to the stub
 299 
 300     // we do this here because the notify will already have been done
 301     // if we get to the next instruction via an exception
 302     //
 303     // n.b. adding this instruction here affects the calculation of
 304     // whether or not a routine returns to the call stub (used when
 305     // doing stack walks) since the normal test is to check the return
 306     // pc against the address saved below. so we may need to allow for
 307     // this extra instruction in the check.
 308 
 309     if (NotifySimulator) {
 310       __ notify(Assembler::method_reentry);
 311     }
 312     // save current address for use by exception handling code
 313 
 314     return_address = __ pc();
 315 
 316     // store result depending on type (everything that is not
 317     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 318     // n.b. this assumes Java returns an integral result in r0
 319     // and a floating result in j_farg0
 320     __ ldr(j_rarg2, result);
 321     Label is_long, is_float, is_double, exit;
 322     __ ldr(j_rarg1, result_type);
 323     __ cmp(j_rarg1, T_OBJECT);
 324     __ br(Assembler::EQ, is_long);
 325     __ cmp(j_rarg1, T_LONG);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, T_FLOAT);
 328     __ br(Assembler::EQ, is_float);
 329     __ cmp(j_rarg1, T_DOUBLE);
 330     __ br(Assembler::EQ, is_double);
 331 
 332     // handle T_INT case
 333     __ strw(r0, Address(j_rarg2));
 334 
 335     __ BIND(exit);
 336 
 337     // pop parameters
 338     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 339 
 340 #ifdef ASSERT
 341     // verify that threads correspond
 342     {
 343       Label L, S;
 344       __ ldr(rscratch1, thread);
 345       __ cmp(rthread, rscratch1);
 346       __ br(Assembler::NE, S);
 347       __ get_thread(rscratch1);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::EQ, L);
 350       __ BIND(S);
 351       __ stop("StubRoutines::call_stub: threads must correspond");
 352       __ BIND(L);
 353     }
 354 #endif
 355 
 356     // restore callee-save registers
 357     __ ldpd(v15, v14,  d15_save);
 358     __ ldpd(v13, v12,  d13_save);
 359     __ ldpd(v11, v10,  d11_save);
 360     __ ldpd(v9,  v8,   d9_save);
 361 
 362     __ ldp(r28, r27,   r28_save);
 363     __ ldp(r26, r25,   r26_save);
 364     __ ldp(r24, r23,   r24_save);
 365     __ ldp(r22, r21,   r22_save);
 366     __ ldp(r20, r19,   r20_save);
 367 
 368     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 369     __ ldrw(c_rarg2, result_type);
 370     __ ldr(c_rarg3,  method);
 371     __ ldp(c_rarg4, c_rarg5,  entry_point);
 372     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 373 
 374 #ifndef PRODUCT
 375     // tell the simulator we are about to end Java execution
 376     if (NotifySimulator) {
 377       __ notify(Assembler::method_exit);
 378     }
 379 #endif
 380     // leave frame and return to caller
 381     __ leave();
 382     __ ret(lr);
 383 
 384     // handle return types different from T_INT
 385 
 386     __ BIND(is_long);
 387     __ str(r0, Address(j_rarg2, 0));
 388     __ br(Assembler::AL, exit);
 389 
 390     __ BIND(is_float);
 391     __ strs(j_farg0, Address(j_rarg2, 0));
 392     __ br(Assembler::AL, exit);
 393 
 394     __ BIND(is_double);
 395     __ strd(j_farg0, Address(j_rarg2, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     return start;
 399   }
 400 
 401   // Return point for a Java call if there's an exception thrown in
 402   // Java code.  The exception is caught and transformed into a
 403   // pending exception stored in JavaThread that can be tested from
 404   // within the VM.
 405   //
 406   // Note: Usually the parameters are removed by the callee. In case
 407   // of an exception crossing an activation frame boundary, that is
 408   // not the case if the callee is compiled code => need to setup the
 409   // rsp.
 410   //
 411   // r0: exception oop
 412 
 413   // NOTE: this is used as a target from the signal handler so it
 414   // needs an x86 prolog which returns into the current simulator
 415   // executing the generated catch_exception code. so the prolog
 416   // needs to install rax in a sim register and adjust the sim's
 417   // restart pc to enter the generated code at the start position
 418   // then return from native to simulated execution.
 419 
 420   address generate_catch_exception() {
 421     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 422     address start = __ pc();
 423 
 424     // same as in generate_call_stub():
 425     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 426     const Address thread        (rfp, thread_off         * wordSize);
 427 
 428 #ifdef ASSERT
 429     // verify that threads correspond
 430     {
 431       Label L, S;
 432       __ ldr(rscratch1, thread);
 433       __ cmp(rthread, rscratch1);
 434       __ br(Assembler::NE, S);
 435       __ get_thread(rscratch1);
 436       __ cmp(rthread, rscratch1);
 437       __ br(Assembler::EQ, L);
 438       __ bind(S);
 439       __ stop("StubRoutines::catch_exception: threads must correspond");
 440       __ bind(L);
 441     }
 442 #endif
 443 
 444     // set pending exception
 445     __ verify_oop(r0);
 446 
 447     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 448     __ mov(rscratch1, (address)__FILE__);
 449     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 450     __ movw(rscratch1, (int)__LINE__);
 451     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 452 
 453     // complete return to VM
 454     assert(StubRoutines::_call_stub_return_address != NULL,
 455            "_call_stub_return_address must have been generated before");
 456     __ b(StubRoutines::_call_stub_return_address);
 457 
 458     return start;
 459   }
 460 
 461   // Continuation point for runtime calls returning with a pending
 462   // exception.  The pending exception check happened in the runtime
 463   // or native call stub.  The pending exception in Thread is
 464   // converted into a Java-level exception.
 465   //
 466   // Contract with Java-level exception handlers:
 467   // r0: exception
 468   // r3: throwing pc
 469   //
 470   // NOTE: At entry of this stub, exception-pc must be in LR !!
 471 
 472   // NOTE: this is always used as a jump target within generated code
 473   // so it just needs to be generated code wiht no x86 prolog
 474 
 475   address generate_forward_exception() {
 476     StubCodeMark mark(this, "StubRoutines", "forward exception");
 477     address start = __ pc();
 478 
 479     // Upon entry, LR points to the return address returning into
 480     // Java (interpreted or compiled) code; i.e., the return address
 481     // becomes the throwing pc.
 482     //
 483     // Arguments pushed before the runtime call are still on the stack
 484     // but the exception handler will reset the stack pointer ->
 485     // ignore them.  A potential result in registers can be ignored as
 486     // well.
 487 
 488 #ifdef ASSERT
 489     // make sure this code is only executed if there is a pending exception
 490     {
 491       Label L;
 492       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 493       __ cbnz(rscratch1, L);
 494       __ stop("StubRoutines::forward exception: no pending exception (1)");
 495       __ bind(L);
 496     }
 497 #endif
 498 
 499     // compute exception handler into r19
 500 
 501     // call the VM to find the handler address associated with the
 502     // caller address. pass thread in r0 and caller pc (ret address)
 503     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 504     // the stack.
 505     __ mov(c_rarg1, lr);
 506     // lr will be trashed by the VM call so we move it to R19
 507     // (callee-saved) because we also need to pass it to the handler
 508     // returned by this call.
 509     __ mov(r19, lr);
 510     BLOCK_COMMENT("call exception_handler_for_return_address");
 511     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 512                          SharedRuntime::exception_handler_for_return_address),
 513                     rthread, c_rarg1);
 514     // we should not really care that lr is no longer the callee
 515     // address. we saved the value the handler needs in r19 so we can
 516     // just copy it to r3. however, the C2 handler will push its own
 517     // frame and then calls into the VM and the VM code asserts that
 518     // the PC for the frame above the handler belongs to a compiled
 519     // Java method. So, we restore lr here to satisfy that assert.
 520     __ mov(lr, r19);
 521     // setup r0 & r3 & clear pending exception
 522     __ mov(r3, r19);
 523     __ mov(r19, r0);
 524     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 525     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 526 
 527 #ifdef ASSERT
 528     // make sure exception is set
 529     {
 530       Label L;
 531       __ cbnz(r0, L);
 532       __ stop("StubRoutines::forward exception: no pending exception (2)");
 533       __ bind(L);
 534     }
 535 #endif
 536 
 537     // continue at exception handler
 538     // r0: exception
 539     // r3: throwing pc
 540     // r19: exception handler
 541     __ verify_oop(r0);
 542     __ br(r19);
 543 
 544     return start;
 545   }
 546 
 547   // Non-destructive plausibility checks for oops
 548   //
 549   // Arguments:
 550   //    r0: oop to verify
 551   //    rscratch1: error message
 552   //
 553   // Stack after saving c_rarg3:
 554   //    [tos + 0]: saved c_rarg3
 555   //    [tos + 1]: saved c_rarg2
 556   //    [tos + 2]: saved lr
 557   //    [tos + 3]: saved rscratch2
 558   //    [tos + 4]: saved r0
 559   //    [tos + 5]: saved rscratch1
 560   address generate_verify_oop() {
 561 
 562     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 563     address start = __ pc();
 564 
 565     Label exit, error;
 566 
 567     // save c_rarg2 and c_rarg3
 568     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 569 
 570     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 571     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 572     __ ldr(c_rarg3, Address(c_rarg2));
 573     __ add(c_rarg3, c_rarg3, 1);
 574     __ str(c_rarg3, Address(c_rarg2));
 575 
 576     // object is in r0
 577     // make sure object is 'reasonable'
 578     __ cbz(r0, exit); // if obj is NULL it is OK
 579 
 580     // Check if the oop is in the right area of memory
 581     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 582     __ andr(c_rarg2, r0, c_rarg3);
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 584 
 585     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 586     // instruction here because the flags register is live.
 587     __ eor(c_rarg2, c_rarg2, c_rarg3);
 588     __ cbnz(c_rarg2, error);
 589 
 590     // make sure klass is 'reasonable', which is not zero.
 591     __ load_klass(r0, r0);  // get klass
 592     __ cbz(r0, error);      // if klass is NULL it is broken
 593 
 594     // return if everything seems ok
 595     __ bind(exit);
 596 
 597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 598     __ ret(lr);
 599 
 600     // handle errors
 601     __ bind(error);
 602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 603 
 604     __ push(RegSet::range(r0, r29), sp);
 605     // debug(char* msg, int64_t pc, int64_t regs[])
 606     __ mov(c_rarg0, rscratch1);      // pass address of error message
 607     __ mov(c_rarg1, lr);             // pass return address
 608     __ mov(c_rarg2, sp);             // pass address of regs on stack
 609 #ifndef PRODUCT
 610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 611 #endif
 612     BLOCK_COMMENT("call MacroAssembler::debug");
 613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 614     __ blrt(rscratch1, 3, 0, 1);
 615 
 616     return start;
 617   }
 618 
 619   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 620 
 621   // Generate code for an array write pre barrier
 622   //
 623   //     addr    -  starting address
 624   //     count   -  element count
 625   //     tmp     - scratch register
 626   //
 627   //     Destroy no registers except rscratch1 and rscratch2
 628   //
 629   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 630     BarrierSet* bs = Universe::heap()->barrier_set();
 631     switch (bs->kind()) {
 632     case BarrierSet::G1SATBCTLogging:
 633       // With G1, don't generate the call if we statically know that the target in uninitialized
 634       if (!dest_uninitialized) {
 635         __ push_call_clobbered_registers();
 636         if (count == c_rarg0) {
 637           if (addr == c_rarg1) {
 638             // exactly backwards!!
 639             __ mov(rscratch1, c_rarg0);
 640             __ mov(c_rarg0, c_rarg1);
 641             __ mov(c_rarg1, rscratch1);
 642           } else {
 643             __ mov(c_rarg1, count);
 644             __ mov(c_rarg0, addr);
 645           }
 646         } else {
 647           __ mov(c_rarg0, addr);
 648           __ mov(c_rarg1, count);
 649         }
 650         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 651         __ pop_call_clobbered_registers();
 652         break;
 653       case BarrierSet::CardTableForRS:
 654       case BarrierSet::CardTableExtension:
 655       case BarrierSet::ModRef:
 656         break;
 657       default:
 658         ShouldNotReachHere();
 659 
 660       }
 661     }
 662   }
 663 
 664   //
 665   // Generate code for an array write post barrier
 666   //
 667   //  Input:
 668   //     start    - register containing starting address of destination array
 669   //     end      - register containing ending address of destination array
 670   //     scratch  - scratch register
 671   //
 672   //  The input registers are overwritten.
 673   //  The ending address is inclusive.
 674   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 675     assert_different_registers(start, end, scratch);
 676     BarrierSet* bs = Universe::heap()->barrier_set();
 677     switch (bs->kind()) {
 678       case BarrierSet::G1SATBCTLogging:
 679 
 680         {
 681           __ push_call_clobbered_registers();
 682           // must compute element count unless barrier set interface is changed (other platforms supply count)
 683           assert_different_registers(start, end, scratch);
 684           __ lea(scratch, Address(end, BytesPerHeapOop));
 685           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 686           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 687           __ mov(c_rarg0, start);
 688           __ mov(c_rarg1, scratch);
 689           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 690           __ pop_call_clobbered_registers();
 691         }
 692         break;
 693       case BarrierSet::CardTableForRS:
 694       case BarrierSet::CardTableExtension:
 695         {
 696           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 697           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 698 
 699           Label L_loop;
 700 
 701            __ lsr(start, start, CardTableModRefBS::card_shift);
 702            __ lsr(end, end, CardTableModRefBS::card_shift);
 703            __ sub(end, end, start); // number of bytes to copy
 704 
 705           const Register count = end; // 'end' register contains bytes count now
 706           __ load_byte_map_base(scratch);
 707           __ add(start, start, scratch);
 708           if (UseConcMarkSweepGC) {
 709             __ membar(__ StoreStore);
 710           }
 711           __ BIND(L_loop);
 712           __ strb(zr, Address(start, count));
 713           __ subs(count, count, 1);
 714           __ br(Assembler::HS, L_loop);
 715         }
 716         break;
 717       default:
 718         ShouldNotReachHere();
 719 
 720     }
 721   }
 722 
 723   typedef enum {
 724     copy_forwards = 1,
 725     copy_backwards = -1
 726   } copy_direction;
 727 
 728   // Bulk copy of blocks of 8 words.
 729   //
 730   // count is a count of words.
 731   //
 732   // Precondition: count >= 8
 733   //
 734   // Postconditions:
 735   //
 736   // The least significant bit of count contains the remaining count
 737   // of words to copy.  The rest of count is trash.
 738   //
 739   // s and d are adjusted to point to the remaining words to copy
 740   //
 741   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 742                            copy_direction direction) {
 743     int unit = wordSize * direction;
 744     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 745 
 746     int offset;
 747     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 748       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 749     const Register stride = r13;
 750 
 751     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 752     assert_different_registers(s, d, count, rscratch1);
 753 
 754     Label again, drain;
 755     const char *stub_name;
 756     if (direction == copy_forwards)
 757       stub_name = "foward_copy_longs";
 758     else
 759       stub_name = "backward_copy_longs";
 760     StubCodeMark mark(this, "StubRoutines", stub_name);
 761     __ align(CodeEntryAlignment);
 762     __ bind(start);
 763     if (direction == copy_forwards) {
 764       __ sub(s, s, bias);
 765       __ sub(d, d, bias);
 766     }
 767 
 768 #ifdef ASSERT
 769     // Make sure we are never given < 8 words
 770     {
 771       Label L;
 772       __ cmp(count, 8);
 773       __ br(Assembler::GE, L);
 774       __ stop("genrate_copy_longs called with < 8 words");
 775       __ bind(L);
 776     }
 777 #endif
 778 
 779     // Fill 8 registers
 780     if (UseSIMDForMemoryOps) {
 781       __ ldpq(v0, v1, Address(s, 4 * unit));
 782       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 783     } else {
 784       __ ldp(t0, t1, Address(s, 2 * unit));
 785       __ ldp(t2, t3, Address(s, 4 * unit));
 786       __ ldp(t4, t5, Address(s, 6 * unit));
 787       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 788     }
 789 
 790     __ subs(count, count, 16);
 791     __ br(Assembler::LO, drain);
 792 
 793     int prefetch = PrefetchCopyIntervalInBytes;
 794     bool use_stride = false;
 795     if (direction == copy_backwards) {
 796        use_stride = prefetch > 256;
 797        prefetch = -prefetch;
 798        if (use_stride) __ mov(stride, prefetch);
 799     }
 800 
 801     __ bind(again);
 802 
 803     if (PrefetchCopyIntervalInBytes > 0)
 804       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 805 
 806     if (UseSIMDForMemoryOps) {
 807       __ stpq(v0, v1, Address(d, 4 * unit));
 808       __ ldpq(v0, v1, Address(s, 4 * unit));
 809       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 810       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 811     } else {
 812       __ stp(t0, t1, Address(d, 2 * unit));
 813       __ ldp(t0, t1, Address(s, 2 * unit));
 814       __ stp(t2, t3, Address(d, 4 * unit));
 815       __ ldp(t2, t3, Address(s, 4 * unit));
 816       __ stp(t4, t5, Address(d, 6 * unit));
 817       __ ldp(t4, t5, Address(s, 6 * unit));
 818       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 819       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 820     }
 821 
 822     __ subs(count, count, 8);
 823     __ br(Assembler::HS, again);
 824 
 825     // Drain
 826     __ bind(drain);
 827     if (UseSIMDForMemoryOps) {
 828       __ stpq(v0, v1, Address(d, 4 * unit));
 829       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 830     } else {
 831       __ stp(t0, t1, Address(d, 2 * unit));
 832       __ stp(t2, t3, Address(d, 4 * unit));
 833       __ stp(t4, t5, Address(d, 6 * unit));
 834       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 835     }
 836 
 837     {
 838       Label L1, L2;
 839       __ tbz(count, exact_log2(4), L1);
 840       if (UseSIMDForMemoryOps) {
 841         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 842         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 843       } else {
 844         __ ldp(t0, t1, Address(s, 2 * unit));
 845         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 846         __ stp(t0, t1, Address(d, 2 * unit));
 847         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 848       }
 849       __ bind(L1);
 850 
 851       if (direction == copy_forwards) {
 852         __ add(s, s, bias);
 853         __ add(d, d, bias);
 854       }
 855 
 856       __ tbz(count, 1, L2);
 857       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 858       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 859       __ bind(L2);
 860     }
 861 
 862     __ ret(lr);
 863   }
 864 
 865   // Small copy: less than 16 bytes.
 866   //
 867   // NB: Ignores all of the bits of count which represent more than 15
 868   // bytes, so a caller doesn't have to mask them.
 869 
 870   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
 871     bool is_backwards = step < 0;
 872     size_t granularity = uabs(step);
 873     int direction = is_backwards ? -1 : 1;
 874     int unit = wordSize * direction;
 875 
 876     Label Lpair, Lword, Lint, Lshort, Lbyte;
 877 
 878     assert(granularity
 879            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 880 
 881     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
 882 
 883     // ??? I don't know if this bit-test-and-branch is the right thing
 884     // to do.  It does a lot of jumping, resulting in several
 885     // mispredicted branches.  It might make more sense to do this
 886     // with something like Duff's device with a single computed branch.
 887 
 888     __ tbz(count, 3 - exact_log2(granularity), Lword);
 889     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
 890     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
 891     __ bind(Lword);
 892 
 893     if (granularity <= sizeof (jint)) {
 894       __ tbz(count, 2 - exact_log2(granularity), Lint);
 895       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 896       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 897       __ bind(Lint);
 898     }
 899 
 900     if (granularity <= sizeof (jshort)) {
 901       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 902       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 903       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 904       __ bind(Lshort);
 905     }
 906 
 907     if (granularity <= sizeof (jbyte)) {
 908       __ tbz(count, 0, Lbyte);
 909       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 910       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 911       __ bind(Lbyte);
 912     }
 913   }
 914 
 915   Label copy_f, copy_b;
 916 
 917   // All-singing all-dancing memory copy.
 918   //
 919   // Copy count units of memory from s to d.  The size of a unit is
 920   // step, which can be positive or negative depending on the direction
 921   // of copy.  If is_aligned is false, we align the source address.
 922   //
 923 
 924   void copy_memory(bool is_aligned, Register s, Register d,
 925                    Register count, Register tmp, int step) {
 926     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 927     bool is_backwards = step < 0;
 928     int granularity = uabs(step);
 929     const Register t0 = r3, t1 = r4;
 930 
 931     // <= 96 bytes do inline. Direction doesn't matter because we always
 932     // load all the data before writing anything
 933     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
 934     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
 935     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
 936     const Register send = r17, dend = r18;
 937 
 938     if (PrefetchCopyIntervalInBytes > 0)
 939       __ prfm(Address(s, 0), PLDL1KEEP);
 940     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
 941     __ br(Assembler::HI, copy_big);
 942 
 943     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 944     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 945 
 946     __ cmp(count, 16/granularity);
 947     __ br(Assembler::LS, copy16);
 948 
 949     __ cmp(count, 64/granularity);
 950     __ br(Assembler::HI, copy80);
 951 
 952     __ cmp(count, 32/granularity);
 953     __ br(Assembler::LS, copy32);
 954 
 955     // 33..64 bytes
 956     if (UseSIMDForMemoryOps) {
 957       __ ldpq(v0, v1, Address(s, 0));
 958       __ ldpq(v2, v3, Address(send, -32));
 959       __ stpq(v0, v1, Address(d, 0));
 960       __ stpq(v2, v3, Address(dend, -32));
 961     } else {
 962       __ ldp(t0, t1, Address(s, 0));
 963       __ ldp(t2, t3, Address(s, 16));
 964       __ ldp(t4, t5, Address(send, -32));
 965       __ ldp(t6, t7, Address(send, -16));
 966 
 967       __ stp(t0, t1, Address(d, 0));
 968       __ stp(t2, t3, Address(d, 16));
 969       __ stp(t4, t5, Address(dend, -32));
 970       __ stp(t6, t7, Address(dend, -16));
 971     }
 972     __ b(finish);
 973 
 974     // 17..32 bytes
 975     __ bind(copy32);
 976     __ ldp(t0, t1, Address(s, 0));
 977     __ ldp(t2, t3, Address(send, -16));
 978     __ stp(t0, t1, Address(d, 0));
 979     __ stp(t2, t3, Address(dend, -16));
 980     __ b(finish);
 981 
 982     // 65..80/96 bytes
 983     // (96 bytes if SIMD because we do 32 byes per instruction)
 984     __ bind(copy80);
 985     if (UseSIMDForMemoryOps) {
 986       __ ldpq(v0, v1, Address(s, 0));
 987       __ ldpq(v2, v3, Address(s, 32));
 988       __ ldpq(v4, v5, Address(send, -32));
 989       __ stpq(v0, v1, Address(d, 0));
 990       __ stpq(v2, v3, Address(d, 32));
 991       __ stpq(v4, v5, Address(dend, -32));
 992     } else {
 993       __ ldp(t0, t1, Address(s, 0));
 994       __ ldp(t2, t3, Address(s, 16));
 995       __ ldp(t4, t5, Address(s, 32));
 996       __ ldp(t6, t7, Address(s, 48));
 997       __ ldp(t8, t9, Address(send, -16));
 998 
 999       __ stp(t0, t1, Address(d, 0));
1000       __ stp(t2, t3, Address(d, 16));
1001       __ stp(t4, t5, Address(d, 32));
1002       __ stp(t6, t7, Address(d, 48));
1003       __ stp(t8, t9, Address(dend, -16));
1004     }
1005     __ b(finish);
1006 
1007     // 0..16 bytes
1008     __ bind(copy16);
1009     __ cmp(count, 8/granularity);
1010     __ br(Assembler::LO, copy8);
1011 
1012     // 8..16 bytes
1013     __ ldr(t0, Address(s, 0));
1014     __ ldr(t1, Address(send, -8));
1015     __ str(t0, Address(d, 0));
1016     __ str(t1, Address(dend, -8));
1017     __ b(finish);
1018 
1019     if (granularity < 8) {
1020       // 4..7 bytes
1021       __ bind(copy8);
1022       __ tbz(count, 2 - exact_log2(granularity), copy4);
1023       __ ldrw(t0, Address(s, 0));
1024       __ ldrw(t1, Address(send, -4));
1025       __ strw(t0, Address(d, 0));
1026       __ strw(t1, Address(dend, -4));
1027       __ b(finish);
1028       if (granularity < 4) {
1029         // 0..3 bytes
1030         __ bind(copy4);
1031         __ cbz(count, finish); // get rid of 0 case
1032         if (granularity == 2) {
1033           __ ldrh(t0, Address(s, 0));
1034           __ strh(t0, Address(d, 0));
1035         } else { // granularity == 1
1036           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1037           // the first and last byte.
1038           // Handle the 3 byte case by loading and storing base + count/2
1039           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1040           // This does means in the 1 byte case we load/store the same
1041           // byte 3 times.
1042           __ lsr(count, count, 1);
1043           __ ldrb(t0, Address(s, 0));
1044           __ ldrb(t1, Address(send, -1));
1045           __ ldrb(t2, Address(s, count));
1046           __ strb(t0, Address(d, 0));
1047           __ strb(t1, Address(dend, -1));
1048           __ strb(t2, Address(d, count));
1049         }
1050         __ b(finish);
1051       }
1052     }
1053 
1054     __ bind(copy_big);
1055     if (is_backwards) {
1056       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1057       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1058     }
1059 
1060     // Now we've got the small case out of the way we can align the
1061     // source address on a 2-word boundary.
1062 
1063     Label aligned;
1064 
1065     if (is_aligned) {
1066       // We may have to adjust by 1 word to get s 2-word-aligned.
1067       __ tbz(s, exact_log2(wordSize), aligned);
1068       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1069       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1070       __ sub(count, count, wordSize/granularity);
1071     } else {
1072       if (is_backwards) {
1073         __ andr(rscratch2, s, 2 * wordSize - 1);
1074       } else {
1075         __ neg(rscratch2, s);
1076         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1077       }
1078       // rscratch2 is the byte adjustment needed to align s.
1079       __ cbz(rscratch2, aligned);
1080       int shift = exact_log2(granularity);
1081       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1082       __ sub(count, count, rscratch2);
1083 
1084 #if 0
1085       // ?? This code is only correct for a disjoint copy.  It may or
1086       // may not make sense to use it in that case.
1087 
1088       // Copy the first pair; s and d may not be aligned.
1089       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1090       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1091 
1092       // Align s and d, adjust count
1093       if (is_backwards) {
1094         __ sub(s, s, rscratch2);
1095         __ sub(d, d, rscratch2);
1096       } else {
1097         __ add(s, s, rscratch2);
1098         __ add(d, d, rscratch2);
1099       }
1100 #else
1101       copy_memory_small(s, d, rscratch2, rscratch1, step);
1102 #endif
1103     }
1104 
1105     __ bind(aligned);
1106 
1107     // s is now 2-word-aligned.
1108 
1109     // We have a count of units and some trailing bytes.  Adjust the
1110     // count and do a bulk copy of words.
1111     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1112     if (direction == copy_forwards)
1113       __ bl(copy_f);
1114     else
1115       __ bl(copy_b);
1116 
1117     // And the tail.
1118     copy_memory_small(s, d, count, tmp, step);
1119 
1120     if (granularity >= 8) __ bind(copy8);
1121     if (granularity >= 4) __ bind(copy4);
1122     __ bind(finish);
1123   }
1124 
1125 
1126   void clobber_registers() {
1127 #ifdef ASSERT
1128     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1129     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1130     for (Register r = r3; r <= r18; r++)
1131       if (r != rscratch1) __ mov(r, rscratch1);
1132 #endif
1133   }
1134 
1135   // Scan over array at a for count oops, verifying each one.
1136   // Preserves a and count, clobbers rscratch1 and rscratch2.
1137   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1138     Label loop, end;
1139     __ mov(rscratch1, a);
1140     __ mov(rscratch2, zr);
1141     __ bind(loop);
1142     __ cmp(rscratch2, count);
1143     __ br(Assembler::HS, end);
1144     if (size == (size_t)wordSize) {
1145       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1146       __ verify_oop(temp);
1147     } else {
1148       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1149       __ decode_heap_oop(temp); // calls verify_oop
1150     }
1151     __ add(rscratch2, rscratch2, size);
1152     __ b(loop);
1153     __ bind(end);
1154   }
1155 
1156   // Arguments:
1157   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1158   //             ignored
1159   //   is_oop  - true => oop array, so generate store check code
1160   //   name    - stub name string
1161   //
1162   // Inputs:
1163   //   c_rarg0   - source array address
1164   //   c_rarg1   - destination array address
1165   //   c_rarg2   - element count, treated as ssize_t, can be zero
1166   //
1167   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1168   // the hardware handle it.  The two dwords within qwords that span
1169   // cache line boundaries will still be loaded and stored atomicly.
1170   //
1171   // Side Effects:
1172   //   disjoint_int_copy_entry is set to the no-overlap entry point
1173   //   used by generate_conjoint_int_oop_copy().
1174   //
1175   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1176                                   const char *name, bool dest_uninitialized = false) {
1177     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1178     __ align(CodeEntryAlignment);
1179     StubCodeMark mark(this, "StubRoutines", name);
1180     address start = __ pc();
1181     __ enter();
1182 
1183     if (entry != NULL) {
1184       *entry = __ pc();
1185       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1186       BLOCK_COMMENT("Entry:");
1187     }
1188 
1189     if (is_oop) {
1190       __ push(RegSet::of(d, count), sp);
1191       // no registers are destroyed by this call
1192       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1193     }
1194     copy_memory(aligned, s, d, count, rscratch1, size);
1195     if (is_oop) {
1196       __ pop(RegSet::of(d, count), sp);
1197       if (VerifyOops)
1198         verify_oop_array(size, d, count, r16);
1199       __ sub(count, count, 1); // make an inclusive end pointer
1200       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1201       gen_write_ref_array_post_barrier(d, count, rscratch1);
1202     }
1203     __ leave();
1204     __ mov(r0, zr); // return 0
1205     __ ret(lr);
1206 #ifdef BUILTIN_SIM
1207     {
1208       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1209       sim->notifyCompile(const_cast<char*>(name), start);
1210     }
1211 #endif
1212     return start;
1213   }
1214 
1215   // Arguments:
1216   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1217   //             ignored
1218   //   is_oop  - true => oop array, so generate store check code
1219   //   name    - stub name string
1220   //
1221   // Inputs:
1222   //   c_rarg0   - source array address
1223   //   c_rarg1   - destination array address
1224   //   c_rarg2   - element count, treated as ssize_t, can be zero
1225   //
1226   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1227   // the hardware handle it.  The two dwords within qwords that span
1228   // cache line boundaries will still be loaded and stored atomicly.
1229   //
1230   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1231                                  address *entry, const char *name,
1232                                  bool dest_uninitialized = false) {
1233     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1234 
1235     StubCodeMark mark(this, "StubRoutines", name);
1236     address start = __ pc();
1237     __ enter();
1238 
1239     if (entry != NULL) {
1240       *entry = __ pc();
1241       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1242       BLOCK_COMMENT("Entry:");
1243     }
1244 
1245     // use fwd copy when (d-s) above_equal (count*size)
1246     __ sub(rscratch1, d, s);
1247     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1248     __ br(Assembler::HS, nooverlap_target);
1249 
1250     if (is_oop) {
1251       __ push(RegSet::of(d, count), sp);
1252       // no registers are destroyed by this call
1253       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1254     }
1255     copy_memory(aligned, s, d, count, rscratch1, -size);
1256     if (is_oop) {
1257       __ pop(RegSet::of(d, count), sp);
1258       if (VerifyOops)
1259         verify_oop_array(size, d, count, r16);
1260       __ sub(count, count, 1); // make an inclusive end pointer
1261       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1262       gen_write_ref_array_post_barrier(d, count, rscratch1);
1263     }
1264     __ leave();
1265     __ mov(r0, zr); // return 0
1266     __ ret(lr);
1267 #ifdef BUILTIN_SIM
1268     {
1269       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1270       sim->notifyCompile(const_cast<char*>(name), start);
1271     }
1272 #endif
1273     return start;
1274 }
1275 
1276   // Arguments:
1277   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1278   //             ignored
1279   //   name    - stub name string
1280   //
1281   // Inputs:
1282   //   c_rarg0   - source array address
1283   //   c_rarg1   - destination array address
1284   //   c_rarg2   - element count, treated as ssize_t, can be zero
1285   //
1286   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1287   // we let the hardware handle it.  The one to eight bytes within words,
1288   // dwords or qwords that span cache line boundaries will still be loaded
1289   // and stored atomically.
1290   //
1291   // Side Effects:
1292   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1293   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1294   // we let the hardware handle it.  The one to eight bytes within words,
1295   // dwords or qwords that span cache line boundaries will still be loaded
1296   // and stored atomically.
1297   //
1298   // Side Effects:
1299   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1300   //   used by generate_conjoint_byte_copy().
1301   //
1302   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1303     const bool not_oop = false;
1304     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1305   }
1306 
1307   // Arguments:
1308   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1309   //             ignored
1310   //   name    - stub name string
1311   //
1312   // Inputs:
1313   //   c_rarg0   - source array address
1314   //   c_rarg1   - destination array address
1315   //   c_rarg2   - element count, treated as ssize_t, can be zero
1316   //
1317   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1318   // we let the hardware handle it.  The one to eight bytes within words,
1319   // dwords or qwords that span cache line boundaries will still be loaded
1320   // and stored atomically.
1321   //
1322   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1323                                       address* entry, const char *name) {
1324     const bool not_oop = false;
1325     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1326   }
1327 
1328   // Arguments:
1329   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1330   //             ignored
1331   //   name    - stub name string
1332   //
1333   // Inputs:
1334   //   c_rarg0   - source array address
1335   //   c_rarg1   - destination array address
1336   //   c_rarg2   - element count, treated as ssize_t, can be zero
1337   //
1338   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1339   // let the hardware handle it.  The two or four words within dwords
1340   // or qwords that span cache line boundaries will still be loaded
1341   // and stored atomically.
1342   //
1343   // Side Effects:
1344   //   disjoint_short_copy_entry is set to the no-overlap entry point
1345   //   used by generate_conjoint_short_copy().
1346   //
1347   address generate_disjoint_short_copy(bool aligned,
1348                                        address* entry, const char *name) {
1349     const bool not_oop = false;
1350     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1351   }
1352 
1353   // Arguments:
1354   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1355   //             ignored
1356   //   name    - stub name string
1357   //
1358   // Inputs:
1359   //   c_rarg0   - source array address
1360   //   c_rarg1   - destination array address
1361   //   c_rarg2   - element count, treated as ssize_t, can be zero
1362   //
1363   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1364   // let the hardware handle it.  The two or four words within dwords
1365   // or qwords that span cache line boundaries will still be loaded
1366   // and stored atomically.
1367   //
1368   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1369                                        address *entry, const char *name) {
1370     const bool not_oop = false;
1371     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1372 
1373   }
1374   // Arguments:
1375   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1376   //             ignored
1377   //   name    - stub name string
1378   //
1379   // Inputs:
1380   //   c_rarg0   - source array address
1381   //   c_rarg1   - destination array address
1382   //   c_rarg2   - element count, treated as ssize_t, can be zero
1383   //
1384   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1385   // the hardware handle it.  The two dwords within qwords that span
1386   // cache line boundaries will still be loaded and stored atomicly.
1387   //
1388   // Side Effects:
1389   //   disjoint_int_copy_entry is set to the no-overlap entry point
1390   //   used by generate_conjoint_int_oop_copy().
1391   //
1392   address generate_disjoint_int_copy(bool aligned, address *entry,
1393                                          const char *name, bool dest_uninitialized = false) {
1394     const bool not_oop = false;
1395     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1396   }
1397 
1398   // Arguments:
1399   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1400   //             ignored
1401   //   name    - stub name string
1402   //
1403   // Inputs:
1404   //   c_rarg0   - source array address
1405   //   c_rarg1   - destination array address
1406   //   c_rarg2   - element count, treated as ssize_t, can be zero
1407   //
1408   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1409   // the hardware handle it.  The two dwords within qwords that span
1410   // cache line boundaries will still be loaded and stored atomicly.
1411   //
1412   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1413                                      address *entry, const char *name,
1414                                      bool dest_uninitialized = false) {
1415     const bool not_oop = false;
1416     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1417   }
1418 
1419 
1420   // Arguments:
1421   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1422   //             ignored
1423   //   name    - stub name string
1424   //
1425   // Inputs:
1426   //   c_rarg0   - source array address
1427   //   c_rarg1   - destination array address
1428   //   c_rarg2   - element count, treated as size_t, can be zero
1429   //
1430   // Side Effects:
1431   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1432   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1433   //
1434   address generate_disjoint_long_copy(bool aligned, address *entry,
1435                                           const char *name, bool dest_uninitialized = false) {
1436     const bool not_oop = false;
1437     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1438   }
1439 
1440   // Arguments:
1441   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1442   //             ignored
1443   //   name    - stub name string
1444   //
1445   // Inputs:
1446   //   c_rarg0   - source array address
1447   //   c_rarg1   - destination array address
1448   //   c_rarg2   - element count, treated as size_t, can be zero
1449   //
1450   address generate_conjoint_long_copy(bool aligned,
1451                                       address nooverlap_target, address *entry,
1452                                       const char *name, bool dest_uninitialized = false) {
1453     const bool not_oop = false;
1454     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1455   }
1456 
1457   // Arguments:
1458   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1459   //             ignored
1460   //   name    - stub name string
1461   //
1462   // Inputs:
1463   //   c_rarg0   - source array address
1464   //   c_rarg1   - destination array address
1465   //   c_rarg2   - element count, treated as size_t, can be zero
1466   //
1467   // Side Effects:
1468   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1469   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1470   //
1471   address generate_disjoint_oop_copy(bool aligned, address *entry,
1472                                      const char *name, bool dest_uninitialized) {
1473     const bool is_oop = true;
1474     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1475     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1476   }
1477 
1478   // Arguments:
1479   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1480   //             ignored
1481   //   name    - stub name string
1482   //
1483   // Inputs:
1484   //   c_rarg0   - source array address
1485   //   c_rarg1   - destination array address
1486   //   c_rarg2   - element count, treated as size_t, can be zero
1487   //
1488   address generate_conjoint_oop_copy(bool aligned,
1489                                      address nooverlap_target, address *entry,
1490                                      const char *name, bool dest_uninitialized) {
1491     const bool is_oop = true;
1492     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1493     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1494                                   name, dest_uninitialized);
1495   }
1496 
1497 
1498   // Helper for generating a dynamic type check.
1499   // Smashes rscratch1.
1500   void generate_type_check(Register sub_klass,
1501                            Register super_check_offset,
1502                            Register super_klass,
1503                            Label& L_success) {
1504     assert_different_registers(sub_klass, super_check_offset, super_klass);
1505 
1506     BLOCK_COMMENT("type_check:");
1507 
1508     Label L_miss;
1509 
1510     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1511                                      super_check_offset);
1512     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1513 
1514     // Fall through on failure!
1515     __ BIND(L_miss);
1516   }
1517 
1518   //
1519   //  Generate checkcasting array copy stub
1520   //
1521   //  Input:
1522   //    c_rarg0   - source array address
1523   //    c_rarg1   - destination array address
1524   //    c_rarg2   - element count, treated as ssize_t, can be zero
1525   //    c_rarg3   - size_t ckoff (super_check_offset)
1526   //    c_rarg4   - oop ckval (super_klass)
1527   //
1528   //  Output:
1529   //    r0 ==  0  -  success
1530   //    r0 == -1^K - failure, where K is partial transfer count
1531   //
1532   address generate_checkcast_copy(const char *name, address *entry,
1533                                   bool dest_uninitialized = false) {
1534 
1535     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1536 
1537     // Input registers (after setup_arg_regs)
1538     const Register from        = c_rarg0;   // source array address
1539     const Register to          = c_rarg1;   // destination array address
1540     const Register count       = c_rarg2;   // elementscount
1541     const Register ckoff       = c_rarg3;   // super_check_offset
1542     const Register ckval       = c_rarg4;   // super_klass
1543 
1544     // Registers used as temps (r18, r19, r20 are save-on-entry)
1545     const Register count_save  = r21;       // orig elementscount
1546     const Register start_to    = r20;       // destination array start address
1547     const Register copied_oop  = r18;       // actual oop copied
1548     const Register r19_klass   = r19;       // oop._klass
1549 
1550     //---------------------------------------------------------------
1551     // Assembler stub will be used for this call to arraycopy
1552     // if the two arrays are subtypes of Object[] but the
1553     // destination array type is not equal to or a supertype
1554     // of the source type.  Each element must be separately
1555     // checked.
1556 
1557     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1558                                copied_oop, r19_klass, count_save);
1559 
1560     __ align(CodeEntryAlignment);
1561     StubCodeMark mark(this, "StubRoutines", name);
1562     address start = __ pc();
1563 
1564     __ enter(); // required for proper stackwalking of RuntimeStub frame
1565 
1566 #ifdef ASSERT
1567     // caller guarantees that the arrays really are different
1568     // otherwise, we would have to make conjoint checks
1569     { Label L;
1570       array_overlap_test(L, TIMES_OOP);
1571       __ stop("checkcast_copy within a single array");
1572       __ bind(L);
1573     }
1574 #endif //ASSERT
1575 
1576     // Caller of this entry point must set up the argument registers.
1577     if (entry != NULL) {
1578       *entry = __ pc();
1579       BLOCK_COMMENT("Entry:");
1580     }
1581 
1582      // Empty array:  Nothing to do.
1583     __ cbz(count, L_done);
1584 
1585     __ push(RegSet::of(r18, r19, r20, r21), sp);
1586 
1587 #ifdef ASSERT
1588     BLOCK_COMMENT("assert consistent ckoff/ckval");
1589     // The ckoff and ckval must be mutually consistent,
1590     // even though caller generates both.
1591     { Label L;
1592       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1593       __ ldrw(start_to, Address(ckval, sco_offset));
1594       __ cmpw(ckoff, start_to);
1595       __ br(Assembler::EQ, L);
1596       __ stop("super_check_offset inconsistent");
1597       __ bind(L);
1598     }
1599 #endif //ASSERT
1600 
1601     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1602 
1603     // save the original count
1604     __ mov(count_save, count);
1605 
1606     // Copy from low to high addresses
1607     __ mov(start_to, to);              // Save destination array start address
1608     __ b(L_load_element);
1609 
1610     // ======== begin loop ========
1611     // (Loop is rotated; its entry is L_load_element.)
1612     // Loop control:
1613     //   for (; count != 0; count--) {
1614     //     copied_oop = load_heap_oop(from++);
1615     //     ... generate_type_check ...;
1616     //     store_heap_oop(to++, copied_oop);
1617     //   }
1618     __ align(OptoLoopAlignment);
1619 
1620     __ BIND(L_store_element);
1621     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1622     __ sub(count, count, 1);
1623     __ cbz(count, L_do_card_marks);
1624 
1625     // ======== loop entry is here ========
1626     __ BIND(L_load_element);
1627     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1628     __ cbz(copied_oop, L_store_element);
1629 
1630     __ load_klass(r19_klass, copied_oop);// query the object klass
1631     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1632     // ======== end loop ========
1633 
1634     // It was a real error; we must depend on the caller to finish the job.
1635     // Register count = remaining oops, count_orig = total oops.
1636     // Emit GC store barriers for the oops we have copied and report
1637     // their number to the caller.
1638 
1639     __ subs(count, count_save, count);     // K = partially copied oop count
1640     __ eon(count, count, zr);                   // report (-1^K) to caller
1641     __ br(Assembler::EQ, L_done_pop);
1642 
1643     __ BIND(L_do_card_marks);
1644     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1645     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1646 
1647     __ bind(L_done_pop);
1648     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1649     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1650 
1651     __ bind(L_done);
1652     __ mov(r0, count);
1653     __ leave();
1654     __ ret(lr);
1655 
1656     return start;
1657   }
1658 
1659   // Perform range checks on the proposed arraycopy.
1660   // Kills temp, but nothing else.
1661   // Also, clean the sign bits of src_pos and dst_pos.
1662   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1663                               Register src_pos, // source position (c_rarg1)
1664                               Register dst,     // destination array oo (c_rarg2)
1665                               Register dst_pos, // destination position (c_rarg3)
1666                               Register length,
1667                               Register temp,
1668                               Label& L_failed) {
1669     BLOCK_COMMENT("arraycopy_range_checks:");
1670 
1671     assert_different_registers(rscratch1, temp);
1672 
1673     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1674     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1675     __ addw(temp, length, src_pos);
1676     __ cmpw(temp, rscratch1);
1677     __ br(Assembler::HI, L_failed);
1678 
1679     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1680     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1681     __ addw(temp, length, dst_pos);
1682     __ cmpw(temp, rscratch1);
1683     __ br(Assembler::HI, L_failed);
1684 
1685     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1686     __ movw(src_pos, src_pos);
1687     __ movw(dst_pos, dst_pos);
1688 
1689     BLOCK_COMMENT("arraycopy_range_checks done");
1690   }
1691 
1692   // These stubs get called from some dumb test routine.
1693   // I'll write them properly when they're called from
1694   // something that's actually doing something.
1695   static void fake_arraycopy_stub(address src, address dst, int count) {
1696     assert(count == 0, "huh?");
1697   }
1698 
1699 
1700   //
1701   //  Generate 'unsafe' array copy stub
1702   //  Though just as safe as the other stubs, it takes an unscaled
1703   //  size_t argument instead of an element count.
1704   //
1705   //  Input:
1706   //    c_rarg0   - source array address
1707   //    c_rarg1   - destination array address
1708   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1709   //
1710   // Examines the alignment of the operands and dispatches
1711   // to a long, int, short, or byte copy loop.
1712   //
1713   address generate_unsafe_copy(const char *name,
1714                                address byte_copy_entry,
1715                                address short_copy_entry,
1716                                address int_copy_entry,
1717                                address long_copy_entry) {
1718     Label L_long_aligned, L_int_aligned, L_short_aligned;
1719     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1720 
1721     __ align(CodeEntryAlignment);
1722     StubCodeMark mark(this, "StubRoutines", name);
1723     address start = __ pc();
1724     __ enter(); // required for proper stackwalking of RuntimeStub frame
1725 
1726     // bump this on entry, not on exit:
1727     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1728 
1729     __ orr(rscratch1, s, d);
1730     __ orr(rscratch1, rscratch1, count);
1731 
1732     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1733     __ cbz(rscratch1, L_long_aligned);
1734     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1735     __ cbz(rscratch1, L_int_aligned);
1736     __ tbz(rscratch1, 0, L_short_aligned);
1737     __ b(RuntimeAddress(byte_copy_entry));
1738 
1739     __ BIND(L_short_aligned);
1740     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1741     __ b(RuntimeAddress(short_copy_entry));
1742     __ BIND(L_int_aligned);
1743     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1744     __ b(RuntimeAddress(int_copy_entry));
1745     __ BIND(L_long_aligned);
1746     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1747     __ b(RuntimeAddress(long_copy_entry));
1748 
1749     return start;
1750   }
1751 
1752   //
1753   //  Generate generic array copy stubs
1754   //
1755   //  Input:
1756   //    c_rarg0    -  src oop
1757   //    c_rarg1    -  src_pos (32-bits)
1758   //    c_rarg2    -  dst oop
1759   //    c_rarg3    -  dst_pos (32-bits)
1760   //    c_rarg4    -  element count (32-bits)
1761   //
1762   //  Output:
1763   //    r0 ==  0  -  success
1764   //    r0 == -1^K - failure, where K is partial transfer count
1765   //
1766   address generate_generic_copy(const char *name,
1767                                 address byte_copy_entry, address short_copy_entry,
1768                                 address int_copy_entry, address oop_copy_entry,
1769                                 address long_copy_entry, address checkcast_copy_entry) {
1770 
1771     Label L_failed, L_failed_0, L_objArray;
1772     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1773 
1774     // Input registers
1775     const Register src        = c_rarg0;  // source array oop
1776     const Register src_pos    = c_rarg1;  // source position
1777     const Register dst        = c_rarg2;  // destination array oop
1778     const Register dst_pos    = c_rarg3;  // destination position
1779     const Register length     = c_rarg4;
1780 
1781     StubCodeMark mark(this, "StubRoutines", name);
1782 
1783     __ align(CodeEntryAlignment);
1784     address start = __ pc();
1785 
1786     __ enter(); // required for proper stackwalking of RuntimeStub frame
1787 
1788     // bump this on entry, not on exit:
1789     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1790 
1791     //-----------------------------------------------------------------------
1792     // Assembler stub will be used for this call to arraycopy
1793     // if the following conditions are met:
1794     //
1795     // (1) src and dst must not be null.
1796     // (2) src_pos must not be negative.
1797     // (3) dst_pos must not be negative.
1798     // (4) length  must not be negative.
1799     // (5) src klass and dst klass should be the same and not NULL.
1800     // (6) src and dst should be arrays.
1801     // (7) src_pos + length must not exceed length of src.
1802     // (8) dst_pos + length must not exceed length of dst.
1803     //
1804 
1805     //  if (src == NULL) return -1;
1806     __ cbz(src, L_failed);
1807 
1808     //  if (src_pos < 0) return -1;
1809     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1810 
1811     //  if (dst == NULL) return -1;
1812     __ cbz(dst, L_failed);
1813 
1814     //  if (dst_pos < 0) return -1;
1815     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
1816 
1817     // registers used as temp
1818     const Register scratch_length    = r16; // elements count to copy
1819     const Register scratch_src_klass = r17; // array klass
1820     const Register lh                = r18; // layout helper
1821 
1822     //  if (length < 0) return -1;
1823     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
1824     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
1825 
1826     __ load_klass(scratch_src_klass, src);
1827 #ifdef ASSERT
1828     //  assert(src->klass() != NULL);
1829     {
1830       BLOCK_COMMENT("assert klasses not null {");
1831       Label L1, L2;
1832       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
1833       __ bind(L1);
1834       __ stop("broken null klass");
1835       __ bind(L2);
1836       __ load_klass(rscratch1, dst);
1837       __ cbz(rscratch1, L1);     // this would be broken also
1838       BLOCK_COMMENT("} assert klasses not null done");
1839     }
1840 #endif
1841 
1842     // Load layout helper (32-bits)
1843     //
1844     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1845     // 32        30    24            16              8     2                 0
1846     //
1847     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1848     //
1849 
1850     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1851 
1852     // Handle objArrays completely differently...
1853     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1854     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
1855     __ movw(rscratch1, objArray_lh);
1856     __ eorw(rscratch2, lh, rscratch1);
1857     __ cbzw(rscratch2, L_objArray);
1858 
1859     //  if (src->klass() != dst->klass()) return -1;
1860     __ load_klass(rscratch2, dst);
1861     __ eor(rscratch2, rscratch2, scratch_src_klass);
1862     __ cbnz(rscratch2, L_failed);
1863 
1864     //  if (!src->is_Array()) return -1;
1865     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
1866 
1867     // At this point, it is known to be a typeArray (array_tag 0x3).
1868 #ifdef ASSERT
1869     {
1870       BLOCK_COMMENT("assert primitive array {");
1871       Label L;
1872       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
1873       __ cmpw(lh, rscratch2);
1874       __ br(Assembler::GE, L);
1875       __ stop("must be a primitive array");
1876       __ bind(L);
1877       BLOCK_COMMENT("} assert primitive array done");
1878     }
1879 #endif
1880 
1881     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1882                            rscratch2, L_failed);
1883 
1884     // TypeArrayKlass
1885     //
1886     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1887     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1888     //
1889 
1890     const Register rscratch1_offset = rscratch1;    // array offset
1891     const Register r18_elsize = lh; // element size
1892 
1893     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
1894            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
1895     __ add(src, src, rscratch1_offset);           // src array offset
1896     __ add(dst, dst, rscratch1_offset);           // dst array offset
1897     BLOCK_COMMENT("choose copy loop based on element size");
1898 
1899     // next registers should be set before the jump to corresponding stub
1900     const Register from     = c_rarg0;  // source array address
1901     const Register to       = c_rarg1;  // destination array address
1902     const Register count    = c_rarg2;  // elements count
1903 
1904     // 'from', 'to', 'count' registers should be set in such order
1905     // since they are the same as 'src', 'src_pos', 'dst'.
1906 
1907     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1908 
1909     // The possible values of elsize are 0-3, i.e. exact_log2(element
1910     // size in bytes).  We do a simple bitwise binary search.
1911   __ BIND(L_copy_bytes);
1912     __ tbnz(r18_elsize, 1, L_copy_ints);
1913     __ tbnz(r18_elsize, 0, L_copy_shorts);
1914     __ lea(from, Address(src, src_pos));// src_addr
1915     __ lea(to,   Address(dst, dst_pos));// dst_addr
1916     __ movw(count, scratch_length); // length
1917     __ b(RuntimeAddress(byte_copy_entry));
1918 
1919   __ BIND(L_copy_shorts);
1920     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
1921     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
1922     __ movw(count, scratch_length); // length
1923     __ b(RuntimeAddress(short_copy_entry));
1924 
1925   __ BIND(L_copy_ints);
1926     __ tbnz(r18_elsize, 0, L_copy_longs);
1927     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
1928     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
1929     __ movw(count, scratch_length); // length
1930     __ b(RuntimeAddress(int_copy_entry));
1931 
1932   __ BIND(L_copy_longs);
1933 #ifdef ASSERT
1934     {
1935       BLOCK_COMMENT("assert long copy {");
1936       Label L;
1937       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
1938       __ cmpw(r18_elsize, LogBytesPerLong);
1939       __ br(Assembler::EQ, L);
1940       __ stop("must be long copy, but elsize is wrong");
1941       __ bind(L);
1942       BLOCK_COMMENT("} assert long copy done");
1943     }
1944 #endif
1945     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
1946     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
1947     __ movw(count, scratch_length); // length
1948     __ b(RuntimeAddress(long_copy_entry));
1949 
1950     // ObjArrayKlass
1951   __ BIND(L_objArray);
1952     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1953 
1954     Label L_plain_copy, L_checkcast_copy;
1955     //  test array classes for subtyping
1956     __ load_klass(r18, dst);
1957     __ cmp(scratch_src_klass, r18); // usual case is exact equality
1958     __ br(Assembler::NE, L_checkcast_copy);
1959 
1960     // Identically typed arrays can be copied without element-wise checks.
1961     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1962                            rscratch2, L_failed);
1963 
1964     __ lea(from, Address(src, src_pos, Address::lsl(3)));
1965     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1966     __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
1967     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1968     __ movw(count, scratch_length); // length
1969   __ BIND(L_plain_copy);
1970     __ b(RuntimeAddress(oop_copy_entry));
1971 
1972   __ BIND(L_checkcast_copy);
1973     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
1974     {
1975       // Before looking at dst.length, make sure dst is also an objArray.
1976       __ ldrw(rscratch1, Address(r18, lh_offset));
1977       __ movw(rscratch2, objArray_lh);
1978       __ eorw(rscratch1, rscratch1, rscratch2);
1979       __ cbnzw(rscratch1, L_failed);
1980 
1981       // It is safe to examine both src.length and dst.length.
1982       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1983                              r18, L_failed);
1984 
1985       const Register rscratch2_dst_klass = rscratch2;
1986       __ load_klass(rscratch2_dst_klass, dst); // reload
1987 
1988       // Marshal the base address arguments now, freeing registers.
1989       __ lea(from, Address(src, src_pos, Address::lsl(3)));
1990       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1991       __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
1992       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1993       __ movw(count, length);           // length (reloaded)
1994       Register sco_temp = c_rarg3;      // this register is free now
1995       assert_different_registers(from, to, count, sco_temp,
1996                                  rscratch2_dst_klass, scratch_src_klass);
1997       // assert_clean_int(count, sco_temp);
1998 
1999       // Generate the type check.
2000       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2001       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2002       // assert_clean_int(sco_temp, r18);
2003       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2004 
2005       // Fetch destination element klass from the ObjArrayKlass header.
2006       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2007       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2008       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2009 
2010       // the checkcast_copy loop needs two extra arguments:
2011       assert(c_rarg3 == sco_temp, "#3 already in place");
2012       // Set up arguments for checkcast_copy_entry.
2013       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2014       __ b(RuntimeAddress(checkcast_copy_entry));
2015     }
2016 
2017   __ BIND(L_failed);
2018     __ mov(r0, -1);
2019     __ leave();   // required for proper stackwalking of RuntimeStub frame
2020     __ ret(lr);
2021 
2022     return start;
2023   }
2024 
2025   void generate_arraycopy_stubs() {
2026     address entry;
2027     address entry_jbyte_arraycopy;
2028     address entry_jshort_arraycopy;
2029     address entry_jint_arraycopy;
2030     address entry_oop_arraycopy;
2031     address entry_jlong_arraycopy;
2032     address entry_checkcast_arraycopy;
2033 
2034     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2035     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2036 
2037     //*** jbyte
2038     // Always need aligned and unaligned versions
2039     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2040                                                                                   "jbyte_disjoint_arraycopy");
2041     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2042                                                                                   &entry_jbyte_arraycopy,
2043                                                                                   "jbyte_arraycopy");
2044     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2045                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2046     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2047                                                                                   "arrayof_jbyte_arraycopy");
2048 
2049     //*** jshort
2050     // Always need aligned and unaligned versions
2051     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2052                                                                                     "jshort_disjoint_arraycopy");
2053     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2054                                                                                     &entry_jshort_arraycopy,
2055                                                                                     "jshort_arraycopy");
2056     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2057                                                                                     "arrayof_jshort_disjoint_arraycopy");
2058     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2059                                                                                     "arrayof_jshort_arraycopy");
2060 
2061     //*** jint
2062     // Aligned versions
2063     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2064                                                                                 "arrayof_jint_disjoint_arraycopy");
2065     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2066                                                                                 "arrayof_jint_arraycopy");
2067     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2068     // entry_jint_arraycopy always points to the unaligned version
2069     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2070                                                                                 "jint_disjoint_arraycopy");
2071     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2072                                                                                 &entry_jint_arraycopy,
2073                                                                                 "jint_arraycopy");
2074 
2075     //*** jlong
2076     // It is always aligned
2077     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2078                                                                                   "arrayof_jlong_disjoint_arraycopy");
2079     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2080                                                                                   "arrayof_jlong_arraycopy");
2081     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2082     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2083 
2084     //*** oops
2085     {
2086       // With compressed oops we need unaligned versions; notice that
2087       // we overwrite entry_oop_arraycopy.
2088       bool aligned = !UseCompressedOops;
2089 
2090       StubRoutines::_arrayof_oop_disjoint_arraycopy
2091         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2092                                      /*dest_uninitialized*/false);
2093       StubRoutines::_arrayof_oop_arraycopy
2094         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2095                                      /*dest_uninitialized*/false);
2096       // Aligned versions without pre-barriers
2097       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2098         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2099                                      /*dest_uninitialized*/true);
2100       StubRoutines::_arrayof_oop_arraycopy_uninit
2101         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2102                                      /*dest_uninitialized*/true);
2103     }
2104 
2105     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2106     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2107     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2108     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2109 
2110     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2111     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2112                                                                         /*dest_uninitialized*/true);
2113 
2114     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2115                                                               entry_jbyte_arraycopy,
2116                                                               entry_jshort_arraycopy,
2117                                                               entry_jint_arraycopy,
2118                                                               entry_jlong_arraycopy);
2119 
2120     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2121                                                                entry_jbyte_arraycopy,
2122                                                                entry_jshort_arraycopy,
2123                                                                entry_jint_arraycopy,
2124                                                                entry_oop_arraycopy,
2125                                                                entry_jlong_arraycopy,
2126                                                                entry_checkcast_arraycopy);
2127 
2128   }
2129 
2130   void generate_math_stubs() { Unimplemented(); }
2131 
2132   // Arguments:
2133   //
2134   // Inputs:
2135   //   c_rarg0   - source byte array address
2136   //   c_rarg1   - destination byte array address
2137   //   c_rarg2   - K (key) in little endian int array
2138   //
2139   address generate_aescrypt_encryptBlock() {
2140     __ align(CodeEntryAlignment);
2141     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2142 
2143     Label L_doLast;
2144 
2145     const Register from        = c_rarg0;  // source array address
2146     const Register to          = c_rarg1;  // destination array address
2147     const Register key         = c_rarg2;  // key array address
2148     const Register keylen      = rscratch1;
2149 
2150     address start = __ pc();
2151     __ enter();
2152 
2153     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2154 
2155     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2156 
2157     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2158     __ rev32(v1, __ T16B, v1);
2159     __ rev32(v2, __ T16B, v2);
2160     __ rev32(v3, __ T16B, v3);
2161     __ rev32(v4, __ T16B, v4);
2162     __ aese(v0, v1);
2163     __ aesmc(v0, v0);
2164     __ aese(v0, v2);
2165     __ aesmc(v0, v0);
2166     __ aese(v0, v3);
2167     __ aesmc(v0, v0);
2168     __ aese(v0, v4);
2169     __ aesmc(v0, v0);
2170 
2171     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2172     __ rev32(v1, __ T16B, v1);
2173     __ rev32(v2, __ T16B, v2);
2174     __ rev32(v3, __ T16B, v3);
2175     __ rev32(v4, __ T16B, v4);
2176     __ aese(v0, v1);
2177     __ aesmc(v0, v0);
2178     __ aese(v0, v2);
2179     __ aesmc(v0, v0);
2180     __ aese(v0, v3);
2181     __ aesmc(v0, v0);
2182     __ aese(v0, v4);
2183     __ aesmc(v0, v0);
2184 
2185     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2186     __ rev32(v1, __ T16B, v1);
2187     __ rev32(v2, __ T16B, v2);
2188 
2189     __ cmpw(keylen, 44);
2190     __ br(Assembler::EQ, L_doLast);
2191 
2192     __ aese(v0, v1);
2193     __ aesmc(v0, v0);
2194     __ aese(v0, v2);
2195     __ aesmc(v0, v0);
2196 
2197     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2198     __ rev32(v1, __ T16B, v1);
2199     __ rev32(v2, __ T16B, v2);
2200 
2201     __ cmpw(keylen, 52);
2202     __ br(Assembler::EQ, L_doLast);
2203 
2204     __ aese(v0, v1);
2205     __ aesmc(v0, v0);
2206     __ aese(v0, v2);
2207     __ aesmc(v0, v0);
2208 
2209     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2210     __ rev32(v1, __ T16B, v1);
2211     __ rev32(v2, __ T16B, v2);
2212 
2213     __ BIND(L_doLast);
2214 
2215     __ aese(v0, v1);
2216     __ aesmc(v0, v0);
2217     __ aese(v0, v2);
2218 
2219     __ ld1(v1, __ T16B, key);
2220     __ rev32(v1, __ T16B, v1);
2221     __ eor(v0, __ T16B, v0, v1);
2222 
2223     __ st1(v0, __ T16B, to);
2224 
2225     __ mov(r0, 0);
2226 
2227     __ leave();
2228     __ ret(lr);
2229 
2230     return start;
2231   }
2232 
2233   // Arguments:
2234   //
2235   // Inputs:
2236   //   c_rarg0   - source byte array address
2237   //   c_rarg1   - destination byte array address
2238   //   c_rarg2   - K (key) in little endian int array
2239   //
2240   address generate_aescrypt_decryptBlock() {
2241     assert(UseAES, "need AES instructions and misaligned SSE support");
2242     __ align(CodeEntryAlignment);
2243     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2244     Label L_doLast;
2245 
2246     const Register from        = c_rarg0;  // source array address
2247     const Register to          = c_rarg1;  // destination array address
2248     const Register key         = c_rarg2;  // key array address
2249     const Register keylen      = rscratch1;
2250 
2251     address start = __ pc();
2252     __ enter(); // required for proper stackwalking of RuntimeStub frame
2253 
2254     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2255 
2256     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2257 
2258     __ ld1(v5, __ T16B, __ post(key, 16));
2259     __ rev32(v5, __ T16B, v5);
2260 
2261     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2262     __ rev32(v1, __ T16B, v1);
2263     __ rev32(v2, __ T16B, v2);
2264     __ rev32(v3, __ T16B, v3);
2265     __ rev32(v4, __ T16B, v4);
2266     __ aesd(v0, v1);
2267     __ aesimc(v0, v0);
2268     __ aesd(v0, v2);
2269     __ aesimc(v0, v0);
2270     __ aesd(v0, v3);
2271     __ aesimc(v0, v0);
2272     __ aesd(v0, v4);
2273     __ aesimc(v0, v0);
2274 
2275     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2276     __ rev32(v1, __ T16B, v1);
2277     __ rev32(v2, __ T16B, v2);
2278     __ rev32(v3, __ T16B, v3);
2279     __ rev32(v4, __ T16B, v4);
2280     __ aesd(v0, v1);
2281     __ aesimc(v0, v0);
2282     __ aesd(v0, v2);
2283     __ aesimc(v0, v0);
2284     __ aesd(v0, v3);
2285     __ aesimc(v0, v0);
2286     __ aesd(v0, v4);
2287     __ aesimc(v0, v0);
2288 
2289     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2290     __ rev32(v1, __ T16B, v1);
2291     __ rev32(v2, __ T16B, v2);
2292 
2293     __ cmpw(keylen, 44);
2294     __ br(Assembler::EQ, L_doLast);
2295 
2296     __ aesd(v0, v1);
2297     __ aesimc(v0, v0);
2298     __ aesd(v0, v2);
2299     __ aesimc(v0, v0);
2300 
2301     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2302     __ rev32(v1, __ T16B, v1);
2303     __ rev32(v2, __ T16B, v2);
2304 
2305     __ cmpw(keylen, 52);
2306     __ br(Assembler::EQ, L_doLast);
2307 
2308     __ aesd(v0, v1);
2309     __ aesimc(v0, v0);
2310     __ aesd(v0, v2);
2311     __ aesimc(v0, v0);
2312 
2313     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2314     __ rev32(v1, __ T16B, v1);
2315     __ rev32(v2, __ T16B, v2);
2316 
2317     __ BIND(L_doLast);
2318 
2319     __ aesd(v0, v1);
2320     __ aesimc(v0, v0);
2321     __ aesd(v0, v2);
2322 
2323     __ eor(v0, __ T16B, v0, v5);
2324 
2325     __ st1(v0, __ T16B, to);
2326 
2327     __ mov(r0, 0);
2328 
2329     __ leave();
2330     __ ret(lr);
2331 
2332     return start;
2333   }
2334 
2335   // Arguments:
2336   //
2337   // Inputs:
2338   //   c_rarg0   - source byte array address
2339   //   c_rarg1   - destination byte array address
2340   //   c_rarg2   - K (key) in little endian int array
2341   //   c_rarg3   - r vector byte array address
2342   //   c_rarg4   - input length
2343   //
2344   // Output:
2345   //   x0        - input length
2346   //
2347   address generate_cipherBlockChaining_encryptAESCrypt() {
2348     assert(UseAES, "need AES instructions and misaligned SSE support");
2349     __ align(CodeEntryAlignment);
2350     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2351 
2352     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2353 
2354     const Register from        = c_rarg0;  // source array address
2355     const Register to          = c_rarg1;  // destination array address
2356     const Register key         = c_rarg2;  // key array address
2357     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2358                                            // and left with the results of the last encryption block
2359     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2360     const Register keylen      = rscratch1;
2361 
2362     address start = __ pc();
2363       __ enter();
2364 
2365       __ mov(rscratch2, len_reg);
2366       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2367 
2368       __ ld1(v0, __ T16B, rvec);
2369 
2370       __ cmpw(keylen, 52);
2371       __ br(Assembler::CC, L_loadkeys_44);
2372       __ br(Assembler::EQ, L_loadkeys_52);
2373 
2374       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2375       __ rev32(v17, __ T16B, v17);
2376       __ rev32(v18, __ T16B, v18);
2377     __ BIND(L_loadkeys_52);
2378       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2379       __ rev32(v19, __ T16B, v19);
2380       __ rev32(v20, __ T16B, v20);
2381     __ BIND(L_loadkeys_44);
2382       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2383       __ rev32(v21, __ T16B, v21);
2384       __ rev32(v22, __ T16B, v22);
2385       __ rev32(v23, __ T16B, v23);
2386       __ rev32(v24, __ T16B, v24);
2387       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2388       __ rev32(v25, __ T16B, v25);
2389       __ rev32(v26, __ T16B, v26);
2390       __ rev32(v27, __ T16B, v27);
2391       __ rev32(v28, __ T16B, v28);
2392       __ ld1(v29, v30, v31, __ T16B, key);
2393       __ rev32(v29, __ T16B, v29);
2394       __ rev32(v30, __ T16B, v30);
2395       __ rev32(v31, __ T16B, v31);
2396 
2397     __ BIND(L_aes_loop);
2398       __ ld1(v1, __ T16B, __ post(from, 16));
2399       __ eor(v0, __ T16B, v0, v1);
2400 
2401       __ br(Assembler::CC, L_rounds_44);
2402       __ br(Assembler::EQ, L_rounds_52);
2403 
2404       __ aese(v0, v17); __ aesmc(v0, v0);
2405       __ aese(v0, v18); __ aesmc(v0, v0);
2406     __ BIND(L_rounds_52);
2407       __ aese(v0, v19); __ aesmc(v0, v0);
2408       __ aese(v0, v20); __ aesmc(v0, v0);
2409     __ BIND(L_rounds_44);
2410       __ aese(v0, v21); __ aesmc(v0, v0);
2411       __ aese(v0, v22); __ aesmc(v0, v0);
2412       __ aese(v0, v23); __ aesmc(v0, v0);
2413       __ aese(v0, v24); __ aesmc(v0, v0);
2414       __ aese(v0, v25); __ aesmc(v0, v0);
2415       __ aese(v0, v26); __ aesmc(v0, v0);
2416       __ aese(v0, v27); __ aesmc(v0, v0);
2417       __ aese(v0, v28); __ aesmc(v0, v0);
2418       __ aese(v0, v29); __ aesmc(v0, v0);
2419       __ aese(v0, v30);
2420       __ eor(v0, __ T16B, v0, v31);
2421 
2422       __ st1(v0, __ T16B, __ post(to, 16));
2423       __ sub(len_reg, len_reg, 16);
2424       __ cbnz(len_reg, L_aes_loop);
2425 
2426       __ st1(v0, __ T16B, rvec);
2427 
2428       __ mov(r0, rscratch2);
2429 
2430       __ leave();
2431       __ ret(lr);
2432 
2433       return start;
2434   }
2435 
2436   // Arguments:
2437   //
2438   // Inputs:
2439   //   c_rarg0   - source byte array address
2440   //   c_rarg1   - destination byte array address
2441   //   c_rarg2   - K (key) in little endian int array
2442   //   c_rarg3   - r vector byte array address
2443   //   c_rarg4   - input length
2444   //
2445   // Output:
2446   //   r0        - input length
2447   //
2448   address generate_cipherBlockChaining_decryptAESCrypt() {
2449     assert(UseAES, "need AES instructions and misaligned SSE support");
2450     __ align(CodeEntryAlignment);
2451     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2452 
2453     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2454 
2455     const Register from        = c_rarg0;  // source array address
2456     const Register to          = c_rarg1;  // destination array address
2457     const Register key         = c_rarg2;  // key array address
2458     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2459                                            // and left with the results of the last encryption block
2460     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2461     const Register keylen      = rscratch1;
2462 
2463     address start = __ pc();
2464       __ enter();
2465 
2466       __ mov(rscratch2, len_reg);
2467       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2468 
2469       __ ld1(v2, __ T16B, rvec);
2470 
2471       __ ld1(v31, __ T16B, __ post(key, 16));
2472       __ rev32(v31, __ T16B, v31);
2473 
2474       __ cmpw(keylen, 52);
2475       __ br(Assembler::CC, L_loadkeys_44);
2476       __ br(Assembler::EQ, L_loadkeys_52);
2477 
2478       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2479       __ rev32(v17, __ T16B, v17);
2480       __ rev32(v18, __ T16B, v18);
2481     __ BIND(L_loadkeys_52);
2482       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2483       __ rev32(v19, __ T16B, v19);
2484       __ rev32(v20, __ T16B, v20);
2485     __ BIND(L_loadkeys_44);
2486       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2487       __ rev32(v21, __ T16B, v21);
2488       __ rev32(v22, __ T16B, v22);
2489       __ rev32(v23, __ T16B, v23);
2490       __ rev32(v24, __ T16B, v24);
2491       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2492       __ rev32(v25, __ T16B, v25);
2493       __ rev32(v26, __ T16B, v26);
2494       __ rev32(v27, __ T16B, v27);
2495       __ rev32(v28, __ T16B, v28);
2496       __ ld1(v29, v30, __ T16B, key);
2497       __ rev32(v29, __ T16B, v29);
2498       __ rev32(v30, __ T16B, v30);
2499 
2500     __ BIND(L_aes_loop);
2501       __ ld1(v0, __ T16B, __ post(from, 16));
2502       __ orr(v1, __ T16B, v0, v0);
2503 
2504       __ br(Assembler::CC, L_rounds_44);
2505       __ br(Assembler::EQ, L_rounds_52);
2506 
2507       __ aesd(v0, v17); __ aesimc(v0, v0);
2508       __ aesd(v0, v18); __ aesimc(v0, v0);
2509     __ BIND(L_rounds_52);
2510       __ aesd(v0, v19); __ aesimc(v0, v0);
2511       __ aesd(v0, v20); __ aesimc(v0, v0);
2512     __ BIND(L_rounds_44);
2513       __ aesd(v0, v21); __ aesimc(v0, v0);
2514       __ aesd(v0, v22); __ aesimc(v0, v0);
2515       __ aesd(v0, v23); __ aesimc(v0, v0);
2516       __ aesd(v0, v24); __ aesimc(v0, v0);
2517       __ aesd(v0, v25); __ aesimc(v0, v0);
2518       __ aesd(v0, v26); __ aesimc(v0, v0);
2519       __ aesd(v0, v27); __ aesimc(v0, v0);
2520       __ aesd(v0, v28); __ aesimc(v0, v0);
2521       __ aesd(v0, v29); __ aesimc(v0, v0);
2522       __ aesd(v0, v30);
2523       __ eor(v0, __ T16B, v0, v31);
2524       __ eor(v0, __ T16B, v0, v2);
2525 
2526       __ st1(v0, __ T16B, __ post(to, 16));
2527       __ orr(v2, __ T16B, v1, v1);
2528 
2529       __ sub(len_reg, len_reg, 16);
2530       __ cbnz(len_reg, L_aes_loop);
2531 
2532       __ st1(v2, __ T16B, rvec);
2533 
2534       __ mov(r0, rscratch2);
2535 
2536       __ leave();
2537       __ ret(lr);
2538 
2539     return start;
2540   }
2541 
2542   // Arguments:
2543   //
2544   // Inputs:
2545   //   c_rarg0   - byte[]  source+offset
2546   //   c_rarg1   - int[]   SHA.state
2547   //   c_rarg2   - int     offset
2548   //   c_rarg3   - int     limit
2549   //
2550   address generate_sha1_implCompress(bool multi_block, const char *name) {
2551     __ align(CodeEntryAlignment);
2552     StubCodeMark mark(this, "StubRoutines", name);
2553     address start = __ pc();
2554 
2555     Register buf   = c_rarg0;
2556     Register state = c_rarg1;
2557     Register ofs   = c_rarg2;
2558     Register limit = c_rarg3;
2559 
2560     Label keys;
2561     Label sha1_loop;
2562 
2563     // load the keys into v0..v3
2564     __ adr(rscratch1, keys);
2565     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2566     // load 5 words state into v6, v7
2567     __ ldrq(v6, Address(state, 0));
2568     __ ldrs(v7, Address(state, 16));
2569 
2570 
2571     __ BIND(sha1_loop);
2572     // load 64 bytes of data into v16..v19
2573     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2574     __ rev32(v16, __ T16B, v16);
2575     __ rev32(v17, __ T16B, v17);
2576     __ rev32(v18, __ T16B, v18);
2577     __ rev32(v19, __ T16B, v19);
2578 
2579     // do the sha1
2580     __ addv(v4, __ T4S, v16, v0);
2581     __ orr(v20, __ T16B, v6, v6);
2582 
2583     FloatRegister d0 = v16;
2584     FloatRegister d1 = v17;
2585     FloatRegister d2 = v18;
2586     FloatRegister d3 = v19;
2587 
2588     for (int round = 0; round < 20; round++) {
2589       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2590       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2591       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2592       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2593       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2594 
2595       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2596       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2597       __ sha1h(tmp2, __ T4S, v20);
2598       if (round < 5)
2599         __ sha1c(v20, __ T4S, tmp3, tmp4);
2600       else if (round < 10 || round >= 15)
2601         __ sha1p(v20, __ T4S, tmp3, tmp4);
2602       else
2603         __ sha1m(v20, __ T4S, tmp3, tmp4);
2604       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2605 
2606       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2607     }
2608 
2609     __ addv(v7, __ T2S, v7, v21);
2610     __ addv(v6, __ T4S, v6, v20);
2611 
2612     if (multi_block) {
2613       __ add(ofs, ofs, 64);
2614       __ cmp(ofs, limit);
2615       __ br(Assembler::LE, sha1_loop);
2616       __ mov(c_rarg0, ofs); // return ofs
2617     }
2618 
2619     __ strq(v6, Address(state, 0));
2620     __ strs(v7, Address(state, 16));
2621 
2622     __ ret(lr);
2623 
2624     __ bind(keys);
2625     __ emit_int32(0x5a827999);
2626     __ emit_int32(0x6ed9eba1);
2627     __ emit_int32(0x8f1bbcdc);
2628     __ emit_int32(0xca62c1d6);
2629 
2630     return start;
2631   }
2632 
2633 
2634   // Arguments:
2635   //
2636   // Inputs:
2637   //   c_rarg0   - byte[]  source+offset
2638   //   c_rarg1   - int[]   SHA.state
2639   //   c_rarg2   - int     offset
2640   //   c_rarg3   - int     limit
2641   //
2642   address generate_sha256_implCompress(bool multi_block, const char *name) {
2643     static const uint32_t round_consts[64] = {
2644       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2645       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2646       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2647       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2648       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2649       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2650       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2651       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2652       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2653       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2654       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2655       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2656       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2657       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2658       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2659       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
2660     };
2661     __ align(CodeEntryAlignment);
2662     StubCodeMark mark(this, "StubRoutines", name);
2663     address start = __ pc();
2664 
2665     Register buf   = c_rarg0;
2666     Register state = c_rarg1;
2667     Register ofs   = c_rarg2;
2668     Register limit = c_rarg3;
2669 
2670     Label sha1_loop;
2671 
2672     __ stpd(v8, v9, __ pre(sp, -32));
2673     __ stpd(v10, v11, Address(sp, 16));
2674 
2675 // dga == v0
2676 // dgb == v1
2677 // dg0 == v2
2678 // dg1 == v3
2679 // dg2 == v4
2680 // t0 == v6
2681 // t1 == v7
2682 
2683     // load 16 keys to v16..v31
2684     __ lea(rscratch1, ExternalAddress((address)round_consts));
2685     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
2686     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
2687     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
2688     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
2689 
2690     // load 8 words (256 bits) state
2691     __ ldpq(v0, v1, state);
2692 
2693     __ BIND(sha1_loop);
2694     // load 64 bytes of data into v8..v11
2695     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
2696     __ rev32(v8, __ T16B, v8);
2697     __ rev32(v9, __ T16B, v9);
2698     __ rev32(v10, __ T16B, v10);
2699     __ rev32(v11, __ T16B, v11);
2700 
2701     __ addv(v6, __ T4S, v8, v16);
2702     __ orr(v2, __ T16B, v0, v0);
2703     __ orr(v3, __ T16B, v1, v1);
2704 
2705     FloatRegister d0 = v8;
2706     FloatRegister d1 = v9;
2707     FloatRegister d2 = v10;
2708     FloatRegister d3 = v11;
2709 
2710 
2711     for (int round = 0; round < 16; round++) {
2712       FloatRegister tmp1 = (round & 1) ? v6 : v7;
2713       FloatRegister tmp2 = (round & 1) ? v7 : v6;
2714       FloatRegister tmp3 = (round & 1) ? v2 : v4;
2715       FloatRegister tmp4 = (round & 1) ? v4 : v2;
2716 
2717       if (round < 12) __ sha256su0(d0, __ T4S, d1);
2718        __ orr(v4, __ T16B, v2, v2);
2719       if (round < 15)
2720         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
2721       __ sha256h(v2, __ T4S, v3, tmp2);
2722       __ sha256h2(v3, __ T4S, v4, tmp2);
2723       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
2724 
2725       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2726     }
2727 
2728     __ addv(v0, __ T4S, v0, v2);
2729     __ addv(v1, __ T4S, v1, v3);
2730 
2731     if (multi_block) {
2732       __ add(ofs, ofs, 64);
2733       __ cmp(ofs, limit);
2734       __ br(Assembler::LE, sha1_loop);
2735       __ mov(c_rarg0, ofs); // return ofs
2736     }
2737 
2738     __ ldpd(v10, v11, Address(sp, 16));
2739     __ ldpd(v8, v9, __ post(sp, 32));
2740 
2741     __ stpq(v0, v1, state);
2742 
2743     __ ret(lr);
2744 
2745     return start;
2746   }
2747 
2748 #ifndef BUILTIN_SIM
2749   // Safefetch stubs.
2750   void generate_safefetch(const char* name, int size, address* entry,
2751                           address* fault_pc, address* continuation_pc) {
2752     // safefetch signatures:
2753     //   int      SafeFetch32(int*      adr, int      errValue);
2754     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2755     //
2756     // arguments:
2757     //   c_rarg0 = adr
2758     //   c_rarg1 = errValue
2759     //
2760     // result:
2761     //   PPC_RET  = *adr or errValue
2762 
2763     StubCodeMark mark(this, "StubRoutines", name);
2764 
2765     // Entry point, pc or function descriptor.
2766     *entry = __ pc();
2767 
2768     // Load *adr into c_rarg1, may fault.
2769     *fault_pc = __ pc();
2770     switch (size) {
2771       case 4:
2772         // int32_t
2773         __ ldrw(c_rarg1, Address(c_rarg0, 0));
2774         break;
2775       case 8:
2776         // int64_t
2777         __ ldr(c_rarg1, Address(c_rarg0, 0));
2778         break;
2779       default:
2780         ShouldNotReachHere();
2781     }
2782 
2783     // return errValue or *adr
2784     *continuation_pc = __ pc();
2785     __ mov(r0, c_rarg1);
2786     __ ret(lr);
2787   }
2788 #endif
2789 
2790   /**
2791    *  Arguments:
2792    *
2793    * Inputs:
2794    *   c_rarg0   - int crc
2795    *   c_rarg1   - byte* buf
2796    *   c_rarg2   - int length
2797    *
2798    * Ouput:
2799    *       rax   - int crc result
2800    */
2801   address generate_updateBytesCRC32() {
2802     assert(UseCRC32Intrinsics, "what are we doing here?");
2803 
2804     __ align(CodeEntryAlignment);
2805     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
2806 
2807     address start = __ pc();
2808 
2809     const Register crc   = c_rarg0;  // crc
2810     const Register buf   = c_rarg1;  // source java byte array address
2811     const Register len   = c_rarg2;  // length
2812     const Register table0 = c_rarg3; // crc_table address
2813     const Register table1 = c_rarg4;
2814     const Register table2 = c_rarg5;
2815     const Register table3 = c_rarg6;
2816     const Register tmp3 = c_rarg7;
2817 
2818     BLOCK_COMMENT("Entry:");
2819     __ enter(); // required for proper stackwalking of RuntimeStub frame
2820 
2821     __ kernel_crc32(crc, buf, len,
2822               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2823 
2824     __ leave(); // required for proper stackwalking of RuntimeStub frame
2825     __ ret(lr);
2826 
2827     return start;
2828   }
2829 
2830   /**
2831    *  Arguments:
2832    *
2833    * Inputs:
2834    *   c_rarg0   - int crc
2835    *   c_rarg1   - byte* buf
2836    *   c_rarg2   - int length
2837    *   c_rarg3   - int* table
2838    *
2839    * Ouput:
2840    *       r0   - int crc result
2841    */
2842   address generate_updateBytesCRC32C() {
2843     assert(UseCRC32CIntrinsics, "what are we doing here?");
2844 
2845     __ align(CodeEntryAlignment);
2846     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
2847 
2848     address start = __ pc();
2849 
2850     const Register crc   = c_rarg0;  // crc
2851     const Register buf   = c_rarg1;  // source java byte array address
2852     const Register len   = c_rarg2;  // length
2853     const Register table0 = c_rarg3; // crc_table address
2854     const Register table1 = c_rarg4;
2855     const Register table2 = c_rarg5;
2856     const Register table3 = c_rarg6;
2857     const Register tmp3 = c_rarg7;
2858 
2859     BLOCK_COMMENT("Entry:");
2860     __ enter(); // required for proper stackwalking of RuntimeStub frame
2861 
2862     __ kernel_crc32c(crc, buf, len,
2863               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2864 
2865     __ leave(); // required for proper stackwalking of RuntimeStub frame
2866     __ ret(lr);
2867 
2868     return start;
2869   }
2870 
2871   /***
2872    *  Arguments:
2873    *
2874    *  Inputs:
2875    *   c_rarg0   - int   adler
2876    *   c_rarg1   - byte* buff
2877    *   c_rarg2   - int   len
2878    *
2879    * Output:
2880    *   c_rarg0   - int adler result
2881    */
2882   address generate_updateBytesAdler32() {
2883     __ align(CodeEntryAlignment);
2884     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
2885     address start = __ pc();
2886 
2887     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
2888 
2889     // Aliases
2890     Register adler  = c_rarg0;
2891     Register s1     = c_rarg0;
2892     Register s2     = c_rarg3;
2893     Register buff   = c_rarg1;
2894     Register len    = c_rarg2;
2895     Register nmax  = r4;
2896     Register base = r5;
2897     Register count = r6;
2898     Register temp0 = rscratch1;
2899     Register temp1 = rscratch2;
2900     Register temp2 = r7;
2901 
2902     // Max number of bytes we can process before having to take the mod
2903     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
2904     unsigned long BASE = 0xfff1;
2905     unsigned long NMAX = 0x15B0;
2906 
2907     __ mov(base, BASE);
2908     __ mov(nmax, NMAX);
2909 
2910     // s1 is initialized to the lower 16 bits of adler
2911     // s2 is initialized to the upper 16 bits of adler
2912     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
2913     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
2914 
2915     // The pipelined loop needs at least 16 elements for 1 iteration
2916     // It does check this, but it is more effective to skip to the cleanup loop
2917     __ cmp(len, 16);
2918     __ br(Assembler::HS, L_nmax);
2919     __ cbz(len, L_combine);
2920 
2921     __ bind(L_simple_by1_loop);
2922     __ ldrb(temp0, Address(__ post(buff, 1)));
2923     __ add(s1, s1, temp0);
2924     __ add(s2, s2, s1);
2925     __ subs(len, len, 1);
2926     __ br(Assembler::HI, L_simple_by1_loop);
2927 
2928     // s1 = s1 % BASE
2929     __ subs(temp0, s1, base);
2930     __ csel(s1, temp0, s1, Assembler::HS);
2931 
2932     // s2 = s2 % BASE
2933     __ lsr(temp0, s2, 16);
2934     __ lsl(temp1, temp0, 4);
2935     __ sub(temp1, temp1, temp0);
2936     __ add(s2, temp1, s2, ext::uxth);
2937 
2938     __ subs(temp0, s2, base);
2939     __ csel(s2, temp0, s2, Assembler::HS);
2940 
2941     __ b(L_combine);
2942 
2943     __ bind(L_nmax);
2944     __ subs(len, len, nmax);
2945     __ sub(count, nmax, 16);
2946     __ br(Assembler::LO, L_by16);
2947 
2948     __ bind(L_nmax_loop);
2949 
2950     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
2951 
2952     __ add(s1, s1, temp0, ext::uxtb);
2953     __ ubfx(temp2, temp0, 8, 8);
2954     __ add(s2, s2, s1);
2955     __ add(s1, s1, temp2);
2956     __ ubfx(temp2, temp0, 16, 8);
2957     __ add(s2, s2, s1);
2958     __ add(s1, s1, temp2);
2959     __ ubfx(temp2, temp0, 24, 8);
2960     __ add(s2, s2, s1);
2961     __ add(s1, s1, temp2);
2962     __ ubfx(temp2, temp0, 32, 8);
2963     __ add(s2, s2, s1);
2964     __ add(s1, s1, temp2);
2965     __ ubfx(temp2, temp0, 40, 8);
2966     __ add(s2, s2, s1);
2967     __ add(s1, s1, temp2);
2968     __ ubfx(temp2, temp0, 48, 8);
2969     __ add(s2, s2, s1);
2970     __ add(s1, s1, temp2);
2971     __ add(s2, s2, s1);
2972     __ add(s1, s1, temp0, Assembler::LSR, 56);
2973     __ add(s2, s2, s1);
2974 
2975     __ add(s1, s1, temp1, ext::uxtb);
2976     __ ubfx(temp2, temp1, 8, 8);
2977     __ add(s2, s2, s1);
2978     __ add(s1, s1, temp2);
2979     __ ubfx(temp2, temp1, 16, 8);
2980     __ add(s2, s2, s1);
2981     __ add(s1, s1, temp2);
2982     __ ubfx(temp2, temp1, 24, 8);
2983     __ add(s2, s2, s1);
2984     __ add(s1, s1, temp2);
2985     __ ubfx(temp2, temp1, 32, 8);
2986     __ add(s2, s2, s1);
2987     __ add(s1, s1, temp2);
2988     __ ubfx(temp2, temp1, 40, 8);
2989     __ add(s2, s2, s1);
2990     __ add(s1, s1, temp2);
2991     __ ubfx(temp2, temp1, 48, 8);
2992     __ add(s2, s2, s1);
2993     __ add(s1, s1, temp2);
2994     __ add(s2, s2, s1);
2995     __ add(s1, s1, temp1, Assembler::LSR, 56);
2996     __ add(s2, s2, s1);
2997 
2998     __ subs(count, count, 16);
2999     __ br(Assembler::HS, L_nmax_loop);
3000 
3001     // s1 = s1 % BASE
3002     __ lsr(temp0, s1, 16);
3003     __ lsl(temp1, temp0, 4);
3004     __ sub(temp1, temp1, temp0);
3005     __ add(temp1, temp1, s1, ext::uxth);
3006 
3007     __ lsr(temp0, temp1, 16);
3008     __ lsl(s1, temp0, 4);
3009     __ sub(s1, s1, temp0);
3010     __ add(s1, s1, temp1, ext:: uxth);
3011 
3012     __ subs(temp0, s1, base);
3013     __ csel(s1, temp0, s1, Assembler::HS);
3014 
3015     // s2 = s2 % BASE
3016     __ lsr(temp0, s2, 16);
3017     __ lsl(temp1, temp0, 4);
3018     __ sub(temp1, temp1, temp0);
3019     __ add(temp1, temp1, s2, ext::uxth);
3020 
3021     __ lsr(temp0, temp1, 16);
3022     __ lsl(s2, temp0, 4);
3023     __ sub(s2, s2, temp0);
3024     __ add(s2, s2, temp1, ext:: uxth);
3025 
3026     __ subs(temp0, s2, base);
3027     __ csel(s2, temp0, s2, Assembler::HS);
3028 
3029     __ subs(len, len, nmax);
3030     __ sub(count, nmax, 16);
3031     __ br(Assembler::HS, L_nmax_loop);
3032 
3033     __ bind(L_by16);
3034     __ adds(len, len, count);
3035     __ br(Assembler::LO, L_by1);
3036 
3037     __ bind(L_by16_loop);
3038 
3039     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3040 
3041     __ add(s1, s1, temp0, ext::uxtb);
3042     __ ubfx(temp2, temp0, 8, 8);
3043     __ add(s2, s2, s1);
3044     __ add(s1, s1, temp2);
3045     __ ubfx(temp2, temp0, 16, 8);
3046     __ add(s2, s2, s1);
3047     __ add(s1, s1, temp2);
3048     __ ubfx(temp2, temp0, 24, 8);
3049     __ add(s2, s2, s1);
3050     __ add(s1, s1, temp2);
3051     __ ubfx(temp2, temp0, 32, 8);
3052     __ add(s2, s2, s1);
3053     __ add(s1, s1, temp2);
3054     __ ubfx(temp2, temp0, 40, 8);
3055     __ add(s2, s2, s1);
3056     __ add(s1, s1, temp2);
3057     __ ubfx(temp2, temp0, 48, 8);
3058     __ add(s2, s2, s1);
3059     __ add(s1, s1, temp2);
3060     __ add(s2, s2, s1);
3061     __ add(s1, s1, temp0, Assembler::LSR, 56);
3062     __ add(s2, s2, s1);
3063 
3064     __ add(s1, s1, temp1, ext::uxtb);
3065     __ ubfx(temp2, temp1, 8, 8);
3066     __ add(s2, s2, s1);
3067     __ add(s1, s1, temp2);
3068     __ ubfx(temp2, temp1, 16, 8);
3069     __ add(s2, s2, s1);
3070     __ add(s1, s1, temp2);
3071     __ ubfx(temp2, temp1, 24, 8);
3072     __ add(s2, s2, s1);
3073     __ add(s1, s1, temp2);
3074     __ ubfx(temp2, temp1, 32, 8);
3075     __ add(s2, s2, s1);
3076     __ add(s1, s1, temp2);
3077     __ ubfx(temp2, temp1, 40, 8);
3078     __ add(s2, s2, s1);
3079     __ add(s1, s1, temp2);
3080     __ ubfx(temp2, temp1, 48, 8);
3081     __ add(s2, s2, s1);
3082     __ add(s1, s1, temp2);
3083     __ add(s2, s2, s1);
3084     __ add(s1, s1, temp1, Assembler::LSR, 56);
3085     __ add(s2, s2, s1);
3086 
3087     __ subs(len, len, 16);
3088     __ br(Assembler::HS, L_by16_loop);
3089 
3090     __ bind(L_by1);
3091     __ adds(len, len, 15);
3092     __ br(Assembler::LO, L_do_mod);
3093 
3094     __ bind(L_by1_loop);
3095     __ ldrb(temp0, Address(__ post(buff, 1)));
3096     __ add(s1, temp0, s1);
3097     __ add(s2, s2, s1);
3098     __ subs(len, len, 1);
3099     __ br(Assembler::HS, L_by1_loop);
3100 
3101     __ bind(L_do_mod);
3102     // s1 = s1 % BASE
3103     __ lsr(temp0, s1, 16);
3104     __ lsl(temp1, temp0, 4);
3105     __ sub(temp1, temp1, temp0);
3106     __ add(temp1, temp1, s1, ext::uxth);
3107 
3108     __ lsr(temp0, temp1, 16);
3109     __ lsl(s1, temp0, 4);
3110     __ sub(s1, s1, temp0);
3111     __ add(s1, s1, temp1, ext:: uxth);
3112 
3113     __ subs(temp0, s1, base);
3114     __ csel(s1, temp0, s1, Assembler::HS);
3115 
3116     // s2 = s2 % BASE
3117     __ lsr(temp0, s2, 16);
3118     __ lsl(temp1, temp0, 4);
3119     __ sub(temp1, temp1, temp0);
3120     __ add(temp1, temp1, s2, ext::uxth);
3121 
3122     __ lsr(temp0, temp1, 16);
3123     __ lsl(s2, temp0, 4);
3124     __ sub(s2, s2, temp0);
3125     __ add(s2, s2, temp1, ext:: uxth);
3126 
3127     __ subs(temp0, s2, base);
3128     __ csel(s2, temp0, s2, Assembler::HS);
3129 
3130     // Combine lower bits and higher bits
3131     __ bind(L_combine);
3132     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3133 
3134     __ ret(lr);
3135 
3136     return start;
3137   }
3138 
3139   /**
3140    *  Arguments:
3141    *
3142    *  Input:
3143    *    c_rarg0   - x address
3144    *    c_rarg1   - x length
3145    *    c_rarg2   - y address
3146    *    c_rarg3   - y lenth
3147    *    c_rarg4   - z address
3148    *    c_rarg5   - z length
3149    */
3150   address generate_multiplyToLen() {
3151     __ align(CodeEntryAlignment);
3152     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3153 
3154     address start = __ pc();
3155     const Register x     = r0;
3156     const Register xlen  = r1;
3157     const Register y     = r2;
3158     const Register ylen  = r3;
3159     const Register z     = r4;
3160     const Register zlen  = r5;
3161 
3162     const Register tmp1  = r10;
3163     const Register tmp2  = r11;
3164     const Register tmp3  = r12;
3165     const Register tmp4  = r13;
3166     const Register tmp5  = r14;
3167     const Register tmp6  = r15;
3168     const Register tmp7  = r16;
3169 
3170     BLOCK_COMMENT("Entry:");
3171     __ enter(); // required for proper stackwalking of RuntimeStub frame
3172     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3173     __ leave(); // required for proper stackwalking of RuntimeStub frame
3174     __ ret(lr);
3175 
3176     return start;
3177   }
3178 
3179   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3180                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3181                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3182     // Karatsuba multiplication performs a 128*128 -> 256-bit
3183     // multiplication in three 128-bit multiplications and a few
3184     // additions.
3185     //
3186     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3187     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3188     //
3189     // Inputs:
3190     //
3191     // A0 in a.d[0]     (subkey)
3192     // A1 in a.d[1]
3193     // (A1+A0) in a1_xor_a0.d[0]
3194     //
3195     // B0 in b.d[0]     (state)
3196     // B1 in b.d[1]
3197 
3198     __ ext(tmp1, __ T16B, b, b, 0x08);
3199     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3200     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3201     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3202     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3203 
3204     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3205     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3206     __ eor(tmp2, __ T16B, tmp2, tmp4);
3207     __ eor(tmp2, __ T16B, tmp2, tmp3);
3208 
3209     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3210     __ ins(result_hi, __ D, tmp2, 0, 1);
3211     __ ins(result_lo, __ D, tmp2, 1, 0);
3212   }
3213 
3214   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3215                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3216     const FloatRegister t0 = result;
3217 
3218     // The GCM field polynomial f is z^128 + p(z), where p =
3219     // z^7+z^2+z+1.
3220     //
3221     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3222     //
3223     // so, given that the product we're reducing is
3224     //    a == lo + hi * z^128
3225     // substituting,
3226     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3227     //
3228     // we reduce by multiplying hi by p(z) and subtracting the result
3229     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3230     // bits we can do this with two 64-bit multiplications, lo*p and
3231     // hi*p.
3232 
3233     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3234     __ ext(t1, __ T16B, t0, z, 8);
3235     __ eor(hi, __ T16B, hi, t1);
3236     __ ext(t1, __ T16B, z, t0, 8);
3237     __ eor(lo, __ T16B, lo, t1);
3238     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3239     __ eor(result, __ T16B, lo, t0);
3240   }
3241 
3242   /**
3243    *  Arguments:
3244    *
3245    *  Input:
3246    *  c_rarg0   - current state address
3247    *  c_rarg1   - H key address
3248    *  c_rarg2   - data address
3249    *  c_rarg3   - number of blocks
3250    *
3251    *  Output:
3252    *  Updated state at c_rarg0
3253    */
3254   address generate_ghash_processBlocks() {
3255     // Bafflingly, GCM uses little-endian for the byte order, but
3256     // big-endian for the bit order.  For example, the polynomial 1 is
3257     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3258     //
3259     // So, we must either reverse the bytes in each word and do
3260     // everything big-endian or reverse the bits in each byte and do
3261     // it little-endian.  On AArch64 it's more idiomatic to reverse
3262     // the bits in each byte (we have an instruction, RBIT, to do
3263     // that) and keep the data in little-endian bit order throught the
3264     // calculation, bit-reversing the inputs and outputs.
3265 
3266     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3267     __ align(wordSize * 2);
3268     address p = __ pc();
3269     __ emit_int64(0x87);  // The low-order bits of the field
3270                           // polynomial (i.e. p = z^7+z^2+z+1)
3271                           // repeated in the low and high parts of a
3272                           // 128-bit vector
3273     __ emit_int64(0x87);
3274 
3275     __ align(CodeEntryAlignment);
3276     address start = __ pc();
3277 
3278     Register state   = c_rarg0;
3279     Register subkeyH = c_rarg1;
3280     Register data    = c_rarg2;
3281     Register blocks  = c_rarg3;
3282 
3283     FloatRegister vzr = v30;
3284     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3285 
3286     __ ldrq(v0, Address(state));
3287     __ ldrq(v1, Address(subkeyH));
3288 
3289     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3290     __ rbit(v0, __ T16B, v0);
3291     __ rev64(v1, __ T16B, v1);
3292     __ rbit(v1, __ T16B, v1);
3293 
3294     __ ldrq(v26, p);
3295 
3296     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3297     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3298 
3299     {
3300       Label L_ghash_loop;
3301       __ bind(L_ghash_loop);
3302 
3303       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3304                                                  // reversing each byte
3305       __ rbit(v2, __ T16B, v2);
3306       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3307 
3308       // Multiply state in v2 by subkey in v1
3309       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3310                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3311                      /*temps*/v6, v20, v18, v21);
3312       // Reduce v7:v5 by the field polynomial
3313       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3314 
3315       __ sub(blocks, blocks, 1);
3316       __ cbnz(blocks, L_ghash_loop);
3317     }
3318 
3319     // The bit-reversed result is at this point in v0
3320     __ rev64(v1, __ T16B, v0);
3321     __ rbit(v1, __ T16B, v1);
3322 
3323     __ st1(v1, __ T16B, state);
3324     __ ret(lr);
3325 
3326     return start;
3327   }
3328 
3329   // Continuation point for throwing of implicit exceptions that are
3330   // not handled in the current activation. Fabricates an exception
3331   // oop and initiates normal exception dispatching in this
3332   // frame. Since we need to preserve callee-saved values (currently
3333   // only for C2, but done for C1 as well) we need a callee-saved oop
3334   // map and therefore have to make these stubs into RuntimeStubs
3335   // rather than BufferBlobs.  If the compiler needs all registers to
3336   // be preserved between the fault point and the exception handler
3337   // then it must assume responsibility for that in
3338   // AbstractCompiler::continuation_for_implicit_null_exception or
3339   // continuation_for_implicit_division_by_zero_exception. All other
3340   // implicit exceptions (e.g., NullPointerException or
3341   // AbstractMethodError on entry) are either at call sites or
3342   // otherwise assume that stack unwinding will be initiated, so
3343   // caller saved registers were assumed volatile in the compiler.
3344 
3345 #undef __
3346 #define __ masm->
3347 
3348   address generate_throw_exception(const char* name,
3349                                    address runtime_entry,
3350                                    Register arg1 = noreg,
3351                                    Register arg2 = noreg) {
3352     // Information about frame layout at time of blocking runtime call.
3353     // Note that we only have to preserve callee-saved registers since
3354     // the compilers are responsible for supplying a continuation point
3355     // if they expect all registers to be preserved.
3356     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3357     enum layout {
3358       rfp_off = 0,
3359       rfp_off2,
3360       return_off,
3361       return_off2,
3362       framesize // inclusive of return address
3363     };
3364 
3365     int insts_size = 512;
3366     int locs_size  = 64;
3367 
3368     CodeBuffer code(name, insts_size, locs_size);
3369     OopMapSet* oop_maps  = new OopMapSet();
3370     MacroAssembler* masm = new MacroAssembler(&code);
3371 
3372     address start = __ pc();
3373 
3374     // This is an inlined and slightly modified version of call_VM
3375     // which has the ability to fetch the return PC out of
3376     // thread-local storage and also sets up last_Java_sp slightly
3377     // differently than the real call_VM
3378 
3379     __ enter(); // Save FP and LR before call
3380 
3381     assert(is_even(framesize/2), "sp not 16-byte aligned");
3382 
3383     // lr and fp are already in place
3384     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3385 
3386     int frame_complete = __ pc() - start;
3387 
3388     // Set up last_Java_sp and last_Java_fp
3389     address the_pc = __ pc();
3390     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3391 
3392     // Call runtime
3393     if (arg1 != noreg) {
3394       assert(arg2 != c_rarg1, "clobbered");
3395       __ mov(c_rarg1, arg1);
3396     }
3397     if (arg2 != noreg) {
3398       __ mov(c_rarg2, arg2);
3399     }
3400     __ mov(c_rarg0, rthread);
3401     BLOCK_COMMENT("call runtime_entry");
3402     __ mov(rscratch1, runtime_entry);
3403     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3404 
3405     // Generate oop map
3406     OopMap* map = new OopMap(framesize, 0);
3407 
3408     oop_maps->add_gc_map(the_pc - start, map);
3409 
3410     __ reset_last_Java_frame(true, true);
3411     __ maybe_isb();
3412 
3413     __ leave();
3414 
3415     // check for pending exceptions
3416 #ifdef ASSERT
3417     Label L;
3418     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3419     __ cbnz(rscratch1, L);
3420     __ should_not_reach_here();
3421     __ bind(L);
3422 #endif // ASSERT
3423     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3424 
3425 
3426     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3427     RuntimeStub* stub =
3428       RuntimeStub::new_runtime_stub(name,
3429                                     &code,
3430                                     frame_complete,
3431                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3432                                     oop_maps, false);
3433     return stub->entry_point();
3434   }
3435 
3436   class MontgomeryMultiplyGenerator : public MacroAssembler {
3437 
3438     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3439       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3440 
3441     RegSet _toSave;
3442     bool _squaring;
3443 
3444   public:
3445     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3446       : MacroAssembler(as->code()), _squaring(squaring) {
3447 
3448       // Register allocation
3449 
3450       Register reg = c_rarg0;
3451       Pa_base = reg;       // Argument registers
3452       if (squaring)
3453         Pb_base = Pa_base;
3454       else
3455         Pb_base = ++reg;
3456       Pn_base = ++reg;
3457       Rlen= ++reg;
3458       inv = ++reg;
3459       Pm_base = ++reg;
3460 
3461                           // Working registers:
3462       Ra =  ++reg;        // The current digit of a, b, n, and m.
3463       Rb =  ++reg;
3464       Rm =  ++reg;
3465       Rn =  ++reg;
3466 
3467       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3468       Pb =  ++reg;
3469       Pm =  ++reg;
3470       Pn =  ++reg;
3471 
3472       t0 =  ++reg;        // Three registers which form a
3473       t1 =  ++reg;        // triple-precision accumuator.
3474       t2 =  ++reg;
3475 
3476       Ri =  ++reg;        // Inner and outer loop indexes.
3477       Rj =  ++reg;
3478 
3479       Rhi_ab = ++reg;     // Product registers: low and high parts
3480       Rlo_ab = ++reg;     // of a*b and m*n.
3481       Rhi_mn = ++reg;
3482       Rlo_mn = ++reg;
3483 
3484       // r19 and up are callee-saved.
3485       _toSave = RegSet::range(r19, reg) + Pm_base;
3486     }
3487 
3488   private:
3489     void save_regs() {
3490       push(_toSave, sp);
3491     }
3492 
3493     void restore_regs() {
3494       pop(_toSave, sp);
3495     }
3496 
3497     template <typename T>
3498     void unroll_2(Register count, T block) {
3499       Label loop, end, odd;
3500       tbnz(count, 0, odd);
3501       cbz(count, end);
3502       align(16);
3503       bind(loop);
3504       (this->*block)();
3505       bind(odd);
3506       (this->*block)();
3507       subs(count, count, 2);
3508       br(Assembler::GT, loop);
3509       bind(end);
3510     }
3511 
3512     template <typename T>
3513     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3514       Label loop, end, odd;
3515       tbnz(count, 0, odd);
3516       cbz(count, end);
3517       align(16);
3518       bind(loop);
3519       (this->*block)(d, s, tmp);
3520       bind(odd);
3521       (this->*block)(d, s, tmp);
3522       subs(count, count, 2);
3523       br(Assembler::GT, loop);
3524       bind(end);
3525     }
3526 
3527     void pre1(RegisterOrConstant i) {
3528       block_comment("pre1");
3529       // Pa = Pa_base;
3530       // Pb = Pb_base + i;
3531       // Pm = Pm_base;
3532       // Pn = Pn_base + i;
3533       // Ra = *Pa;
3534       // Rb = *Pb;
3535       // Rm = *Pm;
3536       // Rn = *Pn;
3537       ldr(Ra, Address(Pa_base));
3538       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3539       ldr(Rm, Address(Pm_base));
3540       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3541       lea(Pa, Address(Pa_base));
3542       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3543       lea(Pm, Address(Pm_base));
3544       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3545 
3546       // Zero the m*n result.
3547       mov(Rhi_mn, zr);
3548       mov(Rlo_mn, zr);
3549     }
3550 
3551     // The core multiply-accumulate step of a Montgomery
3552     // multiplication.  The idea is to schedule operations as a
3553     // pipeline so that instructions with long latencies (loads and
3554     // multiplies) have time to complete before their results are
3555     // used.  This most benefits in-order implementations of the
3556     // architecture but out-of-order ones also benefit.
3557     void step() {
3558       block_comment("step");
3559       // MACC(Ra, Rb, t0, t1, t2);
3560       // Ra = *++Pa;
3561       // Rb = *--Pb;
3562       umulh(Rhi_ab, Ra, Rb);
3563       mul(Rlo_ab, Ra, Rb);
3564       ldr(Ra, pre(Pa, wordSize));
3565       ldr(Rb, pre(Pb, -wordSize));
3566       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3567                                        // previous iteration.
3568       // MACC(Rm, Rn, t0, t1, t2);
3569       // Rm = *++Pm;
3570       // Rn = *--Pn;
3571       umulh(Rhi_mn, Rm, Rn);
3572       mul(Rlo_mn, Rm, Rn);
3573       ldr(Rm, pre(Pm, wordSize));
3574       ldr(Rn, pre(Pn, -wordSize));
3575       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3576     }
3577 
3578     void post1() {
3579       block_comment("post1");
3580 
3581       // MACC(Ra, Rb, t0, t1, t2);
3582       // Ra = *++Pa;
3583       // Rb = *--Pb;
3584       umulh(Rhi_ab, Ra, Rb);
3585       mul(Rlo_ab, Ra, Rb);
3586       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3587       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3588 
3589       // *Pm = Rm = t0 * inv;
3590       mul(Rm, t0, inv);
3591       str(Rm, Address(Pm));
3592 
3593       // MACC(Rm, Rn, t0, t1, t2);
3594       // t0 = t1; t1 = t2; t2 = 0;
3595       umulh(Rhi_mn, Rm, Rn);
3596 
3597 #ifndef PRODUCT
3598       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3599       {
3600         mul(Rlo_mn, Rm, Rn);
3601         add(Rlo_mn, t0, Rlo_mn);
3602         Label ok;
3603         cbz(Rlo_mn, ok); {
3604           stop("broken Montgomery multiply");
3605         } bind(ok);
3606       }
3607 #endif
3608       // We have very carefully set things up so that
3609       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3610       // the lower half of Rm * Rn because we know the result already:
3611       // it must be -t0.  t0 + (-t0) must generate a carry iff
3612       // t0 != 0.  So, rather than do a mul and an adds we just set
3613       // the carry flag iff t0 is nonzero.
3614       //
3615       // mul(Rlo_mn, Rm, Rn);
3616       // adds(zr, t0, Rlo_mn);
3617       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3618       adcs(t0, t1, Rhi_mn);
3619       adc(t1, t2, zr);
3620       mov(t2, zr);
3621     }
3622 
3623     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3624       block_comment("pre2");
3625       // Pa = Pa_base + i-len;
3626       // Pb = Pb_base + len;
3627       // Pm = Pm_base + i-len;
3628       // Pn = Pn_base + len;
3629 
3630       if (i.is_register()) {
3631         sub(Rj, i.as_register(), len);
3632       } else {
3633         mov(Rj, i.as_constant());
3634         sub(Rj, Rj, len);
3635       }
3636       // Rj == i-len
3637 
3638       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3639       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3640       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3641       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3642 
3643       // Ra = *++Pa;
3644       // Rb = *--Pb;
3645       // Rm = *++Pm;
3646       // Rn = *--Pn;
3647       ldr(Ra, pre(Pa, wordSize));
3648       ldr(Rb, pre(Pb, -wordSize));
3649       ldr(Rm, pre(Pm, wordSize));
3650       ldr(Rn, pre(Pn, -wordSize));
3651 
3652       mov(Rhi_mn, zr);
3653       mov(Rlo_mn, zr);
3654     }
3655 
3656     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3657       block_comment("post2");
3658       if (i.is_constant()) {
3659         mov(Rj, i.as_constant()-len.as_constant());
3660       } else {
3661         sub(Rj, i.as_register(), len);
3662       }
3663 
3664       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3665 
3666       // As soon as we know the least significant digit of our result,
3667       // store it.
3668       // Pm_base[i-len] = t0;
3669       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3670 
3671       // t0 = t1; t1 = t2; t2 = 0;
3672       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3673       adc(t1, t2, zr);
3674       mov(t2, zr);
3675     }
3676 
3677     // A carry in t0 after Montgomery multiplication means that we
3678     // should subtract multiples of n from our result in m.  We'll
3679     // keep doing that until there is no carry.
3680     void normalize(RegisterOrConstant len) {
3681       block_comment("normalize");
3682       // while (t0)
3683       //   t0 = sub(Pm_base, Pn_base, t0, len);
3684       Label loop, post, again;
3685       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3686       cbz(t0, post); {
3687         bind(again); {
3688           mov(i, zr);
3689           mov(cnt, len);
3690           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3691           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3692           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3693           align(16);
3694           bind(loop); {
3695             sbcs(Rm, Rm, Rn);
3696             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3697             add(i, i, 1);
3698             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3699             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3700             sub(cnt, cnt, 1);
3701           } cbnz(cnt, loop);
3702           sbc(t0, t0, zr);
3703         } cbnz(t0, again);
3704       } bind(post);
3705     }
3706 
3707     // Move memory at s to d, reversing words.
3708     //    Increments d to end of copied memory
3709     //    Destroys tmp1, tmp2
3710     //    Preserves len
3711     //    Leaves s pointing to the address which was in d at start
3712     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3713       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3714 
3715       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3716       mov(tmp1, len);
3717       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3718       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3719     }
3720     // where
3721     void reverse1(Register d, Register s, Register tmp) {
3722       ldr(tmp, pre(s, -wordSize));
3723       ror(tmp, tmp, 32);
3724       str(tmp, post(d, wordSize));
3725     }
3726 
3727     void step_squaring() {
3728       // An extra ACC
3729       step();
3730       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3731     }
3732 
3733     void last_squaring(RegisterOrConstant i) {
3734       Label dont;
3735       // if ((i & 1) == 0) {
3736       tbnz(i.as_register(), 0, dont); {
3737         // MACC(Ra, Rb, t0, t1, t2);
3738         // Ra = *++Pa;
3739         // Rb = *--Pb;
3740         umulh(Rhi_ab, Ra, Rb);
3741         mul(Rlo_ab, Ra, Rb);
3742         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3743       } bind(dont);
3744     }
3745 
3746     void extra_step_squaring() {
3747       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3748 
3749       // MACC(Rm, Rn, t0, t1, t2);
3750       // Rm = *++Pm;
3751       // Rn = *--Pn;
3752       umulh(Rhi_mn, Rm, Rn);
3753       mul(Rlo_mn, Rm, Rn);
3754       ldr(Rm, pre(Pm, wordSize));
3755       ldr(Rn, pre(Pn, -wordSize));
3756     }
3757 
3758     void post1_squaring() {
3759       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3760 
3761       // *Pm = Rm = t0 * inv;
3762       mul(Rm, t0, inv);
3763       str(Rm, Address(Pm));
3764 
3765       // MACC(Rm, Rn, t0, t1, t2);
3766       // t0 = t1; t1 = t2; t2 = 0;
3767       umulh(Rhi_mn, Rm, Rn);
3768 
3769 #ifndef PRODUCT
3770       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3771       {
3772         mul(Rlo_mn, Rm, Rn);
3773         add(Rlo_mn, t0, Rlo_mn);
3774         Label ok;
3775         cbz(Rlo_mn, ok); {
3776           stop("broken Montgomery multiply");
3777         } bind(ok);
3778       }
3779 #endif
3780       // We have very carefully set things up so that
3781       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3782       // the lower half of Rm * Rn because we know the result already:
3783       // it must be -t0.  t0 + (-t0) must generate a carry iff
3784       // t0 != 0.  So, rather than do a mul and an adds we just set
3785       // the carry flag iff t0 is nonzero.
3786       //
3787       // mul(Rlo_mn, Rm, Rn);
3788       // adds(zr, t0, Rlo_mn);
3789       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3790       adcs(t0, t1, Rhi_mn);
3791       adc(t1, t2, zr);
3792       mov(t2, zr);
3793     }
3794 
3795     void acc(Register Rhi, Register Rlo,
3796              Register t0, Register t1, Register t2) {
3797       adds(t0, t0, Rlo);
3798       adcs(t1, t1, Rhi);
3799       adc(t2, t2, zr);
3800     }
3801 
3802   public:
3803     /**
3804      * Fast Montgomery multiplication.  The derivation of the
3805      * algorithm is in A Cryptographic Library for the Motorola
3806      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3807      *
3808      * Arguments:
3809      *
3810      * Inputs for multiplication:
3811      *   c_rarg0   - int array elements a
3812      *   c_rarg1   - int array elements b
3813      *   c_rarg2   - int array elements n (the modulus)
3814      *   c_rarg3   - int length
3815      *   c_rarg4   - int inv
3816      *   c_rarg5   - int array elements m (the result)
3817      *
3818      * Inputs for squaring:
3819      *   c_rarg0   - int array elements a
3820      *   c_rarg1   - int array elements n (the modulus)
3821      *   c_rarg2   - int length
3822      *   c_rarg3   - int inv
3823      *   c_rarg4   - int array elements m (the result)
3824      *
3825      */
3826     address generate_multiply() {
3827       Label argh, nothing;
3828       bind(argh);
3829       stop("MontgomeryMultiply total_allocation must be <= 8192");
3830 
3831       align(CodeEntryAlignment);
3832       address entry = pc();
3833 
3834       cbzw(Rlen, nothing);
3835 
3836       enter();
3837 
3838       // Make room.
3839       cmpw(Rlen, 512);
3840       br(Assembler::HI, argh);
3841       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3842       andr(sp, Ra, -2 * wordSize);
3843 
3844       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3845 
3846       {
3847         // Copy input args, reversing as we go.  We use Ra as a
3848         // temporary variable.
3849         reverse(Ra, Pa_base, Rlen, t0, t1);
3850         if (!_squaring)
3851           reverse(Ra, Pb_base, Rlen, t0, t1);
3852         reverse(Ra, Pn_base, Rlen, t0, t1);
3853       }
3854 
3855       // Push all call-saved registers and also Pm_base which we'll need
3856       // at the end.
3857       save_regs();
3858 
3859 #ifndef PRODUCT
3860       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3861       {
3862         ldr(Rn, Address(Pn_base, 0));
3863         mul(Rlo_mn, Rn, inv);
3864         cmp(Rlo_mn, -1);
3865         Label ok;
3866         br(EQ, ok); {
3867           stop("broken inverse in Montgomery multiply");
3868         } bind(ok);
3869       }
3870 #endif
3871 
3872       mov(Pm_base, Ra);
3873 
3874       mov(t0, zr);
3875       mov(t1, zr);
3876       mov(t2, zr);
3877 
3878       block_comment("for (int i = 0; i < len; i++) {");
3879       mov(Ri, zr); {
3880         Label loop, end;
3881         cmpw(Ri, Rlen);
3882         br(Assembler::GE, end);
3883 
3884         bind(loop);
3885         pre1(Ri);
3886 
3887         block_comment("  for (j = i; j; j--) {"); {
3888           movw(Rj, Ri);
3889           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3890         } block_comment("  } // j");
3891 
3892         post1();
3893         addw(Ri, Ri, 1);
3894         cmpw(Ri, Rlen);
3895         br(Assembler::LT, loop);
3896         bind(end);
3897         block_comment("} // i");
3898       }
3899 
3900       block_comment("for (int i = len; i < 2*len; i++) {");
3901       mov(Ri, Rlen); {
3902         Label loop, end;
3903         cmpw(Ri, Rlen, Assembler::LSL, 1);
3904         br(Assembler::GE, end);
3905 
3906         bind(loop);
3907         pre2(Ri, Rlen);
3908 
3909         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3910           lslw(Rj, Rlen, 1);
3911           subw(Rj, Rj, Ri);
3912           subw(Rj, Rj, 1);
3913           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3914         } block_comment("  } // j");
3915 
3916         post2(Ri, Rlen);
3917         addw(Ri, Ri, 1);
3918         cmpw(Ri, Rlen, Assembler::LSL, 1);
3919         br(Assembler::LT, loop);
3920         bind(end);
3921       }
3922       block_comment("} // i");
3923 
3924       normalize(Rlen);
3925 
3926       mov(Ra, Pm_base);  // Save Pm_base in Ra
3927       restore_regs();  // Restore caller's Pm_base
3928 
3929       // Copy our result into caller's Pm_base
3930       reverse(Pm_base, Ra, Rlen, t0, t1);
3931 
3932       leave();
3933       bind(nothing);
3934       ret(lr);
3935 
3936       return entry;
3937     }
3938     // In C, approximately:
3939 
3940     // void
3941     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
3942     //                     unsigned long Pn_base[], unsigned long Pm_base[],
3943     //                     unsigned long inv, int len) {
3944     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3945     //   unsigned long *Pa, *Pb, *Pn, *Pm;
3946     //   unsigned long Ra, Rb, Rn, Rm;
3947 
3948     //   int i;
3949 
3950     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
3951 
3952     //   for (i = 0; i < len; i++) {
3953     //     int j;
3954 
3955     //     Pa = Pa_base;
3956     //     Pb = Pb_base + i;
3957     //     Pm = Pm_base;
3958     //     Pn = Pn_base + i;
3959 
3960     //     Ra = *Pa;
3961     //     Rb = *Pb;
3962     //     Rm = *Pm;
3963     //     Rn = *Pn;
3964 
3965     //     int iters = i;
3966     //     for (j = 0; iters--; j++) {
3967     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3968     //       MACC(Ra, Rb, t0, t1, t2);
3969     //       Ra = *++Pa;
3970     //       Rb = *--Pb;
3971     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3972     //       MACC(Rm, Rn, t0, t1, t2);
3973     //       Rm = *++Pm;
3974     //       Rn = *--Pn;
3975     //     }
3976 
3977     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
3978     //     MACC(Ra, Rb, t0, t1, t2);
3979     //     *Pm = Rm = t0 * inv;
3980     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
3981     //     MACC(Rm, Rn, t0, t1, t2);
3982 
3983     //     assert(t0 == 0, "broken Montgomery multiply");
3984 
3985     //     t0 = t1; t1 = t2; t2 = 0;
3986     //   }
3987 
3988     //   for (i = len; i < 2*len; i++) {
3989     //     int j;
3990 
3991     //     Pa = Pa_base + i-len;
3992     //     Pb = Pb_base + len;
3993     //     Pm = Pm_base + i-len;
3994     //     Pn = Pn_base + len;
3995 
3996     //     Ra = *++Pa;
3997     //     Rb = *--Pb;
3998     //     Rm = *++Pm;
3999     //     Rn = *--Pn;
4000 
4001     //     int iters = len*2-i-1;
4002     //     for (j = i-len+1; iters--; j++) {
4003     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4004     //       MACC(Ra, Rb, t0, t1, t2);
4005     //       Ra = *++Pa;
4006     //       Rb = *--Pb;
4007     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4008     //       MACC(Rm, Rn, t0, t1, t2);
4009     //       Rm = *++Pm;
4010     //       Rn = *--Pn;
4011     //     }
4012 
4013     //     Pm_base[i-len] = t0;
4014     //     t0 = t1; t1 = t2; t2 = 0;
4015     //   }
4016 
4017     //   while (t0)
4018     //     t0 = sub(Pm_base, Pn_base, t0, len);
4019     // }
4020 
4021     /**
4022      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4023      * multiplies than Montgomery multiplication so it should be up to
4024      * 25% faster.  However, its loop control is more complex and it
4025      * may actually run slower on some machines.
4026      *
4027      * Arguments:
4028      *
4029      * Inputs:
4030      *   c_rarg0   - int array elements a
4031      *   c_rarg1   - int array elements n (the modulus)
4032      *   c_rarg2   - int length
4033      *   c_rarg3   - int inv
4034      *   c_rarg4   - int array elements m (the result)
4035      *
4036      */
4037     address generate_square() {
4038       Label argh;
4039       bind(argh);
4040       stop("MontgomeryMultiply total_allocation must be <= 8192");
4041 
4042       align(CodeEntryAlignment);
4043       address entry = pc();
4044 
4045       enter();
4046 
4047       // Make room.
4048       cmpw(Rlen, 512);
4049       br(Assembler::HI, argh);
4050       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4051       andr(sp, Ra, -2 * wordSize);
4052 
4053       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4054 
4055       {
4056         // Copy input args, reversing as we go.  We use Ra as a
4057         // temporary variable.
4058         reverse(Ra, Pa_base, Rlen, t0, t1);
4059         reverse(Ra, Pn_base, Rlen, t0, t1);
4060       }
4061 
4062       // Push all call-saved registers and also Pm_base which we'll need
4063       // at the end.
4064       save_regs();
4065 
4066       mov(Pm_base, Ra);
4067 
4068       mov(t0, zr);
4069       mov(t1, zr);
4070       mov(t2, zr);
4071 
4072       block_comment("for (int i = 0; i < len; i++) {");
4073       mov(Ri, zr); {
4074         Label loop, end;
4075         bind(loop);
4076         cmp(Ri, Rlen);
4077         br(Assembler::GE, end);
4078 
4079         pre1(Ri);
4080 
4081         block_comment("for (j = (i+1)/2; j; j--) {"); {
4082           add(Rj, Ri, 1);
4083           lsr(Rj, Rj, 1);
4084           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4085         } block_comment("  } // j");
4086 
4087         last_squaring(Ri);
4088 
4089         block_comment("  for (j = i/2; j; j--) {"); {
4090           lsr(Rj, Ri, 1);
4091           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4092         } block_comment("  } // j");
4093 
4094         post1_squaring();
4095         add(Ri, Ri, 1);
4096         cmp(Ri, Rlen);
4097         br(Assembler::LT, loop);
4098 
4099         bind(end);
4100         block_comment("} // i");
4101       }
4102 
4103       block_comment("for (int i = len; i < 2*len; i++) {");
4104       mov(Ri, Rlen); {
4105         Label loop, end;
4106         bind(loop);
4107         cmp(Ri, Rlen, Assembler::LSL, 1);
4108         br(Assembler::GE, end);
4109 
4110         pre2(Ri, Rlen);
4111 
4112         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4113           lsl(Rj, Rlen, 1);
4114           sub(Rj, Rj, Ri);
4115           sub(Rj, Rj, 1);
4116           lsr(Rj, Rj, 1);
4117           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4118         } block_comment("  } // j");
4119 
4120         last_squaring(Ri);
4121 
4122         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4123           lsl(Rj, Rlen, 1);
4124           sub(Rj, Rj, Ri);
4125           lsr(Rj, Rj, 1);
4126           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4127         } block_comment("  } // j");
4128 
4129         post2(Ri, Rlen);
4130         add(Ri, Ri, 1);
4131         cmp(Ri, Rlen, Assembler::LSL, 1);
4132 
4133         br(Assembler::LT, loop);
4134         bind(end);
4135         block_comment("} // i");
4136       }
4137 
4138       normalize(Rlen);
4139 
4140       mov(Ra, Pm_base);  // Save Pm_base in Ra
4141       restore_regs();  // Restore caller's Pm_base
4142 
4143       // Copy our result into caller's Pm_base
4144       reverse(Pm_base, Ra, Rlen, t0, t1);
4145 
4146       leave();
4147       ret(lr);
4148 
4149       return entry;
4150     }
4151     // In C, approximately:
4152 
4153     // void
4154     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4155     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4156     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4157     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4158     //   unsigned long Ra, Rb, Rn, Rm;
4159 
4160     //   int i;
4161 
4162     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4163 
4164     //   for (i = 0; i < len; i++) {
4165     //     int j;
4166 
4167     //     Pa = Pa_base;
4168     //     Pb = Pa_base + i;
4169     //     Pm = Pm_base;
4170     //     Pn = Pn_base + i;
4171 
4172     //     Ra = *Pa;
4173     //     Rb = *Pb;
4174     //     Rm = *Pm;
4175     //     Rn = *Pn;
4176 
4177     //     int iters = (i+1)/2;
4178     //     for (j = 0; iters--; j++) {
4179     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4180     //       MACC2(Ra, Rb, t0, t1, t2);
4181     //       Ra = *++Pa;
4182     //       Rb = *--Pb;
4183     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4184     //       MACC(Rm, Rn, t0, t1, t2);
4185     //       Rm = *++Pm;
4186     //       Rn = *--Pn;
4187     //     }
4188     //     if ((i & 1) == 0) {
4189     //       assert(Ra == Pa_base[j], "must be");
4190     //       MACC(Ra, Ra, t0, t1, t2);
4191     //     }
4192     //     iters = i/2;
4193     //     assert(iters == i-j, "must be");
4194     //     for (; iters--; j++) {
4195     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4196     //       MACC(Rm, Rn, t0, t1, t2);
4197     //       Rm = *++Pm;
4198     //       Rn = *--Pn;
4199     //     }
4200 
4201     //     *Pm = Rm = t0 * inv;
4202     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4203     //     MACC(Rm, Rn, t0, t1, t2);
4204 
4205     //     assert(t0 == 0, "broken Montgomery multiply");
4206 
4207     //     t0 = t1; t1 = t2; t2 = 0;
4208     //   }
4209 
4210     //   for (i = len; i < 2*len; i++) {
4211     //     int start = i-len+1;
4212     //     int end = start + (len - start)/2;
4213     //     int j;
4214 
4215     //     Pa = Pa_base + i-len;
4216     //     Pb = Pa_base + len;
4217     //     Pm = Pm_base + i-len;
4218     //     Pn = Pn_base + len;
4219 
4220     //     Ra = *++Pa;
4221     //     Rb = *--Pb;
4222     //     Rm = *++Pm;
4223     //     Rn = *--Pn;
4224 
4225     //     int iters = (2*len-i-1)/2;
4226     //     assert(iters == end-start, "must be");
4227     //     for (j = start; iters--; j++) {
4228     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4229     //       MACC2(Ra, Rb, t0, t1, t2);
4230     //       Ra = *++Pa;
4231     //       Rb = *--Pb;
4232     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4233     //       MACC(Rm, Rn, t0, t1, t2);
4234     //       Rm = *++Pm;
4235     //       Rn = *--Pn;
4236     //     }
4237     //     if ((i & 1) == 0) {
4238     //       assert(Ra == Pa_base[j], "must be");
4239     //       MACC(Ra, Ra, t0, t1, t2);
4240     //     }
4241     //     iters =  (2*len-i)/2;
4242     //     assert(iters == len-j, "must be");
4243     //     for (; iters--; j++) {
4244     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4245     //       MACC(Rm, Rn, t0, t1, t2);
4246     //       Rm = *++Pm;
4247     //       Rn = *--Pn;
4248     //     }
4249     //     Pm_base[i-len] = t0;
4250     //     t0 = t1; t1 = t2; t2 = 0;
4251     //   }
4252 
4253     //   while (t0)
4254     //     t0 = sub(Pm_base, Pn_base, t0, len);
4255     // }
4256   };
4257 
4258   // Initialization
4259   void generate_initial() {
4260     // Generate initial stubs and initializes the entry points
4261 
4262     // entry points that exist in all platforms Note: This is code
4263     // that could be shared among different platforms - however the
4264     // benefit seems to be smaller than the disadvantage of having a
4265     // much more complicated generator structure. See also comment in
4266     // stubRoutines.hpp.
4267 
4268     StubRoutines::_forward_exception_entry = generate_forward_exception();
4269 
4270     StubRoutines::_call_stub_entry =
4271       generate_call_stub(StubRoutines::_call_stub_return_address);
4272 
4273     // is referenced by megamorphic call
4274     StubRoutines::_catch_exception_entry = generate_catch_exception();
4275 
4276     // Build this early so it's available for the interpreter.
4277     StubRoutines::_throw_StackOverflowError_entry =
4278       generate_throw_exception("StackOverflowError throw_exception",
4279                                CAST_FROM_FN_PTR(address,
4280                                                 SharedRuntime::
4281                                                 throw_StackOverflowError));
4282     if (UseCRC32Intrinsics) {
4283       // set table address before stub generation which use it
4284       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4285       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4286     }
4287   }
4288 
4289   void generate_all() {
4290     // support for verify_oop (must happen after universe_init)
4291     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4292     StubRoutines::_throw_AbstractMethodError_entry =
4293       generate_throw_exception("AbstractMethodError throw_exception",
4294                                CAST_FROM_FN_PTR(address,
4295                                                 SharedRuntime::
4296                                                 throw_AbstractMethodError));
4297 
4298     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4299       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4300                                CAST_FROM_FN_PTR(address,
4301                                                 SharedRuntime::
4302                                                 throw_IncompatibleClassChangeError));
4303 
4304     StubRoutines::_throw_NullPointerException_at_call_entry =
4305       generate_throw_exception("NullPointerException at call throw_exception",
4306                                CAST_FROM_FN_PTR(address,
4307                                                 SharedRuntime::
4308                                                 throw_NullPointerException_at_call));
4309 
4310     // arraycopy stubs used by compilers
4311     generate_arraycopy_stubs();
4312 
4313     if (UseMultiplyToLenIntrinsic) {
4314       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4315     }
4316 
4317     if (UseMontgomeryMultiplyIntrinsic) {
4318       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4319       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4320       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4321     }
4322 
4323     if (UseMontgomerySquareIntrinsic) {
4324       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4325       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4326       // We use generate_multiply() rather than generate_square()
4327       // because it's faster for the sizes of modulus we care about.
4328       StubRoutines::_montgomerySquare = g.generate_multiply();
4329     }
4330 
4331 #ifndef BUILTIN_SIM
4332     // generate GHASH intrinsics code
4333     if (UseGHASHIntrinsics) {
4334       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4335     }
4336 
4337     if (UseAESIntrinsics) {
4338       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4339       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4340       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4341       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4342     }
4343 
4344     if (UseSHA1Intrinsics) {
4345       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4346       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4347     }
4348     if (UseSHA256Intrinsics) {
4349       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4350       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4351     }
4352 
4353     if (UseCRC32CIntrinsics) {
4354       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4355     }
4356 
4357     // generate Adler32 intrinsics code
4358     if (UseAdler32Intrinsics) {
4359       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4360     }
4361 
4362     // Safefetch stubs.
4363     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4364                                                        &StubRoutines::_safefetch32_fault_pc,
4365                                                        &StubRoutines::_safefetch32_continuation_pc);
4366     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4367                                                        &StubRoutines::_safefetchN_fault_pc,
4368                                                        &StubRoutines::_safefetchN_continuation_pc);
4369 #endif
4370   }
4371 
4372  public:
4373   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4374     if (all) {
4375       generate_all();
4376     } else {
4377       generate_initial();
4378     }
4379   }
4380 }; // end class declaration
4381 
4382 void StubGenerator_generate(CodeBuffer* code, bool all) {
4383   StubGenerator g(code, all);
4384 }