Old src/cpu/aarch64/vm/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #ifdef COMPILER2
  43 #include "opto/runtime.hpp"
  44 #endif
  45 
  46 #ifdef BUILTIN_SIM
  47 #include "../../../../../../simulator/simulator.hpp"
  48 #endif
  49 
  50 // Declaration and definition of StubGenerator (no .hpp file).
  51 // For a more detailed description of the stub routine structure
  52 // see the comment in stubRoutines.hpp
  53 
  54 #undef __
  55 #define __ _masm->
  56 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #else
  61 #define BLOCK_COMMENT(str) __ block_comment(str)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 // Stub Code definitions
  67 
  68 class StubGenerator: public StubCodeGenerator {
  69  private:
  70 
  71 #ifdef PRODUCT
  72 #define inc_counter_np(counter) ((void)0)
  73 #else
  74   void inc_counter_np_(int& counter) {
  75     __ lea(rscratch2, ExternalAddress((address)&counter));
  76     __ ldrw(rscratch1, Address(rscratch2));
  77     __ addw(rscratch1, rscratch1, 1);
  78     __ strw(rscratch1, Address(rscratch2));
  79   }
  80 #define inc_counter_np(counter) \
  81   BLOCK_COMMENT("inc_counter " #counter); \
  82   inc_counter_np_(counter);
  83 #endif
  84 
  85   // Call stubs are used to call Java from C
  86   //
  87   // Arguments:
  88   //    c_rarg0:   call wrapper address                   address
  89   //    c_rarg1:   result                                 address
  90   //    c_rarg2:   result type                            BasicType
  91   //    c_rarg3:   method                                 Method*
  92   //    c_rarg4:   (interpreter) entry point              address
  93   //    c_rarg5:   parameters                             intptr_t*
  94   //    c_rarg6:   parameter size (in words)              int
  95   //    c_rarg7:   thread                                 Thread*
  96   //
  97   // There is no return from the stub itself as any Java result
  98   // is written to result
  99   //
 100   // we save r30 (lr) as the return PC at the base of the frame and
 101   // link r29 (fp) below it as the frame pointer installing sp (r31)
 102   // into fp.
 103   //
 104   // we save r0-r7, which accounts for all the c arguments.
 105   //
 106   // TODO: strictly do we need to save them all? they are treated as
 107   // volatile by C so could we omit saving the ones we are going to
 108   // place in global registers (thread? method?) or those we only use
 109   // during setup of the Java call?
 110   //
 111   // we don't need to save r8 which C uses as an indirect result location
 112   // return register.
 113   //
 114   // we don't need to save r9-r15 which both C and Java treat as
 115   // volatile
 116   //
 117   // we don't need to save r16-18 because Java does not use them
 118   //
 119   // we save r19-r28 which Java uses as scratch registers and C
 120   // expects to be callee-save
 121   //
 122   // we save the bottom 64 bits of each value stored in v8-v15; it is
 123   // the responsibility of the caller to preserve larger values.
 124   //
 125   // so the stub frame looks like this when we enter Java code
 126   //
 127   //     [ return_from_Java     ] <--- sp
 128   //     [ argument word n      ]
 129   //      ...
 130   // -27 [ argument word 1      ]
 131   // -26 [ saved v15            ] <--- sp_after_call
 132   // -25 [ saved v14            ]
 133   // -24 [ saved v13            ]
 134   // -23 [ saved v12            ]
 135   // -22 [ saved v11            ]
 136   // -21 [ saved v10            ]
 137   // -20 [ saved v9             ]
 138   // -19 [ saved v8             ]
 139   // -18 [ saved r28            ]
 140   // -17 [ saved r27            ]
 141   // -16 [ saved r26            ]
 142   // -15 [ saved r25            ]
 143   // -14 [ saved r24            ]
 144   // -13 [ saved r23            ]
 145   // -12 [ saved r22            ]
 146   // -11 [ saved r21            ]
 147   // -10 [ saved r20            ]
 148   //  -9 [ saved r19            ]
 149   //  -8 [ call wrapper    (r0) ]
 150   //  -7 [ result          (r1) ]
 151   //  -6 [ result type     (r2) ]
 152   //  -5 [ method          (r3) ]
 153   //  -4 [ entry point     (r4) ]
 154   //  -3 [ parameters      (r5) ]
 155   //  -2 [ parameter size  (r6) ]
 156   //  -1 [ thread (r7)          ]
 157   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 158   //   1 [ saved lr       (r30) ]
 159 
 160   // Call stub stack layout word offsets from fp
 161   enum call_stub_layout {
 162     sp_after_call_off = -26,
 163 
 164     d15_off            = -26,
 165     d13_off            = -24,
 166     d11_off            = -22,
 167     d9_off             = -20,
 168 
 169     r28_off            = -18,
 170     r26_off            = -16,
 171     r24_off            = -14,
 172     r22_off            = -12,
 173     r20_off            = -10,
 174     call_wrapper_off   =  -8,
 175     result_off         =  -7,
 176     result_type_off    =  -6,
 177     method_off         =  -5,
 178     entry_point_off    =  -4,
 179     parameter_size_off =  -2,
 180     thread_off         =  -1,
 181     fp_f               =   0,
 182     retaddr_off        =   1,
 183   };
 184 
 185   address generate_call_stub(address& return_address) {
 186     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 187            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 188            "adjust this code");
 189 
 190     StubCodeMark mark(this, "StubRoutines", "call_stub");
 191     address start = __ pc();
 192 
 193     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 194 
 195     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 196     const Address result        (rfp, result_off         * wordSize);
 197     const Address result_type   (rfp, result_type_off    * wordSize);
 198     const Address method        (rfp, method_off         * wordSize);
 199     const Address entry_point   (rfp, entry_point_off    * wordSize);
 200     const Address parameter_size(rfp, parameter_size_off * wordSize);
 201 
 202     const Address thread        (rfp, thread_off         * wordSize);
 203 
 204     const Address d15_save      (rfp, d15_off * wordSize);
 205     const Address d13_save      (rfp, d13_off * wordSize);
 206     const Address d11_save      (rfp, d11_off * wordSize);
 207     const Address d9_save       (rfp, d9_off * wordSize);
 208 
 209     const Address r28_save      (rfp, r28_off * wordSize);
 210     const Address r26_save      (rfp, r26_off * wordSize);
 211     const Address r24_save      (rfp, r24_off * wordSize);
 212     const Address r22_save      (rfp, r22_off * wordSize);
 213     const Address r20_save      (rfp, r20_off * wordSize);
 214 
 215     // stub code
 216 
 217     // we need a C prolog to bootstrap the x86 caller into the sim
 218     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 219 
 220     address aarch64_entry = __ pc();
 221 
 222 #ifdef BUILTIN_SIM
 223     // Save sender's SP for stack traces.
 224     __ mov(rscratch1, sp);
 225     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 226 #endif
 227     // set up frame and move sp to end of save area
 228     __ enter();
 229     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 230 
 231     // save register parameters and Java scratch/global registers
 232     // n.b. we save thread even though it gets installed in
 233     // rthread because we want to sanity check rthread later
 234     __ str(c_rarg7,  thread);
 235     __ strw(c_rarg6, parameter_size);
 236     __ stp(c_rarg4, c_rarg5,  entry_point);
 237     __ stp(c_rarg2, c_rarg3,  result_type);
 238     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 239 
 240     __ stp(r20, r19,   r20_save);
 241     __ stp(r22, r21,   r22_save);
 242     __ stp(r24, r23,   r24_save);
 243     __ stp(r26, r25,   r26_save);
 244     __ stp(r28, r27,   r28_save);
 245 
 246     __ stpd(v9,  v8,   d9_save);
 247     __ stpd(v11, v10,  d11_save);
 248     __ stpd(v13, v12,  d13_save);
 249     __ stpd(v15, v14,  d15_save);
 250 
 251     // install Java thread in global register now we have saved
 252     // whatever value it held
 253     __ mov(rthread, c_rarg7);
 254     // And method
 255     __ mov(rmethod, c_rarg3);
 256 
 257     // set up the heapbase register
 258     __ reinit_heapbase();
 259 
 260 #ifdef ASSERT
 261     // make sure we have no pending exceptions
 262     {
 263       Label L;
 264       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 265       __ cmp(rscratch1, (unsigned)NULL_WORD);
 266       __ br(Assembler::EQ, L);
 267       __ stop("StubRoutines::call_stub: entered with pending exception");
 268       __ BIND(L);
 269     }
 270 #endif
 271     // pass parameters if any
 272     __ mov(esp, sp);
 273     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 274     __ andr(sp, rscratch1, -2 * wordSize);
 275 
 276     BLOCK_COMMENT("pass parameters if any");
 277     Label parameters_done;
 278     // parameter count is still in c_rarg6
 279     // and parameter pointer identifying param 1 is in c_rarg5
 280     __ cbzw(c_rarg6, parameters_done);
 281 
 282     address loop = __ pc();
 283     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 284     __ subsw(c_rarg6, c_rarg6, 1);
 285     __ push(rscratch1);
 286     __ br(Assembler::GT, loop);
 287 
 288     __ BIND(parameters_done);
 289 
 290     // call Java entry -- passing methdoOop, and current sp
 291     //      rmethod: Method*
 292     //      r13: sender sp
 293     BLOCK_COMMENT("call Java function");
 294     __ mov(r13, sp);
 295     __ blr(c_rarg4);
 296 
 297     // tell the simulator we have returned to the stub
 298 
 299     // we do this here because the notify will already have been done
 300     // if we get to the next instruction via an exception
 301     //
 302     // n.b. adding this instruction here affects the calculation of
 303     // whether or not a routine returns to the call stub (used when
 304     // doing stack walks) since the normal test is to check the return
 305     // pc against the address saved below. so we may need to allow for
 306     // this extra instruction in the check.
 307 
 308     if (NotifySimulator) {
 309       __ notify(Assembler::method_reentry);
 310     }
 311     // save current address for use by exception handling code
 312 
 313     return_address = __ pc();
 314 
 315     // store result depending on type (everything that is not
 316     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 317     // n.b. this assumes Java returns an integral result in r0
 318     // and a floating result in j_farg0
 319     __ ldr(j_rarg2, result);
 320     Label is_long, is_float, is_double, exit;
 321     __ ldr(j_rarg1, result_type);
 322     __ cmp(j_rarg1, T_OBJECT);
 323     __ br(Assembler::EQ, is_long);
 324     __ cmp(j_rarg1, T_LONG);
 325     __ br(Assembler::EQ, is_long);
 326     __ cmp(j_rarg1, T_FLOAT);
 327     __ br(Assembler::EQ, is_float);
 328     __ cmp(j_rarg1, T_DOUBLE);
 329     __ br(Assembler::EQ, is_double);
 330 
 331     // handle T_INT case
 332     __ strw(r0, Address(j_rarg2));
 333 
 334     __ BIND(exit);
 335 
 336     // pop parameters
 337     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 338 
 339 #ifdef ASSERT
 340     // verify that threads correspond
 341     {
 342       Label L, S;
 343       __ ldr(rscratch1, thread);
 344       __ cmp(rthread, rscratch1);
 345       __ br(Assembler::NE, S);
 346       __ get_thread(rscratch1);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::EQ, L);
 349       __ BIND(S);
 350       __ stop("StubRoutines::call_stub: threads must correspond");
 351       __ BIND(L);
 352     }
 353 #endif
 354 
 355     // restore callee-save registers
 356     __ ldpd(v15, v14,  d15_save);
 357     __ ldpd(v13, v12,  d13_save);
 358     __ ldpd(v11, v10,  d11_save);
 359     __ ldpd(v9,  v8,   d9_save);
 360 
 361     __ ldp(r28, r27,   r28_save);
 362     __ ldp(r26, r25,   r26_save);
 363     __ ldp(r24, r23,   r24_save);
 364     __ ldp(r22, r21,   r22_save);
 365     __ ldp(r20, r19,   r20_save);
 366 
 367     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 368     __ ldrw(c_rarg2, result_type);
 369     __ ldr(c_rarg3,  method);
 370     __ ldp(c_rarg4, c_rarg5,  entry_point);
 371     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 372 
 373 #ifndef PRODUCT
 374     // tell the simulator we are about to end Java execution
 375     if (NotifySimulator) {
 376       __ notify(Assembler::method_exit);
 377     }
 378 #endif
 379     // leave frame and return to caller
 380     __ leave();
 381     __ ret(lr);
 382 
 383     // handle return types different from T_INT
 384 
 385     __ BIND(is_long);
 386     __ str(r0, Address(j_rarg2, 0));
 387     __ br(Assembler::AL, exit);
 388 
 389     __ BIND(is_float);
 390     __ strs(j_farg0, Address(j_rarg2, 0));
 391     __ br(Assembler::AL, exit);
 392 
 393     __ BIND(is_double);
 394     __ strd(j_farg0, Address(j_rarg2, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     return start;
 398   }
 399 
 400   // Return point for a Java call if there's an exception thrown in
 401   // Java code.  The exception is caught and transformed into a
 402   // pending exception stored in JavaThread that can be tested from
 403   // within the VM.
 404   //
 405   // Note: Usually the parameters are removed by the callee. In case
 406   // of an exception crossing an activation frame boundary, that is
 407   // not the case if the callee is compiled code => need to setup the
 408   // rsp.
 409   //
 410   // r0: exception oop
 411 
 412   // NOTE: this is used as a target from the signal handler so it
 413   // needs an x86 prolog which returns into the current simulator
 414   // executing the generated catch_exception code. so the prolog
 415   // needs to install rax in a sim register and adjust the sim's
 416   // restart pc to enter the generated code at the start position
 417   // then return from native to simulated execution.
 418 
 419   address generate_catch_exception() {
 420     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 421     address start = __ pc();
 422 
 423     // same as in generate_call_stub():
 424     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 425     const Address thread        (rfp, thread_off         * wordSize);
 426 
 427 #ifdef ASSERT
 428     // verify that threads correspond
 429     {
 430       Label L, S;
 431       __ ldr(rscratch1, thread);
 432       __ cmp(rthread, rscratch1);
 433       __ br(Assembler::NE, S);
 434       __ get_thread(rscratch1);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::EQ, L);
 437       __ bind(S);
 438       __ stop("StubRoutines::catch_exception: threads must correspond");
 439       __ bind(L);
 440     }
 441 #endif
 442 
 443     // set pending exception
 444     __ verify_oop(r0);
 445 
 446     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 447     __ mov(rscratch1, (address)__FILE__);
 448     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 449     __ movw(rscratch1, (int)__LINE__);
 450     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 451 
 452     // complete return to VM
 453     assert(StubRoutines::_call_stub_return_address != NULL,
 454            "_call_stub_return_address must have been generated before");
 455     __ b(StubRoutines::_call_stub_return_address);
 456 
 457     return start;
 458   }
 459 
 460   // Continuation point for runtime calls returning with a pending
 461   // exception.  The pending exception check happened in the runtime
 462   // or native call stub.  The pending exception in Thread is
 463   // converted into a Java-level exception.
 464   //
 465   // Contract with Java-level exception handlers:
 466   // r0: exception
 467   // r3: throwing pc
 468   //
 469   // NOTE: At entry of this stub, exception-pc must be in LR !!
 470 
 471   // NOTE: this is always used as a jump target within generated code
 472   // so it just needs to be generated code wiht no x86 prolog
 473 
 474   address generate_forward_exception() {
 475     StubCodeMark mark(this, "StubRoutines", "forward exception");
 476     address start = __ pc();
 477 
 478     // Upon entry, LR points to the return address returning into
 479     // Java (interpreted or compiled) code; i.e., the return address
 480     // becomes the throwing pc.
 481     //
 482     // Arguments pushed before the runtime call are still on the stack
 483     // but the exception handler will reset the stack pointer ->
 484     // ignore them.  A potential result in registers can be ignored as
 485     // well.
 486 
 487 #ifdef ASSERT
 488     // make sure this code is only executed if there is a pending exception
 489     {
 490       Label L;
 491       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 492       __ cbnz(rscratch1, L);
 493       __ stop("StubRoutines::forward exception: no pending exception (1)");
 494       __ bind(L);
 495     }
 496 #endif
 497 
 498     // compute exception handler into r19
 499 
 500     // call the VM to find the handler address associated with the
 501     // caller address. pass thread in r0 and caller pc (ret address)
 502     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 503     // the stack.
 504     __ mov(c_rarg1, lr);
 505     // lr will be trashed by the VM call so we move it to R19
 506     // (callee-saved) because we also need to pass it to the handler
 507     // returned by this call.
 508     __ mov(r19, lr);
 509     BLOCK_COMMENT("call exception_handler_for_return_address");
 510     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 511                          SharedRuntime::exception_handler_for_return_address),
 512                     rthread, c_rarg1);
 513     // we should not really care that lr is no longer the callee
 514     // address. we saved the value the handler needs in r19 so we can
 515     // just copy it to r3. however, the C2 handler will push its own
 516     // frame and then calls into the VM and the VM code asserts that
 517     // the PC for the frame above the handler belongs to a compiled
 518     // Java method. So, we restore lr here to satisfy that assert.
 519     __ mov(lr, r19);
 520     // setup r0 & r3 & clear pending exception
 521     __ mov(r3, r19);
 522     __ mov(r19, r0);
 523     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 524     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 525 
 526 #ifdef ASSERT
 527     // make sure exception is set
 528     {
 529       Label L;
 530       __ cbnz(r0, L);
 531       __ stop("StubRoutines::forward exception: no pending exception (2)");
 532       __ bind(L);
 533     }
 534 #endif
 535 
 536     // continue at exception handler
 537     // r0: exception
 538     // r3: throwing pc
 539     // r19: exception handler
 540     __ verify_oop(r0);
 541     __ br(r19);
 542 
 543     return start;
 544   }
 545 
 546   // Non-destructive plausibility checks for oops
 547   //
 548   // Arguments:
 549   //    r0: oop to verify
 550   //    rscratch1: error message
 551   //
 552   // Stack after saving c_rarg3:
 553   //    [tos + 0]: saved c_rarg3
 554   //    [tos + 1]: saved c_rarg2
 555   //    [tos + 2]: saved lr
 556   //    [tos + 3]: saved rscratch2
 557   //    [tos + 4]: saved r0
 558   //    [tos + 5]: saved rscratch1
 559   address generate_verify_oop() {
 560 
 561     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 562     address start = __ pc();
 563 
 564     Label exit, error;
 565 
 566     // save c_rarg2 and c_rarg3
 567     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 568 
 569     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 570     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 571     __ ldr(c_rarg3, Address(c_rarg2));
 572     __ add(c_rarg3, c_rarg3, 1);
 573     __ str(c_rarg3, Address(c_rarg2));
 574 
 575     // object is in r0
 576     // make sure object is 'reasonable'
 577     __ cbz(r0, exit); // if obj is NULL it is OK
 578 
 579     // Check if the oop is in the right area of memory
 580     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 581     __ andr(c_rarg2, r0, c_rarg3);
 582     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 583 
 584     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 585     // instruction here because the flags register is live.
 586     __ eor(c_rarg2, c_rarg2, c_rarg3);
 587     __ cbnz(c_rarg2, error);
 588 
 589     // make sure klass is 'reasonable', which is not zero.
 590     __ load_klass(r0, r0);  // get klass
 591     __ cbz(r0, error);      // if klass is NULL it is broken
 592 
 593     // return if everything seems ok
 594     __ bind(exit);
 595 
 596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 597     __ ret(lr);
 598 
 599     // handle errors
 600     __ bind(error);
 601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 602 
 603     __ push(RegSet::range(r0, r29), sp);
 604     // debug(char* msg, int64_t pc, int64_t regs[])
 605     __ mov(c_rarg0, rscratch1);      // pass address of error message
 606     __ mov(c_rarg1, lr);             // pass return address
 607     __ mov(c_rarg2, sp);             // pass address of regs on stack
 608 #ifndef PRODUCT
 609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 610 #endif
 611     BLOCK_COMMENT("call MacroAssembler::debug");
 612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 613     __ blrt(rscratch1, 3, 0, 1);
 614 
 615     return start;
 616   }
 617 
 618   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 619 
 620   // Generate code for an array write pre barrier
 621   //
 622   //     addr    -  starting address
 623   //     count   -  element count
 624   //     tmp     - scratch register
 625   //
 626   //     Destroy no registers except rscratch1 and rscratch2
 627   //
 628   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 629     BarrierSet* bs = Universe::heap()->barrier_set();
 630     switch (bs->kind()) {
 631     case BarrierSet::G1SATBCTLogging:
 632       // With G1, don't generate the call if we statically know that the target in uninitialized
 633       if (!dest_uninitialized) {
 634         __ push_call_clobbered_registers();
 635         if (count == c_rarg0) {
 636           if (addr == c_rarg1) {
 637             // exactly backwards!!
 638             __ mov(rscratch1, c_rarg0);
 639             __ mov(c_rarg0, c_rarg1);
 640             __ mov(c_rarg1, rscratch1);
 641           } else {
 642             __ mov(c_rarg1, count);
 643             __ mov(c_rarg0, addr);
 644           }
 645         } else {
 646           __ mov(c_rarg0, addr);
 647           __ mov(c_rarg1, count);
 648         }
 649         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 650         __ pop_call_clobbered_registers();
 651         break;
 652       case BarrierSet::CardTableForRS:
 653       case BarrierSet::CardTableExtension:
 654       case BarrierSet::ModRef:
 655         break;
 656       default:
 657         ShouldNotReachHere();
 658 
 659       }
 660     }
 661   }
 662 
 663   //
 664   // Generate code for an array write post barrier
 665   //
 666   //  Input:
 667   //     start    - register containing starting address of destination array
 668   //     end      - register containing ending address of destination array
 669   //     scratch  - scratch register
 670   //
 671   //  The input registers are overwritten.
 672   //  The ending address is inclusive.
 673   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 674     assert_different_registers(start, end, scratch);
 675     BarrierSet* bs = Universe::heap()->barrier_set();
 676     switch (bs->kind()) {
 677       case BarrierSet::G1SATBCTLogging:
 678 
 679         {
 680           __ push_call_clobbered_registers();
 681           // must compute element count unless barrier set interface is changed (other platforms supply count)
 682           assert_different_registers(start, end, scratch);
 683           __ lea(scratch, Address(end, BytesPerHeapOop));
 684           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 685           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 686           __ mov(c_rarg0, start);
 687           __ mov(c_rarg1, scratch);
 688           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 689           __ pop_call_clobbered_registers();
 690         }
 691         break;
 692       case BarrierSet::CardTableForRS:
 693       case BarrierSet::CardTableExtension:
 694         {
 695           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 696           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 697 
 698           Label L_loop;
 699 
 700            __ lsr(start, start, CardTableModRefBS::card_shift);
 701            __ lsr(end, end, CardTableModRefBS::card_shift);
 702            __ sub(end, end, start); // number of bytes to copy
 703 
 704           const Register count = end; // 'end' register contains bytes count now
 705           __ load_byte_map_base(scratch);
 706           __ add(start, start, scratch);
 707           if (UseConcMarkSweepGC) {
 708             __ membar(__ StoreStore);
 709           }
 710           __ BIND(L_loop);
 711           __ strb(zr, Address(start, count));
 712           __ subs(count, count, 1);
 713           __ br(Assembler::GE, L_loop);
 714         }
 715         break;
 716       default:
 717         ShouldNotReachHere();
 718 
 719     }
 720   }
 721 
 722   address generate_zero_longs(Register base, Register cnt) {
 723     Register tmp = rscratch1;
 724     Register tmp2 = rscratch2;
 725     int zva_length = VM_Version::zva_length();
 726     Label initial_table_end, loop_zva;
 727     Label fini;
 728 
 729     __ align(CodeEntryAlignment);
 730     StubCodeMark mark(this, "StubRoutines", "zero_longs");
 731     address start = __ pc();
 732 
 733     // Base must be 16 byte aligned. If not just return and let caller handle it
 734     __ tst(base, 0x0f);
 735     __ br(Assembler::NE, fini);
 736     // Align base with ZVA length.
 737     __ neg(tmp, base);
 738     __ andr(tmp, tmp, zva_length - 1);
 739 
 740     // tmp: the number of bytes to be filled to align the base with ZVA length.
 741     __ add(base, base, tmp);
 742     __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
 743     __ adr(tmp2, initial_table_end);
 744     __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
 745     __ br(tmp2);
 746 
 747     for (int i = -zva_length + 16; i < 0; i += 16)
 748       __ stp(zr, zr, Address(base, i));
 749     __ bind(initial_table_end);
 750 
 751     __ sub(cnt, cnt, zva_length >> 3);
 752     __ bind(loop_zva);
 753     __ dc(Assembler::ZVA, base);
 754     __ subs(cnt, cnt, zva_length >> 3);
 755     __ add(base, base, zva_length);
 756     __ br(Assembler::GE, loop_zva);
 757     __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
 758     __ bind(fini);
 759     __ ret(lr);
 760 
 761     return start;
 762   }
 763 
 764   typedef enum {
 765     copy_forwards = 1,
 766     copy_backwards = -1
 767   } copy_direction;
 768 
 769   // Bulk copy of blocks of 8 words.
 770   //
 771   // count is a count of words.
 772   //
 773   // Precondition: count >= 8
 774   //
 775   // Postconditions:
 776   //
 777   // The least significant bit of count contains the remaining count
 778   // of words to copy.  The rest of count is trash.
 779   //
 780   // s and d are adjusted to point to the remaining words to copy
 781   //
 782   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 783                            copy_direction direction) {
 784     int unit = wordSize * direction;
 785     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 786 
 787     int offset;
 788     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 789       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 790     const Register stride = r13;
 791 
 792     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 793     assert_different_registers(s, d, count, rscratch1);
 794 
 795     Label again, drain;
 796     const char *stub_name;
 797     if (direction == copy_forwards)
 798       stub_name = "foward_copy_longs";
 799     else
 800       stub_name = "backward_copy_longs";
 801     StubCodeMark mark(this, "StubRoutines", stub_name);
 802     __ align(CodeEntryAlignment);
 803     __ bind(start);
 804     if (direction == copy_forwards) {
 805       __ sub(s, s, bias);
 806       __ sub(d, d, bias);
 807     }
 808 
 809 #ifdef ASSERT
 810     // Make sure we are never given < 8 words
 811     {
 812       Label L;
 813       __ cmp(count, 8);
 814       __ br(Assembler::GE, L);
 815       __ stop("genrate_copy_longs called with < 8 words");
 816       __ bind(L);
 817     }
 818 #endif
 819 
 820     // Fill 8 registers
 821     if (UseSIMDForMemoryOps) {
 822       __ ldpq(v0, v1, Address(s, 4 * unit));
 823       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 824     } else {
 825       __ ldp(t0, t1, Address(s, 2 * unit));
 826       __ ldp(t2, t3, Address(s, 4 * unit));
 827       __ ldp(t4, t5, Address(s, 6 * unit));
 828       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 829     }
 830 
 831     __ subs(count, count, 16);
 832     __ br(Assembler::LO, drain);
 833 
 834     int prefetch = PrefetchCopyIntervalInBytes;
 835     bool use_stride = false;
 836     if (direction == copy_backwards) {
 837        use_stride = prefetch > 256;
 838        prefetch = -prefetch;
 839        if (use_stride) __ mov(stride, prefetch);
 840     }
 841 
 842     __ bind(again);
 843 
 844     if (PrefetchCopyIntervalInBytes > 0)
 845       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 846 
 847     if (UseSIMDForMemoryOps) {
 848       __ stpq(v0, v1, Address(d, 4 * unit));
 849       __ ldpq(v0, v1, Address(s, 4 * unit));
 850       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 851       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 852     } else {
 853       __ stp(t0, t1, Address(d, 2 * unit));
 854       __ ldp(t0, t1, Address(s, 2 * unit));
 855       __ stp(t2, t3, Address(d, 4 * unit));
 856       __ ldp(t2, t3, Address(s, 4 * unit));
 857       __ stp(t4, t5, Address(d, 6 * unit));
 858       __ ldp(t4, t5, Address(s, 6 * unit));
 859       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 860       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 861     }
 862 
 863     __ subs(count, count, 8);
 864     __ br(Assembler::HS, again);
 865 
 866     // Drain
 867     __ bind(drain);
 868     if (UseSIMDForMemoryOps) {
 869       __ stpq(v0, v1, Address(d, 4 * unit));
 870       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 871     } else {
 872       __ stp(t0, t1, Address(d, 2 * unit));
 873       __ stp(t2, t3, Address(d, 4 * unit));
 874       __ stp(t4, t5, Address(d, 6 * unit));
 875       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 876     }
 877 
 878     {
 879       Label L1, L2;
 880       __ tbz(count, exact_log2(4), L1);
 881       if (UseSIMDForMemoryOps) {
 882         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 883         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 884       } else {
 885         __ ldp(t0, t1, Address(s, 2 * unit));
 886         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 887         __ stp(t0, t1, Address(d, 2 * unit));
 888         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 889       }
 890       __ bind(L1);
 891 
 892       if (direction == copy_forwards) {
 893         __ add(s, s, bias);
 894         __ add(d, d, bias);
 895       }
 896 
 897       __ tbz(count, 1, L2);
 898       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 899       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 900       __ bind(L2);
 901     }
 902 
 903     __ ret(lr);
 904   }
 905 
 906   // Small copy: less than 16 bytes.
 907   //
 908   // NB: Ignores all of the bits of count which represent more than 15
 909   // bytes, so a caller doesn't have to mask them.
 910 
 911   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
 912     bool is_backwards = step < 0;
 913     size_t granularity = uabs(step);
 914     int direction = is_backwards ? -1 : 1;
 915     int unit = wordSize * direction;
 916 
 917     Label Lpair, Lword, Lint, Lshort, Lbyte;
 918 
 919     assert(granularity
 920            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 921 
 922     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
 923 
 924     // ??? I don't know if this bit-test-and-branch is the right thing
 925     // to do.  It does a lot of jumping, resulting in several
 926     // mispredicted branches.  It might make more sense to do this
 927     // with something like Duff's device with a single computed branch.
 928 
 929     __ tbz(count, 3 - exact_log2(granularity), Lword);
 930     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
 931     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
 932     __ bind(Lword);
 933 
 934     if (granularity <= sizeof (jint)) {
 935       __ tbz(count, 2 - exact_log2(granularity), Lint);
 936       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 937       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 938       __ bind(Lint);
 939     }
 940 
 941     if (granularity <= sizeof (jshort)) {
 942       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 943       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 944       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 945       __ bind(Lshort);
 946     }
 947 
 948     if (granularity <= sizeof (jbyte)) {
 949       __ tbz(count, 0, Lbyte);
 950       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 951       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 952       __ bind(Lbyte);
 953     }
 954   }
 955 
 956   Label copy_f, copy_b;
 957 
 958   // All-singing all-dancing memory copy.
 959   //
 960   // Copy count units of memory from s to d.  The size of a unit is
 961   // step, which can be positive or negative depending on the direction
 962   // of copy.  If is_aligned is false, we align the source address.
 963   //
 964 
 965   void copy_memory(bool is_aligned, Register s, Register d,
 966                    Register count, Register tmp, int step) {
 967     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 968     bool is_backwards = step < 0;
 969     int granularity = uabs(step);
 970     const Register t0 = r3, t1 = r4;
 971 
 972     // <= 96 bytes do inline. Direction doesn't matter because we always
 973     // load all the data before writing anything
 974     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
 975     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
 976     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
 977     const Register send = r17, dend = r18;
 978 
 979     if (PrefetchCopyIntervalInBytes > 0)
 980       __ prfm(Address(s, 0), PLDL1KEEP);
 981     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
 982     __ br(Assembler::HI, copy_big);
 983 
 984     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 985     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 986 
 987     __ cmp(count, 16/granularity);
 988     __ br(Assembler::LS, copy16);
 989 
 990     __ cmp(count, 64/granularity);
 991     __ br(Assembler::HI, copy80);
 992 
 993     __ cmp(count, 32/granularity);
 994     __ br(Assembler::LS, copy32);
 995 
 996     // 33..64 bytes
 997     if (UseSIMDForMemoryOps) {
 998       __ ldpq(v0, v1, Address(s, 0));
 999       __ ldpq(v2, v3, Address(send, -32));
1000       __ stpq(v0, v1, Address(d, 0));
1001       __ stpq(v2, v3, Address(dend, -32));
1002     } else {
1003       __ ldp(t0, t1, Address(s, 0));
1004       __ ldp(t2, t3, Address(s, 16));
1005       __ ldp(t4, t5, Address(send, -32));
1006       __ ldp(t6, t7, Address(send, -16));
1007 
1008       __ stp(t0, t1, Address(d, 0));
1009       __ stp(t2, t3, Address(d, 16));
1010       __ stp(t4, t5, Address(dend, -32));
1011       __ stp(t6, t7, Address(dend, -16));
1012     }
1013     __ b(finish);
1014 
1015     // 17..32 bytes
1016     __ bind(copy32);
1017     __ ldp(t0, t1, Address(s, 0));
1018     __ ldp(t2, t3, Address(send, -16));
1019     __ stp(t0, t1, Address(d, 0));
1020     __ stp(t2, t3, Address(dend, -16));
1021     __ b(finish);
1022 
1023     // 65..80/96 bytes
1024     // (96 bytes if SIMD because we do 32 byes per instruction)
1025     __ bind(copy80);
1026     if (UseSIMDForMemoryOps) {
1027       __ ldpq(v0, v1, Address(s, 0));
1028       __ ldpq(v2, v3, Address(s, 32));
1029       __ ldpq(v4, v5, Address(send, -32));
1030       __ stpq(v0, v1, Address(d, 0));
1031       __ stpq(v2, v3, Address(d, 32));
1032       __ stpq(v4, v5, Address(dend, -32));
1033     } else {
1034       __ ldp(t0, t1, Address(s, 0));
1035       __ ldp(t2, t3, Address(s, 16));
1036       __ ldp(t4, t5, Address(s, 32));
1037       __ ldp(t6, t7, Address(s, 48));
1038       __ ldp(t8, t9, Address(send, -16));
1039 
1040       __ stp(t0, t1, Address(d, 0));
1041       __ stp(t2, t3, Address(d, 16));
1042       __ stp(t4, t5, Address(d, 32));
1043       __ stp(t6, t7, Address(d, 48));
1044       __ stp(t8, t9, Address(dend, -16));
1045     }
1046     __ b(finish);
1047 
1048     // 0..16 bytes
1049     __ bind(copy16);
1050     __ cmp(count, 8/granularity);
1051     __ br(Assembler::LO, copy8);
1052 
1053     // 8..16 bytes
1054     __ ldr(t0, Address(s, 0));
1055     __ ldr(t1, Address(send, -8));
1056     __ str(t0, Address(d, 0));
1057     __ str(t1, Address(dend, -8));
1058     __ b(finish);
1059 
1060     if (granularity < 8) {
1061       // 4..7 bytes
1062       __ bind(copy8);
1063       __ tbz(count, 2 - exact_log2(granularity), copy4);
1064       __ ldrw(t0, Address(s, 0));
1065       __ ldrw(t1, Address(send, -4));
1066       __ strw(t0, Address(d, 0));
1067       __ strw(t1, Address(dend, -4));
1068       __ b(finish);
1069       if (granularity < 4) {
1070         // 0..3 bytes
1071         __ bind(copy4);
1072         __ cbz(count, finish); // get rid of 0 case
1073         if (granularity == 2) {
1074           __ ldrh(t0, Address(s, 0));
1075           __ strh(t0, Address(d, 0));
1076         } else { // granularity == 1
1077           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1078           // the first and last byte.
1079           // Handle the 3 byte case by loading and storing base + count/2
1080           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1081           // This does means in the 1 byte case we load/store the same
1082           // byte 3 times.
1083           __ lsr(count, count, 1);
1084           __ ldrb(t0, Address(s, 0));
1085           __ ldrb(t1, Address(send, -1));
1086           __ ldrb(t2, Address(s, count));
1087           __ strb(t0, Address(d, 0));
1088           __ strb(t1, Address(dend, -1));
1089           __ strb(t2, Address(d, count));
1090         }
1091         __ b(finish);
1092       }
1093     }
1094 
1095     __ bind(copy_big);
1096     if (is_backwards) {
1097       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1098       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1099     }
1100 
1101     // Now we've got the small case out of the way we can align the
1102     // source address on a 2-word boundary.
1103 
1104     Label aligned;
1105 
1106     if (is_aligned) {
1107       // We may have to adjust by 1 word to get s 2-word-aligned.
1108       __ tbz(s, exact_log2(wordSize), aligned);
1109       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1110       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1111       __ sub(count, count, wordSize/granularity);
1112     } else {
1113       if (is_backwards) {
1114         __ andr(rscratch2, s, 2 * wordSize - 1);
1115       } else {
1116         __ neg(rscratch2, s);
1117         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1118       }
1119       // rscratch2 is the byte adjustment needed to align s.
1120       __ cbz(rscratch2, aligned);
1121       int shift = exact_log2(granularity);
1122       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1123       __ sub(count, count, rscratch2);
1124 
1125 #if 0
1126       // ?? This code is only correct for a disjoint copy.  It may or
1127       // may not make sense to use it in that case.
1128 
1129       // Copy the first pair; s and d may not be aligned.
1130       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1131       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1132 
1133       // Align s and d, adjust count
1134       if (is_backwards) {
1135         __ sub(s, s, rscratch2);
1136         __ sub(d, d, rscratch2);
1137       } else {
1138         __ add(s, s, rscratch2);
1139         __ add(d, d, rscratch2);
1140       }
1141 #else
1142       copy_memory_small(s, d, rscratch2, rscratch1, step);
1143 #endif
1144     }
1145 
1146     __ bind(aligned);
1147 
1148     // s is now 2-word-aligned.
1149 
1150     // We have a count of units and some trailing bytes.  Adjust the
1151     // count and do a bulk copy of words.
1152     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1153     if (direction == copy_forwards)
1154       __ bl(copy_f);
1155     else
1156       __ bl(copy_b);
1157 
1158     // And the tail.
1159     copy_memory_small(s, d, count, tmp, step);
1160 
1161     if (granularity >= 8) __ bind(copy8);
1162     if (granularity >= 4) __ bind(copy4);
1163     __ bind(finish);
1164   }
1165 
1166 
1167   void clobber_registers() {
1168 #ifdef ASSERT
1169     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1170     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1171     for (Register r = r3; r <= r18; r++)
1172       if (r != rscratch1) __ mov(r, rscratch1);
1173 #endif
1174   }
1175 
1176   // Scan over array at a for count oops, verifying each one.
1177   // Preserves a and count, clobbers rscratch1 and rscratch2.
1178   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1179     Label loop, end;
1180     __ mov(rscratch1, a);
1181     __ mov(rscratch2, zr);
1182     __ bind(loop);
1183     __ cmp(rscratch2, count);
1184     __ br(Assembler::HS, end);
1185     if (size == (size_t)wordSize) {
1186       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1187       __ verify_oop(temp);
1188     } else {
1189       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1190       __ decode_heap_oop(temp); // calls verify_oop
1191     }
1192     __ add(rscratch2, rscratch2, size);
1193     __ b(loop);
1194     __ bind(end);
1195   }
1196 
1197   // Arguments:
1198   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1199   //             ignored
1200   //   is_oop  - true => oop array, so generate store check code
1201   //   name    - stub name string
1202   //
1203   // Inputs:
1204   //   c_rarg0   - source array address
1205   //   c_rarg1   - destination array address
1206   //   c_rarg2   - element count, treated as ssize_t, can be zero
1207   //
1208   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1209   // the hardware handle it.  The two dwords within qwords that span
1210   // cache line boundaries will still be loaded and stored atomicly.
1211   //
1212   // Side Effects:
1213   //   disjoint_int_copy_entry is set to the no-overlap entry point
1214   //   used by generate_conjoint_int_oop_copy().
1215   //
1216   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1217                                   const char *name, bool dest_uninitialized = false) {
1218     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1219     __ align(CodeEntryAlignment);
1220     StubCodeMark mark(this, "StubRoutines", name);
1221     address start = __ pc();
1222     __ enter();
1223 
1224     if (entry != NULL) {
1225       *entry = __ pc();
1226       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1227       BLOCK_COMMENT("Entry:");
1228     }
1229 
1230     if (is_oop) {
1231       __ push(RegSet::of(d, count), sp);
1232       // no registers are destroyed by this call
1233       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1234     }
1235     copy_memory(aligned, s, d, count, rscratch1, size);
1236     if (is_oop) {
1237       __ pop(RegSet::of(d, count), sp);
1238       if (VerifyOops)
1239         verify_oop_array(size, d, count, r16);
1240       __ sub(count, count, 1); // make an inclusive end pointer
1241       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1242       gen_write_ref_array_post_barrier(d, count, rscratch1);
1243     }
1244     __ leave();
1245     __ mov(r0, zr); // return 0
1246     __ ret(lr);
1247 #ifdef BUILTIN_SIM
1248     {
1249       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1250       sim->notifyCompile(const_cast<char*>(name), start);
1251     }
1252 #endif
1253     return start;
1254   }
1255 
1256   // Arguments:
1257   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1258   //             ignored
1259   //   is_oop  - true => oop array, so generate store check code
1260   //   name    - stub name string
1261   //
1262   // Inputs:
1263   //   c_rarg0   - source array address
1264   //   c_rarg1   - destination array address
1265   //   c_rarg2   - element count, treated as ssize_t, can be zero
1266   //
1267   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1268   // the hardware handle it.  The two dwords within qwords that span
1269   // cache line boundaries will still be loaded and stored atomicly.
1270   //
1271   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1272                                  address *entry, const char *name,
1273                                  bool dest_uninitialized = false) {
1274     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1275 
1276     StubCodeMark mark(this, "StubRoutines", name);
1277     address start = __ pc();
1278     __ enter();
1279 
1280     if (entry != NULL) {
1281       *entry = __ pc();
1282       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1283       BLOCK_COMMENT("Entry:");
1284     }
1285 
1286     // use fwd copy when (d-s) above_equal (count*size)
1287     __ sub(rscratch1, d, s);
1288     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1289     __ br(Assembler::HS, nooverlap_target);
1290 
1291     if (is_oop) {
1292       __ push(RegSet::of(d, count), sp);
1293       // no registers are destroyed by this call
1294       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1295     }
1296     copy_memory(aligned, s, d, count, rscratch1, -size);
1297     if (is_oop) {
1298       __ pop(RegSet::of(d, count), sp);
1299       if (VerifyOops)
1300         verify_oop_array(size, d, count, r16);
1301       __ sub(count, count, 1); // make an inclusive end pointer
1302       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1303       gen_write_ref_array_post_barrier(d, count, rscratch1);
1304     }
1305     __ leave();
1306     __ mov(r0, zr); // return 0
1307     __ ret(lr);
1308 #ifdef BUILTIN_SIM
1309     {
1310       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1311       sim->notifyCompile(const_cast<char*>(name), start);
1312     }
1313 #endif
1314     return start;
1315 }
1316 
1317   // Arguments:
1318   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1319   //             ignored
1320   //   name    - stub name string
1321   //
1322   // Inputs:
1323   //   c_rarg0   - source array address
1324   //   c_rarg1   - destination array address
1325   //   c_rarg2   - element count, treated as ssize_t, can be zero
1326   //
1327   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1328   // we let the hardware handle it.  The one to eight bytes within words,
1329   // dwords or qwords that span cache line boundaries will still be loaded
1330   // and stored atomically.
1331   //
1332   // Side Effects:
1333   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1334   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1335   // we let the hardware handle it.  The one to eight bytes within words,
1336   // dwords or qwords that span cache line boundaries will still be loaded
1337   // and stored atomically.
1338   //
1339   // Side Effects:
1340   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1341   //   used by generate_conjoint_byte_copy().
1342   //
1343   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1344     const bool not_oop = false;
1345     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1346   }
1347 
1348   // Arguments:
1349   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1350   //             ignored
1351   //   name    - stub name string
1352   //
1353   // Inputs:
1354   //   c_rarg0   - source array address
1355   //   c_rarg1   - destination array address
1356   //   c_rarg2   - element count, treated as ssize_t, can be zero
1357   //
1358   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1359   // we let the hardware handle it.  The one to eight bytes within words,
1360   // dwords or qwords that span cache line boundaries will still be loaded
1361   // and stored atomically.
1362   //
1363   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1364                                       address* entry, const char *name) {
1365     const bool not_oop = false;
1366     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1367   }
1368 
1369   // Arguments:
1370   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1371   //             ignored
1372   //   name    - stub name string
1373   //
1374   // Inputs:
1375   //   c_rarg0   - source array address
1376   //   c_rarg1   - destination array address
1377   //   c_rarg2   - element count, treated as ssize_t, can be zero
1378   //
1379   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1380   // let the hardware handle it.  The two or four words within dwords
1381   // or qwords that span cache line boundaries will still be loaded
1382   // and stored atomically.
1383   //
1384   // Side Effects:
1385   //   disjoint_short_copy_entry is set to the no-overlap entry point
1386   //   used by generate_conjoint_short_copy().
1387   //
1388   address generate_disjoint_short_copy(bool aligned,
1389                                        address* entry, const char *name) {
1390     const bool not_oop = false;
1391     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1392   }
1393 
1394   // Arguments:
1395   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1396   //             ignored
1397   //   name    - stub name string
1398   //
1399   // Inputs:
1400   //   c_rarg0   - source array address
1401   //   c_rarg1   - destination array address
1402   //   c_rarg2   - element count, treated as ssize_t, can be zero
1403   //
1404   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1405   // let the hardware handle it.  The two or four words within dwords
1406   // or qwords that span cache line boundaries will still be loaded
1407   // and stored atomically.
1408   //
1409   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1410                                        address *entry, const char *name) {
1411     const bool not_oop = false;
1412     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1413 
1414   }
1415   // Arguments:
1416   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1417   //             ignored
1418   //   name    - stub name string
1419   //
1420   // Inputs:
1421   //   c_rarg0   - source array address
1422   //   c_rarg1   - destination array address
1423   //   c_rarg2   - element count, treated as ssize_t, can be zero
1424   //
1425   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1426   // the hardware handle it.  The two dwords within qwords that span
1427   // cache line boundaries will still be loaded and stored atomicly.
1428   //
1429   // Side Effects:
1430   //   disjoint_int_copy_entry is set to the no-overlap entry point
1431   //   used by generate_conjoint_int_oop_copy().
1432   //
1433   address generate_disjoint_int_copy(bool aligned, address *entry,
1434                                          const char *name, bool dest_uninitialized = false) {
1435     const bool not_oop = false;
1436     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1437   }
1438 
1439   // Arguments:
1440   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1441   //             ignored
1442   //   name    - stub name string
1443   //
1444   // Inputs:
1445   //   c_rarg0   - source array address
1446   //   c_rarg1   - destination array address
1447   //   c_rarg2   - element count, treated as ssize_t, can be zero
1448   //
1449   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1450   // the hardware handle it.  The two dwords within qwords that span
1451   // cache line boundaries will still be loaded and stored atomicly.
1452   //
1453   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1454                                      address *entry, const char *name,
1455                                      bool dest_uninitialized = false) {
1456     const bool not_oop = false;
1457     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1458   }
1459 
1460 
1461   // Arguments:
1462   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1463   //             ignored
1464   //   name    - stub name string
1465   //
1466   // Inputs:
1467   //   c_rarg0   - source array address
1468   //   c_rarg1   - destination array address
1469   //   c_rarg2   - element count, treated as size_t, can be zero
1470   //
1471   // Side Effects:
1472   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1473   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1474   //
1475   address generate_disjoint_long_copy(bool aligned, address *entry,
1476                                           const char *name, bool dest_uninitialized = false) {
1477     const bool not_oop = false;
1478     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1479   }
1480 
1481   // Arguments:
1482   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1483   //             ignored
1484   //   name    - stub name string
1485   //
1486   // Inputs:
1487   //   c_rarg0   - source array address
1488   //   c_rarg1   - destination array address
1489   //   c_rarg2   - element count, treated as size_t, can be zero
1490   //
1491   address generate_conjoint_long_copy(bool aligned,
1492                                       address nooverlap_target, address *entry,
1493                                       const char *name, bool dest_uninitialized = false) {
1494     const bool not_oop = false;
1495     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1496   }
1497 
1498   // Arguments:
1499   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1500   //             ignored
1501   //   name    - stub name string
1502   //
1503   // Inputs:
1504   //   c_rarg0   - source array address
1505   //   c_rarg1   - destination array address
1506   //   c_rarg2   - element count, treated as size_t, can be zero
1507   //
1508   // Side Effects:
1509   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1510   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1511   //
1512   address generate_disjoint_oop_copy(bool aligned, address *entry,
1513                                      const char *name, bool dest_uninitialized) {
1514     const bool is_oop = true;
1515     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1516     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1517   }
1518 
1519   // Arguments:
1520   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1521   //             ignored
1522   //   name    - stub name string
1523   //
1524   // Inputs:
1525   //   c_rarg0   - source array address
1526   //   c_rarg1   - destination array address
1527   //   c_rarg2   - element count, treated as size_t, can be zero
1528   //
1529   address generate_conjoint_oop_copy(bool aligned,
1530                                      address nooverlap_target, address *entry,
1531                                      const char *name, bool dest_uninitialized) {
1532     const bool is_oop = true;
1533     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1534     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1535                                   name, dest_uninitialized);
1536   }
1537 
1538 
1539   // Helper for generating a dynamic type check.
1540   // Smashes rscratch1.
1541   void generate_type_check(Register sub_klass,
1542                            Register super_check_offset,
1543                            Register super_klass,
1544                            Label& L_success) {
1545     assert_different_registers(sub_klass, super_check_offset, super_klass);
1546 
1547     BLOCK_COMMENT("type_check:");
1548 
1549     Label L_miss;
1550 
1551     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1552                                      super_check_offset);
1553     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1554 
1555     // Fall through on failure!
1556     __ BIND(L_miss);
1557   }
1558 
1559   //
1560   //  Generate checkcasting array copy stub
1561   //
1562   //  Input:
1563   //    c_rarg0   - source array address
1564   //    c_rarg1   - destination array address
1565   //    c_rarg2   - element count, treated as ssize_t, can be zero
1566   //    c_rarg3   - size_t ckoff (super_check_offset)
1567   //    c_rarg4   - oop ckval (super_klass)
1568   //
1569   //  Output:
1570   //    r0 ==  0  -  success
1571   //    r0 == -1^K - failure, where K is partial transfer count
1572   //
1573   address generate_checkcast_copy(const char *name, address *entry,
1574                                   bool dest_uninitialized = false) {
1575 
1576     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1577 
1578     // Input registers (after setup_arg_regs)
1579     const Register from        = c_rarg0;   // source array address
1580     const Register to          = c_rarg1;   // destination array address
1581     const Register count       = c_rarg2;   // elementscount
1582     const Register ckoff       = c_rarg3;   // super_check_offset
1583     const Register ckval       = c_rarg4;   // super_klass
1584 
1585     // Registers used as temps (r18, r19, r20 are save-on-entry)
1586     const Register count_save  = r21;       // orig elementscount
1587     const Register start_to    = r20;       // destination array start address
1588     const Register copied_oop  = r18;       // actual oop copied
1589     const Register r19_klass   = r19;       // oop._klass
1590 
1591     //---------------------------------------------------------------
1592     // Assembler stub will be used for this call to arraycopy
1593     // if the two arrays are subtypes of Object[] but the
1594     // destination array type is not equal to or a supertype
1595     // of the source type.  Each element must be separately
1596     // checked.
1597 
1598     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1599                                copied_oop, r19_klass, count_save);
1600 
1601     __ align(CodeEntryAlignment);
1602     StubCodeMark mark(this, "StubRoutines", name);
1603     address start = __ pc();
1604 
1605     __ enter(); // required for proper stackwalking of RuntimeStub frame
1606 
1607 #ifdef ASSERT
1608     // caller guarantees that the arrays really are different
1609     // otherwise, we would have to make conjoint checks
1610     { Label L;
1611       array_overlap_test(L, TIMES_OOP);
1612       __ stop("checkcast_copy within a single array");
1613       __ bind(L);
1614     }
1615 #endif //ASSERT
1616 
1617     // Caller of this entry point must set up the argument registers.
1618     if (entry != NULL) {
1619       *entry = __ pc();
1620       BLOCK_COMMENT("Entry:");
1621     }
1622 
1623      // Empty array:  Nothing to do.
1624     __ cbz(count, L_done);
1625 
1626     __ push(RegSet::of(r18, r19, r20, r21), sp);
1627 
1628 #ifdef ASSERT
1629     BLOCK_COMMENT("assert consistent ckoff/ckval");
1630     // The ckoff and ckval must be mutually consistent,
1631     // even though caller generates both.
1632     { Label L;
1633       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1634       __ ldrw(start_to, Address(ckval, sco_offset));
1635       __ cmpw(ckoff, start_to);
1636       __ br(Assembler::EQ, L);
1637       __ stop("super_check_offset inconsistent");
1638       __ bind(L);
1639     }
1640 #endif //ASSERT
1641 
1642     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1643 
1644     // save the original count
1645     __ mov(count_save, count);
1646 
1647     // Copy from low to high addresses
1648     __ mov(start_to, to);              // Save destination array start address
1649     __ b(L_load_element);
1650 
1651     // ======== begin loop ========
1652     // (Loop is rotated; its entry is L_load_element.)
1653     // Loop control:
1654     //   for (; count != 0; count--) {
1655     //     copied_oop = load_heap_oop(from++);
1656     //     ... generate_type_check ...;
1657     //     store_heap_oop(to++, copied_oop);
1658     //   }
1659     __ align(OptoLoopAlignment);
1660 
1661     __ BIND(L_store_element);
1662     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1663     __ sub(count, count, 1);
1664     __ cbz(count, L_do_card_marks);
1665 
1666     // ======== loop entry is here ========
1667     __ BIND(L_load_element);
1668     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1669     __ cbz(copied_oop, L_store_element);
1670 
1671     __ load_klass(r19_klass, copied_oop);// query the object klass
1672     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1673     // ======== end loop ========
1674 
1675     // It was a real error; we must depend on the caller to finish the job.
1676     // Register count = remaining oops, count_orig = total oops.
1677     // Emit GC store barriers for the oops we have copied and report
1678     // their number to the caller.
1679 
1680     __ subs(count, count_save, count);     // K = partially copied oop count
1681     __ eon(count, count, zr);                   // report (-1^K) to caller
1682     __ br(Assembler::EQ, L_done_pop);
1683 
1684     __ BIND(L_do_card_marks);
1685     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1686     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1687 
1688     __ bind(L_done_pop);
1689     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1690     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1691 
1692     __ bind(L_done);
1693     __ mov(r0, count);
1694     __ leave();
1695     __ ret(lr);
1696 
1697     return start;
1698   }
1699 
1700   // Perform range checks on the proposed arraycopy.
1701   // Kills temp, but nothing else.
1702   // Also, clean the sign bits of src_pos and dst_pos.
1703   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1704                               Register src_pos, // source position (c_rarg1)
1705                               Register dst,     // destination array oo (c_rarg2)
1706                               Register dst_pos, // destination position (c_rarg3)
1707                               Register length,
1708                               Register temp,
1709                               Label& L_failed) {
1710     BLOCK_COMMENT("arraycopy_range_checks:");
1711 
1712     assert_different_registers(rscratch1, temp);
1713 
1714     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1715     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1716     __ addw(temp, length, src_pos);
1717     __ cmpw(temp, rscratch1);
1718     __ br(Assembler::HI, L_failed);
1719 
1720     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1721     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1722     __ addw(temp, length, dst_pos);
1723     __ cmpw(temp, rscratch1);
1724     __ br(Assembler::HI, L_failed);
1725 
1726     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1727     __ movw(src_pos, src_pos);
1728     __ movw(dst_pos, dst_pos);
1729 
1730     BLOCK_COMMENT("arraycopy_range_checks done");
1731   }
1732 
1733   // These stubs get called from some dumb test routine.
1734   // I'll write them properly when they're called from
1735   // something that's actually doing something.
1736   static void fake_arraycopy_stub(address src, address dst, int count) {
1737     assert(count == 0, "huh?");
1738   }
1739 
1740 
1741   //
1742   //  Generate 'unsafe' array copy stub
1743   //  Though just as safe as the other stubs, it takes an unscaled
1744   //  size_t argument instead of an element count.
1745   //
1746   //  Input:
1747   //    c_rarg0   - source array address
1748   //    c_rarg1   - destination array address
1749   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1750   //
1751   // Examines the alignment of the operands and dispatches
1752   // to a long, int, short, or byte copy loop.
1753   //
1754   address generate_unsafe_copy(const char *name,
1755                                address byte_copy_entry,
1756                                address short_copy_entry,
1757                                address int_copy_entry,
1758                                address long_copy_entry) {
1759     Label L_long_aligned, L_int_aligned, L_short_aligned;
1760     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1761 
1762     __ align(CodeEntryAlignment);
1763     StubCodeMark mark(this, "StubRoutines", name);
1764     address start = __ pc();
1765     __ enter(); // required for proper stackwalking of RuntimeStub frame
1766 
1767     // bump this on entry, not on exit:
1768     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1769 
1770     __ orr(rscratch1, s, d);
1771     __ orr(rscratch1, rscratch1, count);
1772 
1773     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1774     __ cbz(rscratch1, L_long_aligned);
1775     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1776     __ cbz(rscratch1, L_int_aligned);
1777     __ tbz(rscratch1, 0, L_short_aligned);
1778     __ b(RuntimeAddress(byte_copy_entry));
1779 
1780     __ BIND(L_short_aligned);
1781     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1782     __ b(RuntimeAddress(short_copy_entry));
1783     __ BIND(L_int_aligned);
1784     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1785     __ b(RuntimeAddress(int_copy_entry));
1786     __ BIND(L_long_aligned);
1787     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1788     __ b(RuntimeAddress(long_copy_entry));
1789 
1790     return start;
1791   }
1792 
1793   //
1794   //  Generate generic array copy stubs
1795   //
1796   //  Input:
1797   //    c_rarg0    -  src oop
1798   //    c_rarg1    -  src_pos (32-bits)
1799   //    c_rarg2    -  dst oop
1800   //    c_rarg3    -  dst_pos (32-bits)
1801   //    c_rarg4    -  element count (32-bits)
1802   //
1803   //  Output:
1804   //    r0 ==  0  -  success
1805   //    r0 == -1^K - failure, where K is partial transfer count
1806   //
1807   address generate_generic_copy(const char *name,
1808                                 address byte_copy_entry, address short_copy_entry,
1809                                 address int_copy_entry, address oop_copy_entry,
1810                                 address long_copy_entry, address checkcast_copy_entry) {
1811 
1812     Label L_failed, L_failed_0, L_objArray;
1813     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1814 
1815     // Input registers
1816     const Register src        = c_rarg0;  // source array oop
1817     const Register src_pos    = c_rarg1;  // source position
1818     const Register dst        = c_rarg2;  // destination array oop
1819     const Register dst_pos    = c_rarg3;  // destination position
1820     const Register length     = c_rarg4;
1821 
1822     StubCodeMark mark(this, "StubRoutines", name);
1823 
1824     __ align(CodeEntryAlignment);
1825     address start = __ pc();
1826 
1827     __ enter(); // required for proper stackwalking of RuntimeStub frame
1828 
1829     // bump this on entry, not on exit:
1830     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1831 
1832     //-----------------------------------------------------------------------
1833     // Assembler stub will be used for this call to arraycopy
1834     // if the following conditions are met:
1835     //
1836     // (1) src and dst must not be null.
1837     // (2) src_pos must not be negative.
1838     // (3) dst_pos must not be negative.
1839     // (4) length  must not be negative.
1840     // (5) src klass and dst klass should be the same and not NULL.
1841     // (6) src and dst should be arrays.
1842     // (7) src_pos + length must not exceed length of src.
1843     // (8) dst_pos + length must not exceed length of dst.
1844     //
1845 
1846     //  if (src == NULL) return -1;
1847     __ cbz(src, L_failed);
1848 
1849     //  if (src_pos < 0) return -1;
1850     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1851 
1852     //  if (dst == NULL) return -1;
1853     __ cbz(dst, L_failed);
1854 
1855     //  if (dst_pos < 0) return -1;
1856     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
1857 
1858     // registers used as temp
1859     const Register scratch_length    = r16; // elements count to copy
1860     const Register scratch_src_klass = r17; // array klass
1861     const Register lh                = r18; // layout helper
1862 
1863     //  if (length < 0) return -1;
1864     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
1865     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
1866 
1867     __ load_klass(scratch_src_klass, src);
1868 #ifdef ASSERT
1869     //  assert(src->klass() != NULL);
1870     {
1871       BLOCK_COMMENT("assert klasses not null {");
1872       Label L1, L2;
1873       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
1874       __ bind(L1);
1875       __ stop("broken null klass");
1876       __ bind(L2);
1877       __ load_klass(rscratch1, dst);
1878       __ cbz(rscratch1, L1);     // this would be broken also
1879       BLOCK_COMMENT("} assert klasses not null done");
1880     }
1881 #endif
1882 
1883     // Load layout helper (32-bits)
1884     //
1885     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1886     // 32        30    24            16              8     2                 0
1887     //
1888     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1889     //
1890 
1891     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1892 
1893     // Handle objArrays completely differently...
1894     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1895     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
1896     __ movw(rscratch1, objArray_lh);
1897     __ eorw(rscratch2, lh, rscratch1);
1898     __ cbzw(rscratch2, L_objArray);
1899 
1900     //  if (src->klass() != dst->klass()) return -1;
1901     __ load_klass(rscratch2, dst);
1902     __ eor(rscratch2, rscratch2, scratch_src_klass);
1903     __ cbnz(rscratch2, L_failed);
1904 
1905     //  if (!src->is_Array()) return -1;
1906     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
1907 
1908     // At this point, it is known to be a typeArray (array_tag 0x3).
1909 #ifdef ASSERT
1910     {
1911       BLOCK_COMMENT("assert primitive array {");
1912       Label L;
1913       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
1914       __ cmpw(lh, rscratch2);
1915       __ br(Assembler::GE, L);
1916       __ stop("must be a primitive array");
1917       __ bind(L);
1918       BLOCK_COMMENT("} assert primitive array done");
1919     }
1920 #endif
1921 
1922     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1923                            rscratch2, L_failed);
1924 
1925     // TypeArrayKlass
1926     //
1927     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1928     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1929     //
1930 
1931     const Register rscratch1_offset = rscratch1;    // array offset
1932     const Register r18_elsize = lh; // element size
1933 
1934     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
1935            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
1936     __ add(src, src, rscratch1_offset);           // src array offset
1937     __ add(dst, dst, rscratch1_offset);           // dst array offset
1938     BLOCK_COMMENT("choose copy loop based on element size");
1939 
1940     // next registers should be set before the jump to corresponding stub
1941     const Register from     = c_rarg0;  // source array address
1942     const Register to       = c_rarg1;  // destination array address
1943     const Register count    = c_rarg2;  // elements count
1944 
1945     // 'from', 'to', 'count' registers should be set in such order
1946     // since they are the same as 'src', 'src_pos', 'dst'.
1947 
1948     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1949 
1950     // The possible values of elsize are 0-3, i.e. exact_log2(element
1951     // size in bytes).  We do a simple bitwise binary search.
1952   __ BIND(L_copy_bytes);
1953     __ tbnz(r18_elsize, 1, L_copy_ints);
1954     __ tbnz(r18_elsize, 0, L_copy_shorts);
1955     __ lea(from, Address(src, src_pos));// src_addr
1956     __ lea(to,   Address(dst, dst_pos));// dst_addr
1957     __ movw(count, scratch_length); // length
1958     __ b(RuntimeAddress(byte_copy_entry));
1959 
1960   __ BIND(L_copy_shorts);
1961     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
1962     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
1963     __ movw(count, scratch_length); // length
1964     __ b(RuntimeAddress(short_copy_entry));
1965 
1966   __ BIND(L_copy_ints);
1967     __ tbnz(r18_elsize, 0, L_copy_longs);
1968     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
1969     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
1970     __ movw(count, scratch_length); // length
1971     __ b(RuntimeAddress(int_copy_entry));
1972 
1973   __ BIND(L_copy_longs);
1974 #ifdef ASSERT
1975     {
1976       BLOCK_COMMENT("assert long copy {");
1977       Label L;
1978       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
1979       __ cmpw(r18_elsize, LogBytesPerLong);
1980       __ br(Assembler::EQ, L);
1981       __ stop("must be long copy, but elsize is wrong");
1982       __ bind(L);
1983       BLOCK_COMMENT("} assert long copy done");
1984     }
1985 #endif
1986     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
1987     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
1988     __ movw(count, scratch_length); // length
1989     __ b(RuntimeAddress(long_copy_entry));
1990 
1991     // ObjArrayKlass
1992   __ BIND(L_objArray);
1993     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1994 
1995     Label L_plain_copy, L_checkcast_copy;
1996     //  test array classes for subtyping
1997     __ load_klass(r18, dst);
1998     __ cmp(scratch_src_klass, r18); // usual case is exact equality
1999     __ br(Assembler::NE, L_checkcast_copy);
2000 
2001     // Identically typed arrays can be copied without element-wise checks.
2002     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2003                            rscratch2, L_failed);
2004 
2005     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2006     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2007     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2008     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2009     __ movw(count, scratch_length); // length
2010   __ BIND(L_plain_copy);
2011     __ b(RuntimeAddress(oop_copy_entry));
2012 
2013   __ BIND(L_checkcast_copy);
2014     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2015     {
2016       // Before looking at dst.length, make sure dst is also an objArray.
2017       __ ldrw(rscratch1, Address(r18, lh_offset));
2018       __ movw(rscratch2, objArray_lh);
2019       __ eorw(rscratch1, rscratch1, rscratch2);
2020       __ cbnzw(rscratch1, L_failed);
2021 
2022       // It is safe to examine both src.length and dst.length.
2023       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2024                              r18, L_failed);
2025 
2026       const Register rscratch2_dst_klass = rscratch2;
2027       __ load_klass(rscratch2_dst_klass, dst); // reload
2028 
2029       // Marshal the base address arguments now, freeing registers.
2030       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2031       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2032       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2033       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2034       __ movw(count, length);           // length (reloaded)
2035       Register sco_temp = c_rarg3;      // this register is free now
2036       assert_different_registers(from, to, count, sco_temp,
2037                                  rscratch2_dst_klass, scratch_src_klass);
2038       // assert_clean_int(count, sco_temp);
2039 
2040       // Generate the type check.
2041       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2042       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2043       // assert_clean_int(sco_temp, r18);
2044       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2045 
2046       // Fetch destination element klass from the ObjArrayKlass header.
2047       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2048       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2049       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2050 
2051       // the checkcast_copy loop needs two extra arguments:
2052       assert(c_rarg3 == sco_temp, "#3 already in place");
2053       // Set up arguments for checkcast_copy_entry.
2054       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2055       __ b(RuntimeAddress(checkcast_copy_entry));
2056     }
2057 
2058   __ BIND(L_failed);
2059     __ mov(r0, -1);
2060     __ leave();   // required for proper stackwalking of RuntimeStub frame
2061     __ ret(lr);
2062 
2063     return start;
2064   }
2065 
2066   //
2067   // Generate stub for array fill. If "aligned" is true, the
2068   // "to" address is assumed to be heapword aligned.
2069   //
2070   // Arguments for generated stub:
2071   //   to:    c_rarg0
2072   //   value: c_rarg1
2073   //   count: c_rarg2 treated as signed
2074   //
2075   address generate_fill(BasicType t, bool aligned, const char *name) {
2076     __ align(CodeEntryAlignment);
2077     StubCodeMark mark(this, "StubRoutines", name);
2078     address start = __ pc();
2079 
2080     BLOCK_COMMENT("Entry:");
2081 
2082     const Register to        = c_rarg0;  // source array address
2083     const Register value     = c_rarg1;  // value
2084     const Register count     = c_rarg2;  // elements count
2085 
2086     const Register bz_base = r10;        // base for block_zero routine
2087     const Register cnt_words = r11;      // temp register
2088 
2089     __ enter();
2090 
2091     Label L_fill_elements, L_exit1;
2092 
2093     int shift = -1;
2094     switch (t) {
2095       case T_BYTE:
2096         shift = 0;
2097         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2098         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2099         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2100         __ br(Assembler::LO, L_fill_elements);
2101         break;
2102       case T_SHORT:
2103         shift = 1;
2104         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2105         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2106         __ br(Assembler::LO, L_fill_elements);
2107         break;
2108       case T_INT:
2109         shift = 2;
2110         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2111         __ br(Assembler::LO, L_fill_elements);
2112         break;
2113       default: ShouldNotReachHere();
2114     }
2115 
2116     // Align source address at 8 bytes address boundary.
2117     Label L_skip_align1, L_skip_align2, L_skip_align4;
2118     if (!aligned) {
2119       switch (t) {
2120         case T_BYTE:
2121           // One byte misalignment happens only for byte arrays.
2122           __ tbz(to, 0, L_skip_align1);
2123           __ strb(value, Address(__ post(to, 1)));
2124           __ subw(count, count, 1);
2125           __ bind(L_skip_align1);
2126           // Fallthrough
2127         case T_SHORT:
2128           // Two bytes misalignment happens only for byte and short (char) arrays.
2129           __ tbz(to, 1, L_skip_align2);
2130           __ strh(value, Address(__ post(to, 2)));
2131           __ subw(count, count, 2 >> shift);
2132           __ bind(L_skip_align2);
2133           // Fallthrough
2134         case T_INT:
2135           // Align to 8 bytes, we know we are 4 byte aligned to start.
2136           __ tbz(to, 2, L_skip_align4);
2137           __ strw(value, Address(__ post(to, 4)));
2138           __ subw(count, count, 4 >> shift);
2139           __ bind(L_skip_align4);
2140           break;
2141         default: ShouldNotReachHere();
2142       }
2143     }
2144 
2145     //
2146     //  Fill large chunks
2147     //
2148     __ lsrw(cnt_words, count, 3 - shift); // number of words
2149     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2150     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2151     if (UseBlockZeroing) {
2152       Label non_block_zeroing, rest;
2153       // count >= BlockZeroingLowLimit && value == 0
2154       __ cmp(cnt_words, BlockZeroingLowLimit >> 3);
2155       __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
2156       __ br(Assembler::NE, non_block_zeroing);
2157       __ mov(bz_base, to);
2158       __ block_zero(bz_base, cnt_words, true);
2159       __ mov(to, bz_base);
2160       __ b(rest);
2161       __ bind(non_block_zeroing);
2162       __ fill_words(to, cnt_words, value);
2163       __ bind(rest);
2164     }
2165     else {
2166       __ fill_words(to, cnt_words, value);
2167     }
2168 
2169     // Remaining count is less than 8 bytes. Fill it by a single store.
2170     // Note that the total length is no less than 8 bytes.
2171     if (t == T_BYTE || t == T_SHORT) {
2172       Label L_exit1;
2173       __ cbzw(count, L_exit1);
2174       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2175       __ str(value, Address(to, -8));    // overwrite some elements
2176       __ bind(L_exit1);
2177       __ leave();
2178       __ ret(lr);
2179     }
2180 
2181     // Handle copies less than 8 bytes.
2182     Label L_fill_2, L_fill_4, L_exit2;
2183     __ bind(L_fill_elements);
2184     switch (t) {
2185       case T_BYTE:
2186         __ tbz(count, 0, L_fill_2);
2187         __ strb(value, Address(__ post(to, 1)));
2188         __ bind(L_fill_2);
2189         __ tbz(count, 1, L_fill_4);
2190         __ strh(value, Address(__ post(to, 2)));
2191         __ bind(L_fill_4);
2192         __ tbz(count, 2, L_exit2);
2193         __ strw(value, Address(to));
2194         break;
2195       case T_SHORT:
2196         __ tbz(count, 0, L_fill_4);
2197         __ strh(value, Address(__ post(to, 2)));
2198         __ bind(L_fill_4);
2199         __ tbz(count, 1, L_exit2);
2200         __ strw(value, Address(to));
2201         break;
2202       case T_INT:
2203         __ cbzw(count, L_exit2);
2204         __ strw(value, Address(to));
2205         break;
2206       default: ShouldNotReachHere();
2207     }
2208     __ bind(L_exit2);
2209     __ leave();
2210     __ ret(lr);
2211     return start;
2212   }
2213 
2214   void generate_arraycopy_stubs() {
2215     address entry;
2216     address entry_jbyte_arraycopy;
2217     address entry_jshort_arraycopy;
2218     address entry_jint_arraycopy;
2219     address entry_oop_arraycopy;
2220     address entry_jlong_arraycopy;
2221     address entry_checkcast_arraycopy;
2222 
2223     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2224     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2225 
2226     StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
2227 
2228     //*** jbyte
2229     // Always need aligned and unaligned versions
2230     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2231                                                                                   "jbyte_disjoint_arraycopy");
2232     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2233                                                                                   &entry_jbyte_arraycopy,
2234                                                                                   "jbyte_arraycopy");
2235     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2236                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2237     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2238                                                                                   "arrayof_jbyte_arraycopy");
2239 
2240     //*** jshort
2241     // Always need aligned and unaligned versions
2242     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2243                                                                                     "jshort_disjoint_arraycopy");
2244     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2245                                                                                     &entry_jshort_arraycopy,
2246                                                                                     "jshort_arraycopy");
2247     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2248                                                                                     "arrayof_jshort_disjoint_arraycopy");
2249     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2250                                                                                     "arrayof_jshort_arraycopy");
2251 
2252     //*** jint
2253     // Aligned versions
2254     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2255                                                                                 "arrayof_jint_disjoint_arraycopy");
2256     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2257                                                                                 "arrayof_jint_arraycopy");
2258     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2259     // entry_jint_arraycopy always points to the unaligned version
2260     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2261                                                                                 "jint_disjoint_arraycopy");
2262     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2263                                                                                 &entry_jint_arraycopy,
2264                                                                                 "jint_arraycopy");
2265 
2266     //*** jlong
2267     // It is always aligned
2268     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2269                                                                                   "arrayof_jlong_disjoint_arraycopy");
2270     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2271                                                                                   "arrayof_jlong_arraycopy");
2272     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2273     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2274 
2275     //*** oops
2276     {
2277       // With compressed oops we need unaligned versions; notice that
2278       // we overwrite entry_oop_arraycopy.
2279       bool aligned = !UseCompressedOops;
2280 
2281       StubRoutines::_arrayof_oop_disjoint_arraycopy
2282         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2283                                      /*dest_uninitialized*/false);
2284       StubRoutines::_arrayof_oop_arraycopy
2285         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2286                                      /*dest_uninitialized*/false);
2287       // Aligned versions without pre-barriers
2288       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2289         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2290                                      /*dest_uninitialized*/true);
2291       StubRoutines::_arrayof_oop_arraycopy_uninit
2292         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2293                                      /*dest_uninitialized*/true);
2294     }
2295 
2296     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2297     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2298     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2299     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2300 
2301     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2302     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2303                                                                         /*dest_uninitialized*/true);
2304 
2305     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2306                                                               entry_jbyte_arraycopy,
2307                                                               entry_jshort_arraycopy,
2308                                                               entry_jint_arraycopy,
2309                                                               entry_jlong_arraycopy);
2310 
2311     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2312                                                                entry_jbyte_arraycopy,
2313                                                                entry_jshort_arraycopy,
2314                                                                entry_jint_arraycopy,
2315                                                                entry_oop_arraycopy,
2316                                                                entry_jlong_arraycopy,
2317                                                                entry_checkcast_arraycopy);
2318 
2319     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2320     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2321     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2322     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2323     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2324     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2325   }
2326 
2327   void generate_math_stubs() { Unimplemented(); }
2328 
2329   // Arguments:
2330   //
2331   // Inputs:
2332   //   c_rarg0   - source byte array address
2333   //   c_rarg1   - destination byte array address
2334   //   c_rarg2   - K (key) in little endian int array
2335   //
2336   address generate_aescrypt_encryptBlock() {
2337     __ align(CodeEntryAlignment);
2338     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2339 
2340     Label L_doLast;
2341 
2342     const Register from        = c_rarg0;  // source array address
2343     const Register to          = c_rarg1;  // destination array address
2344     const Register key         = c_rarg2;  // key array address
2345     const Register keylen      = rscratch1;
2346 
2347     address start = __ pc();
2348     __ enter();
2349 
2350     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2351 
2352     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2353 
2354     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2355     __ rev32(v1, __ T16B, v1);
2356     __ rev32(v2, __ T16B, v2);
2357     __ rev32(v3, __ T16B, v3);
2358     __ rev32(v4, __ T16B, v4);
2359     __ aese(v0, v1);
2360     __ aesmc(v0, v0);
2361     __ aese(v0, v2);
2362     __ aesmc(v0, v0);
2363     __ aese(v0, v3);
2364     __ aesmc(v0, v0);
2365     __ aese(v0, v4);
2366     __ aesmc(v0, v0);
2367 
2368     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2369     __ rev32(v1, __ T16B, v1);
2370     __ rev32(v2, __ T16B, v2);
2371     __ rev32(v3, __ T16B, v3);
2372     __ rev32(v4, __ T16B, v4);
2373     __ aese(v0, v1);
2374     __ aesmc(v0, v0);
2375     __ aese(v0, v2);
2376     __ aesmc(v0, v0);
2377     __ aese(v0, v3);
2378     __ aesmc(v0, v0);
2379     __ aese(v0, v4);
2380     __ aesmc(v0, v0);
2381 
2382     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2383     __ rev32(v1, __ T16B, v1);
2384     __ rev32(v2, __ T16B, v2);
2385 
2386     __ cmpw(keylen, 44);
2387     __ br(Assembler::EQ, L_doLast);
2388 
2389     __ aese(v0, v1);
2390     __ aesmc(v0, v0);
2391     __ aese(v0, v2);
2392     __ aesmc(v0, v0);
2393 
2394     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2395     __ rev32(v1, __ T16B, v1);
2396     __ rev32(v2, __ T16B, v2);
2397 
2398     __ cmpw(keylen, 52);
2399     __ br(Assembler::EQ, L_doLast);
2400 
2401     __ aese(v0, v1);
2402     __ aesmc(v0, v0);
2403     __ aese(v0, v2);
2404     __ aesmc(v0, v0);
2405 
2406     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2407     __ rev32(v1, __ T16B, v1);
2408     __ rev32(v2, __ T16B, v2);
2409 
2410     __ BIND(L_doLast);
2411 
2412     __ aese(v0, v1);
2413     __ aesmc(v0, v0);
2414     __ aese(v0, v2);
2415 
2416     __ ld1(v1, __ T16B, key);
2417     __ rev32(v1, __ T16B, v1);
2418     __ eor(v0, __ T16B, v0, v1);
2419 
2420     __ st1(v0, __ T16B, to);
2421 
2422     __ mov(r0, 0);
2423 
2424     __ leave();
2425     __ ret(lr);
2426 
2427     return start;
2428   }
2429 
2430   // Arguments:
2431   //
2432   // Inputs:
2433   //   c_rarg0   - source byte array address
2434   //   c_rarg1   - destination byte array address
2435   //   c_rarg2   - K (key) in little endian int array
2436   //
2437   address generate_aescrypt_decryptBlock() {
2438     assert(UseAES, "need AES instructions and misaligned SSE support");
2439     __ align(CodeEntryAlignment);
2440     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2441     Label L_doLast;
2442 
2443     const Register from        = c_rarg0;  // source array address
2444     const Register to          = c_rarg1;  // destination array address
2445     const Register key         = c_rarg2;  // key array address
2446     const Register keylen      = rscratch1;
2447 
2448     address start = __ pc();
2449     __ enter(); // required for proper stackwalking of RuntimeStub frame
2450 
2451     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2452 
2453     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2454 
2455     __ ld1(v5, __ T16B, __ post(key, 16));
2456     __ rev32(v5, __ T16B, v5);
2457 
2458     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2459     __ rev32(v1, __ T16B, v1);
2460     __ rev32(v2, __ T16B, v2);
2461     __ rev32(v3, __ T16B, v3);
2462     __ rev32(v4, __ T16B, v4);
2463     __ aesd(v0, v1);
2464     __ aesimc(v0, v0);
2465     __ aesd(v0, v2);
2466     __ aesimc(v0, v0);
2467     __ aesd(v0, v3);
2468     __ aesimc(v0, v0);
2469     __ aesd(v0, v4);
2470     __ aesimc(v0, v0);
2471 
2472     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2473     __ rev32(v1, __ T16B, v1);
2474     __ rev32(v2, __ T16B, v2);
2475     __ rev32(v3, __ T16B, v3);
2476     __ rev32(v4, __ T16B, v4);
2477     __ aesd(v0, v1);
2478     __ aesimc(v0, v0);
2479     __ aesd(v0, v2);
2480     __ aesimc(v0, v0);
2481     __ aesd(v0, v3);
2482     __ aesimc(v0, v0);
2483     __ aesd(v0, v4);
2484     __ aesimc(v0, v0);
2485 
2486     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2487     __ rev32(v1, __ T16B, v1);
2488     __ rev32(v2, __ T16B, v2);
2489 
2490     __ cmpw(keylen, 44);
2491     __ br(Assembler::EQ, L_doLast);
2492 
2493     __ aesd(v0, v1);
2494     __ aesimc(v0, v0);
2495     __ aesd(v0, v2);
2496     __ aesimc(v0, v0);
2497 
2498     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2499     __ rev32(v1, __ T16B, v1);
2500     __ rev32(v2, __ T16B, v2);
2501 
2502     __ cmpw(keylen, 52);
2503     __ br(Assembler::EQ, L_doLast);
2504 
2505     __ aesd(v0, v1);
2506     __ aesimc(v0, v0);
2507     __ aesd(v0, v2);
2508     __ aesimc(v0, v0);
2509 
2510     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2511     __ rev32(v1, __ T16B, v1);
2512     __ rev32(v2, __ T16B, v2);
2513 
2514     __ BIND(L_doLast);
2515 
2516     __ aesd(v0, v1);
2517     __ aesimc(v0, v0);
2518     __ aesd(v0, v2);
2519 
2520     __ eor(v0, __ T16B, v0, v5);
2521 
2522     __ st1(v0, __ T16B, to);
2523 
2524     __ mov(r0, 0);
2525 
2526     __ leave();
2527     __ ret(lr);
2528 
2529     return start;
2530   }
2531 
2532   // Arguments:
2533   //
2534   // Inputs:
2535   //   c_rarg0   - source byte array address
2536   //   c_rarg1   - destination byte array address
2537   //   c_rarg2   - K (key) in little endian int array
2538   //   c_rarg3   - r vector byte array address
2539   //   c_rarg4   - input length
2540   //
2541   // Output:
2542   //   x0        - input length
2543   //
2544   address generate_cipherBlockChaining_encryptAESCrypt() {
2545     assert(UseAES, "need AES instructions and misaligned SSE support");
2546     __ align(CodeEntryAlignment);
2547     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2548 
2549     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2550 
2551     const Register from        = c_rarg0;  // source array address
2552     const Register to          = c_rarg1;  // destination array address
2553     const Register key         = c_rarg2;  // key array address
2554     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2555                                            // and left with the results of the last encryption block
2556     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2557     const Register keylen      = rscratch1;
2558 
2559     address start = __ pc();
2560       __ enter();
2561 
2562       __ mov(rscratch2, len_reg);
2563       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2564 
2565       __ ld1(v0, __ T16B, rvec);
2566 
2567       __ cmpw(keylen, 52);
2568       __ br(Assembler::CC, L_loadkeys_44);
2569       __ br(Assembler::EQ, L_loadkeys_52);
2570 
2571       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2572       __ rev32(v17, __ T16B, v17);
2573       __ rev32(v18, __ T16B, v18);
2574     __ BIND(L_loadkeys_52);
2575       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2576       __ rev32(v19, __ T16B, v19);
2577       __ rev32(v20, __ T16B, v20);
2578     __ BIND(L_loadkeys_44);
2579       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2580       __ rev32(v21, __ T16B, v21);
2581       __ rev32(v22, __ T16B, v22);
2582       __ rev32(v23, __ T16B, v23);
2583       __ rev32(v24, __ T16B, v24);
2584       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2585       __ rev32(v25, __ T16B, v25);
2586       __ rev32(v26, __ T16B, v26);
2587       __ rev32(v27, __ T16B, v27);
2588       __ rev32(v28, __ T16B, v28);
2589       __ ld1(v29, v30, v31, __ T16B, key);
2590       __ rev32(v29, __ T16B, v29);
2591       __ rev32(v30, __ T16B, v30);
2592       __ rev32(v31, __ T16B, v31);
2593 
2594     __ BIND(L_aes_loop);
2595       __ ld1(v1, __ T16B, __ post(from, 16));
2596       __ eor(v0, __ T16B, v0, v1);
2597 
2598       __ br(Assembler::CC, L_rounds_44);
2599       __ br(Assembler::EQ, L_rounds_52);
2600 
2601       __ aese(v0, v17); __ aesmc(v0, v0);
2602       __ aese(v0, v18); __ aesmc(v0, v0);
2603     __ BIND(L_rounds_52);
2604       __ aese(v0, v19); __ aesmc(v0, v0);
2605       __ aese(v0, v20); __ aesmc(v0, v0);
2606     __ BIND(L_rounds_44);
2607       __ aese(v0, v21); __ aesmc(v0, v0);
2608       __ aese(v0, v22); __ aesmc(v0, v0);
2609       __ aese(v0, v23); __ aesmc(v0, v0);
2610       __ aese(v0, v24); __ aesmc(v0, v0);
2611       __ aese(v0, v25); __ aesmc(v0, v0);
2612       __ aese(v0, v26); __ aesmc(v0, v0);
2613       __ aese(v0, v27); __ aesmc(v0, v0);
2614       __ aese(v0, v28); __ aesmc(v0, v0);
2615       __ aese(v0, v29); __ aesmc(v0, v0);
2616       __ aese(v0, v30);
2617       __ eor(v0, __ T16B, v0, v31);
2618 
2619       __ st1(v0, __ T16B, __ post(to, 16));
2620       __ sub(len_reg, len_reg, 16);
2621       __ cbnz(len_reg, L_aes_loop);
2622 
2623       __ st1(v0, __ T16B, rvec);
2624 
2625       __ mov(r0, rscratch2);
2626 
2627       __ leave();
2628       __ ret(lr);
2629 
2630       return start;
2631   }
2632 
2633   // Arguments:
2634   //
2635   // Inputs:
2636   //   c_rarg0   - source byte array address
2637   //   c_rarg1   - destination byte array address
2638   //   c_rarg2   - K (key) in little endian int array
2639   //   c_rarg3   - r vector byte array address
2640   //   c_rarg4   - input length
2641   //
2642   // Output:
2643   //   r0        - input length
2644   //
2645   address generate_cipherBlockChaining_decryptAESCrypt() {
2646     assert(UseAES, "need AES instructions and misaligned SSE support");
2647     __ align(CodeEntryAlignment);
2648     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2649 
2650     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2651 
2652     const Register from        = c_rarg0;  // source array address
2653     const Register to          = c_rarg1;  // destination array address
2654     const Register key         = c_rarg2;  // key array address
2655     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2656                                            // and left with the results of the last encryption block
2657     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2658     const Register keylen      = rscratch1;
2659 
2660     address start = __ pc();
2661       __ enter();
2662 
2663       __ mov(rscratch2, len_reg);
2664       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2665 
2666       __ ld1(v2, __ T16B, rvec);
2667 
2668       __ ld1(v31, __ T16B, __ post(key, 16));
2669       __ rev32(v31, __ T16B, v31);
2670 
2671       __ cmpw(keylen, 52);
2672       __ br(Assembler::CC, L_loadkeys_44);
2673       __ br(Assembler::EQ, L_loadkeys_52);
2674 
2675       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2676       __ rev32(v17, __ T16B, v17);
2677       __ rev32(v18, __ T16B, v18);
2678     __ BIND(L_loadkeys_52);
2679       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2680       __ rev32(v19, __ T16B, v19);
2681       __ rev32(v20, __ T16B, v20);
2682     __ BIND(L_loadkeys_44);
2683       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2684       __ rev32(v21, __ T16B, v21);
2685       __ rev32(v22, __ T16B, v22);
2686       __ rev32(v23, __ T16B, v23);
2687       __ rev32(v24, __ T16B, v24);
2688       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2689       __ rev32(v25, __ T16B, v25);
2690       __ rev32(v26, __ T16B, v26);
2691       __ rev32(v27, __ T16B, v27);
2692       __ rev32(v28, __ T16B, v28);
2693       __ ld1(v29, v30, __ T16B, key);
2694       __ rev32(v29, __ T16B, v29);
2695       __ rev32(v30, __ T16B, v30);
2696 
2697     __ BIND(L_aes_loop);
2698       __ ld1(v0, __ T16B, __ post(from, 16));
2699       __ orr(v1, __ T16B, v0, v0);
2700 
2701       __ br(Assembler::CC, L_rounds_44);
2702       __ br(Assembler::EQ, L_rounds_52);
2703 
2704       __ aesd(v0, v17); __ aesimc(v0, v0);
2705       __ aesd(v0, v18); __ aesimc(v0, v0);
2706     __ BIND(L_rounds_52);
2707       __ aesd(v0, v19); __ aesimc(v0, v0);
2708       __ aesd(v0, v20); __ aesimc(v0, v0);
2709     __ BIND(L_rounds_44);
2710       __ aesd(v0, v21); __ aesimc(v0, v0);
2711       __ aesd(v0, v22); __ aesimc(v0, v0);
2712       __ aesd(v0, v23); __ aesimc(v0, v0);
2713       __ aesd(v0, v24); __ aesimc(v0, v0);
2714       __ aesd(v0, v25); __ aesimc(v0, v0);
2715       __ aesd(v0, v26); __ aesimc(v0, v0);
2716       __ aesd(v0, v27); __ aesimc(v0, v0);
2717       __ aesd(v0, v28); __ aesimc(v0, v0);
2718       __ aesd(v0, v29); __ aesimc(v0, v0);
2719       __ aesd(v0, v30);
2720       __ eor(v0, __ T16B, v0, v31);
2721       __ eor(v0, __ T16B, v0, v2);
2722 
2723       __ st1(v0, __ T16B, __ post(to, 16));
2724       __ orr(v2, __ T16B, v1, v1);
2725 
2726       __ sub(len_reg, len_reg, 16);
2727       __ cbnz(len_reg, L_aes_loop);
2728 
2729       __ st1(v2, __ T16B, rvec);
2730 
2731       __ mov(r0, rscratch2);
2732 
2733       __ leave();
2734       __ ret(lr);
2735 
2736     return start;
2737   }
2738 
2739   // Arguments:
2740   //
2741   // Inputs:
2742   //   c_rarg0   - byte[]  source+offset
2743   //   c_rarg1   - int[]   SHA.state
2744   //   c_rarg2   - int     offset
2745   //   c_rarg3   - int     limit
2746   //
2747   address generate_sha1_implCompress(bool multi_block, const char *name) {
2748     __ align(CodeEntryAlignment);
2749     StubCodeMark mark(this, "StubRoutines", name);
2750     address start = __ pc();
2751 
2752     Register buf   = c_rarg0;
2753     Register state = c_rarg1;
2754     Register ofs   = c_rarg2;
2755     Register limit = c_rarg3;
2756 
2757     Label keys;
2758     Label sha1_loop;
2759 
2760     // load the keys into v0..v3
2761     __ adr(rscratch1, keys);
2762     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2763     // load 5 words state into v6, v7
2764     __ ldrq(v6, Address(state, 0));
2765     __ ldrs(v7, Address(state, 16));
2766 
2767 
2768     __ BIND(sha1_loop);
2769     // load 64 bytes of data into v16..v19
2770     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2771     __ rev32(v16, __ T16B, v16);
2772     __ rev32(v17, __ T16B, v17);
2773     __ rev32(v18, __ T16B, v18);
2774     __ rev32(v19, __ T16B, v19);
2775 
2776     // do the sha1
2777     __ addv(v4, __ T4S, v16, v0);
2778     __ orr(v20, __ T16B, v6, v6);
2779 
2780     FloatRegister d0 = v16;
2781     FloatRegister d1 = v17;
2782     FloatRegister d2 = v18;
2783     FloatRegister d3 = v19;
2784 
2785     for (int round = 0; round < 20; round++) {
2786       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2787       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2788       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2789       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2790       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2791 
2792       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2793       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2794       __ sha1h(tmp2, __ T4S, v20);
2795       if (round < 5)
2796         __ sha1c(v20, __ T4S, tmp3, tmp4);
2797       else if (round < 10 || round >= 15)
2798         __ sha1p(v20, __ T4S, tmp3, tmp4);
2799       else
2800         __ sha1m(v20, __ T4S, tmp3, tmp4);
2801       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2802 
2803       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2804     }
2805 
2806     __ addv(v7, __ T2S, v7, v21);
2807     __ addv(v6, __ T4S, v6, v20);
2808 
2809     if (multi_block) {
2810       __ add(ofs, ofs, 64);
2811       __ cmp(ofs, limit);
2812       __ br(Assembler::LE, sha1_loop);
2813       __ mov(c_rarg0, ofs); // return ofs
2814     }
2815 
2816     __ strq(v6, Address(state, 0));
2817     __ strs(v7, Address(state, 16));
2818 
2819     __ ret(lr);
2820 
2821     __ bind(keys);
2822     __ emit_int32(0x5a827999);
2823     __ emit_int32(0x6ed9eba1);
2824     __ emit_int32(0x8f1bbcdc);
2825     __ emit_int32(0xca62c1d6);
2826 
2827     return start;
2828   }
2829 
2830 
2831   // Arguments:
2832   //
2833   // Inputs:
2834   //   c_rarg0   - byte[]  source+offset
2835   //   c_rarg1   - int[]   SHA.state
2836   //   c_rarg2   - int     offset
2837   //   c_rarg3   - int     limit
2838   //
2839   address generate_sha256_implCompress(bool multi_block, const char *name) {
2840     static const uint32_t round_consts[64] = {
2841       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2842       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2843       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2844       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2845       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2846       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2847       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2848       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2849       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2850       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2851       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2852       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2853       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2854       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2855       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2856       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
2857     };
2858     __ align(CodeEntryAlignment);
2859     StubCodeMark mark(this, "StubRoutines", name);
2860     address start = __ pc();
2861 
2862     Register buf   = c_rarg0;
2863     Register state = c_rarg1;
2864     Register ofs   = c_rarg2;
2865     Register limit = c_rarg3;
2866 
2867     Label sha1_loop;
2868 
2869     __ stpd(v8, v9, __ pre(sp, -32));
2870     __ stpd(v10, v11, Address(sp, 16));
2871 
2872 // dga == v0
2873 // dgb == v1
2874 // dg0 == v2
2875 // dg1 == v3
2876 // dg2 == v4
2877 // t0 == v6
2878 // t1 == v7
2879 
2880     // load 16 keys to v16..v31
2881     __ lea(rscratch1, ExternalAddress((address)round_consts));
2882     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
2883     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
2884     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
2885     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
2886 
2887     // load 8 words (256 bits) state
2888     __ ldpq(v0, v1, state);
2889 
2890     __ BIND(sha1_loop);
2891     // load 64 bytes of data into v8..v11
2892     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
2893     __ rev32(v8, __ T16B, v8);
2894     __ rev32(v9, __ T16B, v9);
2895     __ rev32(v10, __ T16B, v10);
2896     __ rev32(v11, __ T16B, v11);
2897 
2898     __ addv(v6, __ T4S, v8, v16);
2899     __ orr(v2, __ T16B, v0, v0);
2900     __ orr(v3, __ T16B, v1, v1);
2901 
2902     FloatRegister d0 = v8;
2903     FloatRegister d1 = v9;
2904     FloatRegister d2 = v10;
2905     FloatRegister d3 = v11;
2906 
2907 
2908     for (int round = 0; round < 16; round++) {
2909       FloatRegister tmp1 = (round & 1) ? v6 : v7;
2910       FloatRegister tmp2 = (round & 1) ? v7 : v6;
2911       FloatRegister tmp3 = (round & 1) ? v2 : v4;
2912       FloatRegister tmp4 = (round & 1) ? v4 : v2;
2913 
2914       if (round < 12) __ sha256su0(d0, __ T4S, d1);
2915        __ orr(v4, __ T16B, v2, v2);
2916       if (round < 15)
2917         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
2918       __ sha256h(v2, __ T4S, v3, tmp2);
2919       __ sha256h2(v3, __ T4S, v4, tmp2);
2920       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
2921 
2922       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2923     }
2924 
2925     __ addv(v0, __ T4S, v0, v2);
2926     __ addv(v1, __ T4S, v1, v3);
2927 
2928     if (multi_block) {
2929       __ add(ofs, ofs, 64);
2930       __ cmp(ofs, limit);
2931       __ br(Assembler::LE, sha1_loop);
2932       __ mov(c_rarg0, ofs); // return ofs
2933     }
2934 
2935     __ ldpd(v10, v11, Address(sp, 16));
2936     __ ldpd(v8, v9, __ post(sp, 32));
2937 
2938     __ stpq(v0, v1, state);
2939 
2940     __ ret(lr);
2941 
2942     return start;
2943   }
2944 
2945 #ifndef BUILTIN_SIM
2946   // Safefetch stubs.
2947   void generate_safefetch(const char* name, int size, address* entry,
2948                           address* fault_pc, address* continuation_pc) {
2949     // safefetch signatures:
2950     //   int      SafeFetch32(int*      adr, int      errValue);
2951     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2952     //
2953     // arguments:
2954     //   c_rarg0 = adr
2955     //   c_rarg1 = errValue
2956     //
2957     // result:
2958     //   PPC_RET  = *adr or errValue
2959 
2960     StubCodeMark mark(this, "StubRoutines", name);
2961 
2962     // Entry point, pc or function descriptor.
2963     *entry = __ pc();
2964 
2965     // Load *adr into c_rarg1, may fault.
2966     *fault_pc = __ pc();
2967     switch (size) {
2968       case 4:
2969         // int32_t
2970         __ ldrw(c_rarg1, Address(c_rarg0, 0));
2971         break;
2972       case 8:
2973         // int64_t
2974         __ ldr(c_rarg1, Address(c_rarg0, 0));
2975         break;
2976       default:
2977         ShouldNotReachHere();
2978     }
2979 
2980     // return errValue or *adr
2981     *continuation_pc = __ pc();
2982     __ mov(r0, c_rarg1);
2983     __ ret(lr);
2984   }
2985 #endif
2986 
2987   /**
2988    *  Arguments:
2989    *
2990    * Inputs:
2991    *   c_rarg0   - int crc
2992    *   c_rarg1   - byte* buf
2993    *   c_rarg2   - int length
2994    *
2995    * Ouput:
2996    *       rax   - int crc result
2997    */
2998   address generate_updateBytesCRC32() {
2999     assert(UseCRC32Intrinsics, "what are we doing here?");
3000 
3001     __ align(CodeEntryAlignment);
3002     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3003 
3004     address start = __ pc();
3005 
3006     const Register crc   = c_rarg0;  // crc
3007     const Register buf   = c_rarg1;  // source java byte array address
3008     const Register len   = c_rarg2;  // length
3009     const Register table0 = c_rarg3; // crc_table address
3010     const Register table1 = c_rarg4;
3011     const Register table2 = c_rarg5;
3012     const Register table3 = c_rarg6;
3013     const Register tmp3 = c_rarg7;
3014 
3015     BLOCK_COMMENT("Entry:");
3016     __ enter(); // required for proper stackwalking of RuntimeStub frame
3017 
3018     __ kernel_crc32(crc, buf, len,
3019               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3020 
3021     __ leave(); // required for proper stackwalking of RuntimeStub frame
3022     __ ret(lr);
3023 
3024     return start;
3025   }
3026 
3027   /**
3028    *  Arguments:
3029    *
3030    * Inputs:
3031    *   c_rarg0   - int crc
3032    *   c_rarg1   - byte* buf
3033    *   c_rarg2   - int length
3034    *   c_rarg3   - int* table
3035    *
3036    * Ouput:
3037    *       r0   - int crc result
3038    */
3039   address generate_updateBytesCRC32C() {
3040     assert(UseCRC32CIntrinsics, "what are we doing here?");
3041 
3042     __ align(CodeEntryAlignment);
3043     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3044 
3045     address start = __ pc();
3046 
3047     const Register crc   = c_rarg0;  // crc
3048     const Register buf   = c_rarg1;  // source java byte array address
3049     const Register len   = c_rarg2;  // length
3050     const Register table0 = c_rarg3; // crc_table address
3051     const Register table1 = c_rarg4;
3052     const Register table2 = c_rarg5;
3053     const Register table3 = c_rarg6;
3054     const Register tmp3 = c_rarg7;
3055 
3056     BLOCK_COMMENT("Entry:");
3057     __ enter(); // required for proper stackwalking of RuntimeStub frame
3058 
3059     __ kernel_crc32c(crc, buf, len,
3060               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3061 
3062     __ leave(); // required for proper stackwalking of RuntimeStub frame
3063     __ ret(lr);
3064 
3065     return start;
3066   }
3067 
3068   /***
3069    *  Arguments:
3070    *
3071    *  Inputs:
3072    *   c_rarg0   - int   adler
3073    *   c_rarg1   - byte* buff
3074    *   c_rarg2   - int   len
3075    *
3076    * Output:
3077    *   c_rarg0   - int adler result
3078    */
3079   address generate_updateBytesAdler32() {
3080     __ align(CodeEntryAlignment);
3081     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3082     address start = __ pc();
3083 
3084     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3085 
3086     // Aliases
3087     Register adler  = c_rarg0;
3088     Register s1     = c_rarg0;
3089     Register s2     = c_rarg3;
3090     Register buff   = c_rarg1;
3091     Register len    = c_rarg2;
3092     Register nmax  = r4;
3093     Register base = r5;
3094     Register count = r6;
3095     Register temp0 = rscratch1;
3096     Register temp1 = rscratch2;
3097     Register temp2 = r7;
3098 
3099     // Max number of bytes we can process before having to take the mod
3100     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3101     unsigned long BASE = 0xfff1;
3102     unsigned long NMAX = 0x15B0;
3103 
3104     __ mov(base, BASE);
3105     __ mov(nmax, NMAX);
3106 
3107     // s1 is initialized to the lower 16 bits of adler
3108     // s2 is initialized to the upper 16 bits of adler
3109     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3110     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3111 
3112     // The pipelined loop needs at least 16 elements for 1 iteration
3113     // It does check this, but it is more effective to skip to the cleanup loop
3114     __ cmp(len, 16);
3115     __ br(Assembler::HS, L_nmax);
3116     __ cbz(len, L_combine);
3117 
3118     __ bind(L_simple_by1_loop);
3119     __ ldrb(temp0, Address(__ post(buff, 1)));
3120     __ add(s1, s1, temp0);
3121     __ add(s2, s2, s1);
3122     __ subs(len, len, 1);
3123     __ br(Assembler::HI, L_simple_by1_loop);
3124 
3125     // s1 = s1 % BASE
3126     __ subs(temp0, s1, base);
3127     __ csel(s1, temp0, s1, Assembler::HS);
3128 
3129     // s2 = s2 % BASE
3130     __ lsr(temp0, s2, 16);
3131     __ lsl(temp1, temp0, 4);
3132     __ sub(temp1, temp1, temp0);
3133     __ add(s2, temp1, s2, ext::uxth);
3134 
3135     __ subs(temp0, s2, base);
3136     __ csel(s2, temp0, s2, Assembler::HS);
3137 
3138     __ b(L_combine);
3139 
3140     __ bind(L_nmax);
3141     __ subs(len, len, nmax);
3142     __ sub(count, nmax, 16);
3143     __ br(Assembler::LO, L_by16);
3144 
3145     __ bind(L_nmax_loop);
3146 
3147     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3148 
3149     __ add(s1, s1, temp0, ext::uxtb);
3150     __ ubfx(temp2, temp0, 8, 8);
3151     __ add(s2, s2, s1);
3152     __ add(s1, s1, temp2);
3153     __ ubfx(temp2, temp0, 16, 8);
3154     __ add(s2, s2, s1);
3155     __ add(s1, s1, temp2);
3156     __ ubfx(temp2, temp0, 24, 8);
3157     __ add(s2, s2, s1);
3158     __ add(s1, s1, temp2);
3159     __ ubfx(temp2, temp0, 32, 8);
3160     __ add(s2, s2, s1);
3161     __ add(s1, s1, temp2);
3162     __ ubfx(temp2, temp0, 40, 8);
3163     __ add(s2, s2, s1);
3164     __ add(s1, s1, temp2);
3165     __ ubfx(temp2, temp0, 48, 8);
3166     __ add(s2, s2, s1);
3167     __ add(s1, s1, temp2);
3168     __ add(s2, s2, s1);
3169     __ add(s1, s1, temp0, Assembler::LSR, 56);
3170     __ add(s2, s2, s1);
3171 
3172     __ add(s1, s1, temp1, ext::uxtb);
3173     __ ubfx(temp2, temp1, 8, 8);
3174     __ add(s2, s2, s1);
3175     __ add(s1, s1, temp2);
3176     __ ubfx(temp2, temp1, 16, 8);
3177     __ add(s2, s2, s1);
3178     __ add(s1, s1, temp2);
3179     __ ubfx(temp2, temp1, 24, 8);
3180     __ add(s2, s2, s1);
3181     __ add(s1, s1, temp2);
3182     __ ubfx(temp2, temp1, 32, 8);
3183     __ add(s2, s2, s1);
3184     __ add(s1, s1, temp2);
3185     __ ubfx(temp2, temp1, 40, 8);
3186     __ add(s2, s2, s1);
3187     __ add(s1, s1, temp2);
3188     __ ubfx(temp2, temp1, 48, 8);
3189     __ add(s2, s2, s1);
3190     __ add(s1, s1, temp2);
3191     __ add(s2, s2, s1);
3192     __ add(s1, s1, temp1, Assembler::LSR, 56);
3193     __ add(s2, s2, s1);
3194 
3195     __ subs(count, count, 16);
3196     __ br(Assembler::HS, L_nmax_loop);
3197 
3198     // s1 = s1 % BASE
3199     __ lsr(temp0, s1, 16);
3200     __ lsl(temp1, temp0, 4);
3201     __ sub(temp1, temp1, temp0);
3202     __ add(temp1, temp1, s1, ext::uxth);
3203 
3204     __ lsr(temp0, temp1, 16);
3205     __ lsl(s1, temp0, 4);
3206     __ sub(s1, s1, temp0);
3207     __ add(s1, s1, temp1, ext:: uxth);
3208 
3209     __ subs(temp0, s1, base);
3210     __ csel(s1, temp0, s1, Assembler::HS);
3211 
3212     // s2 = s2 % BASE
3213     __ lsr(temp0, s2, 16);
3214     __ lsl(temp1, temp0, 4);
3215     __ sub(temp1, temp1, temp0);
3216     __ add(temp1, temp1, s2, ext::uxth);
3217 
3218     __ lsr(temp0, temp1, 16);
3219     __ lsl(s2, temp0, 4);
3220     __ sub(s2, s2, temp0);
3221     __ add(s2, s2, temp1, ext:: uxth);
3222 
3223     __ subs(temp0, s2, base);
3224     __ csel(s2, temp0, s2, Assembler::HS);
3225 
3226     __ subs(len, len, nmax);
3227     __ sub(count, nmax, 16);
3228     __ br(Assembler::HS, L_nmax_loop);
3229 
3230     __ bind(L_by16);
3231     __ adds(len, len, count);
3232     __ br(Assembler::LO, L_by1);
3233 
3234     __ bind(L_by16_loop);
3235 
3236     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3237 
3238     __ add(s1, s1, temp0, ext::uxtb);
3239     __ ubfx(temp2, temp0, 8, 8);
3240     __ add(s2, s2, s1);
3241     __ add(s1, s1, temp2);
3242     __ ubfx(temp2, temp0, 16, 8);
3243     __ add(s2, s2, s1);
3244     __ add(s1, s1, temp2);
3245     __ ubfx(temp2, temp0, 24, 8);
3246     __ add(s2, s2, s1);
3247     __ add(s1, s1, temp2);
3248     __ ubfx(temp2, temp0, 32, 8);
3249     __ add(s2, s2, s1);
3250     __ add(s1, s1, temp2);
3251     __ ubfx(temp2, temp0, 40, 8);
3252     __ add(s2, s2, s1);
3253     __ add(s1, s1, temp2);
3254     __ ubfx(temp2, temp0, 48, 8);
3255     __ add(s2, s2, s1);
3256     __ add(s1, s1, temp2);
3257     __ add(s2, s2, s1);
3258     __ add(s1, s1, temp0, Assembler::LSR, 56);
3259     __ add(s2, s2, s1);
3260 
3261     __ add(s1, s1, temp1, ext::uxtb);
3262     __ ubfx(temp2, temp1, 8, 8);
3263     __ add(s2, s2, s1);
3264     __ add(s1, s1, temp2);
3265     __ ubfx(temp2, temp1, 16, 8);
3266     __ add(s2, s2, s1);
3267     __ add(s1, s1, temp2);
3268     __ ubfx(temp2, temp1, 24, 8);
3269     __ add(s2, s2, s1);
3270     __ add(s1, s1, temp2);
3271     __ ubfx(temp2, temp1, 32, 8);
3272     __ add(s2, s2, s1);
3273     __ add(s1, s1, temp2);
3274     __ ubfx(temp2, temp1, 40, 8);
3275     __ add(s2, s2, s1);
3276     __ add(s1, s1, temp2);
3277     __ ubfx(temp2, temp1, 48, 8);
3278     __ add(s2, s2, s1);
3279     __ add(s1, s1, temp2);
3280     __ add(s2, s2, s1);
3281     __ add(s1, s1, temp1, Assembler::LSR, 56);
3282     __ add(s2, s2, s1);
3283 
3284     __ subs(len, len, 16);
3285     __ br(Assembler::HS, L_by16_loop);
3286 
3287     __ bind(L_by1);
3288     __ adds(len, len, 15);
3289     __ br(Assembler::LO, L_do_mod);
3290 
3291     __ bind(L_by1_loop);
3292     __ ldrb(temp0, Address(__ post(buff, 1)));
3293     __ add(s1, temp0, s1);
3294     __ add(s2, s2, s1);
3295     __ subs(len, len, 1);
3296     __ br(Assembler::HS, L_by1_loop);
3297 
3298     __ bind(L_do_mod);
3299     // s1 = s1 % BASE
3300     __ lsr(temp0, s1, 16);
3301     __ lsl(temp1, temp0, 4);
3302     __ sub(temp1, temp1, temp0);
3303     __ add(temp1, temp1, s1, ext::uxth);
3304 
3305     __ lsr(temp0, temp1, 16);
3306     __ lsl(s1, temp0, 4);
3307     __ sub(s1, s1, temp0);
3308     __ add(s1, s1, temp1, ext:: uxth);
3309 
3310     __ subs(temp0, s1, base);
3311     __ csel(s1, temp0, s1, Assembler::HS);
3312 
3313     // s2 = s2 % BASE
3314     __ lsr(temp0, s2, 16);
3315     __ lsl(temp1, temp0, 4);
3316     __ sub(temp1, temp1, temp0);
3317     __ add(temp1, temp1, s2, ext::uxth);
3318 
3319     __ lsr(temp0, temp1, 16);
3320     __ lsl(s2, temp0, 4);
3321     __ sub(s2, s2, temp0);
3322     __ add(s2, s2, temp1, ext:: uxth);
3323 
3324     __ subs(temp0, s2, base);
3325     __ csel(s2, temp0, s2, Assembler::HS);
3326 
3327     // Combine lower bits and higher bits
3328     __ bind(L_combine);
3329     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3330 
3331     __ ret(lr);
3332 
3333     return start;
3334   }
3335 
3336   /**
3337    *  Arguments:
3338    *
3339    *  Input:
3340    *    c_rarg0   - x address
3341    *    c_rarg1   - x length
3342    *    c_rarg2   - y address
3343    *    c_rarg3   - y lenth
3344    *    c_rarg4   - z address
3345    *    c_rarg5   - z length
3346    */
3347   address generate_multiplyToLen() {
3348     __ align(CodeEntryAlignment);
3349     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3350 
3351     address start = __ pc();
3352     const Register x     = r0;
3353     const Register xlen  = r1;
3354     const Register y     = r2;
3355     const Register ylen  = r3;
3356     const Register z     = r4;
3357     const Register zlen  = r5;
3358 
3359     const Register tmp1  = r10;
3360     const Register tmp2  = r11;
3361     const Register tmp3  = r12;
3362     const Register tmp4  = r13;
3363     const Register tmp5  = r14;
3364     const Register tmp6  = r15;
3365     const Register tmp7  = r16;
3366 
3367     BLOCK_COMMENT("Entry:");
3368     __ enter(); // required for proper stackwalking of RuntimeStub frame
3369     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3370     __ leave(); // required for proper stackwalking of RuntimeStub frame
3371     __ ret(lr);
3372 
3373     return start;
3374   }
3375 
3376   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3377                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3378                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3379     // Karatsuba multiplication performs a 128*128 -> 256-bit
3380     // multiplication in three 128-bit multiplications and a few
3381     // additions.
3382     //
3383     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3384     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3385     //
3386     // Inputs:
3387     //
3388     // A0 in a.d[0]     (subkey)
3389     // A1 in a.d[1]
3390     // (A1+A0) in a1_xor_a0.d[0]
3391     //
3392     // B0 in b.d[0]     (state)
3393     // B1 in b.d[1]
3394 
3395     __ ext(tmp1, __ T16B, b, b, 0x08);
3396     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3397     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3398     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3399     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3400 
3401     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3402     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3403     __ eor(tmp2, __ T16B, tmp2, tmp4);
3404     __ eor(tmp2, __ T16B, tmp2, tmp3);
3405 
3406     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3407     __ ins(result_hi, __ D, tmp2, 0, 1);
3408     __ ins(result_lo, __ D, tmp2, 1, 0);
3409   }
3410 
3411   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3412                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3413     const FloatRegister t0 = result;
3414 
3415     // The GCM field polynomial f is z^128 + p(z), where p =
3416     // z^7+z^2+z+1.
3417     //
3418     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3419     //
3420     // so, given that the product we're reducing is
3421     //    a == lo + hi * z^128
3422     // substituting,
3423     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3424     //
3425     // we reduce by multiplying hi by p(z) and subtracting the result
3426     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3427     // bits we can do this with two 64-bit multiplications, lo*p and
3428     // hi*p.
3429 
3430     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3431     __ ext(t1, __ T16B, t0, z, 8);
3432     __ eor(hi, __ T16B, hi, t1);
3433     __ ext(t1, __ T16B, z, t0, 8);
3434     __ eor(lo, __ T16B, lo, t1);
3435     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3436     __ eor(result, __ T16B, lo, t0);
3437   }
3438 
3439   /**
3440    *  Arguments:
3441    *
3442    *  Input:
3443    *  c_rarg0   - current state address
3444    *  c_rarg1   - H key address
3445    *  c_rarg2   - data address
3446    *  c_rarg3   - number of blocks
3447    *
3448    *  Output:
3449    *  Updated state at c_rarg0
3450    */
3451   address generate_ghash_processBlocks() {
3452     // Bafflingly, GCM uses little-endian for the byte order, but
3453     // big-endian for the bit order.  For example, the polynomial 1 is
3454     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3455     //
3456     // So, we must either reverse the bytes in each word and do
3457     // everything big-endian or reverse the bits in each byte and do
3458     // it little-endian.  On AArch64 it's more idiomatic to reverse
3459     // the bits in each byte (we have an instruction, RBIT, to do
3460     // that) and keep the data in little-endian bit order throught the
3461     // calculation, bit-reversing the inputs and outputs.
3462 
3463     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3464     __ align(wordSize * 2);
3465     address p = __ pc();
3466     __ emit_int64(0x87);  // The low-order bits of the field
3467                           // polynomial (i.e. p = z^7+z^2+z+1)
3468                           // repeated in the low and high parts of a
3469                           // 128-bit vector
3470     __ emit_int64(0x87);
3471 
3472     __ align(CodeEntryAlignment);
3473     address start = __ pc();
3474 
3475     Register state   = c_rarg0;
3476     Register subkeyH = c_rarg1;
3477     Register data    = c_rarg2;
3478     Register blocks  = c_rarg3;
3479 
3480     FloatRegister vzr = v30;
3481     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3482 
3483     __ ldrq(v0, Address(state));
3484     __ ldrq(v1, Address(subkeyH));
3485 
3486     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3487     __ rbit(v0, __ T16B, v0);
3488     __ rev64(v1, __ T16B, v1);
3489     __ rbit(v1, __ T16B, v1);
3490 
3491     __ ldrq(v26, p);
3492 
3493     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3494     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3495 
3496     {
3497       Label L_ghash_loop;
3498       __ bind(L_ghash_loop);
3499 
3500       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3501                                                  // reversing each byte
3502       __ rbit(v2, __ T16B, v2);
3503       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3504 
3505       // Multiply state in v2 by subkey in v1
3506       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3507                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3508                      /*temps*/v6, v20, v18, v21);
3509       // Reduce v7:v5 by the field polynomial
3510       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3511 
3512       __ sub(blocks, blocks, 1);
3513       __ cbnz(blocks, L_ghash_loop);
3514     }
3515 
3516     // The bit-reversed result is at this point in v0
3517     __ rev64(v1, __ T16B, v0);
3518     __ rbit(v1, __ T16B, v1);
3519 
3520     __ st1(v1, __ T16B, state);
3521     __ ret(lr);
3522 
3523     return start;
3524   }
3525 
3526   // Continuation point for throwing of implicit exceptions that are
3527   // not handled in the current activation. Fabricates an exception
3528   // oop and initiates normal exception dispatching in this
3529   // frame. Since we need to preserve callee-saved values (currently
3530   // only for C2, but done for C1 as well) we need a callee-saved oop
3531   // map and therefore have to make these stubs into RuntimeStubs
3532   // rather than BufferBlobs.  If the compiler needs all registers to
3533   // be preserved between the fault point and the exception handler
3534   // then it must assume responsibility for that in
3535   // AbstractCompiler::continuation_for_implicit_null_exception or
3536   // continuation_for_implicit_division_by_zero_exception. All other
3537   // implicit exceptions (e.g., NullPointerException or
3538   // AbstractMethodError on entry) are either at call sites or
3539   // otherwise assume that stack unwinding will be initiated, so
3540   // caller saved registers were assumed volatile in the compiler.
3541 
3542 #undef __
3543 #define __ masm->
3544 
3545   address generate_throw_exception(const char* name,
3546                                    address runtime_entry,
3547                                    Register arg1 = noreg,
3548                                    Register arg2 = noreg) {
3549     // Information about frame layout at time of blocking runtime call.
3550     // Note that we only have to preserve callee-saved registers since
3551     // the compilers are responsible for supplying a continuation point
3552     // if they expect all registers to be preserved.
3553     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3554     enum layout {
3555       rfp_off = 0,
3556       rfp_off2,
3557       return_off,
3558       return_off2,
3559       framesize // inclusive of return address
3560     };
3561 
3562     int insts_size = 512;
3563     int locs_size  = 64;
3564 
3565     CodeBuffer code(name, insts_size, locs_size);
3566     OopMapSet* oop_maps  = new OopMapSet();
3567     MacroAssembler* masm = new MacroAssembler(&code);
3568 
3569     address start = __ pc();
3570 
3571     // This is an inlined and slightly modified version of call_VM
3572     // which has the ability to fetch the return PC out of
3573     // thread-local storage and also sets up last_Java_sp slightly
3574     // differently than the real call_VM
3575 
3576     __ enter(); // Save FP and LR before call
3577 
3578     assert(is_even(framesize/2), "sp not 16-byte aligned");
3579 
3580     // lr and fp are already in place
3581     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3582 
3583     int frame_complete = __ pc() - start;
3584 
3585     // Set up last_Java_sp and last_Java_fp
3586     address the_pc = __ pc();
3587     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3588 
3589     // Call runtime
3590     if (arg1 != noreg) {
3591       assert(arg2 != c_rarg1, "clobbered");
3592       __ mov(c_rarg1, arg1);
3593     }
3594     if (arg2 != noreg) {
3595       __ mov(c_rarg2, arg2);
3596     }
3597     __ mov(c_rarg0, rthread);
3598     BLOCK_COMMENT("call runtime_entry");
3599     __ mov(rscratch1, runtime_entry);
3600     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3601 
3602     // Generate oop map
3603     OopMap* map = new OopMap(framesize, 0);
3604 
3605     oop_maps->add_gc_map(the_pc - start, map);
3606 
3607     __ reset_last_Java_frame(true, true);
3608     __ maybe_isb();
3609 
3610     __ leave();
3611 
3612     // check for pending exceptions
3613 #ifdef ASSERT
3614     Label L;
3615     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3616     __ cbnz(rscratch1, L);
3617     __ should_not_reach_here();
3618     __ bind(L);
3619 #endif // ASSERT
3620     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3621 
3622 
3623     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3624     RuntimeStub* stub =
3625       RuntimeStub::new_runtime_stub(name,
3626                                     &code,
3627                                     frame_complete,
3628                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3629                                     oop_maps, false);
3630     return stub->entry_point();
3631   }
3632 
3633   class MontgomeryMultiplyGenerator : public MacroAssembler {
3634 
3635     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3636       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3637 
3638     RegSet _toSave;
3639     bool _squaring;
3640 
3641   public:
3642     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3643       : MacroAssembler(as->code()), _squaring(squaring) {
3644 
3645       // Register allocation
3646 
3647       Register reg = c_rarg0;
3648       Pa_base = reg;       // Argument registers
3649       if (squaring)
3650         Pb_base = Pa_base;
3651       else
3652         Pb_base = ++reg;
3653       Pn_base = ++reg;
3654       Rlen= ++reg;
3655       inv = ++reg;
3656       Pm_base = ++reg;
3657 
3658                           // Working registers:
3659       Ra =  ++reg;        // The current digit of a, b, n, and m.
3660       Rb =  ++reg;
3661       Rm =  ++reg;
3662       Rn =  ++reg;
3663 
3664       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3665       Pb =  ++reg;
3666       Pm =  ++reg;
3667       Pn =  ++reg;
3668 
3669       t0 =  ++reg;        // Three registers which form a
3670       t1 =  ++reg;        // triple-precision accumuator.
3671       t2 =  ++reg;
3672 
3673       Ri =  ++reg;        // Inner and outer loop indexes.
3674       Rj =  ++reg;
3675 
3676       Rhi_ab = ++reg;     // Product registers: low and high parts
3677       Rlo_ab = ++reg;     // of a*b and m*n.
3678       Rhi_mn = ++reg;
3679       Rlo_mn = ++reg;
3680 
3681       // r19 and up are callee-saved.
3682       _toSave = RegSet::range(r19, reg) + Pm_base;
3683     }
3684 
3685   private:
3686     void save_regs() {
3687       push(_toSave, sp);
3688     }
3689 
3690     void restore_regs() {
3691       pop(_toSave, sp);
3692     }
3693 
3694     template <typename T>
3695     void unroll_2(Register count, T block) {
3696       Label loop, end, odd;
3697       tbnz(count, 0, odd);
3698       cbz(count, end);
3699       align(16);
3700       bind(loop);
3701       (this->*block)();
3702       bind(odd);
3703       (this->*block)();
3704       subs(count, count, 2);
3705       br(Assembler::GT, loop);
3706       bind(end);
3707     }
3708 
3709     template <typename T>
3710     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3711       Label loop, end, odd;
3712       tbnz(count, 0, odd);
3713       cbz(count, end);
3714       align(16);
3715       bind(loop);
3716       (this->*block)(d, s, tmp);
3717       bind(odd);
3718       (this->*block)(d, s, tmp);
3719       subs(count, count, 2);
3720       br(Assembler::GT, loop);
3721       bind(end);
3722     }
3723 
3724     void pre1(RegisterOrConstant i) {
3725       block_comment("pre1");
3726       // Pa = Pa_base;
3727       // Pb = Pb_base + i;
3728       // Pm = Pm_base;
3729       // Pn = Pn_base + i;
3730       // Ra = *Pa;
3731       // Rb = *Pb;
3732       // Rm = *Pm;
3733       // Rn = *Pn;
3734       ldr(Ra, Address(Pa_base));
3735       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3736       ldr(Rm, Address(Pm_base));
3737       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3738       lea(Pa, Address(Pa_base));
3739       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3740       lea(Pm, Address(Pm_base));
3741       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3742 
3743       // Zero the m*n result.
3744       mov(Rhi_mn, zr);
3745       mov(Rlo_mn, zr);
3746     }
3747 
3748     // The core multiply-accumulate step of a Montgomery
3749     // multiplication.  The idea is to schedule operations as a
3750     // pipeline so that instructions with long latencies (loads and
3751     // multiplies) have time to complete before their results are
3752     // used.  This most benefits in-order implementations of the
3753     // architecture but out-of-order ones also benefit.
3754     void step() {
3755       block_comment("step");
3756       // MACC(Ra, Rb, t0, t1, t2);
3757       // Ra = *++Pa;
3758       // Rb = *--Pb;
3759       umulh(Rhi_ab, Ra, Rb);
3760       mul(Rlo_ab, Ra, Rb);
3761       ldr(Ra, pre(Pa, wordSize));
3762       ldr(Rb, pre(Pb, -wordSize));
3763       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3764                                        // previous iteration.
3765       // MACC(Rm, Rn, t0, t1, t2);
3766       // Rm = *++Pm;
3767       // Rn = *--Pn;
3768       umulh(Rhi_mn, Rm, Rn);
3769       mul(Rlo_mn, Rm, Rn);
3770       ldr(Rm, pre(Pm, wordSize));
3771       ldr(Rn, pre(Pn, -wordSize));
3772       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3773     }
3774 
3775     void post1() {
3776       block_comment("post1");
3777 
3778       // MACC(Ra, Rb, t0, t1, t2);
3779       // Ra = *++Pa;
3780       // Rb = *--Pb;
3781       umulh(Rhi_ab, Ra, Rb);
3782       mul(Rlo_ab, Ra, Rb);
3783       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3784       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3785 
3786       // *Pm = Rm = t0 * inv;
3787       mul(Rm, t0, inv);
3788       str(Rm, Address(Pm));
3789 
3790       // MACC(Rm, Rn, t0, t1, t2);
3791       // t0 = t1; t1 = t2; t2 = 0;
3792       umulh(Rhi_mn, Rm, Rn);
3793 
3794 #ifndef PRODUCT
3795       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3796       {
3797         mul(Rlo_mn, Rm, Rn);
3798         add(Rlo_mn, t0, Rlo_mn);
3799         Label ok;
3800         cbz(Rlo_mn, ok); {
3801           stop("broken Montgomery multiply");
3802         } bind(ok);
3803       }
3804 #endif
3805       // We have very carefully set things up so that
3806       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3807       // the lower half of Rm * Rn because we know the result already:
3808       // it must be -t0.  t0 + (-t0) must generate a carry iff
3809       // t0 != 0.  So, rather than do a mul and an adds we just set
3810       // the carry flag iff t0 is nonzero.
3811       //
3812       // mul(Rlo_mn, Rm, Rn);
3813       // adds(zr, t0, Rlo_mn);
3814       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3815       adcs(t0, t1, Rhi_mn);
3816       adc(t1, t2, zr);
3817       mov(t2, zr);
3818     }
3819 
3820     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3821       block_comment("pre2");
3822       // Pa = Pa_base + i-len;
3823       // Pb = Pb_base + len;
3824       // Pm = Pm_base + i-len;
3825       // Pn = Pn_base + len;
3826 
3827       if (i.is_register()) {
3828         sub(Rj, i.as_register(), len);
3829       } else {
3830         mov(Rj, i.as_constant());
3831         sub(Rj, Rj, len);
3832       }
3833       // Rj == i-len
3834 
3835       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3836       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3837       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3838       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3839 
3840       // Ra = *++Pa;
3841       // Rb = *--Pb;
3842       // Rm = *++Pm;
3843       // Rn = *--Pn;
3844       ldr(Ra, pre(Pa, wordSize));
3845       ldr(Rb, pre(Pb, -wordSize));
3846       ldr(Rm, pre(Pm, wordSize));
3847       ldr(Rn, pre(Pn, -wordSize));
3848 
3849       mov(Rhi_mn, zr);
3850       mov(Rlo_mn, zr);
3851     }
3852 
3853     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3854       block_comment("post2");
3855       if (i.is_constant()) {
3856         mov(Rj, i.as_constant()-len.as_constant());
3857       } else {
3858         sub(Rj, i.as_register(), len);
3859       }
3860 
3861       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3862 
3863       // As soon as we know the least significant digit of our result,
3864       // store it.
3865       // Pm_base[i-len] = t0;
3866       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3867 
3868       // t0 = t1; t1 = t2; t2 = 0;
3869       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3870       adc(t1, t2, zr);
3871       mov(t2, zr);
3872     }
3873 
3874     // A carry in t0 after Montgomery multiplication means that we
3875     // should subtract multiples of n from our result in m.  We'll
3876     // keep doing that until there is no carry.
3877     void normalize(RegisterOrConstant len) {
3878       block_comment("normalize");
3879       // while (t0)
3880       //   t0 = sub(Pm_base, Pn_base, t0, len);
3881       Label loop, post, again;
3882       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3883       cbz(t0, post); {
3884         bind(again); {
3885           mov(i, zr);
3886           mov(cnt, len);
3887           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3888           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3889           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3890           align(16);
3891           bind(loop); {
3892             sbcs(Rm, Rm, Rn);
3893             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3894             add(i, i, 1);
3895             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3896             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3897             sub(cnt, cnt, 1);
3898           } cbnz(cnt, loop);
3899           sbc(t0, t0, zr);
3900         } cbnz(t0, again);
3901       } bind(post);
3902     }
3903 
3904     // Move memory at s to d, reversing words.
3905     //    Increments d to end of copied memory
3906     //    Destroys tmp1, tmp2
3907     //    Preserves len
3908     //    Leaves s pointing to the address which was in d at start
3909     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3910       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3911 
3912       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3913       mov(tmp1, len);
3914       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3915       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3916     }
3917     // where
3918     void reverse1(Register d, Register s, Register tmp) {
3919       ldr(tmp, pre(s, -wordSize));
3920       ror(tmp, tmp, 32);
3921       str(tmp, post(d, wordSize));
3922     }
3923 
3924     void step_squaring() {
3925       // An extra ACC
3926       step();
3927       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3928     }
3929 
3930     void last_squaring(RegisterOrConstant i) {
3931       Label dont;
3932       // if ((i & 1) == 0) {
3933       tbnz(i.as_register(), 0, dont); {
3934         // MACC(Ra, Rb, t0, t1, t2);
3935         // Ra = *++Pa;
3936         // Rb = *--Pb;
3937         umulh(Rhi_ab, Ra, Rb);
3938         mul(Rlo_ab, Ra, Rb);
3939         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3940       } bind(dont);
3941     }
3942 
3943     void extra_step_squaring() {
3944       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3945 
3946       // MACC(Rm, Rn, t0, t1, t2);
3947       // Rm = *++Pm;
3948       // Rn = *--Pn;
3949       umulh(Rhi_mn, Rm, Rn);
3950       mul(Rlo_mn, Rm, Rn);
3951       ldr(Rm, pre(Pm, wordSize));
3952       ldr(Rn, pre(Pn, -wordSize));
3953     }
3954 
3955     void post1_squaring() {
3956       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3957 
3958       // *Pm = Rm = t0 * inv;
3959       mul(Rm, t0, inv);
3960       str(Rm, Address(Pm));
3961 
3962       // MACC(Rm, Rn, t0, t1, t2);
3963       // t0 = t1; t1 = t2; t2 = 0;
3964       umulh(Rhi_mn, Rm, Rn);
3965 
3966 #ifndef PRODUCT
3967       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3968       {
3969         mul(Rlo_mn, Rm, Rn);
3970         add(Rlo_mn, t0, Rlo_mn);
3971         Label ok;
3972         cbz(Rlo_mn, ok); {
3973           stop("broken Montgomery multiply");
3974         } bind(ok);
3975       }
3976 #endif
3977       // We have very carefully set things up so that
3978       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3979       // the lower half of Rm * Rn because we know the result already:
3980       // it must be -t0.  t0 + (-t0) must generate a carry iff
3981       // t0 != 0.  So, rather than do a mul and an adds we just set
3982       // the carry flag iff t0 is nonzero.
3983       //
3984       // mul(Rlo_mn, Rm, Rn);
3985       // adds(zr, t0, Rlo_mn);
3986       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3987       adcs(t0, t1, Rhi_mn);
3988       adc(t1, t2, zr);
3989       mov(t2, zr);
3990     }
3991 
3992     void acc(Register Rhi, Register Rlo,
3993              Register t0, Register t1, Register t2) {
3994       adds(t0, t0, Rlo);
3995       adcs(t1, t1, Rhi);
3996       adc(t2, t2, zr);
3997     }
3998 
3999   public:
4000     /**
4001      * Fast Montgomery multiplication.  The derivation of the
4002      * algorithm is in A Cryptographic Library for the Motorola
4003      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4004      *
4005      * Arguments:
4006      *
4007      * Inputs for multiplication:
4008      *   c_rarg0   - int array elements a
4009      *   c_rarg1   - int array elements b
4010      *   c_rarg2   - int array elements n (the modulus)
4011      *   c_rarg3   - int length
4012      *   c_rarg4   - int inv
4013      *   c_rarg5   - int array elements m (the result)
4014      *
4015      * Inputs for squaring:
4016      *   c_rarg0   - int array elements a
4017      *   c_rarg1   - int array elements n (the modulus)
4018      *   c_rarg2   - int length
4019      *   c_rarg3   - int inv
4020      *   c_rarg4   - int array elements m (the result)
4021      *
4022      */
4023     address generate_multiply() {
4024       Label argh, nothing;
4025       bind(argh);
4026       stop("MontgomeryMultiply total_allocation must be <= 8192");
4027 
4028       align(CodeEntryAlignment);
4029       address entry = pc();
4030 
4031       cbzw(Rlen, nothing);
4032 
4033       enter();
4034 
4035       // Make room.
4036       cmpw(Rlen, 512);
4037       br(Assembler::HI, argh);
4038       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4039       andr(sp, Ra, -2 * wordSize);
4040 
4041       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4042 
4043       {
4044         // Copy input args, reversing as we go.  We use Ra as a
4045         // temporary variable.
4046         reverse(Ra, Pa_base, Rlen, t0, t1);
4047         if (!_squaring)
4048           reverse(Ra, Pb_base, Rlen, t0, t1);
4049         reverse(Ra, Pn_base, Rlen, t0, t1);
4050       }
4051 
4052       // Push all call-saved registers and also Pm_base which we'll need
4053       // at the end.
4054       save_regs();
4055 
4056 #ifndef PRODUCT
4057       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4058       {
4059         ldr(Rn, Address(Pn_base, 0));
4060         mul(Rlo_mn, Rn, inv);
4061         cmp(Rlo_mn, -1);
4062         Label ok;
4063         br(EQ, ok); {
4064           stop("broken inverse in Montgomery multiply");
4065         } bind(ok);
4066       }
4067 #endif
4068 
4069       mov(Pm_base, Ra);
4070 
4071       mov(t0, zr);
4072       mov(t1, zr);
4073       mov(t2, zr);
4074 
4075       block_comment("for (int i = 0; i < len; i++) {");
4076       mov(Ri, zr); {
4077         Label loop, end;
4078         cmpw(Ri, Rlen);
4079         br(Assembler::GE, end);
4080 
4081         bind(loop);
4082         pre1(Ri);
4083 
4084         block_comment("  for (j = i; j; j--) {"); {
4085           movw(Rj, Ri);
4086           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4087         } block_comment("  } // j");
4088 
4089         post1();
4090         addw(Ri, Ri, 1);
4091         cmpw(Ri, Rlen);
4092         br(Assembler::LT, loop);
4093         bind(end);
4094         block_comment("} // i");
4095       }
4096 
4097       block_comment("for (int i = len; i < 2*len; i++) {");
4098       mov(Ri, Rlen); {
4099         Label loop, end;
4100         cmpw(Ri, Rlen, Assembler::LSL, 1);
4101         br(Assembler::GE, end);
4102 
4103         bind(loop);
4104         pre2(Ri, Rlen);
4105 
4106         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4107           lslw(Rj, Rlen, 1);
4108           subw(Rj, Rj, Ri);
4109           subw(Rj, Rj, 1);
4110           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4111         } block_comment("  } // j");
4112 
4113         post2(Ri, Rlen);
4114         addw(Ri, Ri, 1);
4115         cmpw(Ri, Rlen, Assembler::LSL, 1);
4116         br(Assembler::LT, loop);
4117         bind(end);
4118       }
4119       block_comment("} // i");
4120 
4121       normalize(Rlen);
4122 
4123       mov(Ra, Pm_base);  // Save Pm_base in Ra
4124       restore_regs();  // Restore caller's Pm_base
4125 
4126       // Copy our result into caller's Pm_base
4127       reverse(Pm_base, Ra, Rlen, t0, t1);
4128 
4129       leave();
4130       bind(nothing);
4131       ret(lr);
4132 
4133       return entry;
4134     }
4135     // In C, approximately:
4136 
4137     // void
4138     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4139     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4140     //                     unsigned long inv, int len) {
4141     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4142     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4143     //   unsigned long Ra, Rb, Rn, Rm;
4144 
4145     //   int i;
4146 
4147     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4148 
4149     //   for (i = 0; i < len; i++) {
4150     //     int j;
4151 
4152     //     Pa = Pa_base;
4153     //     Pb = Pb_base + i;
4154     //     Pm = Pm_base;
4155     //     Pn = Pn_base + i;
4156 
4157     //     Ra = *Pa;
4158     //     Rb = *Pb;
4159     //     Rm = *Pm;
4160     //     Rn = *Pn;
4161 
4162     //     int iters = i;
4163     //     for (j = 0; iters--; j++) {
4164     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4165     //       MACC(Ra, Rb, t0, t1, t2);
4166     //       Ra = *++Pa;
4167     //       Rb = *--Pb;
4168     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4169     //       MACC(Rm, Rn, t0, t1, t2);
4170     //       Rm = *++Pm;
4171     //       Rn = *--Pn;
4172     //     }
4173 
4174     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4175     //     MACC(Ra, Rb, t0, t1, t2);
4176     //     *Pm = Rm = t0 * inv;
4177     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4178     //     MACC(Rm, Rn, t0, t1, t2);
4179 
4180     //     assert(t0 == 0, "broken Montgomery multiply");
4181 
4182     //     t0 = t1; t1 = t2; t2 = 0;
4183     //   }
4184 
4185     //   for (i = len; i < 2*len; i++) {
4186     //     int j;
4187 
4188     //     Pa = Pa_base + i-len;
4189     //     Pb = Pb_base + len;
4190     //     Pm = Pm_base + i-len;
4191     //     Pn = Pn_base + len;
4192 
4193     //     Ra = *++Pa;
4194     //     Rb = *--Pb;
4195     //     Rm = *++Pm;
4196     //     Rn = *--Pn;
4197 
4198     //     int iters = len*2-i-1;
4199     //     for (j = i-len+1; iters--; j++) {
4200     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4201     //       MACC(Ra, Rb, t0, t1, t2);
4202     //       Ra = *++Pa;
4203     //       Rb = *--Pb;
4204     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4205     //       MACC(Rm, Rn, t0, t1, t2);
4206     //       Rm = *++Pm;
4207     //       Rn = *--Pn;
4208     //     }
4209 
4210     //     Pm_base[i-len] = t0;
4211     //     t0 = t1; t1 = t2; t2 = 0;
4212     //   }
4213 
4214     //   while (t0)
4215     //     t0 = sub(Pm_base, Pn_base, t0, len);
4216     // }
4217 
4218     /**
4219      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4220      * multiplies than Montgomery multiplication so it should be up to
4221      * 25% faster.  However, its loop control is more complex and it
4222      * may actually run slower on some machines.
4223      *
4224      * Arguments:
4225      *
4226      * Inputs:
4227      *   c_rarg0   - int array elements a
4228      *   c_rarg1   - int array elements n (the modulus)
4229      *   c_rarg2   - int length
4230      *   c_rarg3   - int inv
4231      *   c_rarg4   - int array elements m (the result)
4232      *
4233      */
4234     address generate_square() {
4235       Label argh;
4236       bind(argh);
4237       stop("MontgomeryMultiply total_allocation must be <= 8192");
4238 
4239       align(CodeEntryAlignment);
4240       address entry = pc();
4241 
4242       enter();
4243 
4244       // Make room.
4245       cmpw(Rlen, 512);
4246       br(Assembler::HI, argh);
4247       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4248       andr(sp, Ra, -2 * wordSize);
4249 
4250       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4251 
4252       {
4253         // Copy input args, reversing as we go.  We use Ra as a
4254         // temporary variable.
4255         reverse(Ra, Pa_base, Rlen, t0, t1);
4256         reverse(Ra, Pn_base, Rlen, t0, t1);
4257       }
4258 
4259       // Push all call-saved registers and also Pm_base which we'll need
4260       // at the end.
4261       save_regs();
4262 
4263       mov(Pm_base, Ra);
4264 
4265       mov(t0, zr);
4266       mov(t1, zr);
4267       mov(t2, zr);
4268 
4269       block_comment("for (int i = 0; i < len; i++) {");
4270       mov(Ri, zr); {
4271         Label loop, end;
4272         bind(loop);
4273         cmp(Ri, Rlen);
4274         br(Assembler::GE, end);
4275 
4276         pre1(Ri);
4277 
4278         block_comment("for (j = (i+1)/2; j; j--) {"); {
4279           add(Rj, Ri, 1);
4280           lsr(Rj, Rj, 1);
4281           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4282         } block_comment("  } // j");
4283 
4284         last_squaring(Ri);
4285 
4286         block_comment("  for (j = i/2; j; j--) {"); {
4287           lsr(Rj, Ri, 1);
4288           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4289         } block_comment("  } // j");
4290 
4291         post1_squaring();
4292         add(Ri, Ri, 1);
4293         cmp(Ri, Rlen);
4294         br(Assembler::LT, loop);
4295 
4296         bind(end);
4297         block_comment("} // i");
4298       }
4299 
4300       block_comment("for (int i = len; i < 2*len; i++) {");
4301       mov(Ri, Rlen); {
4302         Label loop, end;
4303         bind(loop);
4304         cmp(Ri, Rlen, Assembler::LSL, 1);
4305         br(Assembler::GE, end);
4306 
4307         pre2(Ri, Rlen);
4308 
4309         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4310           lsl(Rj, Rlen, 1);
4311           sub(Rj, Rj, Ri);
4312           sub(Rj, Rj, 1);
4313           lsr(Rj, Rj, 1);
4314           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4315         } block_comment("  } // j");
4316 
4317         last_squaring(Ri);
4318 
4319         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4320           lsl(Rj, Rlen, 1);
4321           sub(Rj, Rj, Ri);
4322           lsr(Rj, Rj, 1);
4323           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4324         } block_comment("  } // j");
4325 
4326         post2(Ri, Rlen);
4327         add(Ri, Ri, 1);
4328         cmp(Ri, Rlen, Assembler::LSL, 1);
4329 
4330         br(Assembler::LT, loop);
4331         bind(end);
4332         block_comment("} // i");
4333       }
4334 
4335       normalize(Rlen);
4336 
4337       mov(Ra, Pm_base);  // Save Pm_base in Ra
4338       restore_regs();  // Restore caller's Pm_base
4339 
4340       // Copy our result into caller's Pm_base
4341       reverse(Pm_base, Ra, Rlen, t0, t1);
4342 
4343       leave();
4344       ret(lr);
4345 
4346       return entry;
4347     }
4348     // In C, approximately:
4349 
4350     // void
4351     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4352     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4353     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4354     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4355     //   unsigned long Ra, Rb, Rn, Rm;
4356 
4357     //   int i;
4358 
4359     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4360 
4361     //   for (i = 0; i < len; i++) {
4362     //     int j;
4363 
4364     //     Pa = Pa_base;
4365     //     Pb = Pa_base + i;
4366     //     Pm = Pm_base;
4367     //     Pn = Pn_base + i;
4368 
4369     //     Ra = *Pa;
4370     //     Rb = *Pb;
4371     //     Rm = *Pm;
4372     //     Rn = *Pn;
4373 
4374     //     int iters = (i+1)/2;
4375     //     for (j = 0; iters--; j++) {
4376     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4377     //       MACC2(Ra, Rb, t0, t1, t2);
4378     //       Ra = *++Pa;
4379     //       Rb = *--Pb;
4380     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4381     //       MACC(Rm, Rn, t0, t1, t2);
4382     //       Rm = *++Pm;
4383     //       Rn = *--Pn;
4384     //     }
4385     //     if ((i & 1) == 0) {
4386     //       assert(Ra == Pa_base[j], "must be");
4387     //       MACC(Ra, Ra, t0, t1, t2);
4388     //     }
4389     //     iters = i/2;
4390     //     assert(iters == i-j, "must be");
4391     //     for (; iters--; j++) {
4392     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4393     //       MACC(Rm, Rn, t0, t1, t2);
4394     //       Rm = *++Pm;
4395     //       Rn = *--Pn;
4396     //     }
4397 
4398     //     *Pm = Rm = t0 * inv;
4399     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4400     //     MACC(Rm, Rn, t0, t1, t2);
4401 
4402     //     assert(t0 == 0, "broken Montgomery multiply");
4403 
4404     //     t0 = t1; t1 = t2; t2 = 0;
4405     //   }
4406 
4407     //   for (i = len; i < 2*len; i++) {
4408     //     int start = i-len+1;
4409     //     int end = start + (len - start)/2;
4410     //     int j;
4411 
4412     //     Pa = Pa_base + i-len;
4413     //     Pb = Pa_base + len;
4414     //     Pm = Pm_base + i-len;
4415     //     Pn = Pn_base + len;
4416 
4417     //     Ra = *++Pa;
4418     //     Rb = *--Pb;
4419     //     Rm = *++Pm;
4420     //     Rn = *--Pn;
4421 
4422     //     int iters = (2*len-i-1)/2;
4423     //     assert(iters == end-start, "must be");
4424     //     for (j = start; iters--; j++) {
4425     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4426     //       MACC2(Ra, Rb, t0, t1, t2);
4427     //       Ra = *++Pa;
4428     //       Rb = *--Pb;
4429     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4430     //       MACC(Rm, Rn, t0, t1, t2);
4431     //       Rm = *++Pm;
4432     //       Rn = *--Pn;
4433     //     }
4434     //     if ((i & 1) == 0) {
4435     //       assert(Ra == Pa_base[j], "must be");
4436     //       MACC(Ra, Ra, t0, t1, t2);
4437     //     }
4438     //     iters =  (2*len-i)/2;
4439     //     assert(iters == len-j, "must be");
4440     //     for (; iters--; j++) {
4441     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4442     //       MACC(Rm, Rn, t0, t1, t2);
4443     //       Rm = *++Pm;
4444     //       Rn = *--Pn;
4445     //     }
4446     //     Pm_base[i-len] = t0;
4447     //     t0 = t1; t1 = t2; t2 = 0;
4448     //   }
4449 
4450     //   while (t0)
4451     //     t0 = sub(Pm_base, Pn_base, t0, len);
4452     // }
4453   };
4454 
4455   // Initialization
4456   void generate_initial() {
4457     // Generate initial stubs and initializes the entry points
4458 
4459     // entry points that exist in all platforms Note: This is code
4460     // that could be shared among different platforms - however the
4461     // benefit seems to be smaller than the disadvantage of having a
4462     // much more complicated generator structure. See also comment in
4463     // stubRoutines.hpp.
4464 
4465     StubRoutines::_forward_exception_entry = generate_forward_exception();
4466 
4467     StubRoutines::_call_stub_entry =
4468       generate_call_stub(StubRoutines::_call_stub_return_address);
4469 
4470     // is referenced by megamorphic call
4471     StubRoutines::_catch_exception_entry = generate_catch_exception();
4472 
4473     // Build this early so it's available for the interpreter.
4474     StubRoutines::_throw_StackOverflowError_entry =
4475       generate_throw_exception("StackOverflowError throw_exception",
4476                                CAST_FROM_FN_PTR(address,
4477                                                 SharedRuntime::
4478                                                 throw_StackOverflowError));
4479     if (UseCRC32Intrinsics) {
4480       // set table address before stub generation which use it
4481       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4482       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4483     }
4484   }
4485 
4486   void generate_all() {
4487     // support for verify_oop (must happen after universe_init)
4488     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4489     StubRoutines::_throw_AbstractMethodError_entry =
4490       generate_throw_exception("AbstractMethodError throw_exception",
4491                                CAST_FROM_FN_PTR(address,
4492                                                 SharedRuntime::
4493                                                 throw_AbstractMethodError));
4494 
4495     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4496       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4497                                CAST_FROM_FN_PTR(address,
4498                                                 SharedRuntime::
4499                                                 throw_IncompatibleClassChangeError));
4500 
4501     StubRoutines::_throw_NullPointerException_at_call_entry =
4502       generate_throw_exception("NullPointerException at call throw_exception",
4503                                CAST_FROM_FN_PTR(address,
4504                                                 SharedRuntime::
4505                                                 throw_NullPointerException_at_call));
4506 
4507     // arraycopy stubs used by compilers
4508     generate_arraycopy_stubs();
4509 
4510     if (UseMultiplyToLenIntrinsic) {
4511       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4512     }
4513 
4514     if (UseMontgomeryMultiplyIntrinsic) {
4515       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4516       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4517       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4518     }
4519 
4520     if (UseMontgomerySquareIntrinsic) {
4521       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4522       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4523       // We use generate_multiply() rather than generate_square()
4524       // because it's faster for the sizes of modulus we care about.
4525       StubRoutines::_montgomerySquare = g.generate_multiply();
4526     }
4527 
4528 #ifndef BUILTIN_SIM
4529     // generate GHASH intrinsics code
4530     if (UseGHASHIntrinsics) {
4531       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4532     }
4533 
4534     if (UseAESIntrinsics) {
4535       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4536       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4537       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4538       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4539     }
4540 
4541     if (UseSHA1Intrinsics) {
4542       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4543       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4544     }
4545     if (UseSHA256Intrinsics) {
4546       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4547       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4548     }
4549 
4550     if (UseCRC32CIntrinsics) {
4551       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4552     }
4553 
4554     // generate Adler32 intrinsics code
4555     if (UseAdler32Intrinsics) {
4556       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4557     }
4558 
4559     // Safefetch stubs.
4560     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4561                                                        &StubRoutines::_safefetch32_fault_pc,
4562                                                        &StubRoutines::_safefetch32_continuation_pc);
4563     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4564                                                        &StubRoutines::_safefetchN_fault_pc,
4565                                                        &StubRoutines::_safefetchN_continuation_pc);
4566 #endif
4567   }
4568 
4569  public:
4570   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4571     if (all) {
4572       generate_all();
4573     } else {
4574       generate_initial();
4575     }
4576   }
4577 }; // end class declaration
4578 
4579 void StubGenerator_generate(CodeBuffer* code, bool all) {
4580   StubGenerator g(code, all);
4581 }