1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #include "utilities/align.hpp"
  43 #ifdef COMPILER2
  44 #include "opto/runtime.hpp"
  45 #endif
  46 
  47 #ifdef BUILTIN_SIM
  48 #include "../../../../../../simulator/simulator.hpp"
  49 #endif
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp
  54 
  55 #undef __
  56 #define __ _masm->
  57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) __ block_comment(str)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 // Stub Code definitions
  68 
  69 class StubGenerator: public StubCodeGenerator {
  70  private:
  71 
  72 #ifdef PRODUCT
  73 #define inc_counter_np(counter) ((void)0)
  74 #else
  75   void inc_counter_np_(int& counter) {
  76     __ lea(rscratch2, ExternalAddress((address)&counter));
  77     __ ldrw(rscratch1, Address(rscratch2));
  78     __ addw(rscratch1, rscratch1, 1);
  79     __ strw(rscratch1, Address(rscratch2));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save r30 (lr) as the return PC at the base of the frame and
 102   // link r29 (fp) below it as the frame pointer installing sp (r31)
 103   // into fp.
 104   //
 105   // we save r0-r7, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save r8 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save r9-r15 which both C and Java treat as
 116   // volatile
 117   //
 118   // we don't need to save r16-18 because Java does not use them
 119   //
 120   // we save r19-r28 which Java uses as scratch registers and C
 121   // expects to be callee-save
 122   //
 123   // we save the bottom 64 bits of each value stored in v8-v15; it is
 124   // the responsibility of the caller to preserve larger values.
 125   //
 126   // so the stub frame looks like this when we enter Java code
 127   //
 128   //     [ return_from_Java     ] <--- sp
 129   //     [ argument word n      ]
 130   //      ...
 131   // -27 [ argument word 1      ]
 132   // -26 [ saved v15            ] <--- sp_after_call
 133   // -25 [ saved v14            ]
 134   // -24 [ saved v13            ]
 135   // -23 [ saved v12            ]
 136   // -22 [ saved v11            ]
 137   // -21 [ saved v10            ]
 138   // -20 [ saved v9             ]
 139   // -19 [ saved v8             ]
 140   // -18 [ saved r28            ]
 141   // -17 [ saved r27            ]
 142   // -16 [ saved r26            ]
 143   // -15 [ saved r25            ]
 144   // -14 [ saved r24            ]
 145   // -13 [ saved r23            ]
 146   // -12 [ saved r22            ]
 147   // -11 [ saved r21            ]
 148   // -10 [ saved r20            ]
 149   //  -9 [ saved r19            ]
 150   //  -8 [ call wrapper    (r0) ]
 151   //  -7 [ result          (r1) ]
 152   //  -6 [ result type     (r2) ]
 153   //  -5 [ method          (r3) ]
 154   //  -4 [ entry point     (r4) ]
 155   //  -3 [ parameters      (r5) ]
 156   //  -2 [ parameter size  (r6) ]
 157   //  -1 [ thread (r7)          ]
 158   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 159   //   1 [ saved lr       (r30) ]
 160 
 161   // Call stub stack layout word offsets from fp
 162   enum call_stub_layout {
 163     sp_after_call_off = -26,
 164 
 165     d15_off            = -26,
 166     d13_off            = -24,
 167     d11_off            = -22,
 168     d9_off             = -20,
 169 
 170     r28_off            = -18,
 171     r26_off            = -16,
 172     r24_off            = -14,
 173     r22_off            = -12,
 174     r20_off            = -10,
 175     call_wrapper_off   =  -8,
 176     result_off         =  -7,
 177     result_type_off    =  -6,
 178     method_off         =  -5,
 179     entry_point_off    =  -4,
 180     parameter_size_off =  -2,
 181     thread_off         =  -1,
 182     fp_f               =   0,
 183     retaddr_off        =   1,
 184   };
 185 
 186   address generate_call_stub(address& return_address) {
 187     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 188            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 189            "adjust this code");
 190 
 191     StubCodeMark mark(this, "StubRoutines", "call_stub");
 192     address start = __ pc();
 193 
 194     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 195 
 196     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 197     const Address result        (rfp, result_off         * wordSize);
 198     const Address result_type   (rfp, result_type_off    * wordSize);
 199     const Address method        (rfp, method_off         * wordSize);
 200     const Address entry_point   (rfp, entry_point_off    * wordSize);
 201     const Address parameter_size(rfp, parameter_size_off * wordSize);
 202 
 203     const Address thread        (rfp, thread_off         * wordSize);
 204 
 205     const Address d15_save      (rfp, d15_off * wordSize);
 206     const Address d13_save      (rfp, d13_off * wordSize);
 207     const Address d11_save      (rfp, d11_off * wordSize);
 208     const Address d9_save       (rfp, d9_off * wordSize);
 209 
 210     const Address r28_save      (rfp, r28_off * wordSize);
 211     const Address r26_save      (rfp, r26_off * wordSize);
 212     const Address r24_save      (rfp, r24_off * wordSize);
 213     const Address r22_save      (rfp, r22_off * wordSize);
 214     const Address r20_save      (rfp, r20_off * wordSize);
 215 
 216     // stub code
 217 
 218     // we need a C prolog to bootstrap the x86 caller into the sim
 219     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 220 
 221     address aarch64_entry = __ pc();
 222 
 223 #ifdef BUILTIN_SIM
 224     // Save sender's SP for stack traces.
 225     __ mov(rscratch1, sp);
 226     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 227 #endif
 228     // set up frame and move sp to end of save area
 229     __ enter();
 230     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 231 
 232     // save register parameters and Java scratch/global registers
 233     // n.b. we save thread even though it gets installed in
 234     // rthread because we want to sanity check rthread later
 235     __ str(c_rarg7,  thread);
 236     __ strw(c_rarg6, parameter_size);
 237     __ stp(c_rarg4, c_rarg5,  entry_point);
 238     __ stp(c_rarg2, c_rarg3,  result_type);
 239     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 240 
 241     __ stp(r20, r19,   r20_save);
 242     __ stp(r22, r21,   r22_save);
 243     __ stp(r24, r23,   r24_save);
 244     __ stp(r26, r25,   r26_save);
 245     __ stp(r28, r27,   r28_save);
 246 
 247     __ stpd(v9,  v8,   d9_save);
 248     __ stpd(v11, v10,  d11_save);
 249     __ stpd(v13, v12,  d13_save);
 250     __ stpd(v15, v14,  d15_save);
 251 
 252     // install Java thread in global register now we have saved
 253     // whatever value it held
 254     __ mov(rthread, c_rarg7);
 255     // And method
 256     __ mov(rmethod, c_rarg3);
 257 
 258     // set up the heapbase register
 259     __ reinit_heapbase();
 260 
 261 #ifdef ASSERT
 262     // make sure we have no pending exceptions
 263     {
 264       Label L;
 265       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 266       __ cmp(rscratch1, (unsigned)NULL_WORD);
 267       __ br(Assembler::EQ, L);
 268       __ stop("StubRoutines::call_stub: entered with pending exception");
 269       __ BIND(L);
 270     }
 271 #endif
 272     // pass parameters if any
 273     __ mov(esp, sp);
 274     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 275     __ andr(sp, rscratch1, -2 * wordSize);
 276 
 277     BLOCK_COMMENT("pass parameters if any");
 278     Label parameters_done;
 279     // parameter count is still in c_rarg6
 280     // and parameter pointer identifying param 1 is in c_rarg5
 281     __ cbzw(c_rarg6, parameters_done);
 282 
 283     address loop = __ pc();
 284     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 285     __ subsw(c_rarg6, c_rarg6, 1);
 286     __ push(rscratch1);
 287     __ br(Assembler::GT, loop);
 288 
 289     __ BIND(parameters_done);
 290 
 291     // call Java entry -- passing methdoOop, and current sp
 292     //      rmethod: Method*
 293     //      r13: sender sp
 294     BLOCK_COMMENT("call Java function");
 295     __ mov(r13, sp);
 296     __ blr(c_rarg4);
 297 
 298     // tell the simulator we have returned to the stub
 299 
 300     // we do this here because the notify will already have been done
 301     // if we get to the next instruction via an exception
 302     //
 303     // n.b. adding this instruction here affects the calculation of
 304     // whether or not a routine returns to the call stub (used when
 305     // doing stack walks) since the normal test is to check the return
 306     // pc against the address saved below. so we may need to allow for
 307     // this extra instruction in the check.
 308 
 309     if (NotifySimulator) {
 310       __ notify(Assembler::method_reentry);
 311     }
 312     // save current address for use by exception handling code
 313 
 314     return_address = __ pc();
 315 
 316     // store result depending on type (everything that is not
 317     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 318     // n.b. this assumes Java returns an integral result in r0
 319     // and a floating result in j_farg0
 320     __ ldr(j_rarg2, result);
 321     Label is_long, is_float, is_double, exit;
 322     __ ldr(j_rarg1, result_type);
 323     __ cmp(j_rarg1, T_OBJECT);
 324     __ br(Assembler::EQ, is_long);
 325     __ cmp(j_rarg1, T_LONG);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, T_FLOAT);
 328     __ br(Assembler::EQ, is_float);
 329     __ cmp(j_rarg1, T_DOUBLE);
 330     __ br(Assembler::EQ, is_double);
 331 
 332     // handle T_INT case
 333     __ strw(r0, Address(j_rarg2));
 334 
 335     __ BIND(exit);
 336 
 337     // pop parameters
 338     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 339 
 340 #ifdef ASSERT
 341     // verify that threads correspond
 342     {
 343       Label L, S;
 344       __ ldr(rscratch1, thread);
 345       __ cmp(rthread, rscratch1);
 346       __ br(Assembler::NE, S);
 347       __ get_thread(rscratch1);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::EQ, L);
 350       __ BIND(S);
 351       __ stop("StubRoutines::call_stub: threads must correspond");
 352       __ BIND(L);
 353     }
 354 #endif
 355 
 356     // restore callee-save registers
 357     __ ldpd(v15, v14,  d15_save);
 358     __ ldpd(v13, v12,  d13_save);
 359     __ ldpd(v11, v10,  d11_save);
 360     __ ldpd(v9,  v8,   d9_save);
 361 
 362     __ ldp(r28, r27,   r28_save);
 363     __ ldp(r26, r25,   r26_save);
 364     __ ldp(r24, r23,   r24_save);
 365     __ ldp(r22, r21,   r22_save);
 366     __ ldp(r20, r19,   r20_save);
 367 
 368     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 369     __ ldrw(c_rarg2, result_type);
 370     __ ldr(c_rarg3,  method);
 371     __ ldp(c_rarg4, c_rarg5,  entry_point);
 372     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 373 
 374 #ifndef PRODUCT
 375     // tell the simulator we are about to end Java execution
 376     if (NotifySimulator) {
 377       __ notify(Assembler::method_exit);
 378     }
 379 #endif
 380     // leave frame and return to caller
 381     __ leave();
 382     __ ret(lr);
 383 
 384     // handle return types different from T_INT
 385 
 386     __ BIND(is_long);
 387     __ str(r0, Address(j_rarg2, 0));
 388     __ br(Assembler::AL, exit);
 389 
 390     __ BIND(is_float);
 391     __ strs(j_farg0, Address(j_rarg2, 0));
 392     __ br(Assembler::AL, exit);
 393 
 394     __ BIND(is_double);
 395     __ strd(j_farg0, Address(j_rarg2, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     return start;
 399   }
 400 
 401   // Return point for a Java call if there's an exception thrown in
 402   // Java code.  The exception is caught and transformed into a
 403   // pending exception stored in JavaThread that can be tested from
 404   // within the VM.
 405   //
 406   // Note: Usually the parameters are removed by the callee. In case
 407   // of an exception crossing an activation frame boundary, that is
 408   // not the case if the callee is compiled code => need to setup the
 409   // rsp.
 410   //
 411   // r0: exception oop
 412 
 413   // NOTE: this is used as a target from the signal handler so it
 414   // needs an x86 prolog which returns into the current simulator
 415   // executing the generated catch_exception code. so the prolog
 416   // needs to install rax in a sim register and adjust the sim's
 417   // restart pc to enter the generated code at the start position
 418   // then return from native to simulated execution.
 419 
 420   address generate_catch_exception() {
 421     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 422     address start = __ pc();
 423 
 424     // same as in generate_call_stub():
 425     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 426     const Address thread        (rfp, thread_off         * wordSize);
 427 
 428 #ifdef ASSERT
 429     // verify that threads correspond
 430     {
 431       Label L, S;
 432       __ ldr(rscratch1, thread);
 433       __ cmp(rthread, rscratch1);
 434       __ br(Assembler::NE, S);
 435       __ get_thread(rscratch1);
 436       __ cmp(rthread, rscratch1);
 437       __ br(Assembler::EQ, L);
 438       __ bind(S);
 439       __ stop("StubRoutines::catch_exception: threads must correspond");
 440       __ bind(L);
 441     }
 442 #endif
 443 
 444     // set pending exception
 445     __ verify_oop(r0);
 446 
 447     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 448     __ mov(rscratch1, (address)__FILE__);
 449     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 450     __ movw(rscratch1, (int)__LINE__);
 451     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 452 
 453     // complete return to VM
 454     assert(StubRoutines::_call_stub_return_address != NULL,
 455            "_call_stub_return_address must have been generated before");
 456     __ b(StubRoutines::_call_stub_return_address);
 457 
 458     return start;
 459   }
 460 
 461   // Continuation point for runtime calls returning with a pending
 462   // exception.  The pending exception check happened in the runtime
 463   // or native call stub.  The pending exception in Thread is
 464   // converted into a Java-level exception.
 465   //
 466   // Contract with Java-level exception handlers:
 467   // r0: exception
 468   // r3: throwing pc
 469   //
 470   // NOTE: At entry of this stub, exception-pc must be in LR !!
 471 
 472   // NOTE: this is always used as a jump target within generated code
 473   // so it just needs to be generated code wiht no x86 prolog
 474 
 475   address generate_forward_exception() {
 476     StubCodeMark mark(this, "StubRoutines", "forward exception");
 477     address start = __ pc();
 478 
 479     // Upon entry, LR points to the return address returning into
 480     // Java (interpreted or compiled) code; i.e., the return address
 481     // becomes the throwing pc.
 482     //
 483     // Arguments pushed before the runtime call are still on the stack
 484     // but the exception handler will reset the stack pointer ->
 485     // ignore them.  A potential result in registers can be ignored as
 486     // well.
 487 
 488 #ifdef ASSERT
 489     // make sure this code is only executed if there is a pending exception
 490     {
 491       Label L;
 492       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 493       __ cbnz(rscratch1, L);
 494       __ stop("StubRoutines::forward exception: no pending exception (1)");
 495       __ bind(L);
 496     }
 497 #endif
 498 
 499     // compute exception handler into r19
 500 
 501     // call the VM to find the handler address associated with the
 502     // caller address. pass thread in r0 and caller pc (ret address)
 503     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 504     // the stack.
 505     __ mov(c_rarg1, lr);
 506     // lr will be trashed by the VM call so we move it to R19
 507     // (callee-saved) because we also need to pass it to the handler
 508     // returned by this call.
 509     __ mov(r19, lr);
 510     BLOCK_COMMENT("call exception_handler_for_return_address");
 511     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 512                          SharedRuntime::exception_handler_for_return_address),
 513                     rthread, c_rarg1);
 514     // we should not really care that lr is no longer the callee
 515     // address. we saved the value the handler needs in r19 so we can
 516     // just copy it to r3. however, the C2 handler will push its own
 517     // frame and then calls into the VM and the VM code asserts that
 518     // the PC for the frame above the handler belongs to a compiled
 519     // Java method. So, we restore lr here to satisfy that assert.
 520     __ mov(lr, r19);
 521     // setup r0 & r3 & clear pending exception
 522     __ mov(r3, r19);
 523     __ mov(r19, r0);
 524     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 525     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 526 
 527 #ifdef ASSERT
 528     // make sure exception is set
 529     {
 530       Label L;
 531       __ cbnz(r0, L);
 532       __ stop("StubRoutines::forward exception: no pending exception (2)");
 533       __ bind(L);
 534     }
 535 #endif
 536 
 537     // continue at exception handler
 538     // r0: exception
 539     // r3: throwing pc
 540     // r19: exception handler
 541     __ verify_oop(r0);
 542     __ br(r19);
 543 
 544     return start;
 545   }
 546 
 547   // Non-destructive plausibility checks for oops
 548   //
 549   // Arguments:
 550   //    r0: oop to verify
 551   //    rscratch1: error message
 552   //
 553   // Stack after saving c_rarg3:
 554   //    [tos + 0]: saved c_rarg3
 555   //    [tos + 1]: saved c_rarg2
 556   //    [tos + 2]: saved lr
 557   //    [tos + 3]: saved rscratch2
 558   //    [tos + 4]: saved r0
 559   //    [tos + 5]: saved rscratch1
 560   address generate_verify_oop() {
 561 
 562     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 563     address start = __ pc();
 564 
 565     Label exit, error;
 566 
 567     // save c_rarg2 and c_rarg3
 568     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 569 
 570     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 571     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 572     __ ldr(c_rarg3, Address(c_rarg2));
 573     __ add(c_rarg3, c_rarg3, 1);
 574     __ str(c_rarg3, Address(c_rarg2));
 575 
 576     // object is in r0
 577     // make sure object is 'reasonable'
 578     __ cbz(r0, exit); // if obj is NULL it is OK
 579 
 580     // Check if the oop is in the right area of memory
 581     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 582     __ andr(c_rarg2, r0, c_rarg3);
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 584 
 585     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 586     // instruction here because the flags register is live.
 587     __ eor(c_rarg2, c_rarg2, c_rarg3);
 588     __ cbnz(c_rarg2, error);
 589 
 590     // make sure klass is 'reasonable', which is not zero.
 591     __ load_klass(r0, r0);  // get klass
 592     __ cbz(r0, error);      // if klass is NULL it is broken
 593 
 594     // return if everything seems ok
 595     __ bind(exit);
 596 
 597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 598     __ ret(lr);
 599 
 600     // handle errors
 601     __ bind(error);
 602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 603 
 604     __ push(RegSet::range(r0, r29), sp);
 605     // debug(char* msg, int64_t pc, int64_t regs[])
 606     __ mov(c_rarg0, rscratch1);      // pass address of error message
 607     __ mov(c_rarg1, lr);             // pass return address
 608     __ mov(c_rarg2, sp);             // pass address of regs on stack
 609 #ifndef PRODUCT
 610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 611 #endif
 612     BLOCK_COMMENT("call MacroAssembler::debug");
 613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 614     __ blrt(rscratch1, 3, 0, 1);
 615 
 616     return start;
 617   }
 618 
 619   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 620 
 621   // Generate code for an array write pre barrier
 622   //
 623   //     addr       - starting address
 624   //     count      - element count
 625   //     tmp        - scratch register
 626   //     saved_regs - registers to be saved before calling static_write_ref_array_pre
 627   //
 628   //     Callers must specify which registers to preserve in saved_regs.
 629   //     Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs.
 630   //
 631   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized, RegSet saved_regs) {
 632     BarrierSet* bs = Universe::heap()->barrier_set();
 633     switch (bs->kind()) {
 634     case BarrierSet::G1SATBCTLogging:
 635       // With G1, don't generate the call if we statically know that the target in uninitialized
 636       if (!dest_uninitialized) {
 637         __ push(saved_regs, sp);
 638         if (count == c_rarg0) {
 639           if (addr == c_rarg1) {
 640             // exactly backwards!!
 641             __ mov(rscratch1, c_rarg0);
 642             __ mov(c_rarg0, c_rarg1);
 643             __ mov(c_rarg1, rscratch1);
 644           } else {
 645             __ mov(c_rarg1, count);
 646             __ mov(c_rarg0, addr);
 647           }
 648         } else {
 649           __ mov(c_rarg0, addr);
 650           __ mov(c_rarg1, count);
 651         }
 652         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 653         __ pop(saved_regs, sp);
 654         break;
 655       case BarrierSet::CardTableForRS:
 656       case BarrierSet::CardTableExtension:
 657       case BarrierSet::ModRef:
 658         break;
 659       default:
 660         ShouldNotReachHere();
 661 
 662       }
 663     }
 664   }
 665 
 666   //
 667   // Generate code for an array write post barrier
 668   //
 669   //  Input:
 670   //     start      - register containing starting address of destination array
 671   //     end        - register containing ending address of destination array
 672   //     scratch    - scratch register
 673   //     saved_regs - registers to be saved before calling static_write_ref_array_post
 674   //
 675   //  The input registers are overwritten.
 676   //  The ending address is inclusive.
 677   //  Callers must specify which registers to preserve in saved_regs.
 678   //  Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs.
 679   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch, RegSet saved_regs) {
 680     assert_different_registers(start, end, scratch);
 681     BarrierSet* bs = Universe::heap()->barrier_set();
 682     switch (bs->kind()) {
 683       case BarrierSet::G1SATBCTLogging:
 684 
 685         {
 686           __ push(saved_regs, sp);
 687           // must compute element count unless barrier set interface is changed (other platforms supply count)
 688           assert_different_registers(start, end, scratch);
 689           __ lea(scratch, Address(end, BytesPerHeapOop));
 690           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 691           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 692           __ mov(c_rarg0, start);
 693           __ mov(c_rarg1, scratch);
 694           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 695           __ pop(saved_regs, sp);
 696         }
 697         break;
 698       case BarrierSet::CardTableForRS:
 699       case BarrierSet::CardTableExtension:
 700         {
 701           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 702           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 703 
 704           Label L_loop;
 705 
 706            __ lsr(start, start, CardTableModRefBS::card_shift);
 707            __ lsr(end, end, CardTableModRefBS::card_shift);
 708            __ sub(end, end, start); // number of bytes to copy
 709 
 710           const Register count = end; // 'end' register contains bytes count now
 711           __ load_byte_map_base(scratch);
 712           __ add(start, start, scratch);
 713           if (UseConcMarkSweepGC) {
 714             __ membar(__ StoreStore);
 715           }
 716           __ BIND(L_loop);
 717           __ strb(zr, Address(start, count));
 718           __ subs(count, count, 1);
 719           __ br(Assembler::GE, L_loop);
 720         }
 721         break;
 722       default:
 723         ShouldNotReachHere();
 724 
 725     }
 726   }
 727 
 728   // The inner part of zero_words().  This is the bulk operation,
 729   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 730   // caller is responsible for zeroing the last few words.
 731   //
 732   // Inputs:
 733   // r10: the HeapWord-aligned base address of an array to zero.
 734   // r11: the count in HeapWords, r11 > 0.
 735   //
 736   // Returns r10 and r11, adjusted for the caller to clear.
 737   // r10: the base address of the tail of words left to clear.
 738   // r11: the number of words in the tail.
 739   //      r11 < MacroAssembler::zero_words_block_size.
 740 
 741   address generate_zero_blocks() {
 742     Label store_pair, loop_store_pair, done;
 743     Label base_aligned;
 744 
 745     Register base = r10, cnt = r11;
 746 
 747     __ align(CodeEntryAlignment);
 748     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 749     address start = __ pc();
 750 
 751     if (UseBlockZeroing) {
 752       int zva_length = VM_Version::zva_length();
 753 
 754       // Ensure ZVA length can be divided by 16. This is required by
 755       // the subsequent operations.
 756       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 757 
 758       __ tbz(base, 3, base_aligned);
 759       __ str(zr, Address(__ post(base, 8)));
 760       __ sub(cnt, cnt, 1);
 761       __ bind(base_aligned);
 762 
 763       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 764       // alignment.
 765       Label small;
 766       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 767       __ subs(rscratch1, cnt, low_limit >> 3);
 768       __ br(Assembler::LT, small);
 769       __ zero_dcache_blocks(base, cnt);
 770       __ bind(small);
 771     }
 772 
 773     {
 774       // Number of stp instructions we'll unroll
 775       const int unroll =
 776         MacroAssembler::zero_words_block_size / 2;
 777       // Clear the remaining blocks.
 778       Label loop;
 779       __ subs(cnt, cnt, unroll * 2);
 780       __ br(Assembler::LT, done);
 781       __ bind(loop);
 782       for (int i = 0; i < unroll; i++)
 783         __ stp(zr, zr, __ post(base, 16));
 784       __ subs(cnt, cnt, unroll * 2);
 785       __ br(Assembler::GE, loop);
 786       __ bind(done);
 787       __ add(cnt, cnt, unroll * 2);
 788     }
 789 
 790     __ ret(lr);
 791 
 792     return start;
 793   }
 794 
 795 
 796   typedef enum {
 797     copy_forwards = 1,
 798     copy_backwards = -1
 799   } copy_direction;
 800 
 801   // Bulk copy of blocks of 8 words.
 802   //
 803   // count is a count of words.
 804   //
 805   // Precondition: count >= 8
 806   //
 807   // Postconditions:
 808   //
 809   // The least significant bit of count contains the remaining count
 810   // of words to copy.  The rest of count is trash.
 811   //
 812   // s and d are adjusted to point to the remaining words to copy
 813   //
 814   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 815                            copy_direction direction) {
 816     int unit = wordSize * direction;
 817     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 818 
 819     int offset;
 820     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 821       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 822     const Register stride = r13;
 823 
 824     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 825     assert_different_registers(s, d, count, rscratch1);
 826 
 827     Label again, drain;
 828     const char *stub_name;
 829     if (direction == copy_forwards)
 830       stub_name = "forward_copy_longs";
 831     else
 832       stub_name = "backward_copy_longs";
 833     StubCodeMark mark(this, "StubRoutines", stub_name);
 834     __ align(CodeEntryAlignment);
 835     __ bind(start);
 836 
 837     Label unaligned_copy_long;
 838     if (AvoidUnalignedAccesses) {
 839       __ tbnz(d, 3, unaligned_copy_long);
 840     }
 841 
 842     if (direction == copy_forwards) {
 843       __ sub(s, s, bias);
 844       __ sub(d, d, bias);
 845     }
 846 
 847 #ifdef ASSERT
 848     // Make sure we are never given < 8 words
 849     {
 850       Label L;
 851       __ cmp(count, 8);
 852       __ br(Assembler::GE, L);
 853       __ stop("genrate_copy_longs called with < 8 words");
 854       __ bind(L);
 855     }
 856 #endif
 857 
 858     // Fill 8 registers
 859     if (UseSIMDForMemoryOps) {
 860       __ ldpq(v0, v1, Address(s, 4 * unit));
 861       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 862     } else {
 863       __ ldp(t0, t1, Address(s, 2 * unit));
 864       __ ldp(t2, t3, Address(s, 4 * unit));
 865       __ ldp(t4, t5, Address(s, 6 * unit));
 866       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 867     }
 868 
 869     __ subs(count, count, 16);
 870     __ br(Assembler::LO, drain);
 871 
 872     int prefetch = PrefetchCopyIntervalInBytes;
 873     bool use_stride = false;
 874     if (direction == copy_backwards) {
 875        use_stride = prefetch > 256;
 876        prefetch = -prefetch;
 877        if (use_stride) __ mov(stride, prefetch);
 878     }
 879 
 880     __ bind(again);
 881 
 882     if (PrefetchCopyIntervalInBytes > 0)
 883       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 884 
 885     if (UseSIMDForMemoryOps) {
 886       __ stpq(v0, v1, Address(d, 4 * unit));
 887       __ ldpq(v0, v1, Address(s, 4 * unit));
 888       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 889       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 890     } else {
 891       __ stp(t0, t1, Address(d, 2 * unit));
 892       __ ldp(t0, t1, Address(s, 2 * unit));
 893       __ stp(t2, t3, Address(d, 4 * unit));
 894       __ ldp(t2, t3, Address(s, 4 * unit));
 895       __ stp(t4, t5, Address(d, 6 * unit));
 896       __ ldp(t4, t5, Address(s, 6 * unit));
 897       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 898       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 899     }
 900 
 901     __ subs(count, count, 8);
 902     __ br(Assembler::HS, again);
 903 
 904     // Drain
 905     __ bind(drain);
 906     if (UseSIMDForMemoryOps) {
 907       __ stpq(v0, v1, Address(d, 4 * unit));
 908       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 909     } else {
 910       __ stp(t0, t1, Address(d, 2 * unit));
 911       __ stp(t2, t3, Address(d, 4 * unit));
 912       __ stp(t4, t5, Address(d, 6 * unit));
 913       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 914     }
 915 
 916     {
 917       Label L1, L2;
 918       __ tbz(count, exact_log2(4), L1);
 919       if (UseSIMDForMemoryOps) {
 920         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 921         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 922       } else {
 923         __ ldp(t0, t1, Address(s, 2 * unit));
 924         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 925         __ stp(t0, t1, Address(d, 2 * unit));
 926         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 927       }
 928       __ bind(L1);
 929 
 930       if (direction == copy_forwards) {
 931         __ add(s, s, bias);
 932         __ add(d, d, bias);
 933       }
 934 
 935       __ tbz(count, 1, L2);
 936       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 937       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 938       __ bind(L2);
 939     }
 940 
 941     __ ret(lr);
 942 
 943     if (AvoidUnalignedAccesses) {
 944       Label drain, again;
 945       // Register order for storing. Order is different for backward copy.
 946 
 947       __ bind(unaligned_copy_long);
 948 
 949       // source address is even aligned, target odd aligned
 950       //
 951       // when forward copying word pairs we read long pairs at offsets
 952       // {0, 2, 4, 6} (in long words). when backwards copying we read
 953       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 954       // address by -2 in the forwards case so we can compute the
 955       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 956       // or -1.
 957       //
 958       // when forward copying we need to store 1 word, 3 pairs and
 959       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 960       // zero offset We adjust the destination by -1 which means we
 961       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 962       //
 963       // When backwards copyng we need to store 1 word, 3 pairs and
 964       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 965       // offsets {1, 3, 5, 7, 8} * unit.
 966 
 967       if (direction == copy_forwards) {
 968         __ sub(s, s, 16);
 969         __ sub(d, d, 8);
 970       }
 971 
 972       // Fill 8 registers
 973       //
 974       // for forwards copy s was offset by -16 from the original input
 975       // value of s so the register contents are at these offsets
 976       // relative to the 64 bit block addressed by that original input
 977       // and so on for each successive 64 byte block when s is updated
 978       //
 979       // t0 at offset 0,  t1 at offset 8
 980       // t2 at offset 16, t3 at offset 24
 981       // t4 at offset 32, t5 at offset 40
 982       // t6 at offset 48, t7 at offset 56
 983 
 984       // for backwards copy s was not offset so the register contents
 985       // are at these offsets into the preceding 64 byte block
 986       // relative to that original input and so on for each successive
 987       // preceding 64 byte block when s is updated. this explains the
 988       // slightly counter-intuitive looking pattern of register usage
 989       // in the stp instructions for backwards copy.
 990       //
 991       // t0 at offset -16, t1 at offset -8
 992       // t2 at offset -32, t3 at offset -24
 993       // t4 at offset -48, t5 at offset -40
 994       // t6 at offset -64, t7 at offset -56
 995 
 996       __ ldp(t0, t1, Address(s, 2 * unit));
 997       __ ldp(t2, t3, Address(s, 4 * unit));
 998       __ ldp(t4, t5, Address(s, 6 * unit));
 999       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1000 
1001       __ subs(count, count, 16);
1002       __ br(Assembler::LO, drain);
1003 
1004       int prefetch = PrefetchCopyIntervalInBytes;
1005       bool use_stride = false;
1006       if (direction == copy_backwards) {
1007          use_stride = prefetch > 256;
1008          prefetch = -prefetch;
1009          if (use_stride) __ mov(stride, prefetch);
1010       }
1011 
1012       __ bind(again);
1013 
1014       if (PrefetchCopyIntervalInBytes > 0)
1015         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1016 
1017       if (direction == copy_forwards) {
1018        // allowing for the offset of -8 the store instructions place
1019        // registers into the target 64 bit block at the following
1020        // offsets
1021        //
1022        // t0 at offset 0
1023        // t1 at offset 8,  t2 at offset 16
1024        // t3 at offset 24, t4 at offset 32
1025        // t5 at offset 40, t6 at offset 48
1026        // t7 at offset 56
1027 
1028         __ str(t0, Address(d, 1 * unit));
1029         __ stp(t1, t2, Address(d, 2 * unit));
1030         __ ldp(t0, t1, Address(s, 2 * unit));
1031         __ stp(t3, t4, Address(d, 4 * unit));
1032         __ ldp(t2, t3, Address(s, 4 * unit));
1033         __ stp(t5, t6, Address(d, 6 * unit));
1034         __ ldp(t4, t5, Address(s, 6 * unit));
1035         __ str(t7, Address(__ pre(d, 8 * unit)));
1036         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1037       } else {
1038        // d was not offset when we started so the registers are
1039        // written into the 64 bit block preceding d with the following
1040        // offsets
1041        //
1042        // t1 at offset -8
1043        // t3 at offset -24, t0 at offset -16
1044        // t5 at offset -48, t2 at offset -32
1045        // t7 at offset -56, t4 at offset -48
1046        //                   t6 at offset -64
1047        //
1048        // note that this matches the offsets previously noted for the
1049        // loads
1050 
1051         __ str(t1, Address(d, 1 * unit));
1052         __ stp(t3, t0, Address(d, 3 * unit));
1053         __ ldp(t0, t1, Address(s, 2 * unit));
1054         __ stp(t5, t2, Address(d, 5 * unit));
1055         __ ldp(t2, t3, Address(s, 4 * unit));
1056         __ stp(t7, t4, Address(d, 7 * unit));
1057         __ ldp(t4, t5, Address(s, 6 * unit));
1058         __ str(t6, Address(__ pre(d, 8 * unit)));
1059         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1060       }
1061 
1062       __ subs(count, count, 8);
1063       __ br(Assembler::HS, again);
1064 
1065       // Drain
1066       //
1067       // this uses the same pattern of offsets and register arguments
1068       // as above
1069       __ bind(drain);
1070       if (direction == copy_forwards) {
1071         __ str(t0, Address(d, 1 * unit));
1072         __ stp(t1, t2, Address(d, 2 * unit));
1073         __ stp(t3, t4, Address(d, 4 * unit));
1074         __ stp(t5, t6, Address(d, 6 * unit));
1075         __ str(t7, Address(__ pre(d, 8 * unit)));
1076       } else {
1077         __ str(t1, Address(d, 1 * unit));
1078         __ stp(t3, t0, Address(d, 3 * unit));
1079         __ stp(t5, t2, Address(d, 5 * unit));
1080         __ stp(t7, t4, Address(d, 7 * unit));
1081         __ str(t6, Address(__ pre(d, 8 * unit)));
1082       }
1083       // now we need to copy any remaining part block which may
1084       // include a 4 word block subblock and/or a 2 word subblock.
1085       // bits 2 and 1 in the count are the tell-tale for whetehr we
1086       // have each such subblock
1087       {
1088         Label L1, L2;
1089         __ tbz(count, exact_log2(4), L1);
1090        // this is the same as above but copying only 4 longs hence
1091        // with ony one intervening stp between the str instructions
1092        // but note that the offsets and registers still follow the
1093        // same pattern
1094         __ ldp(t0, t1, Address(s, 2 * unit));
1095         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1096         if (direction == copy_forwards) {
1097           __ str(t0, Address(d, 1 * unit));
1098           __ stp(t1, t2, Address(d, 2 * unit));
1099           __ str(t3, Address(__ pre(d, 4 * unit)));
1100         } else {
1101           __ str(t1, Address(d, 1 * unit));
1102           __ stp(t3, t0, Address(d, 3 * unit));
1103           __ str(t2, Address(__ pre(d, 4 * unit)));
1104         }
1105         __ bind(L1);
1106 
1107         __ tbz(count, 1, L2);
1108        // this is the same as above but copying only 2 longs hence
1109        // there is no intervening stp between the str instructions
1110        // but note that the offset and register patterns are still
1111        // the same
1112         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1113         if (direction == copy_forwards) {
1114           __ str(t0, Address(d, 1 * unit));
1115           __ str(t1, Address(__ pre(d, 2 * unit)));
1116         } else {
1117           __ str(t1, Address(d, 1 * unit));
1118           __ str(t0, Address(__ pre(d, 2 * unit)));
1119         }
1120         __ bind(L2);
1121 
1122        // for forwards copy we need to re-adjust the offsets we
1123        // applied so that s and d are follow the last words written
1124 
1125        if (direction == copy_forwards) {
1126          __ add(s, s, 16);
1127          __ add(d, d, 8);
1128        }
1129 
1130       }
1131 
1132       __ ret(lr);
1133       }
1134   }
1135 
1136   // Small copy: less than 16 bytes.
1137   //
1138   // NB: Ignores all of the bits of count which represent more than 15
1139   // bytes, so a caller doesn't have to mask them.
1140 
1141   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1142     bool is_backwards = step < 0;
1143     size_t granularity = uabs(step);
1144     int direction = is_backwards ? -1 : 1;
1145     int unit = wordSize * direction;
1146 
1147     Label Lpair, Lword, Lint, Lshort, Lbyte;
1148 
1149     assert(granularity
1150            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1151 
1152     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1153 
1154     // ??? I don't know if this bit-test-and-branch is the right thing
1155     // to do.  It does a lot of jumping, resulting in several
1156     // mispredicted branches.  It might make more sense to do this
1157     // with something like Duff's device with a single computed branch.
1158 
1159     __ tbz(count, 3 - exact_log2(granularity), Lword);
1160     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1161     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1162     __ bind(Lword);
1163 
1164     if (granularity <= sizeof (jint)) {
1165       __ tbz(count, 2 - exact_log2(granularity), Lint);
1166       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1167       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1168       __ bind(Lint);
1169     }
1170 
1171     if (granularity <= sizeof (jshort)) {
1172       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1173       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1174       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1175       __ bind(Lshort);
1176     }
1177 
1178     if (granularity <= sizeof (jbyte)) {
1179       __ tbz(count, 0, Lbyte);
1180       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1181       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1182       __ bind(Lbyte);
1183     }
1184   }
1185 
1186   Label copy_f, copy_b;
1187 
1188   // All-singing all-dancing memory copy.
1189   //
1190   // Copy count units of memory from s to d.  The size of a unit is
1191   // step, which can be positive or negative depending on the direction
1192   // of copy.  If is_aligned is false, we align the source address.
1193   //
1194 
1195   void copy_memory(bool is_aligned, Register s, Register d,
1196                    Register count, Register tmp, int step) {
1197     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1198     bool is_backwards = step < 0;
1199     int granularity = uabs(step);
1200     const Register t0 = r3, t1 = r4;
1201 
1202     // <= 96 bytes do inline. Direction doesn't matter because we always
1203     // load all the data before writing anything
1204     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1205     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1206     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1207     const Register send = r17, dend = r18;
1208 
1209     if (PrefetchCopyIntervalInBytes > 0)
1210       __ prfm(Address(s, 0), PLDL1KEEP);
1211     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1212     __ br(Assembler::HI, copy_big);
1213 
1214     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1215     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1216 
1217     __ cmp(count, 16/granularity);
1218     __ br(Assembler::LS, copy16);
1219 
1220     __ cmp(count, 64/granularity);
1221     __ br(Assembler::HI, copy80);
1222 
1223     __ cmp(count, 32/granularity);
1224     __ br(Assembler::LS, copy32);
1225 
1226     // 33..64 bytes
1227     if (UseSIMDForMemoryOps) {
1228       __ ldpq(v0, v1, Address(s, 0));
1229       __ ldpq(v2, v3, Address(send, -32));
1230       __ stpq(v0, v1, Address(d, 0));
1231       __ stpq(v2, v3, Address(dend, -32));
1232     } else {
1233       __ ldp(t0, t1, Address(s, 0));
1234       __ ldp(t2, t3, Address(s, 16));
1235       __ ldp(t4, t5, Address(send, -32));
1236       __ ldp(t6, t7, Address(send, -16));
1237 
1238       __ stp(t0, t1, Address(d, 0));
1239       __ stp(t2, t3, Address(d, 16));
1240       __ stp(t4, t5, Address(dend, -32));
1241       __ stp(t6, t7, Address(dend, -16));
1242     }
1243     __ b(finish);
1244 
1245     // 17..32 bytes
1246     __ bind(copy32);
1247     __ ldp(t0, t1, Address(s, 0));
1248     __ ldp(t2, t3, Address(send, -16));
1249     __ stp(t0, t1, Address(d, 0));
1250     __ stp(t2, t3, Address(dend, -16));
1251     __ b(finish);
1252 
1253     // 65..80/96 bytes
1254     // (96 bytes if SIMD because we do 32 byes per instruction)
1255     __ bind(copy80);
1256     if (UseSIMDForMemoryOps) {
1257       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1258       __ ldpq(v4, v5, Address(send, -32));
1259       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1260       __ stpq(v4, v5, Address(dend, -32));
1261     } else {
1262       __ ldp(t0, t1, Address(s, 0));
1263       __ ldp(t2, t3, Address(s, 16));
1264       __ ldp(t4, t5, Address(s, 32));
1265       __ ldp(t6, t7, Address(s, 48));
1266       __ ldp(t8, t9, Address(send, -16));
1267 
1268       __ stp(t0, t1, Address(d, 0));
1269       __ stp(t2, t3, Address(d, 16));
1270       __ stp(t4, t5, Address(d, 32));
1271       __ stp(t6, t7, Address(d, 48));
1272       __ stp(t8, t9, Address(dend, -16));
1273     }
1274     __ b(finish);
1275 
1276     // 0..16 bytes
1277     __ bind(copy16);
1278     __ cmp(count, 8/granularity);
1279     __ br(Assembler::LO, copy8);
1280 
1281     // 8..16 bytes
1282     __ ldr(t0, Address(s, 0));
1283     __ ldr(t1, Address(send, -8));
1284     __ str(t0, Address(d, 0));
1285     __ str(t1, Address(dend, -8));
1286     __ b(finish);
1287 
1288     if (granularity < 8) {
1289       // 4..7 bytes
1290       __ bind(copy8);
1291       __ tbz(count, 2 - exact_log2(granularity), copy4);
1292       __ ldrw(t0, Address(s, 0));
1293       __ ldrw(t1, Address(send, -4));
1294       __ strw(t0, Address(d, 0));
1295       __ strw(t1, Address(dend, -4));
1296       __ b(finish);
1297       if (granularity < 4) {
1298         // 0..3 bytes
1299         __ bind(copy4);
1300         __ cbz(count, finish); // get rid of 0 case
1301         if (granularity == 2) {
1302           __ ldrh(t0, Address(s, 0));
1303           __ strh(t0, Address(d, 0));
1304         } else { // granularity == 1
1305           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1306           // the first and last byte.
1307           // Handle the 3 byte case by loading and storing base + count/2
1308           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1309           // This does means in the 1 byte case we load/store the same
1310           // byte 3 times.
1311           __ lsr(count, count, 1);
1312           __ ldrb(t0, Address(s, 0));
1313           __ ldrb(t1, Address(send, -1));
1314           __ ldrb(t2, Address(s, count));
1315           __ strb(t0, Address(d, 0));
1316           __ strb(t1, Address(dend, -1));
1317           __ strb(t2, Address(d, count));
1318         }
1319         __ b(finish);
1320       }
1321     }
1322 
1323     __ bind(copy_big);
1324     if (is_backwards) {
1325       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1326       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1327     }
1328 
1329     // Now we've got the small case out of the way we can align the
1330     // source address on a 2-word boundary.
1331 
1332     Label aligned;
1333 
1334     if (is_aligned) {
1335       // We may have to adjust by 1 word to get s 2-word-aligned.
1336       __ tbz(s, exact_log2(wordSize), aligned);
1337       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1338       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1339       __ sub(count, count, wordSize/granularity);
1340     } else {
1341       if (is_backwards) {
1342         __ andr(rscratch2, s, 2 * wordSize - 1);
1343       } else {
1344         __ neg(rscratch2, s);
1345         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1346       }
1347       // rscratch2 is the byte adjustment needed to align s.
1348       __ cbz(rscratch2, aligned);
1349       int shift = exact_log2(granularity);
1350       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1351       __ sub(count, count, rscratch2);
1352 
1353 #if 0
1354       // ?? This code is only correct for a disjoint copy.  It may or
1355       // may not make sense to use it in that case.
1356 
1357       // Copy the first pair; s and d may not be aligned.
1358       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1359       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1360 
1361       // Align s and d, adjust count
1362       if (is_backwards) {
1363         __ sub(s, s, rscratch2);
1364         __ sub(d, d, rscratch2);
1365       } else {
1366         __ add(s, s, rscratch2);
1367         __ add(d, d, rscratch2);
1368       }
1369 #else
1370       copy_memory_small(s, d, rscratch2, rscratch1, step);
1371 #endif
1372     }
1373 
1374     __ bind(aligned);
1375 
1376     // s is now 2-word-aligned.
1377 
1378     // We have a count of units and some trailing bytes.  Adjust the
1379     // count and do a bulk copy of words.
1380     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1381     if (direction == copy_forwards)
1382       __ bl(copy_f);
1383     else
1384       __ bl(copy_b);
1385 
1386     // And the tail.
1387     copy_memory_small(s, d, count, tmp, step);
1388 
1389     if (granularity >= 8) __ bind(copy8);
1390     if (granularity >= 4) __ bind(copy4);
1391     __ bind(finish);
1392   }
1393 
1394 
1395   void clobber_registers() {
1396 #ifdef ASSERT
1397     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1398     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1399     for (Register r = r3; r <= r18; r++)
1400       if (r != rscratch1) __ mov(r, rscratch1);
1401 #endif
1402   }
1403 
1404   // Scan over array at a for count oops, verifying each one.
1405   // Preserves a and count, clobbers rscratch1 and rscratch2.
1406   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1407     Label loop, end;
1408     __ mov(rscratch1, a);
1409     __ mov(rscratch2, zr);
1410     __ bind(loop);
1411     __ cmp(rscratch2, count);
1412     __ br(Assembler::HS, end);
1413     if (size == (size_t)wordSize) {
1414       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1415       __ verify_oop(temp);
1416     } else {
1417       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1418       __ decode_heap_oop(temp); // calls verify_oop
1419     }
1420     __ add(rscratch2, rscratch2, size);
1421     __ b(loop);
1422     __ bind(end);
1423   }
1424 
1425   // Arguments:
1426   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1427   //             ignored
1428   //   is_oop  - true => oop array, so generate store check code
1429   //   name    - stub name string
1430   //
1431   // Inputs:
1432   //   c_rarg0   - source array address
1433   //   c_rarg1   - destination array address
1434   //   c_rarg2   - element count, treated as ssize_t, can be zero
1435   //
1436   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1437   // the hardware handle it.  The two dwords within qwords that span
1438   // cache line boundaries will still be loaded and stored atomicly.
1439   //
1440   // Side Effects:
1441   //   disjoint_int_copy_entry is set to the no-overlap entry point
1442   //   used by generate_conjoint_int_oop_copy().
1443   //
1444   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1445                                   const char *name, bool dest_uninitialized = false) {
1446     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1447     RegSet saved_reg = RegSet::of(s, d, count);
1448     __ align(CodeEntryAlignment);
1449     StubCodeMark mark(this, "StubRoutines", name);
1450     address start = __ pc();
1451     __ enter();
1452 
1453     if (entry != NULL) {
1454       *entry = __ pc();
1455       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1456       BLOCK_COMMENT("Entry:");
1457     }
1458 
1459     if (is_oop) {
1460       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_reg);
1461       // save regs before copy_memory
1462       __ push(RegSet::of(d, count), sp);
1463     }
1464     copy_memory(aligned, s, d, count, rscratch1, size);
1465     if (is_oop) {
1466       __ pop(RegSet::of(d, count), sp);
1467       if (VerifyOops)
1468         verify_oop_array(size, d, count, r16);
1469       __ sub(count, count, 1); // make an inclusive end pointer
1470       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1471       gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet());
1472     }
1473     __ leave();
1474     __ mov(r0, zr); // return 0
1475     __ ret(lr);
1476 #ifdef BUILTIN_SIM
1477     {
1478       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1479       sim->notifyCompile(const_cast<char*>(name), start);
1480     }
1481 #endif
1482     return start;
1483   }
1484 
1485   // Arguments:
1486   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1487   //             ignored
1488   //   is_oop  - true => oop array, so generate store check code
1489   //   name    - stub name string
1490   //
1491   // Inputs:
1492   //   c_rarg0   - source array address
1493   //   c_rarg1   - destination array address
1494   //   c_rarg2   - element count, treated as ssize_t, can be zero
1495   //
1496   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1497   // the hardware handle it.  The two dwords within qwords that span
1498   // cache line boundaries will still be loaded and stored atomicly.
1499   //
1500   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1501                                  address *entry, const char *name,
1502                                  bool dest_uninitialized = false) {
1503     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1504     RegSet saved_regs = RegSet::of(s, d, count);
1505     StubCodeMark mark(this, "StubRoutines", name);
1506     address start = __ pc();
1507     __ enter();
1508 
1509     if (entry != NULL) {
1510       *entry = __ pc();
1511       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1512       BLOCK_COMMENT("Entry:");
1513     }
1514 
1515     // use fwd copy when (d-s) above_equal (count*size)
1516     __ sub(rscratch1, d, s);
1517     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1518     __ br(Assembler::HS, nooverlap_target);
1519 
1520     if (is_oop) {
1521       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_regs);
1522       // save regs before copy_memory
1523       __ push(RegSet::of(d, count), sp);
1524     }
1525     copy_memory(aligned, s, d, count, rscratch1, -size);
1526     if (is_oop) {
1527       __ pop(RegSet::of(d, count), sp);
1528       if (VerifyOops)
1529         verify_oop_array(size, d, count, r16);
1530       __ sub(count, count, 1); // make an inclusive end pointer
1531       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1532       gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet());
1533     }
1534     __ leave();
1535     __ mov(r0, zr); // return 0
1536     __ ret(lr);
1537 #ifdef BUILTIN_SIM
1538     {
1539       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1540       sim->notifyCompile(const_cast<char*>(name), start);
1541     }
1542 #endif
1543     return start;
1544 }
1545 
1546   // Arguments:
1547   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1548   //             ignored
1549   //   name    - stub name string
1550   //
1551   // Inputs:
1552   //   c_rarg0   - source array address
1553   //   c_rarg1   - destination array address
1554   //   c_rarg2   - element count, treated as ssize_t, can be zero
1555   //
1556   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1557   // we let the hardware handle it.  The one to eight bytes within words,
1558   // dwords or qwords that span cache line boundaries will still be loaded
1559   // and stored atomically.
1560   //
1561   // Side Effects:
1562   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1563   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1564   // we let the hardware handle it.  The one to eight bytes within words,
1565   // dwords or qwords that span cache line boundaries will still be loaded
1566   // and stored atomically.
1567   //
1568   // Side Effects:
1569   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1570   //   used by generate_conjoint_byte_copy().
1571   //
1572   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1573     const bool not_oop = false;
1574     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1575   }
1576 
1577   // Arguments:
1578   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1579   //             ignored
1580   //   name    - stub name string
1581   //
1582   // Inputs:
1583   //   c_rarg0   - source array address
1584   //   c_rarg1   - destination array address
1585   //   c_rarg2   - element count, treated as ssize_t, can be zero
1586   //
1587   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1588   // we let the hardware handle it.  The one to eight bytes within words,
1589   // dwords or qwords that span cache line boundaries will still be loaded
1590   // and stored atomically.
1591   //
1592   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1593                                       address* entry, const char *name) {
1594     const bool not_oop = false;
1595     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1596   }
1597 
1598   // Arguments:
1599   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1600   //             ignored
1601   //   name    - stub name string
1602   //
1603   // Inputs:
1604   //   c_rarg0   - source array address
1605   //   c_rarg1   - destination array address
1606   //   c_rarg2   - element count, treated as ssize_t, can be zero
1607   //
1608   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1609   // let the hardware handle it.  The two or four words within dwords
1610   // or qwords that span cache line boundaries will still be loaded
1611   // and stored atomically.
1612   //
1613   // Side Effects:
1614   //   disjoint_short_copy_entry is set to the no-overlap entry point
1615   //   used by generate_conjoint_short_copy().
1616   //
1617   address generate_disjoint_short_copy(bool aligned,
1618                                        address* entry, const char *name) {
1619     const bool not_oop = false;
1620     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1621   }
1622 
1623   // Arguments:
1624   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1625   //             ignored
1626   //   name    - stub name string
1627   //
1628   // Inputs:
1629   //   c_rarg0   - source array address
1630   //   c_rarg1   - destination array address
1631   //   c_rarg2   - element count, treated as ssize_t, can be zero
1632   //
1633   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1634   // let the hardware handle it.  The two or four words within dwords
1635   // or qwords that span cache line boundaries will still be loaded
1636   // and stored atomically.
1637   //
1638   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1639                                        address *entry, const char *name) {
1640     const bool not_oop = false;
1641     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1642 
1643   }
1644   // Arguments:
1645   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1646   //             ignored
1647   //   name    - stub name string
1648   //
1649   // Inputs:
1650   //   c_rarg0   - source array address
1651   //   c_rarg1   - destination array address
1652   //   c_rarg2   - element count, treated as ssize_t, can be zero
1653   //
1654   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1655   // the hardware handle it.  The two dwords within qwords that span
1656   // cache line boundaries will still be loaded and stored atomicly.
1657   //
1658   // Side Effects:
1659   //   disjoint_int_copy_entry is set to the no-overlap entry point
1660   //   used by generate_conjoint_int_oop_copy().
1661   //
1662   address generate_disjoint_int_copy(bool aligned, address *entry,
1663                                          const char *name, bool dest_uninitialized = false) {
1664     const bool not_oop = false;
1665     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1666   }
1667 
1668   // Arguments:
1669   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1670   //             ignored
1671   //   name    - stub name string
1672   //
1673   // Inputs:
1674   //   c_rarg0   - source array address
1675   //   c_rarg1   - destination array address
1676   //   c_rarg2   - element count, treated as ssize_t, can be zero
1677   //
1678   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1679   // the hardware handle it.  The two dwords within qwords that span
1680   // cache line boundaries will still be loaded and stored atomicly.
1681   //
1682   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1683                                      address *entry, const char *name,
1684                                      bool dest_uninitialized = false) {
1685     const bool not_oop = false;
1686     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1687   }
1688 
1689 
1690   // Arguments:
1691   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1692   //             ignored
1693   //   name    - stub name string
1694   //
1695   // Inputs:
1696   //   c_rarg0   - source array address
1697   //   c_rarg1   - destination array address
1698   //   c_rarg2   - element count, treated as size_t, can be zero
1699   //
1700   // Side Effects:
1701   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1702   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1703   //
1704   address generate_disjoint_long_copy(bool aligned, address *entry,
1705                                           const char *name, bool dest_uninitialized = false) {
1706     const bool not_oop = false;
1707     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1708   }
1709 
1710   // Arguments:
1711   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1712   //             ignored
1713   //   name    - stub name string
1714   //
1715   // Inputs:
1716   //   c_rarg0   - source array address
1717   //   c_rarg1   - destination array address
1718   //   c_rarg2   - element count, treated as size_t, can be zero
1719   //
1720   address generate_conjoint_long_copy(bool aligned,
1721                                       address nooverlap_target, address *entry,
1722                                       const char *name, bool dest_uninitialized = false) {
1723     const bool not_oop = false;
1724     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1725   }
1726 
1727   // Arguments:
1728   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1729   //             ignored
1730   //   name    - stub name string
1731   //
1732   // Inputs:
1733   //   c_rarg0   - source array address
1734   //   c_rarg1   - destination array address
1735   //   c_rarg2   - element count, treated as size_t, can be zero
1736   //
1737   // Side Effects:
1738   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1739   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1740   //
1741   address generate_disjoint_oop_copy(bool aligned, address *entry,
1742                                      const char *name, bool dest_uninitialized) {
1743     const bool is_oop = true;
1744     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1745     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1746   }
1747 
1748   // Arguments:
1749   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1750   //             ignored
1751   //   name    - stub name string
1752   //
1753   // Inputs:
1754   //   c_rarg0   - source array address
1755   //   c_rarg1   - destination array address
1756   //   c_rarg2   - element count, treated as size_t, can be zero
1757   //
1758   address generate_conjoint_oop_copy(bool aligned,
1759                                      address nooverlap_target, address *entry,
1760                                      const char *name, bool dest_uninitialized) {
1761     const bool is_oop = true;
1762     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1763     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1764                                   name, dest_uninitialized);
1765   }
1766 
1767 
1768   // Helper for generating a dynamic type check.
1769   // Smashes rscratch1.
1770   void generate_type_check(Register sub_klass,
1771                            Register super_check_offset,
1772                            Register super_klass,
1773                            Label& L_success) {
1774     assert_different_registers(sub_klass, super_check_offset, super_klass);
1775 
1776     BLOCK_COMMENT("type_check:");
1777 
1778     Label L_miss;
1779 
1780     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1781                                      super_check_offset);
1782     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1783 
1784     // Fall through on failure!
1785     __ BIND(L_miss);
1786   }
1787 
1788   //
1789   //  Generate checkcasting array copy stub
1790   //
1791   //  Input:
1792   //    c_rarg0   - source array address
1793   //    c_rarg1   - destination array address
1794   //    c_rarg2   - element count, treated as ssize_t, can be zero
1795   //    c_rarg3   - size_t ckoff (super_check_offset)
1796   //    c_rarg4   - oop ckval (super_klass)
1797   //
1798   //  Output:
1799   //    r0 ==  0  -  success
1800   //    r0 == -1^K - failure, where K is partial transfer count
1801   //
1802   address generate_checkcast_copy(const char *name, address *entry,
1803                                   bool dest_uninitialized = false) {
1804 
1805     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1806 
1807     // Input registers (after setup_arg_regs)
1808     const Register from        = c_rarg0;   // source array address
1809     const Register to          = c_rarg1;   // destination array address
1810     const Register count       = c_rarg2;   // elementscount
1811     const Register ckoff       = c_rarg3;   // super_check_offset
1812     const Register ckval       = c_rarg4;   // super_klass
1813 
1814     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1815     RegSet wb_post_saved_regs = RegSet::of(count);
1816 
1817     // Registers used as temps (r18, r19, r20 are save-on-entry)
1818     const Register count_save  = r21;       // orig elementscount
1819     const Register start_to    = r20;       // destination array start address
1820     const Register copied_oop  = r18;       // actual oop copied
1821     const Register r19_klass   = r19;       // oop._klass
1822 
1823     //---------------------------------------------------------------
1824     // Assembler stub will be used for this call to arraycopy
1825     // if the two arrays are subtypes of Object[] but the
1826     // destination array type is not equal to or a supertype
1827     // of the source type.  Each element must be separately
1828     // checked.
1829 
1830     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1831                                copied_oop, r19_klass, count_save);
1832 
1833     __ align(CodeEntryAlignment);
1834     StubCodeMark mark(this, "StubRoutines", name);
1835     address start = __ pc();
1836 
1837     __ enter(); // required for proper stackwalking of RuntimeStub frame
1838 
1839 #ifdef ASSERT
1840     // caller guarantees that the arrays really are different
1841     // otherwise, we would have to make conjoint checks
1842     { Label L;
1843       array_overlap_test(L, TIMES_OOP);
1844       __ stop("checkcast_copy within a single array");
1845       __ bind(L);
1846     }
1847 #endif //ASSERT
1848 
1849     // Caller of this entry point must set up the argument registers.
1850     if (entry != NULL) {
1851       *entry = __ pc();
1852       BLOCK_COMMENT("Entry:");
1853     }
1854 
1855      // Empty array:  Nothing to do.
1856     __ cbz(count, L_done);
1857 
1858     __ push(RegSet::of(r18, r19, r20, r21), sp);
1859 
1860 #ifdef ASSERT
1861     BLOCK_COMMENT("assert consistent ckoff/ckval");
1862     // The ckoff and ckval must be mutually consistent,
1863     // even though caller generates both.
1864     { Label L;
1865       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1866       __ ldrw(start_to, Address(ckval, sco_offset));
1867       __ cmpw(ckoff, start_to);
1868       __ br(Assembler::EQ, L);
1869       __ stop("super_check_offset inconsistent");
1870       __ bind(L);
1871     }
1872 #endif //ASSERT
1873 
1874     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized, wb_pre_saved_regs);
1875 
1876     // save the original count
1877     __ mov(count_save, count);
1878 
1879     // Copy from low to high addresses
1880     __ mov(start_to, to);              // Save destination array start address
1881     __ b(L_load_element);
1882 
1883     // ======== begin loop ========
1884     // (Loop is rotated; its entry is L_load_element.)
1885     // Loop control:
1886     //   for (; count != 0; count--) {
1887     //     copied_oop = load_heap_oop(from++);
1888     //     ... generate_type_check ...;
1889     //     store_heap_oop(to++, copied_oop);
1890     //   }
1891     __ align(OptoLoopAlignment);
1892 
1893     __ BIND(L_store_element);
1894     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1895     __ sub(count, count, 1);
1896     __ cbz(count, L_do_card_marks);
1897 
1898     // ======== loop entry is here ========
1899     __ BIND(L_load_element);
1900     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1901     __ cbz(copied_oop, L_store_element);
1902 
1903     __ load_klass(r19_klass, copied_oop);// query the object klass
1904     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1905     // ======== end loop ========
1906 
1907     // It was a real error; we must depend on the caller to finish the job.
1908     // Register count = remaining oops, count_orig = total oops.
1909     // Emit GC store barriers for the oops we have copied and report
1910     // their number to the caller.
1911 
1912     __ subs(count, count_save, count);     // K = partially copied oop count
1913     __ eon(count, count, zr);                   // report (-1^K) to caller
1914     __ br(Assembler::EQ, L_done_pop);
1915 
1916     __ BIND(L_do_card_marks);
1917     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1918     gen_write_ref_array_post_barrier(start_to, to, rscratch1, wb_post_saved_regs);
1919 
1920     __ bind(L_done_pop);
1921     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1922     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1923 
1924     __ bind(L_done);
1925     __ mov(r0, count);
1926     __ leave();
1927     __ ret(lr);
1928 
1929     return start;
1930   }
1931 
1932   // Perform range checks on the proposed arraycopy.
1933   // Kills temp, but nothing else.
1934   // Also, clean the sign bits of src_pos and dst_pos.
1935   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1936                               Register src_pos, // source position (c_rarg1)
1937                               Register dst,     // destination array oo (c_rarg2)
1938                               Register dst_pos, // destination position (c_rarg3)
1939                               Register length,
1940                               Register temp,
1941                               Label& L_failed) {
1942     BLOCK_COMMENT("arraycopy_range_checks:");
1943 
1944     assert_different_registers(rscratch1, temp);
1945 
1946     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1947     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1948     __ addw(temp, length, src_pos);
1949     __ cmpw(temp, rscratch1);
1950     __ br(Assembler::HI, L_failed);
1951 
1952     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1953     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1954     __ addw(temp, length, dst_pos);
1955     __ cmpw(temp, rscratch1);
1956     __ br(Assembler::HI, L_failed);
1957 
1958     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1959     __ movw(src_pos, src_pos);
1960     __ movw(dst_pos, dst_pos);
1961 
1962     BLOCK_COMMENT("arraycopy_range_checks done");
1963   }
1964 
1965   // These stubs get called from some dumb test routine.
1966   // I'll write them properly when they're called from
1967   // something that's actually doing something.
1968   static void fake_arraycopy_stub(address src, address dst, int count) {
1969     assert(count == 0, "huh?");
1970   }
1971 
1972 
1973   //
1974   //  Generate 'unsafe' array copy stub
1975   //  Though just as safe as the other stubs, it takes an unscaled
1976   //  size_t argument instead of an element count.
1977   //
1978   //  Input:
1979   //    c_rarg0   - source array address
1980   //    c_rarg1   - destination array address
1981   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1982   //
1983   // Examines the alignment of the operands and dispatches
1984   // to a long, int, short, or byte copy loop.
1985   //
1986   address generate_unsafe_copy(const char *name,
1987                                address byte_copy_entry,
1988                                address short_copy_entry,
1989                                address int_copy_entry,
1990                                address long_copy_entry) {
1991     Label L_long_aligned, L_int_aligned, L_short_aligned;
1992     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1993 
1994     __ align(CodeEntryAlignment);
1995     StubCodeMark mark(this, "StubRoutines", name);
1996     address start = __ pc();
1997     __ enter(); // required for proper stackwalking of RuntimeStub frame
1998 
1999     // bump this on entry, not on exit:
2000     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2001 
2002     __ orr(rscratch1, s, d);
2003     __ orr(rscratch1, rscratch1, count);
2004 
2005     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2006     __ cbz(rscratch1, L_long_aligned);
2007     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2008     __ cbz(rscratch1, L_int_aligned);
2009     __ tbz(rscratch1, 0, L_short_aligned);
2010     __ b(RuntimeAddress(byte_copy_entry));
2011 
2012     __ BIND(L_short_aligned);
2013     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2014     __ b(RuntimeAddress(short_copy_entry));
2015     __ BIND(L_int_aligned);
2016     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2017     __ b(RuntimeAddress(int_copy_entry));
2018     __ BIND(L_long_aligned);
2019     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2020     __ b(RuntimeAddress(long_copy_entry));
2021 
2022     return start;
2023   }
2024 
2025   //
2026   //  Generate generic array copy stubs
2027   //
2028   //  Input:
2029   //    c_rarg0    -  src oop
2030   //    c_rarg1    -  src_pos (32-bits)
2031   //    c_rarg2    -  dst oop
2032   //    c_rarg3    -  dst_pos (32-bits)
2033   //    c_rarg4    -  element count (32-bits)
2034   //
2035   //  Output:
2036   //    r0 ==  0  -  success
2037   //    r0 == -1^K - failure, where K is partial transfer count
2038   //
2039   address generate_generic_copy(const char *name,
2040                                 address byte_copy_entry, address short_copy_entry,
2041                                 address int_copy_entry, address oop_copy_entry,
2042                                 address long_copy_entry, address checkcast_copy_entry) {
2043 
2044     Label L_failed, L_failed_0, L_objArray;
2045     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2046 
2047     // Input registers
2048     const Register src        = c_rarg0;  // source array oop
2049     const Register src_pos    = c_rarg1;  // source position
2050     const Register dst        = c_rarg2;  // destination array oop
2051     const Register dst_pos    = c_rarg3;  // destination position
2052     const Register length     = c_rarg4;
2053 
2054     StubCodeMark mark(this, "StubRoutines", name);
2055 
2056     __ align(CodeEntryAlignment);
2057     address start = __ pc();
2058 
2059     __ enter(); // required for proper stackwalking of RuntimeStub frame
2060 
2061     // bump this on entry, not on exit:
2062     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2063 
2064     //-----------------------------------------------------------------------
2065     // Assembler stub will be used for this call to arraycopy
2066     // if the following conditions are met:
2067     //
2068     // (1) src and dst must not be null.
2069     // (2) src_pos must not be negative.
2070     // (3) dst_pos must not be negative.
2071     // (4) length  must not be negative.
2072     // (5) src klass and dst klass should be the same and not NULL.
2073     // (6) src and dst should be arrays.
2074     // (7) src_pos + length must not exceed length of src.
2075     // (8) dst_pos + length must not exceed length of dst.
2076     //
2077 
2078     //  if (src == NULL) return -1;
2079     __ cbz(src, L_failed);
2080 
2081     //  if (src_pos < 0) return -1;
2082     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2083 
2084     //  if (dst == NULL) return -1;
2085     __ cbz(dst, L_failed);
2086 
2087     //  if (dst_pos < 0) return -1;
2088     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2089 
2090     // registers used as temp
2091     const Register scratch_length    = r16; // elements count to copy
2092     const Register scratch_src_klass = r17; // array klass
2093     const Register lh                = r18; // layout helper
2094 
2095     //  if (length < 0) return -1;
2096     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2097     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2098 
2099     __ load_klass(scratch_src_klass, src);
2100 #ifdef ASSERT
2101     //  assert(src->klass() != NULL);
2102     {
2103       BLOCK_COMMENT("assert klasses not null {");
2104       Label L1, L2;
2105       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2106       __ bind(L1);
2107       __ stop("broken null klass");
2108       __ bind(L2);
2109       __ load_klass(rscratch1, dst);
2110       __ cbz(rscratch1, L1);     // this would be broken also
2111       BLOCK_COMMENT("} assert klasses not null done");
2112     }
2113 #endif
2114 
2115     // Load layout helper (32-bits)
2116     //
2117     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2118     // 32        30    24            16              8     2                 0
2119     //
2120     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2121     //
2122 
2123     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2124 
2125     // Handle objArrays completely differently...
2126     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2127     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2128     __ movw(rscratch1, objArray_lh);
2129     __ eorw(rscratch2, lh, rscratch1);
2130     __ cbzw(rscratch2, L_objArray);
2131 
2132     //  if (src->klass() != dst->klass()) return -1;
2133     __ load_klass(rscratch2, dst);
2134     __ eor(rscratch2, rscratch2, scratch_src_klass);
2135     __ cbnz(rscratch2, L_failed);
2136 
2137     //  if (!src->is_Array()) return -1;
2138     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2139 
2140     // At this point, it is known to be a typeArray (array_tag 0x3).
2141 #ifdef ASSERT
2142     {
2143       BLOCK_COMMENT("assert primitive array {");
2144       Label L;
2145       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2146       __ cmpw(lh, rscratch2);
2147       __ br(Assembler::GE, L);
2148       __ stop("must be a primitive array");
2149       __ bind(L);
2150       BLOCK_COMMENT("} assert primitive array done");
2151     }
2152 #endif
2153 
2154     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2155                            rscratch2, L_failed);
2156 
2157     // TypeArrayKlass
2158     //
2159     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2160     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2161     //
2162 
2163     const Register rscratch1_offset = rscratch1;    // array offset
2164     const Register r18_elsize = lh; // element size
2165 
2166     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2167            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2168     __ add(src, src, rscratch1_offset);           // src array offset
2169     __ add(dst, dst, rscratch1_offset);           // dst array offset
2170     BLOCK_COMMENT("choose copy loop based on element size");
2171 
2172     // next registers should be set before the jump to corresponding stub
2173     const Register from     = c_rarg0;  // source array address
2174     const Register to       = c_rarg1;  // destination array address
2175     const Register count    = c_rarg2;  // elements count
2176 
2177     // 'from', 'to', 'count' registers should be set in such order
2178     // since they are the same as 'src', 'src_pos', 'dst'.
2179 
2180     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2181 
2182     // The possible values of elsize are 0-3, i.e. exact_log2(element
2183     // size in bytes).  We do a simple bitwise binary search.
2184   __ BIND(L_copy_bytes);
2185     __ tbnz(r18_elsize, 1, L_copy_ints);
2186     __ tbnz(r18_elsize, 0, L_copy_shorts);
2187     __ lea(from, Address(src, src_pos));// src_addr
2188     __ lea(to,   Address(dst, dst_pos));// dst_addr
2189     __ movw(count, scratch_length); // length
2190     __ b(RuntimeAddress(byte_copy_entry));
2191 
2192   __ BIND(L_copy_shorts);
2193     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2194     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2195     __ movw(count, scratch_length); // length
2196     __ b(RuntimeAddress(short_copy_entry));
2197 
2198   __ BIND(L_copy_ints);
2199     __ tbnz(r18_elsize, 0, L_copy_longs);
2200     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2201     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2202     __ movw(count, scratch_length); // length
2203     __ b(RuntimeAddress(int_copy_entry));
2204 
2205   __ BIND(L_copy_longs);
2206 #ifdef ASSERT
2207     {
2208       BLOCK_COMMENT("assert long copy {");
2209       Label L;
2210       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2211       __ cmpw(r18_elsize, LogBytesPerLong);
2212       __ br(Assembler::EQ, L);
2213       __ stop("must be long copy, but elsize is wrong");
2214       __ bind(L);
2215       BLOCK_COMMENT("} assert long copy done");
2216     }
2217 #endif
2218     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2219     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2220     __ movw(count, scratch_length); // length
2221     __ b(RuntimeAddress(long_copy_entry));
2222 
2223     // ObjArrayKlass
2224   __ BIND(L_objArray);
2225     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2226 
2227     Label L_plain_copy, L_checkcast_copy;
2228     //  test array classes for subtyping
2229     __ load_klass(r18, dst);
2230     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2231     __ br(Assembler::NE, L_checkcast_copy);
2232 
2233     // Identically typed arrays can be copied without element-wise checks.
2234     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2235                            rscratch2, L_failed);
2236 
2237     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2238     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2239     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2240     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2241     __ movw(count, scratch_length); // length
2242   __ BIND(L_plain_copy);
2243     __ b(RuntimeAddress(oop_copy_entry));
2244 
2245   __ BIND(L_checkcast_copy);
2246     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2247     {
2248       // Before looking at dst.length, make sure dst is also an objArray.
2249       __ ldrw(rscratch1, Address(r18, lh_offset));
2250       __ movw(rscratch2, objArray_lh);
2251       __ eorw(rscratch1, rscratch1, rscratch2);
2252       __ cbnzw(rscratch1, L_failed);
2253 
2254       // It is safe to examine both src.length and dst.length.
2255       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2256                              r18, L_failed);
2257 
2258       const Register rscratch2_dst_klass = rscratch2;
2259       __ load_klass(rscratch2_dst_klass, dst); // reload
2260 
2261       // Marshal the base address arguments now, freeing registers.
2262       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2263       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2264       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2265       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2266       __ movw(count, length);           // length (reloaded)
2267       Register sco_temp = c_rarg3;      // this register is free now
2268       assert_different_registers(from, to, count, sco_temp,
2269                                  rscratch2_dst_klass, scratch_src_klass);
2270       // assert_clean_int(count, sco_temp);
2271 
2272       // Generate the type check.
2273       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2274       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2275       // assert_clean_int(sco_temp, r18);
2276       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2277 
2278       // Fetch destination element klass from the ObjArrayKlass header.
2279       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2280       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2281       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2282 
2283       // the checkcast_copy loop needs two extra arguments:
2284       assert(c_rarg3 == sco_temp, "#3 already in place");
2285       // Set up arguments for checkcast_copy_entry.
2286       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2287       __ b(RuntimeAddress(checkcast_copy_entry));
2288     }
2289 
2290   __ BIND(L_failed);
2291     __ mov(r0, -1);
2292     __ leave();   // required for proper stackwalking of RuntimeStub frame
2293     __ ret(lr);
2294 
2295     return start;
2296   }
2297 
2298   //
2299   // Generate stub for array fill. If "aligned" is true, the
2300   // "to" address is assumed to be heapword aligned.
2301   //
2302   // Arguments for generated stub:
2303   //   to:    c_rarg0
2304   //   value: c_rarg1
2305   //   count: c_rarg2 treated as signed
2306   //
2307   address generate_fill(BasicType t, bool aligned, const char *name) {
2308     __ align(CodeEntryAlignment);
2309     StubCodeMark mark(this, "StubRoutines", name);
2310     address start = __ pc();
2311 
2312     BLOCK_COMMENT("Entry:");
2313 
2314     const Register to        = c_rarg0;  // source array address
2315     const Register value     = c_rarg1;  // value
2316     const Register count     = c_rarg2;  // elements count
2317 
2318     const Register bz_base = r10;        // base for block_zero routine
2319     const Register cnt_words = r11;      // temp register
2320 
2321     __ enter();
2322 
2323     Label L_fill_elements, L_exit1;
2324 
2325     int shift = -1;
2326     switch (t) {
2327       case T_BYTE:
2328         shift = 0;
2329         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2330         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2331         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2332         __ br(Assembler::LO, L_fill_elements);
2333         break;
2334       case T_SHORT:
2335         shift = 1;
2336         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2337         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2338         __ br(Assembler::LO, L_fill_elements);
2339         break;
2340       case T_INT:
2341         shift = 2;
2342         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2343         __ br(Assembler::LO, L_fill_elements);
2344         break;
2345       default: ShouldNotReachHere();
2346     }
2347 
2348     // Align source address at 8 bytes address boundary.
2349     Label L_skip_align1, L_skip_align2, L_skip_align4;
2350     if (!aligned) {
2351       switch (t) {
2352         case T_BYTE:
2353           // One byte misalignment happens only for byte arrays.
2354           __ tbz(to, 0, L_skip_align1);
2355           __ strb(value, Address(__ post(to, 1)));
2356           __ subw(count, count, 1);
2357           __ bind(L_skip_align1);
2358           // Fallthrough
2359         case T_SHORT:
2360           // Two bytes misalignment happens only for byte and short (char) arrays.
2361           __ tbz(to, 1, L_skip_align2);
2362           __ strh(value, Address(__ post(to, 2)));
2363           __ subw(count, count, 2 >> shift);
2364           __ bind(L_skip_align2);
2365           // Fallthrough
2366         case T_INT:
2367           // Align to 8 bytes, we know we are 4 byte aligned to start.
2368           __ tbz(to, 2, L_skip_align4);
2369           __ strw(value, Address(__ post(to, 4)));
2370           __ subw(count, count, 4 >> shift);
2371           __ bind(L_skip_align4);
2372           break;
2373         default: ShouldNotReachHere();
2374       }
2375     }
2376 
2377     //
2378     //  Fill large chunks
2379     //
2380     __ lsrw(cnt_words, count, 3 - shift); // number of words
2381     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2382     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2383     if (UseBlockZeroing) {
2384       Label non_block_zeroing, rest;
2385       // If the fill value is zero we can use the fast zero_words().
2386       __ cbnz(value, non_block_zeroing);
2387       __ mov(bz_base, to);
2388       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2389       __ zero_words(bz_base, cnt_words);
2390       __ b(rest);
2391       __ bind(non_block_zeroing);
2392       __ fill_words(to, cnt_words, value);
2393       __ bind(rest);
2394     } else {
2395       __ fill_words(to, cnt_words, value);
2396     }
2397 
2398     // Remaining count is less than 8 bytes. Fill it by a single store.
2399     // Note that the total length is no less than 8 bytes.
2400     if (t == T_BYTE || t == T_SHORT) {
2401       Label L_exit1;
2402       __ cbzw(count, L_exit1);
2403       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2404       __ str(value, Address(to, -8));    // overwrite some elements
2405       __ bind(L_exit1);
2406       __ leave();
2407       __ ret(lr);
2408     }
2409 
2410     // Handle copies less than 8 bytes.
2411     Label L_fill_2, L_fill_4, L_exit2;
2412     __ bind(L_fill_elements);
2413     switch (t) {
2414       case T_BYTE:
2415         __ tbz(count, 0, L_fill_2);
2416         __ strb(value, Address(__ post(to, 1)));
2417         __ bind(L_fill_2);
2418         __ tbz(count, 1, L_fill_4);
2419         __ strh(value, Address(__ post(to, 2)));
2420         __ bind(L_fill_4);
2421         __ tbz(count, 2, L_exit2);
2422         __ strw(value, Address(to));
2423         break;
2424       case T_SHORT:
2425         __ tbz(count, 0, L_fill_4);
2426         __ strh(value, Address(__ post(to, 2)));
2427         __ bind(L_fill_4);
2428         __ tbz(count, 1, L_exit2);
2429         __ strw(value, Address(to));
2430         break;
2431       case T_INT:
2432         __ cbzw(count, L_exit2);
2433         __ strw(value, Address(to));
2434         break;
2435       default: ShouldNotReachHere();
2436     }
2437     __ bind(L_exit2);
2438     __ leave();
2439     __ ret(lr);
2440     return start;
2441   }
2442 
2443   void generate_arraycopy_stubs() {
2444     address entry;
2445     address entry_jbyte_arraycopy;
2446     address entry_jshort_arraycopy;
2447     address entry_jint_arraycopy;
2448     address entry_oop_arraycopy;
2449     address entry_jlong_arraycopy;
2450     address entry_checkcast_arraycopy;
2451 
2452     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2453     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2454 
2455     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2456 
2457     //*** jbyte
2458     // Always need aligned and unaligned versions
2459     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2460                                                                                   "jbyte_disjoint_arraycopy");
2461     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2462                                                                                   &entry_jbyte_arraycopy,
2463                                                                                   "jbyte_arraycopy");
2464     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2465                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2466     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2467                                                                                   "arrayof_jbyte_arraycopy");
2468 
2469     //*** jshort
2470     // Always need aligned and unaligned versions
2471     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2472                                                                                     "jshort_disjoint_arraycopy");
2473     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2474                                                                                     &entry_jshort_arraycopy,
2475                                                                                     "jshort_arraycopy");
2476     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2477                                                                                     "arrayof_jshort_disjoint_arraycopy");
2478     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2479                                                                                     "arrayof_jshort_arraycopy");
2480 
2481     //*** jint
2482     // Aligned versions
2483     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2484                                                                                 "arrayof_jint_disjoint_arraycopy");
2485     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2486                                                                                 "arrayof_jint_arraycopy");
2487     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2488     // entry_jint_arraycopy always points to the unaligned version
2489     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2490                                                                                 "jint_disjoint_arraycopy");
2491     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2492                                                                                 &entry_jint_arraycopy,
2493                                                                                 "jint_arraycopy");
2494 
2495     //*** jlong
2496     // It is always aligned
2497     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2498                                                                                   "arrayof_jlong_disjoint_arraycopy");
2499     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2500                                                                                   "arrayof_jlong_arraycopy");
2501     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2502     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2503 
2504     //*** oops
2505     {
2506       // With compressed oops we need unaligned versions; notice that
2507       // we overwrite entry_oop_arraycopy.
2508       bool aligned = !UseCompressedOops;
2509 
2510       StubRoutines::_arrayof_oop_disjoint_arraycopy
2511         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2512                                      /*dest_uninitialized*/false);
2513       StubRoutines::_arrayof_oop_arraycopy
2514         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2515                                      /*dest_uninitialized*/false);
2516       // Aligned versions without pre-barriers
2517       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2518         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2519                                      /*dest_uninitialized*/true);
2520       StubRoutines::_arrayof_oop_arraycopy_uninit
2521         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2522                                      /*dest_uninitialized*/true);
2523     }
2524 
2525     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2526     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2527     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2528     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2529 
2530     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2531     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2532                                                                         /*dest_uninitialized*/true);
2533 
2534     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2535                                                               entry_jbyte_arraycopy,
2536                                                               entry_jshort_arraycopy,
2537                                                               entry_jint_arraycopy,
2538                                                               entry_jlong_arraycopy);
2539 
2540     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2541                                                                entry_jbyte_arraycopy,
2542                                                                entry_jshort_arraycopy,
2543                                                                entry_jint_arraycopy,
2544                                                                entry_oop_arraycopy,
2545                                                                entry_jlong_arraycopy,
2546                                                                entry_checkcast_arraycopy);
2547 
2548     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2549     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2550     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2551     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2552     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2553     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2554   }
2555 
2556   void generate_math_stubs() { Unimplemented(); }
2557 
2558   // Arguments:
2559   //
2560   // Inputs:
2561   //   c_rarg0   - source byte array address
2562   //   c_rarg1   - destination byte array address
2563   //   c_rarg2   - K (key) in little endian int array
2564   //
2565   address generate_aescrypt_encryptBlock() {
2566     __ align(CodeEntryAlignment);
2567     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2568 
2569     Label L_doLast;
2570 
2571     const Register from        = c_rarg0;  // source array address
2572     const Register to          = c_rarg1;  // destination array address
2573     const Register key         = c_rarg2;  // key array address
2574     const Register keylen      = rscratch1;
2575 
2576     address start = __ pc();
2577     __ enter();
2578 
2579     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2580 
2581     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2582 
2583     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2584     __ rev32(v1, __ T16B, v1);
2585     __ rev32(v2, __ T16B, v2);
2586     __ rev32(v3, __ T16B, v3);
2587     __ rev32(v4, __ T16B, v4);
2588     __ aese(v0, v1);
2589     __ aesmc(v0, v0);
2590     __ aese(v0, v2);
2591     __ aesmc(v0, v0);
2592     __ aese(v0, v3);
2593     __ aesmc(v0, v0);
2594     __ aese(v0, v4);
2595     __ aesmc(v0, v0);
2596 
2597     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2598     __ rev32(v1, __ T16B, v1);
2599     __ rev32(v2, __ T16B, v2);
2600     __ rev32(v3, __ T16B, v3);
2601     __ rev32(v4, __ T16B, v4);
2602     __ aese(v0, v1);
2603     __ aesmc(v0, v0);
2604     __ aese(v0, v2);
2605     __ aesmc(v0, v0);
2606     __ aese(v0, v3);
2607     __ aesmc(v0, v0);
2608     __ aese(v0, v4);
2609     __ aesmc(v0, v0);
2610 
2611     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2612     __ rev32(v1, __ T16B, v1);
2613     __ rev32(v2, __ T16B, v2);
2614 
2615     __ cmpw(keylen, 44);
2616     __ br(Assembler::EQ, L_doLast);
2617 
2618     __ aese(v0, v1);
2619     __ aesmc(v0, v0);
2620     __ aese(v0, v2);
2621     __ aesmc(v0, v0);
2622 
2623     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2624     __ rev32(v1, __ T16B, v1);
2625     __ rev32(v2, __ T16B, v2);
2626 
2627     __ cmpw(keylen, 52);
2628     __ br(Assembler::EQ, L_doLast);
2629 
2630     __ aese(v0, v1);
2631     __ aesmc(v0, v0);
2632     __ aese(v0, v2);
2633     __ aesmc(v0, v0);
2634 
2635     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2636     __ rev32(v1, __ T16B, v1);
2637     __ rev32(v2, __ T16B, v2);
2638 
2639     __ BIND(L_doLast);
2640 
2641     __ aese(v0, v1);
2642     __ aesmc(v0, v0);
2643     __ aese(v0, v2);
2644 
2645     __ ld1(v1, __ T16B, key);
2646     __ rev32(v1, __ T16B, v1);
2647     __ eor(v0, __ T16B, v0, v1);
2648 
2649     __ st1(v0, __ T16B, to);
2650 
2651     __ mov(r0, 0);
2652 
2653     __ leave();
2654     __ ret(lr);
2655 
2656     return start;
2657   }
2658 
2659   // Arguments:
2660   //
2661   // Inputs:
2662   //   c_rarg0   - source byte array address
2663   //   c_rarg1   - destination byte array address
2664   //   c_rarg2   - K (key) in little endian int array
2665   //
2666   address generate_aescrypt_decryptBlock() {
2667     assert(UseAES, "need AES instructions and misaligned SSE support");
2668     __ align(CodeEntryAlignment);
2669     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2670     Label L_doLast;
2671 
2672     const Register from        = c_rarg0;  // source array address
2673     const Register to          = c_rarg1;  // destination array address
2674     const Register key         = c_rarg2;  // key array address
2675     const Register keylen      = rscratch1;
2676 
2677     address start = __ pc();
2678     __ enter(); // required for proper stackwalking of RuntimeStub frame
2679 
2680     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2681 
2682     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2683 
2684     __ ld1(v5, __ T16B, __ post(key, 16));
2685     __ rev32(v5, __ T16B, v5);
2686 
2687     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2688     __ rev32(v1, __ T16B, v1);
2689     __ rev32(v2, __ T16B, v2);
2690     __ rev32(v3, __ T16B, v3);
2691     __ rev32(v4, __ T16B, v4);
2692     __ aesd(v0, v1);
2693     __ aesimc(v0, v0);
2694     __ aesd(v0, v2);
2695     __ aesimc(v0, v0);
2696     __ aesd(v0, v3);
2697     __ aesimc(v0, v0);
2698     __ aesd(v0, v4);
2699     __ aesimc(v0, v0);
2700 
2701     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2702     __ rev32(v1, __ T16B, v1);
2703     __ rev32(v2, __ T16B, v2);
2704     __ rev32(v3, __ T16B, v3);
2705     __ rev32(v4, __ T16B, v4);
2706     __ aesd(v0, v1);
2707     __ aesimc(v0, v0);
2708     __ aesd(v0, v2);
2709     __ aesimc(v0, v0);
2710     __ aesd(v0, v3);
2711     __ aesimc(v0, v0);
2712     __ aesd(v0, v4);
2713     __ aesimc(v0, v0);
2714 
2715     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2716     __ rev32(v1, __ T16B, v1);
2717     __ rev32(v2, __ T16B, v2);
2718 
2719     __ cmpw(keylen, 44);
2720     __ br(Assembler::EQ, L_doLast);
2721 
2722     __ aesd(v0, v1);
2723     __ aesimc(v0, v0);
2724     __ aesd(v0, v2);
2725     __ aesimc(v0, v0);
2726 
2727     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2728     __ rev32(v1, __ T16B, v1);
2729     __ rev32(v2, __ T16B, v2);
2730 
2731     __ cmpw(keylen, 52);
2732     __ br(Assembler::EQ, L_doLast);
2733 
2734     __ aesd(v0, v1);
2735     __ aesimc(v0, v0);
2736     __ aesd(v0, v2);
2737     __ aesimc(v0, v0);
2738 
2739     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2740     __ rev32(v1, __ T16B, v1);
2741     __ rev32(v2, __ T16B, v2);
2742 
2743     __ BIND(L_doLast);
2744 
2745     __ aesd(v0, v1);
2746     __ aesimc(v0, v0);
2747     __ aesd(v0, v2);
2748 
2749     __ eor(v0, __ T16B, v0, v5);
2750 
2751     __ st1(v0, __ T16B, to);
2752 
2753     __ mov(r0, 0);
2754 
2755     __ leave();
2756     __ ret(lr);
2757 
2758     return start;
2759   }
2760 
2761   // Arguments:
2762   //
2763   // Inputs:
2764   //   c_rarg0   - source byte array address
2765   //   c_rarg1   - destination byte array address
2766   //   c_rarg2   - K (key) in little endian int array
2767   //   c_rarg3   - r vector byte array address
2768   //   c_rarg4   - input length
2769   //
2770   // Output:
2771   //   x0        - input length
2772   //
2773   address generate_cipherBlockChaining_encryptAESCrypt() {
2774     assert(UseAES, "need AES instructions and misaligned SSE support");
2775     __ align(CodeEntryAlignment);
2776     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2777 
2778     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2779 
2780     const Register from        = c_rarg0;  // source array address
2781     const Register to          = c_rarg1;  // destination array address
2782     const Register key         = c_rarg2;  // key array address
2783     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2784                                            // and left with the results of the last encryption block
2785     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2786     const Register keylen      = rscratch1;
2787 
2788     address start = __ pc();
2789 
2790       __ enter();
2791 
2792       __ movw(rscratch2, len_reg);
2793 
2794       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2795 
2796       __ ld1(v0, __ T16B, rvec);
2797 
2798       __ cmpw(keylen, 52);
2799       __ br(Assembler::CC, L_loadkeys_44);
2800       __ br(Assembler::EQ, L_loadkeys_52);
2801 
2802       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2803       __ rev32(v17, __ T16B, v17);
2804       __ rev32(v18, __ T16B, v18);
2805     __ BIND(L_loadkeys_52);
2806       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2807       __ rev32(v19, __ T16B, v19);
2808       __ rev32(v20, __ T16B, v20);
2809     __ BIND(L_loadkeys_44);
2810       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2811       __ rev32(v21, __ T16B, v21);
2812       __ rev32(v22, __ T16B, v22);
2813       __ rev32(v23, __ T16B, v23);
2814       __ rev32(v24, __ T16B, v24);
2815       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2816       __ rev32(v25, __ T16B, v25);
2817       __ rev32(v26, __ T16B, v26);
2818       __ rev32(v27, __ T16B, v27);
2819       __ rev32(v28, __ T16B, v28);
2820       __ ld1(v29, v30, v31, __ T16B, key);
2821       __ rev32(v29, __ T16B, v29);
2822       __ rev32(v30, __ T16B, v30);
2823       __ rev32(v31, __ T16B, v31);
2824 
2825     __ BIND(L_aes_loop);
2826       __ ld1(v1, __ T16B, __ post(from, 16));
2827       __ eor(v0, __ T16B, v0, v1);
2828 
2829       __ br(Assembler::CC, L_rounds_44);
2830       __ br(Assembler::EQ, L_rounds_52);
2831 
2832       __ aese(v0, v17); __ aesmc(v0, v0);
2833       __ aese(v0, v18); __ aesmc(v0, v0);
2834     __ BIND(L_rounds_52);
2835       __ aese(v0, v19); __ aesmc(v0, v0);
2836       __ aese(v0, v20); __ aesmc(v0, v0);
2837     __ BIND(L_rounds_44);
2838       __ aese(v0, v21); __ aesmc(v0, v0);
2839       __ aese(v0, v22); __ aesmc(v0, v0);
2840       __ aese(v0, v23); __ aesmc(v0, v0);
2841       __ aese(v0, v24); __ aesmc(v0, v0);
2842       __ aese(v0, v25); __ aesmc(v0, v0);
2843       __ aese(v0, v26); __ aesmc(v0, v0);
2844       __ aese(v0, v27); __ aesmc(v0, v0);
2845       __ aese(v0, v28); __ aesmc(v0, v0);
2846       __ aese(v0, v29); __ aesmc(v0, v0);
2847       __ aese(v0, v30);
2848       __ eor(v0, __ T16B, v0, v31);
2849 
2850       __ st1(v0, __ T16B, __ post(to, 16));
2851 
2852       __ subw(len_reg, len_reg, 16);
2853       __ cbnzw(len_reg, L_aes_loop);
2854 
2855       __ st1(v0, __ T16B, rvec);
2856 
2857       __ mov(r0, rscratch2);
2858 
2859       __ leave();
2860       __ ret(lr);
2861 
2862       return start;
2863   }
2864 
2865   // Arguments:
2866   //
2867   // Inputs:
2868   //   c_rarg0   - source byte array address
2869   //   c_rarg1   - destination byte array address
2870   //   c_rarg2   - K (key) in little endian int array
2871   //   c_rarg3   - r vector byte array address
2872   //   c_rarg4   - input length
2873   //
2874   // Output:
2875   //   r0        - input length
2876   //
2877   address generate_cipherBlockChaining_decryptAESCrypt() {
2878     assert(UseAES, "need AES instructions and misaligned SSE support");
2879     __ align(CodeEntryAlignment);
2880     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2881 
2882     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2883 
2884     const Register from        = c_rarg0;  // source array address
2885     const Register to          = c_rarg1;  // destination array address
2886     const Register key         = c_rarg2;  // key array address
2887     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2888                                            // and left with the results of the last encryption block
2889     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2890     const Register keylen      = rscratch1;
2891 
2892     address start = __ pc();
2893 
2894       __ enter();
2895 
2896       __ movw(rscratch2, len_reg);
2897 
2898       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2899 
2900       __ ld1(v2, __ T16B, rvec);
2901 
2902       __ ld1(v31, __ T16B, __ post(key, 16));
2903       __ rev32(v31, __ T16B, v31);
2904 
2905       __ cmpw(keylen, 52);
2906       __ br(Assembler::CC, L_loadkeys_44);
2907       __ br(Assembler::EQ, L_loadkeys_52);
2908 
2909       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2910       __ rev32(v17, __ T16B, v17);
2911       __ rev32(v18, __ T16B, v18);
2912     __ BIND(L_loadkeys_52);
2913       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2914       __ rev32(v19, __ T16B, v19);
2915       __ rev32(v20, __ T16B, v20);
2916     __ BIND(L_loadkeys_44);
2917       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2918       __ rev32(v21, __ T16B, v21);
2919       __ rev32(v22, __ T16B, v22);
2920       __ rev32(v23, __ T16B, v23);
2921       __ rev32(v24, __ T16B, v24);
2922       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2923       __ rev32(v25, __ T16B, v25);
2924       __ rev32(v26, __ T16B, v26);
2925       __ rev32(v27, __ T16B, v27);
2926       __ rev32(v28, __ T16B, v28);
2927       __ ld1(v29, v30, __ T16B, key);
2928       __ rev32(v29, __ T16B, v29);
2929       __ rev32(v30, __ T16B, v30);
2930 
2931     __ BIND(L_aes_loop);
2932       __ ld1(v0, __ T16B, __ post(from, 16));
2933       __ orr(v1, __ T16B, v0, v0);
2934 
2935       __ br(Assembler::CC, L_rounds_44);
2936       __ br(Assembler::EQ, L_rounds_52);
2937 
2938       __ aesd(v0, v17); __ aesimc(v0, v0);
2939       __ aesd(v0, v18); __ aesimc(v0, v0);
2940     __ BIND(L_rounds_52);
2941       __ aesd(v0, v19); __ aesimc(v0, v0);
2942       __ aesd(v0, v20); __ aesimc(v0, v0);
2943     __ BIND(L_rounds_44);
2944       __ aesd(v0, v21); __ aesimc(v0, v0);
2945       __ aesd(v0, v22); __ aesimc(v0, v0);
2946       __ aesd(v0, v23); __ aesimc(v0, v0);
2947       __ aesd(v0, v24); __ aesimc(v0, v0);
2948       __ aesd(v0, v25); __ aesimc(v0, v0);
2949       __ aesd(v0, v26); __ aesimc(v0, v0);
2950       __ aesd(v0, v27); __ aesimc(v0, v0);
2951       __ aesd(v0, v28); __ aesimc(v0, v0);
2952       __ aesd(v0, v29); __ aesimc(v0, v0);
2953       __ aesd(v0, v30);
2954       __ eor(v0, __ T16B, v0, v31);
2955       __ eor(v0, __ T16B, v0, v2);
2956 
2957       __ st1(v0, __ T16B, __ post(to, 16));
2958       __ orr(v2, __ T16B, v1, v1);
2959 
2960       __ subw(len_reg, len_reg, 16);
2961       __ cbnzw(len_reg, L_aes_loop);
2962 
2963       __ st1(v2, __ T16B, rvec);
2964 
2965       __ mov(r0, rscratch2);
2966 
2967       __ leave();
2968       __ ret(lr);
2969 
2970     return start;
2971   }
2972 
2973   // Arguments:
2974   //
2975   // Inputs:
2976   //   c_rarg0   - byte[]  source+offset
2977   //   c_rarg1   - int[]   SHA.state
2978   //   c_rarg2   - int     offset
2979   //   c_rarg3   - int     limit
2980   //
2981   address generate_sha1_implCompress(bool multi_block, const char *name) {
2982     __ align(CodeEntryAlignment);
2983     StubCodeMark mark(this, "StubRoutines", name);
2984     address start = __ pc();
2985 
2986     Register buf   = c_rarg0;
2987     Register state = c_rarg1;
2988     Register ofs   = c_rarg2;
2989     Register limit = c_rarg3;
2990 
2991     Label keys;
2992     Label sha1_loop;
2993 
2994     // load the keys into v0..v3
2995     __ adr(rscratch1, keys);
2996     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2997     // load 5 words state into v6, v7
2998     __ ldrq(v6, Address(state, 0));
2999     __ ldrs(v7, Address(state, 16));
3000 
3001 
3002     __ BIND(sha1_loop);
3003     // load 64 bytes of data into v16..v19
3004     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3005     __ rev32(v16, __ T16B, v16);
3006     __ rev32(v17, __ T16B, v17);
3007     __ rev32(v18, __ T16B, v18);
3008     __ rev32(v19, __ T16B, v19);
3009 
3010     // do the sha1
3011     __ addv(v4, __ T4S, v16, v0);
3012     __ orr(v20, __ T16B, v6, v6);
3013 
3014     FloatRegister d0 = v16;
3015     FloatRegister d1 = v17;
3016     FloatRegister d2 = v18;
3017     FloatRegister d3 = v19;
3018 
3019     for (int round = 0; round < 20; round++) {
3020       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3021       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3022       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3023       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3024       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3025 
3026       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3027       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3028       __ sha1h(tmp2, __ T4S, v20);
3029       if (round < 5)
3030         __ sha1c(v20, __ T4S, tmp3, tmp4);
3031       else if (round < 10 || round >= 15)
3032         __ sha1p(v20, __ T4S, tmp3, tmp4);
3033       else
3034         __ sha1m(v20, __ T4S, tmp3, tmp4);
3035       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3036 
3037       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3038     }
3039 
3040     __ addv(v7, __ T2S, v7, v21);
3041     __ addv(v6, __ T4S, v6, v20);
3042 
3043     if (multi_block) {
3044       __ add(ofs, ofs, 64);
3045       __ cmp(ofs, limit);
3046       __ br(Assembler::LE, sha1_loop);
3047       __ mov(c_rarg0, ofs); // return ofs
3048     }
3049 
3050     __ strq(v6, Address(state, 0));
3051     __ strs(v7, Address(state, 16));
3052 
3053     __ ret(lr);
3054 
3055     __ bind(keys);
3056     __ emit_int32(0x5a827999);
3057     __ emit_int32(0x6ed9eba1);
3058     __ emit_int32(0x8f1bbcdc);
3059     __ emit_int32(0xca62c1d6);
3060 
3061     return start;
3062   }
3063 
3064 
3065   // Arguments:
3066   //
3067   // Inputs:
3068   //   c_rarg0   - byte[]  source+offset
3069   //   c_rarg1   - int[]   SHA.state
3070   //   c_rarg2   - int     offset
3071   //   c_rarg3   - int     limit
3072   //
3073   address generate_sha256_implCompress(bool multi_block, const char *name) {
3074     static const uint32_t round_consts[64] = {
3075       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3076       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3077       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3078       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3079       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3080       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3081       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3082       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3083       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3084       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3085       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3086       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3087       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3088       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3089       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3090       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3091     };
3092     __ align(CodeEntryAlignment);
3093     StubCodeMark mark(this, "StubRoutines", name);
3094     address start = __ pc();
3095 
3096     Register buf   = c_rarg0;
3097     Register state = c_rarg1;
3098     Register ofs   = c_rarg2;
3099     Register limit = c_rarg3;
3100 
3101     Label sha1_loop;
3102 
3103     __ stpd(v8, v9, __ pre(sp, -32));
3104     __ stpd(v10, v11, Address(sp, 16));
3105 
3106 // dga == v0
3107 // dgb == v1
3108 // dg0 == v2
3109 // dg1 == v3
3110 // dg2 == v4
3111 // t0 == v6
3112 // t1 == v7
3113 
3114     // load 16 keys to v16..v31
3115     __ lea(rscratch1, ExternalAddress((address)round_consts));
3116     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3117     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3118     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3119     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3120 
3121     // load 8 words (256 bits) state
3122     __ ldpq(v0, v1, state);
3123 
3124     __ BIND(sha1_loop);
3125     // load 64 bytes of data into v8..v11
3126     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3127     __ rev32(v8, __ T16B, v8);
3128     __ rev32(v9, __ T16B, v9);
3129     __ rev32(v10, __ T16B, v10);
3130     __ rev32(v11, __ T16B, v11);
3131 
3132     __ addv(v6, __ T4S, v8, v16);
3133     __ orr(v2, __ T16B, v0, v0);
3134     __ orr(v3, __ T16B, v1, v1);
3135 
3136     FloatRegister d0 = v8;
3137     FloatRegister d1 = v9;
3138     FloatRegister d2 = v10;
3139     FloatRegister d3 = v11;
3140 
3141 
3142     for (int round = 0; round < 16; round++) {
3143       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3144       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3145       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3146       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3147 
3148       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3149        __ orr(v4, __ T16B, v2, v2);
3150       if (round < 15)
3151         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3152       __ sha256h(v2, __ T4S, v3, tmp2);
3153       __ sha256h2(v3, __ T4S, v4, tmp2);
3154       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3155 
3156       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3157     }
3158 
3159     __ addv(v0, __ T4S, v0, v2);
3160     __ addv(v1, __ T4S, v1, v3);
3161 
3162     if (multi_block) {
3163       __ add(ofs, ofs, 64);
3164       __ cmp(ofs, limit);
3165       __ br(Assembler::LE, sha1_loop);
3166       __ mov(c_rarg0, ofs); // return ofs
3167     }
3168 
3169     __ ldpd(v10, v11, Address(sp, 16));
3170     __ ldpd(v8, v9, __ post(sp, 32));
3171 
3172     __ stpq(v0, v1, state);
3173 
3174     __ ret(lr);
3175 
3176     return start;
3177   }
3178 
3179 #ifndef BUILTIN_SIM
3180   // Safefetch stubs.
3181   void generate_safefetch(const char* name, int size, address* entry,
3182                           address* fault_pc, address* continuation_pc) {
3183     // safefetch signatures:
3184     //   int      SafeFetch32(int*      adr, int      errValue);
3185     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3186     //
3187     // arguments:
3188     //   c_rarg0 = adr
3189     //   c_rarg1 = errValue
3190     //
3191     // result:
3192     //   PPC_RET  = *adr or errValue
3193 
3194     StubCodeMark mark(this, "StubRoutines", name);
3195 
3196     // Entry point, pc or function descriptor.
3197     *entry = __ pc();
3198 
3199     // Load *adr into c_rarg1, may fault.
3200     *fault_pc = __ pc();
3201     switch (size) {
3202       case 4:
3203         // int32_t
3204         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3205         break;
3206       case 8:
3207         // int64_t
3208         __ ldr(c_rarg1, Address(c_rarg0, 0));
3209         break;
3210       default:
3211         ShouldNotReachHere();
3212     }
3213 
3214     // return errValue or *adr
3215     *continuation_pc = __ pc();
3216     __ mov(r0, c_rarg1);
3217     __ ret(lr);
3218   }
3219 #endif
3220 
3221   /**
3222    *  Arguments:
3223    *
3224    * Inputs:
3225    *   c_rarg0   - int crc
3226    *   c_rarg1   - byte* buf
3227    *   c_rarg2   - int length
3228    *
3229    * Ouput:
3230    *       rax   - int crc result
3231    */
3232   address generate_updateBytesCRC32() {
3233     assert(UseCRC32Intrinsics, "what are we doing here?");
3234 
3235     __ align(CodeEntryAlignment);
3236     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3237 
3238     address start = __ pc();
3239 
3240     const Register crc   = c_rarg0;  // crc
3241     const Register buf   = c_rarg1;  // source java byte array address
3242     const Register len   = c_rarg2;  // length
3243     const Register table0 = c_rarg3; // crc_table address
3244     const Register table1 = c_rarg4;
3245     const Register table2 = c_rarg5;
3246     const Register table3 = c_rarg6;
3247     const Register tmp3 = c_rarg7;
3248 
3249     BLOCK_COMMENT("Entry:");
3250     __ enter(); // required for proper stackwalking of RuntimeStub frame
3251 
3252     __ kernel_crc32(crc, buf, len,
3253               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3254 
3255     __ leave(); // required for proper stackwalking of RuntimeStub frame
3256     __ ret(lr);
3257 
3258     return start;
3259   }
3260 
3261   /**
3262    *  Arguments:
3263    *
3264    * Inputs:
3265    *   c_rarg0   - int crc
3266    *   c_rarg1   - byte* buf
3267    *   c_rarg2   - int length
3268    *   c_rarg3   - int* table
3269    *
3270    * Ouput:
3271    *       r0   - int crc result
3272    */
3273   address generate_updateBytesCRC32C() {
3274     assert(UseCRC32CIntrinsics, "what are we doing here?");
3275 
3276     __ align(CodeEntryAlignment);
3277     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3278 
3279     address start = __ pc();
3280 
3281     const Register crc   = c_rarg0;  // crc
3282     const Register buf   = c_rarg1;  // source java byte array address
3283     const Register len   = c_rarg2;  // length
3284     const Register table0 = c_rarg3; // crc_table address
3285     const Register table1 = c_rarg4;
3286     const Register table2 = c_rarg5;
3287     const Register table3 = c_rarg6;
3288     const Register tmp3 = c_rarg7;
3289 
3290     BLOCK_COMMENT("Entry:");
3291     __ enter(); // required for proper stackwalking of RuntimeStub frame
3292 
3293     __ kernel_crc32c(crc, buf, len,
3294               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3295 
3296     __ leave(); // required for proper stackwalking of RuntimeStub frame
3297     __ ret(lr);
3298 
3299     return start;
3300   }
3301 
3302   /***
3303    *  Arguments:
3304    *
3305    *  Inputs:
3306    *   c_rarg0   - int   adler
3307    *   c_rarg1   - byte* buff
3308    *   c_rarg2   - int   len
3309    *
3310    * Output:
3311    *   c_rarg0   - int adler result
3312    */
3313   address generate_updateBytesAdler32() {
3314     __ align(CodeEntryAlignment);
3315     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3316     address start = __ pc();
3317 
3318     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3319 
3320     // Aliases
3321     Register adler  = c_rarg0;
3322     Register s1     = c_rarg0;
3323     Register s2     = c_rarg3;
3324     Register buff   = c_rarg1;
3325     Register len    = c_rarg2;
3326     Register nmax  = r4;
3327     Register base = r5;
3328     Register count = r6;
3329     Register temp0 = rscratch1;
3330     Register temp1 = rscratch2;
3331     Register temp2 = r7;
3332 
3333     // Max number of bytes we can process before having to take the mod
3334     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3335     unsigned long BASE = 0xfff1;
3336     unsigned long NMAX = 0x15B0;
3337 
3338     __ mov(base, BASE);
3339     __ mov(nmax, NMAX);
3340 
3341     // s1 is initialized to the lower 16 bits of adler
3342     // s2 is initialized to the upper 16 bits of adler
3343     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3344     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3345 
3346     // The pipelined loop needs at least 16 elements for 1 iteration
3347     // It does check this, but it is more effective to skip to the cleanup loop
3348     __ cmp(len, 16);
3349     __ br(Assembler::HS, L_nmax);
3350     __ cbz(len, L_combine);
3351 
3352     __ bind(L_simple_by1_loop);
3353     __ ldrb(temp0, Address(__ post(buff, 1)));
3354     __ add(s1, s1, temp0);
3355     __ add(s2, s2, s1);
3356     __ subs(len, len, 1);
3357     __ br(Assembler::HI, L_simple_by1_loop);
3358 
3359     // s1 = s1 % BASE
3360     __ subs(temp0, s1, base);
3361     __ csel(s1, temp0, s1, Assembler::HS);
3362 
3363     // s2 = s2 % BASE
3364     __ lsr(temp0, s2, 16);
3365     __ lsl(temp1, temp0, 4);
3366     __ sub(temp1, temp1, temp0);
3367     __ add(s2, temp1, s2, ext::uxth);
3368 
3369     __ subs(temp0, s2, base);
3370     __ csel(s2, temp0, s2, Assembler::HS);
3371 
3372     __ b(L_combine);
3373 
3374     __ bind(L_nmax);
3375     __ subs(len, len, nmax);
3376     __ sub(count, nmax, 16);
3377     __ br(Assembler::LO, L_by16);
3378 
3379     __ bind(L_nmax_loop);
3380 
3381     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3382 
3383     __ add(s1, s1, temp0, ext::uxtb);
3384     __ ubfx(temp2, temp0, 8, 8);
3385     __ add(s2, s2, s1);
3386     __ add(s1, s1, temp2);
3387     __ ubfx(temp2, temp0, 16, 8);
3388     __ add(s2, s2, s1);
3389     __ add(s1, s1, temp2);
3390     __ ubfx(temp2, temp0, 24, 8);
3391     __ add(s2, s2, s1);
3392     __ add(s1, s1, temp2);
3393     __ ubfx(temp2, temp0, 32, 8);
3394     __ add(s2, s2, s1);
3395     __ add(s1, s1, temp2);
3396     __ ubfx(temp2, temp0, 40, 8);
3397     __ add(s2, s2, s1);
3398     __ add(s1, s1, temp2);
3399     __ ubfx(temp2, temp0, 48, 8);
3400     __ add(s2, s2, s1);
3401     __ add(s1, s1, temp2);
3402     __ add(s2, s2, s1);
3403     __ add(s1, s1, temp0, Assembler::LSR, 56);
3404     __ add(s2, s2, s1);
3405 
3406     __ add(s1, s1, temp1, ext::uxtb);
3407     __ ubfx(temp2, temp1, 8, 8);
3408     __ add(s2, s2, s1);
3409     __ add(s1, s1, temp2);
3410     __ ubfx(temp2, temp1, 16, 8);
3411     __ add(s2, s2, s1);
3412     __ add(s1, s1, temp2);
3413     __ ubfx(temp2, temp1, 24, 8);
3414     __ add(s2, s2, s1);
3415     __ add(s1, s1, temp2);
3416     __ ubfx(temp2, temp1, 32, 8);
3417     __ add(s2, s2, s1);
3418     __ add(s1, s1, temp2);
3419     __ ubfx(temp2, temp1, 40, 8);
3420     __ add(s2, s2, s1);
3421     __ add(s1, s1, temp2);
3422     __ ubfx(temp2, temp1, 48, 8);
3423     __ add(s2, s2, s1);
3424     __ add(s1, s1, temp2);
3425     __ add(s2, s2, s1);
3426     __ add(s1, s1, temp1, Assembler::LSR, 56);
3427     __ add(s2, s2, s1);
3428 
3429     __ subs(count, count, 16);
3430     __ br(Assembler::HS, L_nmax_loop);
3431 
3432     // s1 = s1 % BASE
3433     __ lsr(temp0, s1, 16);
3434     __ lsl(temp1, temp0, 4);
3435     __ sub(temp1, temp1, temp0);
3436     __ add(temp1, temp1, s1, ext::uxth);
3437 
3438     __ lsr(temp0, temp1, 16);
3439     __ lsl(s1, temp0, 4);
3440     __ sub(s1, s1, temp0);
3441     __ add(s1, s1, temp1, ext:: uxth);
3442 
3443     __ subs(temp0, s1, base);
3444     __ csel(s1, temp0, s1, Assembler::HS);
3445 
3446     // s2 = s2 % BASE
3447     __ lsr(temp0, s2, 16);
3448     __ lsl(temp1, temp0, 4);
3449     __ sub(temp1, temp1, temp0);
3450     __ add(temp1, temp1, s2, ext::uxth);
3451 
3452     __ lsr(temp0, temp1, 16);
3453     __ lsl(s2, temp0, 4);
3454     __ sub(s2, s2, temp0);
3455     __ add(s2, s2, temp1, ext:: uxth);
3456 
3457     __ subs(temp0, s2, base);
3458     __ csel(s2, temp0, s2, Assembler::HS);
3459 
3460     __ subs(len, len, nmax);
3461     __ sub(count, nmax, 16);
3462     __ br(Assembler::HS, L_nmax_loop);
3463 
3464     __ bind(L_by16);
3465     __ adds(len, len, count);
3466     __ br(Assembler::LO, L_by1);
3467 
3468     __ bind(L_by16_loop);
3469 
3470     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3471 
3472     __ add(s1, s1, temp0, ext::uxtb);
3473     __ ubfx(temp2, temp0, 8, 8);
3474     __ add(s2, s2, s1);
3475     __ add(s1, s1, temp2);
3476     __ ubfx(temp2, temp0, 16, 8);
3477     __ add(s2, s2, s1);
3478     __ add(s1, s1, temp2);
3479     __ ubfx(temp2, temp0, 24, 8);
3480     __ add(s2, s2, s1);
3481     __ add(s1, s1, temp2);
3482     __ ubfx(temp2, temp0, 32, 8);
3483     __ add(s2, s2, s1);
3484     __ add(s1, s1, temp2);
3485     __ ubfx(temp2, temp0, 40, 8);
3486     __ add(s2, s2, s1);
3487     __ add(s1, s1, temp2);
3488     __ ubfx(temp2, temp0, 48, 8);
3489     __ add(s2, s2, s1);
3490     __ add(s1, s1, temp2);
3491     __ add(s2, s2, s1);
3492     __ add(s1, s1, temp0, Assembler::LSR, 56);
3493     __ add(s2, s2, s1);
3494 
3495     __ add(s1, s1, temp1, ext::uxtb);
3496     __ ubfx(temp2, temp1, 8, 8);
3497     __ add(s2, s2, s1);
3498     __ add(s1, s1, temp2);
3499     __ ubfx(temp2, temp1, 16, 8);
3500     __ add(s2, s2, s1);
3501     __ add(s1, s1, temp2);
3502     __ ubfx(temp2, temp1, 24, 8);
3503     __ add(s2, s2, s1);
3504     __ add(s1, s1, temp2);
3505     __ ubfx(temp2, temp1, 32, 8);
3506     __ add(s2, s2, s1);
3507     __ add(s1, s1, temp2);
3508     __ ubfx(temp2, temp1, 40, 8);
3509     __ add(s2, s2, s1);
3510     __ add(s1, s1, temp2);
3511     __ ubfx(temp2, temp1, 48, 8);
3512     __ add(s2, s2, s1);
3513     __ add(s1, s1, temp2);
3514     __ add(s2, s2, s1);
3515     __ add(s1, s1, temp1, Assembler::LSR, 56);
3516     __ add(s2, s2, s1);
3517 
3518     __ subs(len, len, 16);
3519     __ br(Assembler::HS, L_by16_loop);
3520 
3521     __ bind(L_by1);
3522     __ adds(len, len, 15);
3523     __ br(Assembler::LO, L_do_mod);
3524 
3525     __ bind(L_by1_loop);
3526     __ ldrb(temp0, Address(__ post(buff, 1)));
3527     __ add(s1, temp0, s1);
3528     __ add(s2, s2, s1);
3529     __ subs(len, len, 1);
3530     __ br(Assembler::HS, L_by1_loop);
3531 
3532     __ bind(L_do_mod);
3533     // s1 = s1 % BASE
3534     __ lsr(temp0, s1, 16);
3535     __ lsl(temp1, temp0, 4);
3536     __ sub(temp1, temp1, temp0);
3537     __ add(temp1, temp1, s1, ext::uxth);
3538 
3539     __ lsr(temp0, temp1, 16);
3540     __ lsl(s1, temp0, 4);
3541     __ sub(s1, s1, temp0);
3542     __ add(s1, s1, temp1, ext:: uxth);
3543 
3544     __ subs(temp0, s1, base);
3545     __ csel(s1, temp0, s1, Assembler::HS);
3546 
3547     // s2 = s2 % BASE
3548     __ lsr(temp0, s2, 16);
3549     __ lsl(temp1, temp0, 4);
3550     __ sub(temp1, temp1, temp0);
3551     __ add(temp1, temp1, s2, ext::uxth);
3552 
3553     __ lsr(temp0, temp1, 16);
3554     __ lsl(s2, temp0, 4);
3555     __ sub(s2, s2, temp0);
3556     __ add(s2, s2, temp1, ext:: uxth);
3557 
3558     __ subs(temp0, s2, base);
3559     __ csel(s2, temp0, s2, Assembler::HS);
3560 
3561     // Combine lower bits and higher bits
3562     __ bind(L_combine);
3563     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3564 
3565     __ ret(lr);
3566 
3567     return start;
3568   }
3569 
3570   /**
3571    *  Arguments:
3572    *
3573    *  Input:
3574    *    c_rarg0   - x address
3575    *    c_rarg1   - x length
3576    *    c_rarg2   - y address
3577    *    c_rarg3   - y lenth
3578    *    c_rarg4   - z address
3579    *    c_rarg5   - z length
3580    */
3581   address generate_multiplyToLen() {
3582     __ align(CodeEntryAlignment);
3583     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3584 
3585     address start = __ pc();
3586     const Register x     = r0;
3587     const Register xlen  = r1;
3588     const Register y     = r2;
3589     const Register ylen  = r3;
3590     const Register z     = r4;
3591     const Register zlen  = r5;
3592 
3593     const Register tmp1  = r10;
3594     const Register tmp2  = r11;
3595     const Register tmp3  = r12;
3596     const Register tmp4  = r13;
3597     const Register tmp5  = r14;
3598     const Register tmp6  = r15;
3599     const Register tmp7  = r16;
3600 
3601     BLOCK_COMMENT("Entry:");
3602     __ enter(); // required for proper stackwalking of RuntimeStub frame
3603     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3604     __ leave(); // required for proper stackwalking of RuntimeStub frame
3605     __ ret(lr);
3606 
3607     return start;
3608   }
3609 
3610   address generate_squareToLen() {
3611     // squareToLen algorithm for sizes 1..127 described in java code works
3612     // faster than multiply_to_len on some CPUs and slower on others, but
3613     // multiply_to_len shows a bit better overall results
3614     __ align(CodeEntryAlignment);
3615     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3616     address start = __ pc();
3617 
3618     const Register x     = r0;
3619     const Register xlen  = r1;
3620     const Register z     = r2;
3621     const Register zlen  = r3;
3622     const Register y     = r4; // == x
3623     const Register ylen  = r5; // == xlen
3624 
3625     const Register tmp1  = r10;
3626     const Register tmp2  = r11;
3627     const Register tmp3  = r12;
3628     const Register tmp4  = r13;
3629     const Register tmp5  = r14;
3630     const Register tmp6  = r15;
3631     const Register tmp7  = r16;
3632 
3633     RegSet spilled_regs = RegSet::of(y, ylen);
3634     BLOCK_COMMENT("Entry:");
3635     __ enter();
3636     __ push(spilled_regs, sp);
3637     __ mov(y, x);
3638     __ mov(ylen, xlen);
3639     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3640     __ pop(spilled_regs, sp);
3641     __ leave();
3642     __ ret(lr);
3643     return start;
3644   }
3645 
3646   address generate_mulAdd() {
3647     __ align(CodeEntryAlignment);
3648     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3649 
3650     address start = __ pc();
3651 
3652     const Register out     = r0;
3653     const Register in      = r1;
3654     const Register offset  = r2;
3655     const Register len     = r3;
3656     const Register k       = r4;
3657 
3658     BLOCK_COMMENT("Entry:");
3659     __ enter();
3660     __ mul_add(out, in, offset, len, k);
3661     __ leave();
3662     __ ret(lr);
3663 
3664     return start;
3665   }
3666 
3667   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3668                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3669                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3670     // Karatsuba multiplication performs a 128*128 -> 256-bit
3671     // multiplication in three 128-bit multiplications and a few
3672     // additions.
3673     //
3674     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3675     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3676     //
3677     // Inputs:
3678     //
3679     // A0 in a.d[0]     (subkey)
3680     // A1 in a.d[1]
3681     // (A1+A0) in a1_xor_a0.d[0]
3682     //
3683     // B0 in b.d[0]     (state)
3684     // B1 in b.d[1]
3685 
3686     __ ext(tmp1, __ T16B, b, b, 0x08);
3687     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3688     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3689     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3690     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3691 
3692     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3693     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3694     __ eor(tmp2, __ T16B, tmp2, tmp4);
3695     __ eor(tmp2, __ T16B, tmp2, tmp3);
3696 
3697     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3698     __ ins(result_hi, __ D, tmp2, 0, 1);
3699     __ ins(result_lo, __ D, tmp2, 1, 0);
3700   }
3701 
3702   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3703                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3704     const FloatRegister t0 = result;
3705 
3706     // The GCM field polynomial f is z^128 + p(z), where p =
3707     // z^7+z^2+z+1.
3708     //
3709     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3710     //
3711     // so, given that the product we're reducing is
3712     //    a == lo + hi * z^128
3713     // substituting,
3714     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3715     //
3716     // we reduce by multiplying hi by p(z) and subtracting the result
3717     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3718     // bits we can do this with two 64-bit multiplications, lo*p and
3719     // hi*p.
3720 
3721     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3722     __ ext(t1, __ T16B, t0, z, 8);
3723     __ eor(hi, __ T16B, hi, t1);
3724     __ ext(t1, __ T16B, z, t0, 8);
3725     __ eor(lo, __ T16B, lo, t1);
3726     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3727     __ eor(result, __ T16B, lo, t0);
3728   }
3729 
3730   address generate_has_negatives(address &has_negatives_long) {
3731     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3732     const int large_loop_size = 64;
3733     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3734     int dcache_line = VM_Version::dcache_line_size();
3735 
3736     Register ary1 = r1, len = r2, result = r0;
3737 
3738     __ align(CodeEntryAlignment);
3739     address entry = __ pc();
3740 
3741     __ enter();
3742 
3743   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3744         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3745 
3746   __ cmp(len, 15);
3747   __ br(Assembler::GT, LEN_OVER_15);
3748   // The only case when execution falls into this code is when pointer is near
3749   // the end of memory page and we have to avoid reading next page
3750   __ add(ary1, ary1, len);
3751   __ subs(len, len, 8);
3752   __ br(Assembler::GT, LEN_OVER_8);
3753   __ ldr(rscratch2, Address(ary1, -8));
3754   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3755   __ lsrv(rscratch2, rscratch2, rscratch1);
3756   __ tst(rscratch2, UPPER_BIT_MASK);
3757   __ cset(result, Assembler::NE);
3758   __ leave();
3759   __ ret(lr);
3760   __ bind(LEN_OVER_8);
3761   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3762   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3763   __ tst(rscratch2, UPPER_BIT_MASK);
3764   __ br(Assembler::NE, RET_TRUE_NO_POP);
3765   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3766   __ lsrv(rscratch1, rscratch1, rscratch2);
3767   __ tst(rscratch1, UPPER_BIT_MASK);
3768   __ cset(result, Assembler::NE);
3769   __ leave();
3770   __ ret(lr);
3771 
3772   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3773   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3774 
3775   has_negatives_long = __ pc(); // 2nd entry point
3776 
3777   __ enter();
3778 
3779   __ bind(LEN_OVER_15);
3780     __ push(spilled_regs, sp);
3781     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3782     __ cbz(rscratch2, ALIGNED);
3783     __ ldp(tmp6, tmp1, Address(ary1));
3784     __ mov(tmp5, 16);
3785     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3786     __ add(ary1, ary1, rscratch1);
3787     __ sub(len, len, rscratch1);
3788     __ orr(tmp6, tmp6, tmp1);
3789     __ tst(tmp6, UPPER_BIT_MASK);
3790     __ br(Assembler::NE, RET_TRUE);
3791 
3792   __ bind(ALIGNED);
3793     __ cmp(len, large_loop_size);
3794     __ br(Assembler::LT, CHECK_16);
3795     // Perform 16-byte load as early return in pre-loop to handle situation
3796     // when initially aligned large array has negative values at starting bytes,
3797     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3798     // slower. Cases with negative bytes further ahead won't be affected that
3799     // much. In fact, it'll be faster due to early loads, less instructions and
3800     // less branches in LARGE_LOOP.
3801     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3802     __ sub(len, len, 16);
3803     __ orr(tmp6, tmp6, tmp1);
3804     __ tst(tmp6, UPPER_BIT_MASK);
3805     __ br(Assembler::NE, RET_TRUE);
3806     __ cmp(len, large_loop_size);
3807     __ br(Assembler::LT, CHECK_16);
3808 
3809     if (SoftwarePrefetchHintDistance >= 0
3810         && SoftwarePrefetchHintDistance >= dcache_line) {
3811       // initial prefetch
3812       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3813     }
3814   __ bind(LARGE_LOOP);
3815     if (SoftwarePrefetchHintDistance >= 0) {
3816       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3817     }
3818     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3819     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3820     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3821     // instructions per cycle and have less branches, but this approach disables
3822     // early return, thus, all 64 bytes are loaded and checked every time.
3823     __ ldp(tmp2, tmp3, Address(ary1));
3824     __ ldp(tmp4, tmp5, Address(ary1, 16));
3825     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3826     __ ldp(tmp6, tmp1, Address(ary1, 48));
3827     __ add(ary1, ary1, large_loop_size);
3828     __ sub(len, len, large_loop_size);
3829     __ orr(tmp2, tmp2, tmp3);
3830     __ orr(tmp4, tmp4, tmp5);
3831     __ orr(rscratch1, rscratch1, rscratch2);
3832     __ orr(tmp6, tmp6, tmp1);
3833     __ orr(tmp2, tmp2, tmp4);
3834     __ orr(rscratch1, rscratch1, tmp6);
3835     __ orr(tmp2, tmp2, rscratch1);
3836     __ tst(tmp2, UPPER_BIT_MASK);
3837     __ br(Assembler::NE, RET_TRUE);
3838     __ cmp(len, large_loop_size);
3839     __ br(Assembler::GE, LARGE_LOOP);
3840 
3841   __ bind(CHECK_16); // small 16-byte load pre-loop
3842     __ cmp(len, 16);
3843     __ br(Assembler::LT, POST_LOOP16);
3844 
3845   __ bind(LOOP16); // small 16-byte load loop
3846     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3847     __ sub(len, len, 16);
3848     __ orr(tmp2, tmp2, tmp3);
3849     __ tst(tmp2, UPPER_BIT_MASK);
3850     __ br(Assembler::NE, RET_TRUE);
3851     __ cmp(len, 16);
3852     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3853 
3854   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3855     __ cmp(len, 8);
3856     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3857     __ ldr(tmp3, Address(__ post(ary1, 8)));
3858     __ sub(len, len, 8);
3859     __ tst(tmp3, UPPER_BIT_MASK);
3860     __ br(Assembler::NE, RET_TRUE);
3861 
3862   __ bind(POST_LOOP16_LOAD_TAIL);
3863     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3864     __ ldr(tmp1, Address(ary1));
3865     __ mov(tmp2, 64);
3866     __ sub(tmp4, tmp2, len, __ LSL, 3);
3867     __ lslv(tmp1, tmp1, tmp4);
3868     __ tst(tmp1, UPPER_BIT_MASK);
3869     __ br(Assembler::NE, RET_TRUE);
3870     // Fallthrough
3871 
3872   __ bind(RET_FALSE);
3873     __ pop(spilled_regs, sp);
3874     __ leave();
3875     __ mov(result, zr);
3876     __ ret(lr);
3877 
3878   __ bind(RET_TRUE);
3879     __ pop(spilled_regs, sp);
3880   __ bind(RET_TRUE_NO_POP);
3881     __ leave();
3882     __ mov(result, 1);
3883     __ ret(lr);
3884 
3885   __ bind(DONE);
3886     __ pop(spilled_regs, sp);
3887     __ leave();
3888     __ ret(lr);
3889     return entry;
3890   }
3891 
3892   address generate_large_array_equals_byte() {
3893     return generate_large_array_equals(1);
3894   }
3895 
3896   address generate_large_array_equals_char() {
3897     return generate_large_array_equals(2);
3898   }
3899 
3900   // a1 = r1 - array1 address
3901   // a2 = r2 - array2 address
3902   // result = r0 - return value. Already contains "false"
3903   // cnt1 = r4 - amount of elements left to check, reduced by elem_per_word
3904   address generate_large_array_equals(int elem_size) {
3905     StubCodeMark mark(this, "StubRoutines", elem_size == 1
3906         ? "large_array_equals_byte"
3907         : "large_array_equals_char");
3908     Register a1 = r1, a2 = r2, result = r0, cnt1 = r4, tmp1 = rscratch1,
3909         tmp2 = rscratch2, tmp3 = r6, tmp4 = r7;
3910     Label LARGE_LOOP, NOT_EQUAL;
3911     int elem_per_word = wordSize/elem_size;
3912     int branchThreshold = MAX(80, SoftwarePrefetchHintDistance)/elem_size - elem_per_word;
3913     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
3914 
3915     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4);
3916 
3917     __ align(CodeEntryAlignment);
3918     address entry = __ pc();
3919     __ enter();
3920 
3921     if (!UseSIMDForArrayEquals) {
3922       // pre-loop
3923       __ push(spilled_regs, sp);
3924       __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3925       __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3926     }
3927     __ bind(LARGE_LOOP); // unrolled to 64 bytes loop with possible prefetching
3928     if (SoftwarePrefetchHintDistance >= 0) {
3929       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3930       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3931     }
3932     if (UseSIMDForArrayEquals) {
3933       __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3934       __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3935       __ eor(v0, __ T2D, v0, v4);
3936       __ eor(v1, __ T2D, v1, v5);
3937       __ eor(v2, __ T2D, v2, v6);
3938       __ eor(v3, __ T2D, v3, v7);
3939 
3940       __ orr(v0, __ T2D, v0, v1);
3941       __ orr(v1, __ T2D, v2, v3);
3942       __ orr(v0, __ T2D, v0, v1);
3943 
3944       __ umov(tmp1, v0, __ D, 0);
3945       __ cbnz(tmp1, NOT_EQUAL);
3946       __ umov(tmp1, v0, __ D, 1);
3947       __ cbnz(tmp1, NOT_EQUAL);
3948       __ sub(cnt1, cnt1, 64/elem_size);
3949       __ cmp(cnt1, branchThreshold);
3950       __ br(__ GT, LARGE_LOOP);
3951     } else {
3952       __ eor(tmp1, tmp1, tmp2);
3953       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3954       __ cbnz(tmp1, NOT_EQUAL);
3955       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3956       __ eor(tmp3, tmp3, tmp4);
3957       __ ldr(tmp4, Address(__ post(a2, wordSize)));
3958       __ cbnz(tmp3, NOT_EQUAL);
3959       __ ldr(tmp3, Address(__ post(a1, wordSize)));
3960 
3961       __ eor(tmp1, tmp1, tmp2);
3962       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3963       __ cbnz(tmp1, NOT_EQUAL);
3964       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3965       __ eor(tmp3, tmp3, tmp4);
3966       __ ldr(tmp4, Address(__ post(a2, wordSize)));
3967       __ cbnz(tmp3, NOT_EQUAL);
3968       __ ldr(tmp3, Address(__ post(a1, wordSize)));
3969 
3970       __ eor(tmp1, tmp1, tmp2);
3971       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3972       __ cbnz(tmp1, NOT_EQUAL);
3973       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3974       __ eor(tmp3, tmp3, tmp4);
3975       __ ldr(tmp4, Address(__ post(a2, wordSize)));
3976       __ cbnz(tmp3, NOT_EQUAL);
3977       __ ldr(tmp3, Address(__ post(a1, wordSize)));
3978 
3979       // loads below are for next loop iteration
3980       __ eor(tmp1, tmp1, tmp2);
3981       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3982       __ cbnz(tmp1, NOT_EQUAL);
3983       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3984       __ eor(tmp3, tmp3, tmp4);
3985       __ ldr(tmp4, Address(__ post(a2, wordSize)));
3986       __ cbnz(tmp3, NOT_EQUAL);
3987       __ ldr(tmp3, Address(__ post(a1, wordSize)));
3988 
3989       __ sub(cnt1, cnt1, 8 * elem_per_word);
3990       // run this loop until we have memory to prefetch(but at least 64+16 bytes).
3991       __ cmp(cnt1, branchThreshold);
3992       __ br(Assembler::GT, LARGE_LOOP);
3993       // both a1 and a2 are shifted more than needed by wordSize and tmp1-tmp4
3994       // contains still-not-checked value. Check it in this post-loop, also update
3995       // cnt1 accordingly
3996       __ eor(tmp1, tmp1, tmp2);
3997       __ cbnz(tmp1, NOT_EQUAL);
3998       __ eor(tmp3, tmp3, tmp4);
3999       __ cbnz(tmp3, NOT_EQUAL);
4000       __ sub(cnt1, cnt1, 2 * elem_per_word);
4001     }
4002 
4003     __ mov(result, true);
4004     __ bind(NOT_EQUAL);
4005     if (!UseSIMDForArrayEquals) {
4006       __ pop(spilled_regs, sp);
4007     }
4008     __ leave();
4009     __ ret(lr);
4010     return entry;
4011   }
4012 
4013   /**
4014    *  Arguments:
4015    *
4016    *  Input:
4017    *  c_rarg0   - current state address
4018    *  c_rarg1   - H key address
4019    *  c_rarg2   - data address
4020    *  c_rarg3   - number of blocks
4021    *
4022    *  Output:
4023    *  Updated state at c_rarg0
4024    */
4025   address generate_ghash_processBlocks() {
4026     // Bafflingly, GCM uses little-endian for the byte order, but
4027     // big-endian for the bit order.  For example, the polynomial 1 is
4028     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4029     //
4030     // So, we must either reverse the bytes in each word and do
4031     // everything big-endian or reverse the bits in each byte and do
4032     // it little-endian.  On AArch64 it's more idiomatic to reverse
4033     // the bits in each byte (we have an instruction, RBIT, to do
4034     // that) and keep the data in little-endian bit order throught the
4035     // calculation, bit-reversing the inputs and outputs.
4036 
4037     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4038     __ align(wordSize * 2);
4039     address p = __ pc();
4040     __ emit_int64(0x87);  // The low-order bits of the field
4041                           // polynomial (i.e. p = z^7+z^2+z+1)
4042                           // repeated in the low and high parts of a
4043                           // 128-bit vector
4044     __ emit_int64(0x87);
4045 
4046     __ align(CodeEntryAlignment);
4047     address start = __ pc();
4048 
4049     Register state   = c_rarg0;
4050     Register subkeyH = c_rarg1;
4051     Register data    = c_rarg2;
4052     Register blocks  = c_rarg3;
4053 
4054     FloatRegister vzr = v30;
4055     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4056 
4057     __ ldrq(v0, Address(state));
4058     __ ldrq(v1, Address(subkeyH));
4059 
4060     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4061     __ rbit(v0, __ T16B, v0);
4062     __ rev64(v1, __ T16B, v1);
4063     __ rbit(v1, __ T16B, v1);
4064 
4065     __ ldrq(v26, p);
4066 
4067     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4068     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4069 
4070     {
4071       Label L_ghash_loop;
4072       __ bind(L_ghash_loop);
4073 
4074       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4075                                                  // reversing each byte
4076       __ rbit(v2, __ T16B, v2);
4077       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4078 
4079       // Multiply state in v2 by subkey in v1
4080       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4081                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4082                      /*temps*/v6, v20, v18, v21);
4083       // Reduce v7:v5 by the field polynomial
4084       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4085 
4086       __ sub(blocks, blocks, 1);
4087       __ cbnz(blocks, L_ghash_loop);
4088     }
4089 
4090     // The bit-reversed result is at this point in v0
4091     __ rev64(v1, __ T16B, v0);
4092     __ rbit(v1, __ T16B, v1);
4093 
4094     __ st1(v1, __ T16B, state);
4095     __ ret(lr);
4096 
4097     return start;
4098   }
4099 
4100   // Continuation point for throwing of implicit exceptions that are
4101   // not handled in the current activation. Fabricates an exception
4102   // oop and initiates normal exception dispatching in this
4103   // frame. Since we need to preserve callee-saved values (currently
4104   // only for C2, but done for C1 as well) we need a callee-saved oop
4105   // map and therefore have to make these stubs into RuntimeStubs
4106   // rather than BufferBlobs.  If the compiler needs all registers to
4107   // be preserved between the fault point and the exception handler
4108   // then it must assume responsibility for that in
4109   // AbstractCompiler::continuation_for_implicit_null_exception or
4110   // continuation_for_implicit_division_by_zero_exception. All other
4111   // implicit exceptions (e.g., NullPointerException or
4112   // AbstractMethodError on entry) are either at call sites or
4113   // otherwise assume that stack unwinding will be initiated, so
4114   // caller saved registers were assumed volatile in the compiler.
4115 
4116 #undef __
4117 #define __ masm->
4118 
4119   address generate_throw_exception(const char* name,
4120                                    address runtime_entry,
4121                                    Register arg1 = noreg,
4122                                    Register arg2 = noreg) {
4123     // Information about frame layout at time of blocking runtime call.
4124     // Note that we only have to preserve callee-saved registers since
4125     // the compilers are responsible for supplying a continuation point
4126     // if they expect all registers to be preserved.
4127     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4128     enum layout {
4129       rfp_off = 0,
4130       rfp_off2,
4131       return_off,
4132       return_off2,
4133       framesize // inclusive of return address
4134     };
4135 
4136     int insts_size = 512;
4137     int locs_size  = 64;
4138 
4139     CodeBuffer code(name, insts_size, locs_size);
4140     OopMapSet* oop_maps  = new OopMapSet();
4141     MacroAssembler* masm = new MacroAssembler(&code);
4142 
4143     address start = __ pc();
4144 
4145     // This is an inlined and slightly modified version of call_VM
4146     // which has the ability to fetch the return PC out of
4147     // thread-local storage and also sets up last_Java_sp slightly
4148     // differently than the real call_VM
4149 
4150     __ enter(); // Save FP and LR before call
4151 
4152     assert(is_even(framesize/2), "sp not 16-byte aligned");
4153 
4154     // lr and fp are already in place
4155     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4156 
4157     int frame_complete = __ pc() - start;
4158 
4159     // Set up last_Java_sp and last_Java_fp
4160     address the_pc = __ pc();
4161     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4162 
4163     // Call runtime
4164     if (arg1 != noreg) {
4165       assert(arg2 != c_rarg1, "clobbered");
4166       __ mov(c_rarg1, arg1);
4167     }
4168     if (arg2 != noreg) {
4169       __ mov(c_rarg2, arg2);
4170     }
4171     __ mov(c_rarg0, rthread);
4172     BLOCK_COMMENT("call runtime_entry");
4173     __ mov(rscratch1, runtime_entry);
4174     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4175 
4176     // Generate oop map
4177     OopMap* map = new OopMap(framesize, 0);
4178 
4179     oop_maps->add_gc_map(the_pc - start, map);
4180 
4181     __ reset_last_Java_frame(true);
4182     __ maybe_isb();
4183 
4184     __ leave();
4185 
4186     // check for pending exceptions
4187 #ifdef ASSERT
4188     Label L;
4189     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4190     __ cbnz(rscratch1, L);
4191     __ should_not_reach_here();
4192     __ bind(L);
4193 #endif // ASSERT
4194     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4195 
4196 
4197     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4198     RuntimeStub* stub =
4199       RuntimeStub::new_runtime_stub(name,
4200                                     &code,
4201                                     frame_complete,
4202                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4203                                     oop_maps, false);
4204     return stub->entry_point();
4205   }
4206 
4207   class MontgomeryMultiplyGenerator : public MacroAssembler {
4208 
4209     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4210       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4211 
4212     RegSet _toSave;
4213     bool _squaring;
4214 
4215   public:
4216     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4217       : MacroAssembler(as->code()), _squaring(squaring) {
4218 
4219       // Register allocation
4220 
4221       Register reg = c_rarg0;
4222       Pa_base = reg;       // Argument registers
4223       if (squaring)
4224         Pb_base = Pa_base;
4225       else
4226         Pb_base = ++reg;
4227       Pn_base = ++reg;
4228       Rlen= ++reg;
4229       inv = ++reg;
4230       Pm_base = ++reg;
4231 
4232                           // Working registers:
4233       Ra =  ++reg;        // The current digit of a, b, n, and m.
4234       Rb =  ++reg;
4235       Rm =  ++reg;
4236       Rn =  ++reg;
4237 
4238       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4239       Pb =  ++reg;
4240       Pm =  ++reg;
4241       Pn =  ++reg;
4242 
4243       t0 =  ++reg;        // Three registers which form a
4244       t1 =  ++reg;        // triple-precision accumuator.
4245       t2 =  ++reg;
4246 
4247       Ri =  ++reg;        // Inner and outer loop indexes.
4248       Rj =  ++reg;
4249 
4250       Rhi_ab = ++reg;     // Product registers: low and high parts
4251       Rlo_ab = ++reg;     // of a*b and m*n.
4252       Rhi_mn = ++reg;
4253       Rlo_mn = ++reg;
4254 
4255       // r19 and up are callee-saved.
4256       _toSave = RegSet::range(r19, reg) + Pm_base;
4257     }
4258 
4259   private:
4260     void save_regs() {
4261       push(_toSave, sp);
4262     }
4263 
4264     void restore_regs() {
4265       pop(_toSave, sp);
4266     }
4267 
4268     template <typename T>
4269     void unroll_2(Register count, T block) {
4270       Label loop, end, odd;
4271       tbnz(count, 0, odd);
4272       cbz(count, end);
4273       align(16);
4274       bind(loop);
4275       (this->*block)();
4276       bind(odd);
4277       (this->*block)();
4278       subs(count, count, 2);
4279       br(Assembler::GT, loop);
4280       bind(end);
4281     }
4282 
4283     template <typename T>
4284     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4285       Label loop, end, odd;
4286       tbnz(count, 0, odd);
4287       cbz(count, end);
4288       align(16);
4289       bind(loop);
4290       (this->*block)(d, s, tmp);
4291       bind(odd);
4292       (this->*block)(d, s, tmp);
4293       subs(count, count, 2);
4294       br(Assembler::GT, loop);
4295       bind(end);
4296     }
4297 
4298     void pre1(RegisterOrConstant i) {
4299       block_comment("pre1");
4300       // Pa = Pa_base;
4301       // Pb = Pb_base + i;
4302       // Pm = Pm_base;
4303       // Pn = Pn_base + i;
4304       // Ra = *Pa;
4305       // Rb = *Pb;
4306       // Rm = *Pm;
4307       // Rn = *Pn;
4308       ldr(Ra, Address(Pa_base));
4309       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4310       ldr(Rm, Address(Pm_base));
4311       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4312       lea(Pa, Address(Pa_base));
4313       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4314       lea(Pm, Address(Pm_base));
4315       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4316 
4317       // Zero the m*n result.
4318       mov(Rhi_mn, zr);
4319       mov(Rlo_mn, zr);
4320     }
4321 
4322     // The core multiply-accumulate step of a Montgomery
4323     // multiplication.  The idea is to schedule operations as a
4324     // pipeline so that instructions with long latencies (loads and
4325     // multiplies) have time to complete before their results are
4326     // used.  This most benefits in-order implementations of the
4327     // architecture but out-of-order ones also benefit.
4328     void step() {
4329       block_comment("step");
4330       // MACC(Ra, Rb, t0, t1, t2);
4331       // Ra = *++Pa;
4332       // Rb = *--Pb;
4333       umulh(Rhi_ab, Ra, Rb);
4334       mul(Rlo_ab, Ra, Rb);
4335       ldr(Ra, pre(Pa, wordSize));
4336       ldr(Rb, pre(Pb, -wordSize));
4337       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4338                                        // previous iteration.
4339       // MACC(Rm, Rn, t0, t1, t2);
4340       // Rm = *++Pm;
4341       // Rn = *--Pn;
4342       umulh(Rhi_mn, Rm, Rn);
4343       mul(Rlo_mn, Rm, Rn);
4344       ldr(Rm, pre(Pm, wordSize));
4345       ldr(Rn, pre(Pn, -wordSize));
4346       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4347     }
4348 
4349     void post1() {
4350       block_comment("post1");
4351 
4352       // MACC(Ra, Rb, t0, t1, t2);
4353       // Ra = *++Pa;
4354       // Rb = *--Pb;
4355       umulh(Rhi_ab, Ra, Rb);
4356       mul(Rlo_ab, Ra, Rb);
4357       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4358       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4359 
4360       // *Pm = Rm = t0 * inv;
4361       mul(Rm, t0, inv);
4362       str(Rm, Address(Pm));
4363 
4364       // MACC(Rm, Rn, t0, t1, t2);
4365       // t0 = t1; t1 = t2; t2 = 0;
4366       umulh(Rhi_mn, Rm, Rn);
4367 
4368 #ifndef PRODUCT
4369       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4370       {
4371         mul(Rlo_mn, Rm, Rn);
4372         add(Rlo_mn, t0, Rlo_mn);
4373         Label ok;
4374         cbz(Rlo_mn, ok); {
4375           stop("broken Montgomery multiply");
4376         } bind(ok);
4377       }
4378 #endif
4379       // We have very carefully set things up so that
4380       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4381       // the lower half of Rm * Rn because we know the result already:
4382       // it must be -t0.  t0 + (-t0) must generate a carry iff
4383       // t0 != 0.  So, rather than do a mul and an adds we just set
4384       // the carry flag iff t0 is nonzero.
4385       //
4386       // mul(Rlo_mn, Rm, Rn);
4387       // adds(zr, t0, Rlo_mn);
4388       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4389       adcs(t0, t1, Rhi_mn);
4390       adc(t1, t2, zr);
4391       mov(t2, zr);
4392     }
4393 
4394     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4395       block_comment("pre2");
4396       // Pa = Pa_base + i-len;
4397       // Pb = Pb_base + len;
4398       // Pm = Pm_base + i-len;
4399       // Pn = Pn_base + len;
4400 
4401       if (i.is_register()) {
4402         sub(Rj, i.as_register(), len);
4403       } else {
4404         mov(Rj, i.as_constant());
4405         sub(Rj, Rj, len);
4406       }
4407       // Rj == i-len
4408 
4409       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4410       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4411       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4412       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4413 
4414       // Ra = *++Pa;
4415       // Rb = *--Pb;
4416       // Rm = *++Pm;
4417       // Rn = *--Pn;
4418       ldr(Ra, pre(Pa, wordSize));
4419       ldr(Rb, pre(Pb, -wordSize));
4420       ldr(Rm, pre(Pm, wordSize));
4421       ldr(Rn, pre(Pn, -wordSize));
4422 
4423       mov(Rhi_mn, zr);
4424       mov(Rlo_mn, zr);
4425     }
4426 
4427     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4428       block_comment("post2");
4429       if (i.is_constant()) {
4430         mov(Rj, i.as_constant()-len.as_constant());
4431       } else {
4432         sub(Rj, i.as_register(), len);
4433       }
4434 
4435       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4436 
4437       // As soon as we know the least significant digit of our result,
4438       // store it.
4439       // Pm_base[i-len] = t0;
4440       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4441 
4442       // t0 = t1; t1 = t2; t2 = 0;
4443       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4444       adc(t1, t2, zr);
4445       mov(t2, zr);
4446     }
4447 
4448     // A carry in t0 after Montgomery multiplication means that we
4449     // should subtract multiples of n from our result in m.  We'll
4450     // keep doing that until there is no carry.
4451     void normalize(RegisterOrConstant len) {
4452       block_comment("normalize");
4453       // while (t0)
4454       //   t0 = sub(Pm_base, Pn_base, t0, len);
4455       Label loop, post, again;
4456       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4457       cbz(t0, post); {
4458         bind(again); {
4459           mov(i, zr);
4460           mov(cnt, len);
4461           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4462           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4463           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4464           align(16);
4465           bind(loop); {
4466             sbcs(Rm, Rm, Rn);
4467             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4468             add(i, i, 1);
4469             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4470             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4471             sub(cnt, cnt, 1);
4472           } cbnz(cnt, loop);
4473           sbc(t0, t0, zr);
4474         } cbnz(t0, again);
4475       } bind(post);
4476     }
4477 
4478     // Move memory at s to d, reversing words.
4479     //    Increments d to end of copied memory
4480     //    Destroys tmp1, tmp2
4481     //    Preserves len
4482     //    Leaves s pointing to the address which was in d at start
4483     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4484       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4485 
4486       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4487       mov(tmp1, len);
4488       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4489       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4490     }
4491     // where
4492     void reverse1(Register d, Register s, Register tmp) {
4493       ldr(tmp, pre(s, -wordSize));
4494       ror(tmp, tmp, 32);
4495       str(tmp, post(d, wordSize));
4496     }
4497 
4498     void step_squaring() {
4499       // An extra ACC
4500       step();
4501       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4502     }
4503 
4504     void last_squaring(RegisterOrConstant i) {
4505       Label dont;
4506       // if ((i & 1) == 0) {
4507       tbnz(i.as_register(), 0, dont); {
4508         // MACC(Ra, Rb, t0, t1, t2);
4509         // Ra = *++Pa;
4510         // Rb = *--Pb;
4511         umulh(Rhi_ab, Ra, Rb);
4512         mul(Rlo_ab, Ra, Rb);
4513         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4514       } bind(dont);
4515     }
4516 
4517     void extra_step_squaring() {
4518       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4519 
4520       // MACC(Rm, Rn, t0, t1, t2);
4521       // Rm = *++Pm;
4522       // Rn = *--Pn;
4523       umulh(Rhi_mn, Rm, Rn);
4524       mul(Rlo_mn, Rm, Rn);
4525       ldr(Rm, pre(Pm, wordSize));
4526       ldr(Rn, pre(Pn, -wordSize));
4527     }
4528 
4529     void post1_squaring() {
4530       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4531 
4532       // *Pm = Rm = t0 * inv;
4533       mul(Rm, t0, inv);
4534       str(Rm, Address(Pm));
4535 
4536       // MACC(Rm, Rn, t0, t1, t2);
4537       // t0 = t1; t1 = t2; t2 = 0;
4538       umulh(Rhi_mn, Rm, Rn);
4539 
4540 #ifndef PRODUCT
4541       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4542       {
4543         mul(Rlo_mn, Rm, Rn);
4544         add(Rlo_mn, t0, Rlo_mn);
4545         Label ok;
4546         cbz(Rlo_mn, ok); {
4547           stop("broken Montgomery multiply");
4548         } bind(ok);
4549       }
4550 #endif
4551       // We have very carefully set things up so that
4552       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4553       // the lower half of Rm * Rn because we know the result already:
4554       // it must be -t0.  t0 + (-t0) must generate a carry iff
4555       // t0 != 0.  So, rather than do a mul and an adds we just set
4556       // the carry flag iff t0 is nonzero.
4557       //
4558       // mul(Rlo_mn, Rm, Rn);
4559       // adds(zr, t0, Rlo_mn);
4560       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4561       adcs(t0, t1, Rhi_mn);
4562       adc(t1, t2, zr);
4563       mov(t2, zr);
4564     }
4565 
4566     void acc(Register Rhi, Register Rlo,
4567              Register t0, Register t1, Register t2) {
4568       adds(t0, t0, Rlo);
4569       adcs(t1, t1, Rhi);
4570       adc(t2, t2, zr);
4571     }
4572 
4573   public:
4574     /**
4575      * Fast Montgomery multiplication.  The derivation of the
4576      * algorithm is in A Cryptographic Library for the Motorola
4577      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4578      *
4579      * Arguments:
4580      *
4581      * Inputs for multiplication:
4582      *   c_rarg0   - int array elements a
4583      *   c_rarg1   - int array elements b
4584      *   c_rarg2   - int array elements n (the modulus)
4585      *   c_rarg3   - int length
4586      *   c_rarg4   - int inv
4587      *   c_rarg5   - int array elements m (the result)
4588      *
4589      * Inputs for squaring:
4590      *   c_rarg0   - int array elements a
4591      *   c_rarg1   - int array elements n (the modulus)
4592      *   c_rarg2   - int length
4593      *   c_rarg3   - int inv
4594      *   c_rarg4   - int array elements m (the result)
4595      *
4596      */
4597     address generate_multiply() {
4598       Label argh, nothing;
4599       bind(argh);
4600       stop("MontgomeryMultiply total_allocation must be <= 8192");
4601 
4602       align(CodeEntryAlignment);
4603       address entry = pc();
4604 
4605       cbzw(Rlen, nothing);
4606 
4607       enter();
4608 
4609       // Make room.
4610       cmpw(Rlen, 512);
4611       br(Assembler::HI, argh);
4612       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4613       andr(sp, Ra, -2 * wordSize);
4614 
4615       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4616 
4617       {
4618         // Copy input args, reversing as we go.  We use Ra as a
4619         // temporary variable.
4620         reverse(Ra, Pa_base, Rlen, t0, t1);
4621         if (!_squaring)
4622           reverse(Ra, Pb_base, Rlen, t0, t1);
4623         reverse(Ra, Pn_base, Rlen, t0, t1);
4624       }
4625 
4626       // Push all call-saved registers and also Pm_base which we'll need
4627       // at the end.
4628       save_regs();
4629 
4630 #ifndef PRODUCT
4631       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4632       {
4633         ldr(Rn, Address(Pn_base, 0));
4634         mul(Rlo_mn, Rn, inv);
4635         cmp(Rlo_mn, -1);
4636         Label ok;
4637         br(EQ, ok); {
4638           stop("broken inverse in Montgomery multiply");
4639         } bind(ok);
4640       }
4641 #endif
4642 
4643       mov(Pm_base, Ra);
4644 
4645       mov(t0, zr);
4646       mov(t1, zr);
4647       mov(t2, zr);
4648 
4649       block_comment("for (int i = 0; i < len; i++) {");
4650       mov(Ri, zr); {
4651         Label loop, end;
4652         cmpw(Ri, Rlen);
4653         br(Assembler::GE, end);
4654 
4655         bind(loop);
4656         pre1(Ri);
4657 
4658         block_comment("  for (j = i; j; j--) {"); {
4659           movw(Rj, Ri);
4660           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4661         } block_comment("  } // j");
4662 
4663         post1();
4664         addw(Ri, Ri, 1);
4665         cmpw(Ri, Rlen);
4666         br(Assembler::LT, loop);
4667         bind(end);
4668         block_comment("} // i");
4669       }
4670 
4671       block_comment("for (int i = len; i < 2*len; i++) {");
4672       mov(Ri, Rlen); {
4673         Label loop, end;
4674         cmpw(Ri, Rlen, Assembler::LSL, 1);
4675         br(Assembler::GE, end);
4676 
4677         bind(loop);
4678         pre2(Ri, Rlen);
4679 
4680         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4681           lslw(Rj, Rlen, 1);
4682           subw(Rj, Rj, Ri);
4683           subw(Rj, Rj, 1);
4684           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4685         } block_comment("  } // j");
4686 
4687         post2(Ri, Rlen);
4688         addw(Ri, Ri, 1);
4689         cmpw(Ri, Rlen, Assembler::LSL, 1);
4690         br(Assembler::LT, loop);
4691         bind(end);
4692       }
4693       block_comment("} // i");
4694 
4695       normalize(Rlen);
4696 
4697       mov(Ra, Pm_base);  // Save Pm_base in Ra
4698       restore_regs();  // Restore caller's Pm_base
4699 
4700       // Copy our result into caller's Pm_base
4701       reverse(Pm_base, Ra, Rlen, t0, t1);
4702 
4703       leave();
4704       bind(nothing);
4705       ret(lr);
4706 
4707       return entry;
4708     }
4709     // In C, approximately:
4710 
4711     // void
4712     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4713     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4714     //                     unsigned long inv, int len) {
4715     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4716     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4717     //   unsigned long Ra, Rb, Rn, Rm;
4718 
4719     //   int i;
4720 
4721     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4722 
4723     //   for (i = 0; i < len; i++) {
4724     //     int j;
4725 
4726     //     Pa = Pa_base;
4727     //     Pb = Pb_base + i;
4728     //     Pm = Pm_base;
4729     //     Pn = Pn_base + i;
4730 
4731     //     Ra = *Pa;
4732     //     Rb = *Pb;
4733     //     Rm = *Pm;
4734     //     Rn = *Pn;
4735 
4736     //     int iters = i;
4737     //     for (j = 0; iters--; j++) {
4738     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4739     //       MACC(Ra, Rb, t0, t1, t2);
4740     //       Ra = *++Pa;
4741     //       Rb = *--Pb;
4742     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4743     //       MACC(Rm, Rn, t0, t1, t2);
4744     //       Rm = *++Pm;
4745     //       Rn = *--Pn;
4746     //     }
4747 
4748     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4749     //     MACC(Ra, Rb, t0, t1, t2);
4750     //     *Pm = Rm = t0 * inv;
4751     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4752     //     MACC(Rm, Rn, t0, t1, t2);
4753 
4754     //     assert(t0 == 0, "broken Montgomery multiply");
4755 
4756     //     t0 = t1; t1 = t2; t2 = 0;
4757     //   }
4758 
4759     //   for (i = len; i < 2*len; i++) {
4760     //     int j;
4761 
4762     //     Pa = Pa_base + i-len;
4763     //     Pb = Pb_base + len;
4764     //     Pm = Pm_base + i-len;
4765     //     Pn = Pn_base + len;
4766 
4767     //     Ra = *++Pa;
4768     //     Rb = *--Pb;
4769     //     Rm = *++Pm;
4770     //     Rn = *--Pn;
4771 
4772     //     int iters = len*2-i-1;
4773     //     for (j = i-len+1; iters--; j++) {
4774     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4775     //       MACC(Ra, Rb, t0, t1, t2);
4776     //       Ra = *++Pa;
4777     //       Rb = *--Pb;
4778     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4779     //       MACC(Rm, Rn, t0, t1, t2);
4780     //       Rm = *++Pm;
4781     //       Rn = *--Pn;
4782     //     }
4783 
4784     //     Pm_base[i-len] = t0;
4785     //     t0 = t1; t1 = t2; t2 = 0;
4786     //   }
4787 
4788     //   while (t0)
4789     //     t0 = sub(Pm_base, Pn_base, t0, len);
4790     // }
4791 
4792     /**
4793      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4794      * multiplies than Montgomery multiplication so it should be up to
4795      * 25% faster.  However, its loop control is more complex and it
4796      * may actually run slower on some machines.
4797      *
4798      * Arguments:
4799      *
4800      * Inputs:
4801      *   c_rarg0   - int array elements a
4802      *   c_rarg1   - int array elements n (the modulus)
4803      *   c_rarg2   - int length
4804      *   c_rarg3   - int inv
4805      *   c_rarg4   - int array elements m (the result)
4806      *
4807      */
4808     address generate_square() {
4809       Label argh;
4810       bind(argh);
4811       stop("MontgomeryMultiply total_allocation must be <= 8192");
4812 
4813       align(CodeEntryAlignment);
4814       address entry = pc();
4815 
4816       enter();
4817 
4818       // Make room.
4819       cmpw(Rlen, 512);
4820       br(Assembler::HI, argh);
4821       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4822       andr(sp, Ra, -2 * wordSize);
4823 
4824       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4825 
4826       {
4827         // Copy input args, reversing as we go.  We use Ra as a
4828         // temporary variable.
4829         reverse(Ra, Pa_base, Rlen, t0, t1);
4830         reverse(Ra, Pn_base, Rlen, t0, t1);
4831       }
4832 
4833       // Push all call-saved registers and also Pm_base which we'll need
4834       // at the end.
4835       save_regs();
4836 
4837       mov(Pm_base, Ra);
4838 
4839       mov(t0, zr);
4840       mov(t1, zr);
4841       mov(t2, zr);
4842 
4843       block_comment("for (int i = 0; i < len; i++) {");
4844       mov(Ri, zr); {
4845         Label loop, end;
4846         bind(loop);
4847         cmp(Ri, Rlen);
4848         br(Assembler::GE, end);
4849 
4850         pre1(Ri);
4851 
4852         block_comment("for (j = (i+1)/2; j; j--) {"); {
4853           add(Rj, Ri, 1);
4854           lsr(Rj, Rj, 1);
4855           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4856         } block_comment("  } // j");
4857 
4858         last_squaring(Ri);
4859 
4860         block_comment("  for (j = i/2; j; j--) {"); {
4861           lsr(Rj, Ri, 1);
4862           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4863         } block_comment("  } // j");
4864 
4865         post1_squaring();
4866         add(Ri, Ri, 1);
4867         cmp(Ri, Rlen);
4868         br(Assembler::LT, loop);
4869 
4870         bind(end);
4871         block_comment("} // i");
4872       }
4873 
4874       block_comment("for (int i = len; i < 2*len; i++) {");
4875       mov(Ri, Rlen); {
4876         Label loop, end;
4877         bind(loop);
4878         cmp(Ri, Rlen, Assembler::LSL, 1);
4879         br(Assembler::GE, end);
4880 
4881         pre2(Ri, Rlen);
4882 
4883         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4884           lsl(Rj, Rlen, 1);
4885           sub(Rj, Rj, Ri);
4886           sub(Rj, Rj, 1);
4887           lsr(Rj, Rj, 1);
4888           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4889         } block_comment("  } // j");
4890 
4891         last_squaring(Ri);
4892 
4893         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4894           lsl(Rj, Rlen, 1);
4895           sub(Rj, Rj, Ri);
4896           lsr(Rj, Rj, 1);
4897           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4898         } block_comment("  } // j");
4899 
4900         post2(Ri, Rlen);
4901         add(Ri, Ri, 1);
4902         cmp(Ri, Rlen, Assembler::LSL, 1);
4903 
4904         br(Assembler::LT, loop);
4905         bind(end);
4906         block_comment("} // i");
4907       }
4908 
4909       normalize(Rlen);
4910 
4911       mov(Ra, Pm_base);  // Save Pm_base in Ra
4912       restore_regs();  // Restore caller's Pm_base
4913 
4914       // Copy our result into caller's Pm_base
4915       reverse(Pm_base, Ra, Rlen, t0, t1);
4916 
4917       leave();
4918       ret(lr);
4919 
4920       return entry;
4921     }
4922     // In C, approximately:
4923 
4924     // void
4925     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4926     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4927     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4928     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4929     //   unsigned long Ra, Rb, Rn, Rm;
4930 
4931     //   int i;
4932 
4933     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4934 
4935     //   for (i = 0; i < len; i++) {
4936     //     int j;
4937 
4938     //     Pa = Pa_base;
4939     //     Pb = Pa_base + i;
4940     //     Pm = Pm_base;
4941     //     Pn = Pn_base + i;
4942 
4943     //     Ra = *Pa;
4944     //     Rb = *Pb;
4945     //     Rm = *Pm;
4946     //     Rn = *Pn;
4947 
4948     //     int iters = (i+1)/2;
4949     //     for (j = 0; iters--; j++) {
4950     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4951     //       MACC2(Ra, Rb, t0, t1, t2);
4952     //       Ra = *++Pa;
4953     //       Rb = *--Pb;
4954     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4955     //       MACC(Rm, Rn, t0, t1, t2);
4956     //       Rm = *++Pm;
4957     //       Rn = *--Pn;
4958     //     }
4959     //     if ((i & 1) == 0) {
4960     //       assert(Ra == Pa_base[j], "must be");
4961     //       MACC(Ra, Ra, t0, t1, t2);
4962     //     }
4963     //     iters = i/2;
4964     //     assert(iters == i-j, "must be");
4965     //     for (; iters--; j++) {
4966     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4967     //       MACC(Rm, Rn, t0, t1, t2);
4968     //       Rm = *++Pm;
4969     //       Rn = *--Pn;
4970     //     }
4971 
4972     //     *Pm = Rm = t0 * inv;
4973     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4974     //     MACC(Rm, Rn, t0, t1, t2);
4975 
4976     //     assert(t0 == 0, "broken Montgomery multiply");
4977 
4978     //     t0 = t1; t1 = t2; t2 = 0;
4979     //   }
4980 
4981     //   for (i = len; i < 2*len; i++) {
4982     //     int start = i-len+1;
4983     //     int end = start + (len - start)/2;
4984     //     int j;
4985 
4986     //     Pa = Pa_base + i-len;
4987     //     Pb = Pa_base + len;
4988     //     Pm = Pm_base + i-len;
4989     //     Pn = Pn_base + len;
4990 
4991     //     Ra = *++Pa;
4992     //     Rb = *--Pb;
4993     //     Rm = *++Pm;
4994     //     Rn = *--Pn;
4995 
4996     //     int iters = (2*len-i-1)/2;
4997     //     assert(iters == end-start, "must be");
4998     //     for (j = start; iters--; j++) {
4999     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5000     //       MACC2(Ra, Rb, t0, t1, t2);
5001     //       Ra = *++Pa;
5002     //       Rb = *--Pb;
5003     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5004     //       MACC(Rm, Rn, t0, t1, t2);
5005     //       Rm = *++Pm;
5006     //       Rn = *--Pn;
5007     //     }
5008     //     if ((i & 1) == 0) {
5009     //       assert(Ra == Pa_base[j], "must be");
5010     //       MACC(Ra, Ra, t0, t1, t2);
5011     //     }
5012     //     iters =  (2*len-i)/2;
5013     //     assert(iters == len-j, "must be");
5014     //     for (; iters--; j++) {
5015     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5016     //       MACC(Rm, Rn, t0, t1, t2);
5017     //       Rm = *++Pm;
5018     //       Rn = *--Pn;
5019     //     }
5020     //     Pm_base[i-len] = t0;
5021     //     t0 = t1; t1 = t2; t2 = 0;
5022     //   }
5023 
5024     //   while (t0)
5025     //     t0 = sub(Pm_base, Pn_base, t0, len);
5026     // }
5027   };
5028 
5029 
5030   // Initialization
5031   void generate_initial() {
5032     // Generate initial stubs and initializes the entry points
5033 
5034     // entry points that exist in all platforms Note: This is code
5035     // that could be shared among different platforms - however the
5036     // benefit seems to be smaller than the disadvantage of having a
5037     // much more complicated generator structure. See also comment in
5038     // stubRoutines.hpp.
5039 
5040     StubRoutines::_forward_exception_entry = generate_forward_exception();
5041 
5042     StubRoutines::_call_stub_entry =
5043       generate_call_stub(StubRoutines::_call_stub_return_address);
5044 
5045     // is referenced by megamorphic call
5046     StubRoutines::_catch_exception_entry = generate_catch_exception();
5047 
5048     // Build this early so it's available for the interpreter.
5049     StubRoutines::_throw_StackOverflowError_entry =
5050       generate_throw_exception("StackOverflowError throw_exception",
5051                                CAST_FROM_FN_PTR(address,
5052                                                 SharedRuntime::throw_StackOverflowError));
5053     StubRoutines::_throw_delayed_StackOverflowError_entry =
5054       generate_throw_exception("delayed StackOverflowError throw_exception",
5055                                CAST_FROM_FN_PTR(address,
5056                                                 SharedRuntime::throw_delayed_StackOverflowError));
5057     if (UseCRC32Intrinsics) {
5058       // set table address before stub generation which use it
5059       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5060       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5061     }
5062 
5063     if (UseCRC32CIntrinsics) {
5064       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5065     }
5066   }
5067 
5068   void generate_all() {
5069     // support for verify_oop (must happen after universe_init)
5070     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5071     StubRoutines::_throw_AbstractMethodError_entry =
5072       generate_throw_exception("AbstractMethodError throw_exception",
5073                                CAST_FROM_FN_PTR(address,
5074                                                 SharedRuntime::
5075                                                 throw_AbstractMethodError));
5076 
5077     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5078       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5079                                CAST_FROM_FN_PTR(address,
5080                                                 SharedRuntime::
5081                                                 throw_IncompatibleClassChangeError));
5082 
5083     StubRoutines::_throw_NullPointerException_at_call_entry =
5084       generate_throw_exception("NullPointerException at call throw_exception",
5085                                CAST_FROM_FN_PTR(address,
5086                                                 SharedRuntime::
5087                                                 throw_NullPointerException_at_call));
5088 
5089     // arraycopy stubs used by compilers
5090     generate_arraycopy_stubs();
5091 
5092     // has negatives stub for large arrays.
5093     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5094 
5095     // array equals stub for large arrays.
5096     StubRoutines::aarch64::_large_array_equals_byte = generate_large_array_equals_byte();
5097     StubRoutines::aarch64::_large_array_equals_char = generate_large_array_equals_char();
5098 
5099     if (UseMultiplyToLenIntrinsic) {
5100       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5101     }
5102 
5103     if (UseSquareToLenIntrinsic) {
5104       StubRoutines::_squareToLen = generate_squareToLen();
5105     }
5106 
5107     if (UseMulAddIntrinsic) {
5108       StubRoutines::_mulAdd = generate_mulAdd();
5109     }
5110 
5111     if (UseMontgomeryMultiplyIntrinsic) {
5112       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5113       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5114       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5115     }
5116 
5117     if (UseMontgomerySquareIntrinsic) {
5118       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5119       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5120       // We use generate_multiply() rather than generate_square()
5121       // because it's faster for the sizes of modulus we care about.
5122       StubRoutines::_montgomerySquare = g.generate_multiply();
5123     }
5124 
5125 #ifndef BUILTIN_SIM
5126     // generate GHASH intrinsics code
5127     if (UseGHASHIntrinsics) {
5128       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5129     }
5130 
5131     if (UseAESIntrinsics) {
5132       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5133       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5134       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5135       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5136     }
5137 
5138     if (UseSHA1Intrinsics) {
5139       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5140       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5141     }
5142     if (UseSHA256Intrinsics) {
5143       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5144       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5145     }
5146 
5147     // generate Adler32 intrinsics code
5148     if (UseAdler32Intrinsics) {
5149       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5150     }
5151 
5152     // Safefetch stubs.
5153     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5154                                                        &StubRoutines::_safefetch32_fault_pc,
5155                                                        &StubRoutines::_safefetch32_continuation_pc);
5156     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5157                                                        &StubRoutines::_safefetchN_fault_pc,
5158                                                        &StubRoutines::_safefetchN_continuation_pc);
5159 #endif
5160     StubRoutines::aarch64::set_completed();
5161   }
5162 
5163  public:
5164   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5165     if (all) {
5166       generate_all();
5167     } else {
5168       generate_initial();
5169     }
5170   }
5171 }; // end class declaration
5172 
5173 void StubGenerator_generate(CodeBuffer* code, bool all) {
5174   StubGenerator g(code, all);
5175 }