1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 #if INCLUDE_ZGC
  50 #include "gc/z/zThreadLocalData.hpp"
  51 #endif
  52 
  53 #ifdef BUILTIN_SIM
  54 #include "../../../../../../simulator/simulator.hpp"
  55 #endif
  56 
  57 // Declaration and definition of StubGenerator (no .hpp file).
  58 // For a more detailed description of the stub routine structure
  59 // see the comment in stubRoutines.hpp
  60 
  61 #undef __
  62 #define __ _masm->
  63 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  64 
  65 #ifdef PRODUCT
  66 #define BLOCK_COMMENT(str) /* nothing */
  67 #else
  68 #define BLOCK_COMMENT(str) __ block_comment(str)
  69 #endif
  70 
  71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  72 
  73 // Stub Code definitions
  74 
  75 class StubGenerator: public StubCodeGenerator {
  76  private:
  77 
  78 #ifdef PRODUCT
  79 #define inc_counter_np(counter) ((void)0)
  80 #else
  81   void inc_counter_np_(int& counter) {
  82     __ lea(rscratch2, ExternalAddress((address)&counter));
  83     __ ldrw(rscratch1, Address(rscratch2));
  84     __ addw(rscratch1, rscratch1, 1);
  85     __ strw(rscratch1, Address(rscratch2));
  86   }
  87 #define inc_counter_np(counter) \
  88   BLOCK_COMMENT("inc_counter " #counter); \
  89   inc_counter_np_(counter);
  90 #endif
  91 
  92   // Call stubs are used to call Java from C
  93   //
  94   // Arguments:
  95   //    c_rarg0:   call wrapper address                   address
  96   //    c_rarg1:   result                                 address
  97   //    c_rarg2:   result type                            BasicType
  98   //    c_rarg3:   method                                 Method*
  99   //    c_rarg4:   (interpreter) entry point              address
 100   //    c_rarg5:   parameters                             intptr_t*
 101   //    c_rarg6:   parameter size (in words)              int
 102   //    c_rarg7:   thread                                 Thread*
 103   //
 104   // There is no return from the stub itself as any Java result
 105   // is written to result
 106   //
 107   // we save r30 (lr) as the return PC at the base of the frame and
 108   // link r29 (fp) below it as the frame pointer installing sp (r31)
 109   // into fp.
 110   //
 111   // we save r0-r7, which accounts for all the c arguments.
 112   //
 113   // TODO: strictly do we need to save them all? they are treated as
 114   // volatile by C so could we omit saving the ones we are going to
 115   // place in global registers (thread? method?) or those we only use
 116   // during setup of the Java call?
 117   //
 118   // we don't need to save r8 which C uses as an indirect result location
 119   // return register.
 120   //
 121   // we don't need to save r9-r15 which both C and Java treat as
 122   // volatile
 123   //
 124   // we don't need to save r16-18 because Java does not use them
 125   //
 126   // we save r19-r28 which Java uses as scratch registers and C
 127   // expects to be callee-save
 128   //
 129   // we save the bottom 64 bits of each value stored in v8-v15; it is
 130   // the responsibility of the caller to preserve larger values.
 131   //
 132   // so the stub frame looks like this when we enter Java code
 133   //
 134   //     [ return_from_Java     ] <--- sp
 135   //     [ argument word n      ]
 136   //      ...
 137   // -27 [ argument word 1      ]
 138   // -26 [ saved v15            ] <--- sp_after_call
 139   // -25 [ saved v14            ]
 140   // -24 [ saved v13            ]
 141   // -23 [ saved v12            ]
 142   // -22 [ saved v11            ]
 143   // -21 [ saved v10            ]
 144   // -20 [ saved v9             ]
 145   // -19 [ saved v8             ]
 146   // -18 [ saved r28            ]
 147   // -17 [ saved r27            ]
 148   // -16 [ saved r26            ]
 149   // -15 [ saved r25            ]
 150   // -14 [ saved r24            ]
 151   // -13 [ saved r23            ]
 152   // -12 [ saved r22            ]
 153   // -11 [ saved r21            ]
 154   // -10 [ saved r20            ]
 155   //  -9 [ saved r19            ]
 156   //  -8 [ call wrapper    (r0) ]
 157   //  -7 [ result          (r1) ]
 158   //  -6 [ result type     (r2) ]
 159   //  -5 [ method          (r3) ]
 160   //  -4 [ entry point     (r4) ]
 161   //  -3 [ parameters      (r5) ]
 162   //  -2 [ parameter size  (r6) ]
 163   //  -1 [ thread (r7)          ]
 164   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 165   //   1 [ saved lr       (r30) ]
 166 
 167   // Call stub stack layout word offsets from fp
 168   enum call_stub_layout {
 169     sp_after_call_off = -26,
 170 
 171     d15_off            = -26,
 172     d13_off            = -24,
 173     d11_off            = -22,
 174     d9_off             = -20,
 175 
 176     r28_off            = -18,
 177     r26_off            = -16,
 178     r24_off            = -14,
 179     r22_off            = -12,
 180     r20_off            = -10,
 181     call_wrapper_off   =  -8,
 182     result_off         =  -7,
 183     result_type_off    =  -6,
 184     method_off         =  -5,
 185     entry_point_off    =  -4,
 186     parameter_size_off =  -2,
 187     thread_off         =  -1,
 188     fp_f               =   0,
 189     retaddr_off        =   1,
 190   };
 191 
 192   address generate_call_stub(address& return_address) {
 193     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 194            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 195            "adjust this code");
 196 
 197     StubCodeMark mark(this, "StubRoutines", "call_stub");
 198     address start = __ pc();
 199 
 200     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 201 
 202     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 203     const Address result        (rfp, result_off         * wordSize);
 204     const Address result_type   (rfp, result_type_off    * wordSize);
 205     const Address method        (rfp, method_off         * wordSize);
 206     const Address entry_point   (rfp, entry_point_off    * wordSize);
 207     const Address parameter_size(rfp, parameter_size_off * wordSize);
 208 
 209     const Address thread        (rfp, thread_off         * wordSize);
 210 
 211     const Address d15_save      (rfp, d15_off * wordSize);
 212     const Address d13_save      (rfp, d13_off * wordSize);
 213     const Address d11_save      (rfp, d11_off * wordSize);
 214     const Address d9_save       (rfp, d9_off * wordSize);
 215 
 216     const Address r28_save      (rfp, r28_off * wordSize);
 217     const Address r26_save      (rfp, r26_off * wordSize);
 218     const Address r24_save      (rfp, r24_off * wordSize);
 219     const Address r22_save      (rfp, r22_off * wordSize);
 220     const Address r20_save      (rfp, r20_off * wordSize);
 221 
 222     // stub code
 223 
 224     // we need a C prolog to bootstrap the x86 caller into the sim
 225     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 226 
 227     address aarch64_entry = __ pc();
 228 
 229 #ifdef BUILTIN_SIM
 230     // Save sender's SP for stack traces.
 231     __ mov(rscratch1, sp);
 232     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 233 #endif
 234     // set up frame and move sp to end of save area
 235     __ enter();
 236     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 237 
 238     // save register parameters and Java scratch/global registers
 239     // n.b. we save thread even though it gets installed in
 240     // rthread because we want to sanity check rthread later
 241     __ str(c_rarg7,  thread);
 242     __ strw(c_rarg6, parameter_size);
 243     __ stp(c_rarg4, c_rarg5,  entry_point);
 244     __ stp(c_rarg2, c_rarg3,  result_type);
 245     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 246 
 247     __ stp(r20, r19,   r20_save);
 248     __ stp(r22, r21,   r22_save);
 249     __ stp(r24, r23,   r24_save);
 250     __ stp(r26, r25,   r26_save);
 251     __ stp(r28, r27,   r28_save);
 252 
 253     __ stpd(v9,  v8,   d9_save);
 254     __ stpd(v11, v10,  d11_save);
 255     __ stpd(v13, v12,  d13_save);
 256     __ stpd(v15, v14,  d15_save);
 257 
 258     // install Java thread in global register now we have saved
 259     // whatever value it held
 260     __ mov(rthread, c_rarg7);
 261     // And method
 262     __ mov(rmethod, c_rarg3);
 263 
 264     // set up the heapbase register
 265     __ reinit_heapbase();
 266 
 267 #ifdef ASSERT
 268     // make sure we have no pending exceptions
 269     {
 270       Label L;
 271       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 272       __ cmp(rscratch1, (u1)NULL_WORD);
 273       __ br(Assembler::EQ, L);
 274       __ stop("StubRoutines::call_stub: entered with pending exception");
 275       __ BIND(L);
 276     }
 277 #endif
 278     // pass parameters if any
 279     __ mov(esp, sp);
 280     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 281     __ andr(sp, rscratch1, -2 * wordSize);
 282 
 283     BLOCK_COMMENT("pass parameters if any");
 284     Label parameters_done;
 285     // parameter count is still in c_rarg6
 286     // and parameter pointer identifying param 1 is in c_rarg5
 287     __ cbzw(c_rarg6, parameters_done);
 288 
 289     address loop = __ pc();
 290     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 291     __ subsw(c_rarg6, c_rarg6, 1);
 292     __ push(rscratch1);
 293     __ br(Assembler::GT, loop);
 294 
 295     __ BIND(parameters_done);
 296 
 297     // call Java entry -- passing methdoOop, and current sp
 298     //      rmethod: Method*
 299     //      r13: sender sp
 300     BLOCK_COMMENT("call Java function");
 301     __ mov(r13, sp);
 302     __ blr(c_rarg4);
 303 
 304     // tell the simulator we have returned to the stub
 305 
 306     // we do this here because the notify will already have been done
 307     // if we get to the next instruction via an exception
 308     //
 309     // n.b. adding this instruction here affects the calculation of
 310     // whether or not a routine returns to the call stub (used when
 311     // doing stack walks) since the normal test is to check the return
 312     // pc against the address saved below. so we may need to allow for
 313     // this extra instruction in the check.
 314 
 315     if (NotifySimulator) {
 316       __ notify(Assembler::method_reentry);
 317     }
 318     // save current address for use by exception handling code
 319 
 320     return_address = __ pc();
 321 
 322     // store result depending on type (everything that is not
 323     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 324     // n.b. this assumes Java returns an integral result in r0
 325     // and a floating result in j_farg0
 326     __ ldr(j_rarg2, result);
 327     Label is_long, is_float, is_double, exit;
 328     __ ldr(j_rarg1, result_type);
 329     __ cmp(j_rarg1, (u1)T_OBJECT);
 330     __ br(Assembler::EQ, is_long);
 331     __ cmp(j_rarg1, (u1)T_LONG);
 332     __ br(Assembler::EQ, is_long);
 333     __ cmp(j_rarg1, (u1)T_FLOAT);
 334     __ br(Assembler::EQ, is_float);
 335     __ cmp(j_rarg1, (u1)T_DOUBLE);
 336     __ br(Assembler::EQ, is_double);
 337 
 338     // handle T_INT case
 339     __ strw(r0, Address(j_rarg2));
 340 
 341     __ BIND(exit);
 342 
 343     // pop parameters
 344     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 345 
 346 #ifdef ASSERT
 347     // verify that threads correspond
 348     {
 349       Label L, S;
 350       __ ldr(rscratch1, thread);
 351       __ cmp(rthread, rscratch1);
 352       __ br(Assembler::NE, S);
 353       __ get_thread(rscratch1);
 354       __ cmp(rthread, rscratch1);
 355       __ br(Assembler::EQ, L);
 356       __ BIND(S);
 357       __ stop("StubRoutines::call_stub: threads must correspond");
 358       __ BIND(L);
 359     }
 360 #endif
 361 
 362     // restore callee-save registers
 363     __ ldpd(v15, v14,  d15_save);
 364     __ ldpd(v13, v12,  d13_save);
 365     __ ldpd(v11, v10,  d11_save);
 366     __ ldpd(v9,  v8,   d9_save);
 367 
 368     __ ldp(r28, r27,   r28_save);
 369     __ ldp(r26, r25,   r26_save);
 370     __ ldp(r24, r23,   r24_save);
 371     __ ldp(r22, r21,   r22_save);
 372     __ ldp(r20, r19,   r20_save);
 373 
 374     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 375     __ ldrw(c_rarg2, result_type);
 376     __ ldr(c_rarg3,  method);
 377     __ ldp(c_rarg4, c_rarg5,  entry_point);
 378     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 379 
 380 #ifndef PRODUCT
 381     // tell the simulator we are about to end Java execution
 382     if (NotifySimulator) {
 383       __ notify(Assembler::method_exit);
 384     }
 385 #endif
 386     // leave frame and return to caller
 387     __ leave();
 388     __ ret(lr);
 389 
 390     // handle return types different from T_INT
 391 
 392     __ BIND(is_long);
 393     __ str(r0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_float);
 397     __ strs(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     __ BIND(is_double);
 401     __ strd(j_farg0, Address(j_rarg2, 0));
 402     __ br(Assembler::AL, exit);
 403 
 404     return start;
 405   }
 406 
 407   // Return point for a Java call if there's an exception thrown in
 408   // Java code.  The exception is caught and transformed into a
 409   // pending exception stored in JavaThread that can be tested from
 410   // within the VM.
 411   //
 412   // Note: Usually the parameters are removed by the callee. In case
 413   // of an exception crossing an activation frame boundary, that is
 414   // not the case if the callee is compiled code => need to setup the
 415   // rsp.
 416   //
 417   // r0: exception oop
 418 
 419   // NOTE: this is used as a target from the signal handler so it
 420   // needs an x86 prolog which returns into the current simulator
 421   // executing the generated catch_exception code. so the prolog
 422   // needs to install rax in a sim register and adjust the sim's
 423   // restart pc to enter the generated code at the start position
 424   // then return from native to simulated execution.
 425 
 426   address generate_catch_exception() {
 427     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 428     address start = __ pc();
 429 
 430     // same as in generate_call_stub():
 431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 432     const Address thread        (rfp, thread_off         * wordSize);
 433 
 434 #ifdef ASSERT
 435     // verify that threads correspond
 436     {
 437       Label L, S;
 438       __ ldr(rscratch1, thread);
 439       __ cmp(rthread, rscratch1);
 440       __ br(Assembler::NE, S);
 441       __ get_thread(rscratch1);
 442       __ cmp(rthread, rscratch1);
 443       __ br(Assembler::EQ, L);
 444       __ bind(S);
 445       __ stop("StubRoutines::catch_exception: threads must correspond");
 446       __ bind(L);
 447     }
 448 #endif
 449 
 450     // set pending exception
 451     __ verify_oop(r0);
 452 
 453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 454     __ mov(rscratch1, (address)__FILE__);
 455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 456     __ movw(rscratch1, (int)__LINE__);
 457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 458 
 459     // complete return to VM
 460     assert(StubRoutines::_call_stub_return_address != NULL,
 461            "_call_stub_return_address must have been generated before");
 462     __ b(StubRoutines::_call_stub_return_address);
 463 
 464     return start;
 465   }
 466 
 467   // Continuation point for runtime calls returning with a pending
 468   // exception.  The pending exception check happened in the runtime
 469   // or native call stub.  The pending exception in Thread is
 470   // converted into a Java-level exception.
 471   //
 472   // Contract with Java-level exception handlers:
 473   // r0: exception
 474   // r3: throwing pc
 475   //
 476   // NOTE: At entry of this stub, exception-pc must be in LR !!
 477 
 478   // NOTE: this is always used as a jump target within generated code
 479   // so it just needs to be generated code wiht no x86 prolog
 480 
 481   address generate_forward_exception() {
 482     StubCodeMark mark(this, "StubRoutines", "forward exception");
 483     address start = __ pc();
 484 
 485     // Upon entry, LR points to the return address returning into
 486     // Java (interpreted or compiled) code; i.e., the return address
 487     // becomes the throwing pc.
 488     //
 489     // Arguments pushed before the runtime call are still on the stack
 490     // but the exception handler will reset the stack pointer ->
 491     // ignore them.  A potential result in registers can be ignored as
 492     // well.
 493 
 494 #ifdef ASSERT
 495     // make sure this code is only executed if there is a pending exception
 496     {
 497       Label L;
 498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 499       __ cbnz(rscratch1, L);
 500       __ stop("StubRoutines::forward exception: no pending exception (1)");
 501       __ bind(L);
 502     }
 503 #endif
 504 
 505     // compute exception handler into r19
 506 
 507     // call the VM to find the handler address associated with the
 508     // caller address. pass thread in r0 and caller pc (ret address)
 509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 510     // the stack.
 511     __ mov(c_rarg1, lr);
 512     // lr will be trashed by the VM call so we move it to R19
 513     // (callee-saved) because we also need to pass it to the handler
 514     // returned by this call.
 515     __ mov(r19, lr);
 516     BLOCK_COMMENT("call exception_handler_for_return_address");
 517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 518                          SharedRuntime::exception_handler_for_return_address),
 519                     rthread, c_rarg1);
 520     // we should not really care that lr is no longer the callee
 521     // address. we saved the value the handler needs in r19 so we can
 522     // just copy it to r3. however, the C2 handler will push its own
 523     // frame and then calls into the VM and the VM code asserts that
 524     // the PC for the frame above the handler belongs to a compiled
 525     // Java method. So, we restore lr here to satisfy that assert.
 526     __ mov(lr, r19);
 527     // setup r0 & r3 & clear pending exception
 528     __ mov(r3, r19);
 529     __ mov(r19, r0);
 530     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 531     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 532 
 533 #ifdef ASSERT
 534     // make sure exception is set
 535     {
 536       Label L;
 537       __ cbnz(r0, L);
 538       __ stop("StubRoutines::forward exception: no pending exception (2)");
 539       __ bind(L);
 540     }
 541 #endif
 542 
 543     // continue at exception handler
 544     // r0: exception
 545     // r3: throwing pc
 546     // r19: exception handler
 547     __ verify_oop(r0);
 548     __ br(r19);
 549 
 550     return start;
 551   }
 552 
 553   // Non-destructive plausibility checks for oops
 554   //
 555   // Arguments:
 556   //    r0: oop to verify
 557   //    rscratch1: error message
 558   //
 559   // Stack after saving c_rarg3:
 560   //    [tos + 0]: saved c_rarg3
 561   //    [tos + 1]: saved c_rarg2
 562   //    [tos + 2]: saved lr
 563   //    [tos + 3]: saved rscratch2
 564   //    [tos + 4]: saved r0
 565   //    [tos + 5]: saved rscratch1
 566   address generate_verify_oop() {
 567 
 568     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 569     address start = __ pc();
 570 
 571     Label exit, error;
 572 
 573     // save c_rarg2 and c_rarg3
 574     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 575 
 576     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 577     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 578     __ ldr(c_rarg3, Address(c_rarg2));
 579     __ add(c_rarg3, c_rarg3, 1);
 580     __ str(c_rarg3, Address(c_rarg2));
 581 
 582     // object is in r0
 583     // make sure object is 'reasonable'
 584     __ cbz(r0, exit); // if obj is NULL it is OK
 585 
 586 #if INCLUDE_ZGC
 587     if (UseZGC) {
 588       // Check if mask is good.
 589       // verifies that ZAddressBadMask & r0 == 0
 590       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 591       __ andr(c_rarg2, r0, c_rarg3);
 592       __ cbnz(c_rarg2, error);
 593     }
 594 #endif
 595 
 596     // Check if the oop is in the right area of memory
 597     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 598     __ andr(c_rarg2, r0, c_rarg3);
 599     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 600 
 601     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 602     // instruction here because the flags register is live.
 603     __ eor(c_rarg2, c_rarg2, c_rarg3);
 604     __ cbnz(c_rarg2, error);
 605 
 606     // make sure klass is 'reasonable', which is not zero.
 607     __ load_klass(r0, r0);  // get klass
 608     __ cbz(r0, error);      // if klass is NULL it is broken
 609 
 610     // return if everything seems ok
 611     __ bind(exit);
 612 
 613     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 614     __ ret(lr);
 615 
 616     // handle errors
 617     __ bind(error);
 618     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 619 
 620     __ push(RegSet::range(r0, r29), sp);
 621     // debug(char* msg, int64_t pc, int64_t regs[])
 622     __ mov(c_rarg0, rscratch1);      // pass address of error message
 623     __ mov(c_rarg1, lr);             // pass return address
 624     __ mov(c_rarg2, sp);             // pass address of regs on stack
 625 #ifndef PRODUCT
 626     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 627 #endif
 628     BLOCK_COMMENT("call MacroAssembler::debug");
 629     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 630     __ blrt(rscratch1, 3, 0, 1);
 631 
 632     return start;
 633   }
 634 
 635   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 636 
 637   // The inner part of zero_words().  This is the bulk operation,
 638   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 639   // caller is responsible for zeroing the last few words.
 640   //
 641   // Inputs:
 642   // r10: the HeapWord-aligned base address of an array to zero.
 643   // r11: the count in HeapWords, r11 > 0.
 644   //
 645   // Returns r10 and r11, adjusted for the caller to clear.
 646   // r10: the base address of the tail of words left to clear.
 647   // r11: the number of words in the tail.
 648   //      r11 < MacroAssembler::zero_words_block_size.
 649 
 650   address generate_zero_blocks() {
 651     Label done;
 652     Label base_aligned;
 653 
 654     Register base = r10, cnt = r11;
 655 
 656     __ align(CodeEntryAlignment);
 657     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 658     address start = __ pc();
 659 
 660     if (UseBlockZeroing) {
 661       int zva_length = VM_Version::zva_length();
 662 
 663       // Ensure ZVA length can be divided by 16. This is required by
 664       // the subsequent operations.
 665       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 666 
 667       __ tbz(base, 3, base_aligned);
 668       __ str(zr, Address(__ post(base, 8)));
 669       __ sub(cnt, cnt, 1);
 670       __ bind(base_aligned);
 671 
 672       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 673       // alignment.
 674       Label small;
 675       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 676       __ subs(rscratch1, cnt, low_limit >> 3);
 677       __ br(Assembler::LT, small);
 678       __ zero_dcache_blocks(base, cnt);
 679       __ bind(small);
 680     }
 681 
 682     {
 683       // Number of stp instructions we'll unroll
 684       const int unroll =
 685         MacroAssembler::zero_words_block_size / 2;
 686       // Clear the remaining blocks.
 687       Label loop;
 688       __ subs(cnt, cnt, unroll * 2);
 689       __ br(Assembler::LT, done);
 690       __ bind(loop);
 691       for (int i = 0; i < unroll; i++)
 692         __ stp(zr, zr, __ post(base, 16));
 693       __ subs(cnt, cnt, unroll * 2);
 694       __ br(Assembler::GE, loop);
 695       __ bind(done);
 696       __ add(cnt, cnt, unroll * 2);
 697     }
 698 
 699     __ ret(lr);
 700 
 701     return start;
 702   }
 703 
 704 
 705   typedef enum {
 706     copy_forwards = 1,
 707     copy_backwards = -1
 708   } copy_direction;
 709 
 710   // Bulk copy of blocks of 8 words.
 711   //
 712   // count is a count of words.
 713   //
 714   // Precondition: count >= 8
 715   //
 716   // Postconditions:
 717   //
 718   // The least significant bit of count contains the remaining count
 719   // of words to copy.  The rest of count is trash.
 720   //
 721   // s and d are adjusted to point to the remaining words to copy
 722   //
 723   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 724                            copy_direction direction) {
 725     int unit = wordSize * direction;
 726     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 727 
 728     int offset;
 729     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 730       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 731     const Register stride = r13;
 732 
 733     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 734     assert_different_registers(s, d, count, rscratch1);
 735 
 736     Label again, drain;
 737     const char *stub_name;
 738     if (direction == copy_forwards)
 739       stub_name = "forward_copy_longs";
 740     else
 741       stub_name = "backward_copy_longs";
 742 
 743     __ align(CodeEntryAlignment);
 744 
 745     StubCodeMark mark(this, "StubRoutines", stub_name);
 746 
 747     __ bind(start);
 748 
 749     Label unaligned_copy_long;
 750     if (AvoidUnalignedAccesses) {
 751       __ tbnz(d, 3, unaligned_copy_long);
 752     }
 753 
 754     if (direction == copy_forwards) {
 755       __ sub(s, s, bias);
 756       __ sub(d, d, bias);
 757     }
 758 
 759 #ifdef ASSERT
 760     // Make sure we are never given < 8 words
 761     {
 762       Label L;
 763       __ cmp(count, (u1)8);
 764       __ br(Assembler::GE, L);
 765       __ stop("genrate_copy_longs called with < 8 words");
 766       __ bind(L);
 767     }
 768 #endif
 769 
 770     // Fill 8 registers
 771     if (UseSIMDForMemoryOps) {
 772       __ ldpq(v0, v1, Address(s, 4 * unit));
 773       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 774     } else {
 775       __ ldp(t0, t1, Address(s, 2 * unit));
 776       __ ldp(t2, t3, Address(s, 4 * unit));
 777       __ ldp(t4, t5, Address(s, 6 * unit));
 778       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 779     }
 780 
 781     __ subs(count, count, 16);
 782     __ br(Assembler::LO, drain);
 783 
 784     int prefetch = PrefetchCopyIntervalInBytes;
 785     bool use_stride = false;
 786     if (direction == copy_backwards) {
 787        use_stride = prefetch > 256;
 788        prefetch = -prefetch;
 789        if (use_stride) __ mov(stride, prefetch);
 790     }
 791 
 792     __ bind(again);
 793 
 794     if (PrefetchCopyIntervalInBytes > 0)
 795       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 796 
 797     if (UseSIMDForMemoryOps) {
 798       __ stpq(v0, v1, Address(d, 4 * unit));
 799       __ ldpq(v0, v1, Address(s, 4 * unit));
 800       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 801       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 802     } else {
 803       __ stp(t0, t1, Address(d, 2 * unit));
 804       __ ldp(t0, t1, Address(s, 2 * unit));
 805       __ stp(t2, t3, Address(d, 4 * unit));
 806       __ ldp(t2, t3, Address(s, 4 * unit));
 807       __ stp(t4, t5, Address(d, 6 * unit));
 808       __ ldp(t4, t5, Address(s, 6 * unit));
 809       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 810       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 811     }
 812 
 813     __ subs(count, count, 8);
 814     __ br(Assembler::HS, again);
 815 
 816     // Drain
 817     __ bind(drain);
 818     if (UseSIMDForMemoryOps) {
 819       __ stpq(v0, v1, Address(d, 4 * unit));
 820       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 821     } else {
 822       __ stp(t0, t1, Address(d, 2 * unit));
 823       __ stp(t2, t3, Address(d, 4 * unit));
 824       __ stp(t4, t5, Address(d, 6 * unit));
 825       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 826     }
 827 
 828     {
 829       Label L1, L2;
 830       __ tbz(count, exact_log2(4), L1);
 831       if (UseSIMDForMemoryOps) {
 832         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 833         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 834       } else {
 835         __ ldp(t0, t1, Address(s, 2 * unit));
 836         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 837         __ stp(t0, t1, Address(d, 2 * unit));
 838         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 839       }
 840       __ bind(L1);
 841 
 842       if (direction == copy_forwards) {
 843         __ add(s, s, bias);
 844         __ add(d, d, bias);
 845       }
 846 
 847       __ tbz(count, 1, L2);
 848       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 849       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 850       __ bind(L2);
 851     }
 852 
 853     __ ret(lr);
 854 
 855     if (AvoidUnalignedAccesses) {
 856       Label drain, again;
 857       // Register order for storing. Order is different for backward copy.
 858 
 859       __ bind(unaligned_copy_long);
 860 
 861       // source address is even aligned, target odd aligned
 862       //
 863       // when forward copying word pairs we read long pairs at offsets
 864       // {0, 2, 4, 6} (in long words). when backwards copying we read
 865       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 866       // address by -2 in the forwards case so we can compute the
 867       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 868       // or -1.
 869       //
 870       // when forward copying we need to store 1 word, 3 pairs and
 871       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 872       // zero offset We adjust the destination by -1 which means we
 873       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 874       //
 875       // When backwards copyng we need to store 1 word, 3 pairs and
 876       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 877       // offsets {1, 3, 5, 7, 8} * unit.
 878 
 879       if (direction == copy_forwards) {
 880         __ sub(s, s, 16);
 881         __ sub(d, d, 8);
 882       }
 883 
 884       // Fill 8 registers
 885       //
 886       // for forwards copy s was offset by -16 from the original input
 887       // value of s so the register contents are at these offsets
 888       // relative to the 64 bit block addressed by that original input
 889       // and so on for each successive 64 byte block when s is updated
 890       //
 891       // t0 at offset 0,  t1 at offset 8
 892       // t2 at offset 16, t3 at offset 24
 893       // t4 at offset 32, t5 at offset 40
 894       // t6 at offset 48, t7 at offset 56
 895 
 896       // for backwards copy s was not offset so the register contents
 897       // are at these offsets into the preceding 64 byte block
 898       // relative to that original input and so on for each successive
 899       // preceding 64 byte block when s is updated. this explains the
 900       // slightly counter-intuitive looking pattern of register usage
 901       // in the stp instructions for backwards copy.
 902       //
 903       // t0 at offset -16, t1 at offset -8
 904       // t2 at offset -32, t3 at offset -24
 905       // t4 at offset -48, t5 at offset -40
 906       // t6 at offset -64, t7 at offset -56
 907 
 908       __ ldp(t0, t1, Address(s, 2 * unit));
 909       __ ldp(t2, t3, Address(s, 4 * unit));
 910       __ ldp(t4, t5, Address(s, 6 * unit));
 911       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 912 
 913       __ subs(count, count, 16);
 914       __ br(Assembler::LO, drain);
 915 
 916       int prefetch = PrefetchCopyIntervalInBytes;
 917       bool use_stride = false;
 918       if (direction == copy_backwards) {
 919          use_stride = prefetch > 256;
 920          prefetch = -prefetch;
 921          if (use_stride) __ mov(stride, prefetch);
 922       }
 923 
 924       __ bind(again);
 925 
 926       if (PrefetchCopyIntervalInBytes > 0)
 927         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 928 
 929       if (direction == copy_forwards) {
 930        // allowing for the offset of -8 the store instructions place
 931        // registers into the target 64 bit block at the following
 932        // offsets
 933        //
 934        // t0 at offset 0
 935        // t1 at offset 8,  t2 at offset 16
 936        // t3 at offset 24, t4 at offset 32
 937        // t5 at offset 40, t6 at offset 48
 938        // t7 at offset 56
 939 
 940         __ str(t0, Address(d, 1 * unit));
 941         __ stp(t1, t2, Address(d, 2 * unit));
 942         __ ldp(t0, t1, Address(s, 2 * unit));
 943         __ stp(t3, t4, Address(d, 4 * unit));
 944         __ ldp(t2, t3, Address(s, 4 * unit));
 945         __ stp(t5, t6, Address(d, 6 * unit));
 946         __ ldp(t4, t5, Address(s, 6 * unit));
 947         __ str(t7, Address(__ pre(d, 8 * unit)));
 948         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 949       } else {
 950        // d was not offset when we started so the registers are
 951        // written into the 64 bit block preceding d with the following
 952        // offsets
 953        //
 954        // t1 at offset -8
 955        // t3 at offset -24, t0 at offset -16
 956        // t5 at offset -48, t2 at offset -32
 957        // t7 at offset -56, t4 at offset -48
 958        //                   t6 at offset -64
 959        //
 960        // note that this matches the offsets previously noted for the
 961        // loads
 962 
 963         __ str(t1, Address(d, 1 * unit));
 964         __ stp(t3, t0, Address(d, 3 * unit));
 965         __ ldp(t0, t1, Address(s, 2 * unit));
 966         __ stp(t5, t2, Address(d, 5 * unit));
 967         __ ldp(t2, t3, Address(s, 4 * unit));
 968         __ stp(t7, t4, Address(d, 7 * unit));
 969         __ ldp(t4, t5, Address(s, 6 * unit));
 970         __ str(t6, Address(__ pre(d, 8 * unit)));
 971         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 972       }
 973 
 974       __ subs(count, count, 8);
 975       __ br(Assembler::HS, again);
 976 
 977       // Drain
 978       //
 979       // this uses the same pattern of offsets and register arguments
 980       // as above
 981       __ bind(drain);
 982       if (direction == copy_forwards) {
 983         __ str(t0, Address(d, 1 * unit));
 984         __ stp(t1, t2, Address(d, 2 * unit));
 985         __ stp(t3, t4, Address(d, 4 * unit));
 986         __ stp(t5, t6, Address(d, 6 * unit));
 987         __ str(t7, Address(__ pre(d, 8 * unit)));
 988       } else {
 989         __ str(t1, Address(d, 1 * unit));
 990         __ stp(t3, t0, Address(d, 3 * unit));
 991         __ stp(t5, t2, Address(d, 5 * unit));
 992         __ stp(t7, t4, Address(d, 7 * unit));
 993         __ str(t6, Address(__ pre(d, 8 * unit)));
 994       }
 995       // now we need to copy any remaining part block which may
 996       // include a 4 word block subblock and/or a 2 word subblock.
 997       // bits 2 and 1 in the count are the tell-tale for whetehr we
 998       // have each such subblock
 999       {
1000         Label L1, L2;
1001         __ tbz(count, exact_log2(4), L1);
1002        // this is the same as above but copying only 4 longs hence
1003        // with ony one intervening stp between the str instructions
1004        // but note that the offsets and registers still follow the
1005        // same pattern
1006         __ ldp(t0, t1, Address(s, 2 * unit));
1007         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1008         if (direction == copy_forwards) {
1009           __ str(t0, Address(d, 1 * unit));
1010           __ stp(t1, t2, Address(d, 2 * unit));
1011           __ str(t3, Address(__ pre(d, 4 * unit)));
1012         } else {
1013           __ str(t1, Address(d, 1 * unit));
1014           __ stp(t3, t0, Address(d, 3 * unit));
1015           __ str(t2, Address(__ pre(d, 4 * unit)));
1016         }
1017         __ bind(L1);
1018 
1019         __ tbz(count, 1, L2);
1020        // this is the same as above but copying only 2 longs hence
1021        // there is no intervening stp between the str instructions
1022        // but note that the offset and register patterns are still
1023        // the same
1024         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1025         if (direction == copy_forwards) {
1026           __ str(t0, Address(d, 1 * unit));
1027           __ str(t1, Address(__ pre(d, 2 * unit)));
1028         } else {
1029           __ str(t1, Address(d, 1 * unit));
1030           __ str(t0, Address(__ pre(d, 2 * unit)));
1031         }
1032         __ bind(L2);
1033 
1034        // for forwards copy we need to re-adjust the offsets we
1035        // applied so that s and d are follow the last words written
1036 
1037        if (direction == copy_forwards) {
1038          __ add(s, s, 16);
1039          __ add(d, d, 8);
1040        }
1041 
1042       }
1043 
1044       __ ret(lr);
1045       }
1046   }
1047 
1048   // Small copy: less than 16 bytes.
1049   //
1050   // NB: Ignores all of the bits of count which represent more than 15
1051   // bytes, so a caller doesn't have to mask them.
1052 
1053   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1054     bool is_backwards = step < 0;
1055     size_t granularity = uabs(step);
1056     int direction = is_backwards ? -1 : 1;
1057     int unit = wordSize * direction;
1058 
1059     Label Lword, Lint, Lshort, Lbyte;
1060 
1061     assert(granularity
1062            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1063 
1064     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1065 
1066     // ??? I don't know if this bit-test-and-branch is the right thing
1067     // to do.  It does a lot of jumping, resulting in several
1068     // mispredicted branches.  It might make more sense to do this
1069     // with something like Duff's device with a single computed branch.
1070 
1071     __ tbz(count, 3 - exact_log2(granularity), Lword);
1072     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1073     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1074     __ bind(Lword);
1075 
1076     if (granularity <= sizeof (jint)) {
1077       __ tbz(count, 2 - exact_log2(granularity), Lint);
1078       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1079       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1080       __ bind(Lint);
1081     }
1082 
1083     if (granularity <= sizeof (jshort)) {
1084       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1085       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1086       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1087       __ bind(Lshort);
1088     }
1089 
1090     if (granularity <= sizeof (jbyte)) {
1091       __ tbz(count, 0, Lbyte);
1092       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1093       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1094       __ bind(Lbyte);
1095     }
1096   }
1097 
1098   Label copy_f, copy_b;
1099 
1100   // All-singing all-dancing memory copy.
1101   //
1102   // Copy count units of memory from s to d.  The size of a unit is
1103   // step, which can be positive or negative depending on the direction
1104   // of copy.  If is_aligned is false, we align the source address.
1105   //
1106 
1107   void copy_memory(bool is_aligned, Register s, Register d,
1108                    Register count, Register tmp, int step) {
1109     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1110     bool is_backwards = step < 0;
1111     int granularity = uabs(step);
1112     const Register t0 = r3, t1 = r4;
1113 
1114     // <= 96 bytes do inline. Direction doesn't matter because we always
1115     // load all the data before writing anything
1116     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1117     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1118     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1119     const Register send = r17, dend = r18;
1120 
1121     if (PrefetchCopyIntervalInBytes > 0)
1122       __ prfm(Address(s, 0), PLDL1KEEP);
1123     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1124     __ br(Assembler::HI, copy_big);
1125 
1126     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1127     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1128 
1129     __ cmp(count, u1(16/granularity));
1130     __ br(Assembler::LS, copy16);
1131 
1132     __ cmp(count, u1(64/granularity));
1133     __ br(Assembler::HI, copy80);
1134 
1135     __ cmp(count, u1(32/granularity));
1136     __ br(Assembler::LS, copy32);
1137 
1138     // 33..64 bytes
1139     if (UseSIMDForMemoryOps) {
1140       __ ldpq(v0, v1, Address(s, 0));
1141       __ ldpq(v2, v3, Address(send, -32));
1142       __ stpq(v0, v1, Address(d, 0));
1143       __ stpq(v2, v3, Address(dend, -32));
1144     } else {
1145       __ ldp(t0, t1, Address(s, 0));
1146       __ ldp(t2, t3, Address(s, 16));
1147       __ ldp(t4, t5, Address(send, -32));
1148       __ ldp(t6, t7, Address(send, -16));
1149 
1150       __ stp(t0, t1, Address(d, 0));
1151       __ stp(t2, t3, Address(d, 16));
1152       __ stp(t4, t5, Address(dend, -32));
1153       __ stp(t6, t7, Address(dend, -16));
1154     }
1155     __ b(finish);
1156 
1157     // 17..32 bytes
1158     __ bind(copy32);
1159     __ ldp(t0, t1, Address(s, 0));
1160     __ ldp(t2, t3, Address(send, -16));
1161     __ stp(t0, t1, Address(d, 0));
1162     __ stp(t2, t3, Address(dend, -16));
1163     __ b(finish);
1164 
1165     // 65..80/96 bytes
1166     // (96 bytes if SIMD because we do 32 byes per instruction)
1167     __ bind(copy80);
1168     if (UseSIMDForMemoryOps) {
1169       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1170       __ ldpq(v4, v5, Address(send, -32));
1171       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1172       __ stpq(v4, v5, Address(dend, -32));
1173     } else {
1174       __ ldp(t0, t1, Address(s, 0));
1175       __ ldp(t2, t3, Address(s, 16));
1176       __ ldp(t4, t5, Address(s, 32));
1177       __ ldp(t6, t7, Address(s, 48));
1178       __ ldp(t8, t9, Address(send, -16));
1179 
1180       __ stp(t0, t1, Address(d, 0));
1181       __ stp(t2, t3, Address(d, 16));
1182       __ stp(t4, t5, Address(d, 32));
1183       __ stp(t6, t7, Address(d, 48));
1184       __ stp(t8, t9, Address(dend, -16));
1185     }
1186     __ b(finish);
1187 
1188     // 0..16 bytes
1189     __ bind(copy16);
1190     __ cmp(count, u1(8/granularity));
1191     __ br(Assembler::LO, copy8);
1192 
1193     // 8..16 bytes
1194     __ ldr(t0, Address(s, 0));
1195     __ ldr(t1, Address(send, -8));
1196     __ str(t0, Address(d, 0));
1197     __ str(t1, Address(dend, -8));
1198     __ b(finish);
1199 
1200     if (granularity < 8) {
1201       // 4..7 bytes
1202       __ bind(copy8);
1203       __ tbz(count, 2 - exact_log2(granularity), copy4);
1204       __ ldrw(t0, Address(s, 0));
1205       __ ldrw(t1, Address(send, -4));
1206       __ strw(t0, Address(d, 0));
1207       __ strw(t1, Address(dend, -4));
1208       __ b(finish);
1209       if (granularity < 4) {
1210         // 0..3 bytes
1211         __ bind(copy4);
1212         __ cbz(count, finish); // get rid of 0 case
1213         if (granularity == 2) {
1214           __ ldrh(t0, Address(s, 0));
1215           __ strh(t0, Address(d, 0));
1216         } else { // granularity == 1
1217           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1218           // the first and last byte.
1219           // Handle the 3 byte case by loading and storing base + count/2
1220           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1221           // This does means in the 1 byte case we load/store the same
1222           // byte 3 times.
1223           __ lsr(count, count, 1);
1224           __ ldrb(t0, Address(s, 0));
1225           __ ldrb(t1, Address(send, -1));
1226           __ ldrb(t2, Address(s, count));
1227           __ strb(t0, Address(d, 0));
1228           __ strb(t1, Address(dend, -1));
1229           __ strb(t2, Address(d, count));
1230         }
1231         __ b(finish);
1232       }
1233     }
1234 
1235     __ bind(copy_big);
1236     if (is_backwards) {
1237       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1238       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1239     }
1240 
1241     // Now we've got the small case out of the way we can align the
1242     // source address on a 2-word boundary.
1243 
1244     Label aligned;
1245 
1246     if (is_aligned) {
1247       // We may have to adjust by 1 word to get s 2-word-aligned.
1248       __ tbz(s, exact_log2(wordSize), aligned);
1249       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1250       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1251       __ sub(count, count, wordSize/granularity);
1252     } else {
1253       if (is_backwards) {
1254         __ andr(rscratch2, s, 2 * wordSize - 1);
1255       } else {
1256         __ neg(rscratch2, s);
1257         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1258       }
1259       // rscratch2 is the byte adjustment needed to align s.
1260       __ cbz(rscratch2, aligned);
1261       int shift = exact_log2(granularity);
1262       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1263       __ sub(count, count, rscratch2);
1264 
1265 #if 0
1266       // ?? This code is only correct for a disjoint copy.  It may or
1267       // may not make sense to use it in that case.
1268 
1269       // Copy the first pair; s and d may not be aligned.
1270       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1271       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1272 
1273       // Align s and d, adjust count
1274       if (is_backwards) {
1275         __ sub(s, s, rscratch2);
1276         __ sub(d, d, rscratch2);
1277       } else {
1278         __ add(s, s, rscratch2);
1279         __ add(d, d, rscratch2);
1280       }
1281 #else
1282       copy_memory_small(s, d, rscratch2, rscratch1, step);
1283 #endif
1284     }
1285 
1286     __ bind(aligned);
1287 
1288     // s is now 2-word-aligned.
1289 
1290     // We have a count of units and some trailing bytes.  Adjust the
1291     // count and do a bulk copy of words.
1292     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1293     if (direction == copy_forwards)
1294       __ bl(copy_f);
1295     else
1296       __ bl(copy_b);
1297 
1298     // And the tail.
1299     copy_memory_small(s, d, count, tmp, step);
1300 
1301     if (granularity >= 8) __ bind(copy8);
1302     if (granularity >= 4) __ bind(copy4);
1303     __ bind(finish);
1304   }
1305 
1306 
1307   void clobber_registers() {
1308 #ifdef ASSERT
1309     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1310     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1311     for (Register r = r3; r <= r18; r++)
1312       if (r != rscratch1) __ mov(r, rscratch1);
1313 #endif
1314   }
1315 
1316   // Scan over array at a for count oops, verifying each one.
1317   // Preserves a and count, clobbers rscratch1 and rscratch2.
1318   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1319     Label loop, end;
1320     __ mov(rscratch1, a);
1321     __ mov(rscratch2, zr);
1322     __ bind(loop);
1323     __ cmp(rscratch2, count);
1324     __ br(Assembler::HS, end);
1325     if (size == (size_t)wordSize) {
1326       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1327       __ verify_oop(temp);
1328     } else {
1329       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1330       __ decode_heap_oop(temp); // calls verify_oop
1331     }
1332     __ add(rscratch2, rscratch2, size);
1333     __ b(loop);
1334     __ bind(end);
1335   }
1336 
1337   // Arguments:
1338   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1339   //             ignored
1340   //   is_oop  - true => oop array, so generate store check code
1341   //   name    - stub name string
1342   //
1343   // Inputs:
1344   //   c_rarg0   - source array address
1345   //   c_rarg1   - destination array address
1346   //   c_rarg2   - element count, treated as ssize_t, can be zero
1347   //
1348   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1349   // the hardware handle it.  The two dwords within qwords that span
1350   // cache line boundaries will still be loaded and stored atomicly.
1351   //
1352   // Side Effects:
1353   //   disjoint_int_copy_entry is set to the no-overlap entry point
1354   //   used by generate_conjoint_int_oop_copy().
1355   //
1356   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1357                                   const char *name, bool dest_uninitialized = false) {
1358     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1359     RegSet saved_reg = RegSet::of(s, d, count);
1360     __ align(CodeEntryAlignment);
1361     StubCodeMark mark(this, "StubRoutines", name);
1362     address start = __ pc();
1363     __ enter();
1364 
1365     if (entry != NULL) {
1366       *entry = __ pc();
1367       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1368       BLOCK_COMMENT("Entry:");
1369     }
1370 
1371     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1372     if (dest_uninitialized) {
1373       decorators |= IS_DEST_UNINITIALIZED;
1374     }
1375     if (aligned) {
1376       decorators |= ARRAYCOPY_ALIGNED;
1377     }
1378 
1379     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1380     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1381 
1382     if (is_oop) {
1383       // save regs before copy_memory
1384       __ push(RegSet::of(d, count), sp);
1385     }
1386     copy_memory(aligned, s, d, count, rscratch1, size);
1387 
1388     if (is_oop) {
1389       __ pop(RegSet::of(d, count), sp);
1390       if (VerifyOops)
1391         verify_oop_array(size, d, count, r16);
1392     }
1393 
1394     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1395 
1396     __ leave();
1397     __ mov(r0, zr); // return 0
1398     __ ret(lr);
1399 #ifdef BUILTIN_SIM
1400     {
1401       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1402       sim->notifyCompile(const_cast<char*>(name), start);
1403     }
1404 #endif
1405     return start;
1406   }
1407 
1408   // Arguments:
1409   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1410   //             ignored
1411   //   is_oop  - true => oop array, so generate store check code
1412   //   name    - stub name string
1413   //
1414   // Inputs:
1415   //   c_rarg0   - source array address
1416   //   c_rarg1   - destination array address
1417   //   c_rarg2   - element count, treated as ssize_t, can be zero
1418   //
1419   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1420   // the hardware handle it.  The two dwords within qwords that span
1421   // cache line boundaries will still be loaded and stored atomicly.
1422   //
1423   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1424                                  address *entry, const char *name,
1425                                  bool dest_uninitialized = false) {
1426     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1427     RegSet saved_regs = RegSet::of(s, d, count);
1428     StubCodeMark mark(this, "StubRoutines", name);
1429     address start = __ pc();
1430     __ enter();
1431 
1432     if (entry != NULL) {
1433       *entry = __ pc();
1434       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1435       BLOCK_COMMENT("Entry:");
1436     }
1437 
1438     // use fwd copy when (d-s) above_equal (count*size)
1439     __ sub(rscratch1, d, s);
1440     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1441     __ br(Assembler::HS, nooverlap_target);
1442 
1443     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1444     if (dest_uninitialized) {
1445       decorators |= IS_DEST_UNINITIALIZED;
1446     }
1447     if (aligned) {
1448       decorators |= ARRAYCOPY_ALIGNED;
1449     }
1450 
1451     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1452     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1453 
1454     if (is_oop) {
1455       // save regs before copy_memory
1456       __ push(RegSet::of(d, count), sp);
1457     }
1458     copy_memory(aligned, s, d, count, rscratch1, -size);
1459     if (is_oop) {
1460       __ pop(RegSet::of(d, count), sp);
1461       if (VerifyOops)
1462         verify_oop_array(size, d, count, r16);
1463     }
1464     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1465     __ leave();
1466     __ mov(r0, zr); // return 0
1467     __ ret(lr);
1468 #ifdef BUILTIN_SIM
1469     {
1470       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1471       sim->notifyCompile(const_cast<char*>(name), start);
1472     }
1473 #endif
1474     return start;
1475 }
1476 
1477   // Arguments:
1478   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1479   //             ignored
1480   //   name    - stub name string
1481   //
1482   // Inputs:
1483   //   c_rarg0   - source array address
1484   //   c_rarg1   - destination array address
1485   //   c_rarg2   - element count, treated as ssize_t, can be zero
1486   //
1487   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1488   // we let the hardware handle it.  The one to eight bytes within words,
1489   // dwords or qwords that span cache line boundaries will still be loaded
1490   // and stored atomically.
1491   //
1492   // Side Effects:
1493   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1494   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1495   // we let the hardware handle it.  The one to eight bytes within words,
1496   // dwords or qwords that span cache line boundaries will still be loaded
1497   // and stored atomically.
1498   //
1499   // Side Effects:
1500   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1501   //   used by generate_conjoint_byte_copy().
1502   //
1503   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1504     const bool not_oop = false;
1505     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1506   }
1507 
1508   // Arguments:
1509   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1510   //             ignored
1511   //   name    - stub name string
1512   //
1513   // Inputs:
1514   //   c_rarg0   - source array address
1515   //   c_rarg1   - destination array address
1516   //   c_rarg2   - element count, treated as ssize_t, can be zero
1517   //
1518   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1519   // we let the hardware handle it.  The one to eight bytes within words,
1520   // dwords or qwords that span cache line boundaries will still be loaded
1521   // and stored atomically.
1522   //
1523   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1524                                       address* entry, const char *name) {
1525     const bool not_oop = false;
1526     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1527   }
1528 
1529   // Arguments:
1530   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1531   //             ignored
1532   //   name    - stub name string
1533   //
1534   // Inputs:
1535   //   c_rarg0   - source array address
1536   //   c_rarg1   - destination array address
1537   //   c_rarg2   - element count, treated as ssize_t, can be zero
1538   //
1539   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1540   // let the hardware handle it.  The two or four words within dwords
1541   // or qwords that span cache line boundaries will still be loaded
1542   // and stored atomically.
1543   //
1544   // Side Effects:
1545   //   disjoint_short_copy_entry is set to the no-overlap entry point
1546   //   used by generate_conjoint_short_copy().
1547   //
1548   address generate_disjoint_short_copy(bool aligned,
1549                                        address* entry, const char *name) {
1550     const bool not_oop = false;
1551     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1552   }
1553 
1554   // Arguments:
1555   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1556   //             ignored
1557   //   name    - stub name string
1558   //
1559   // Inputs:
1560   //   c_rarg0   - source array address
1561   //   c_rarg1   - destination array address
1562   //   c_rarg2   - element count, treated as ssize_t, can be zero
1563   //
1564   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1565   // let the hardware handle it.  The two or four words within dwords
1566   // or qwords that span cache line boundaries will still be loaded
1567   // and stored atomically.
1568   //
1569   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1570                                        address *entry, const char *name) {
1571     const bool not_oop = false;
1572     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1573 
1574   }
1575   // Arguments:
1576   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1577   //             ignored
1578   //   name    - stub name string
1579   //
1580   // Inputs:
1581   //   c_rarg0   - source array address
1582   //   c_rarg1   - destination array address
1583   //   c_rarg2   - element count, treated as ssize_t, can be zero
1584   //
1585   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1586   // the hardware handle it.  The two dwords within qwords that span
1587   // cache line boundaries will still be loaded and stored atomicly.
1588   //
1589   // Side Effects:
1590   //   disjoint_int_copy_entry is set to the no-overlap entry point
1591   //   used by generate_conjoint_int_oop_copy().
1592   //
1593   address generate_disjoint_int_copy(bool aligned, address *entry,
1594                                          const char *name, bool dest_uninitialized = false) {
1595     const bool not_oop = false;
1596     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1597   }
1598 
1599   // Arguments:
1600   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1601   //             ignored
1602   //   name    - stub name string
1603   //
1604   // Inputs:
1605   //   c_rarg0   - source array address
1606   //   c_rarg1   - destination array address
1607   //   c_rarg2   - element count, treated as ssize_t, can be zero
1608   //
1609   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1610   // the hardware handle it.  The two dwords within qwords that span
1611   // cache line boundaries will still be loaded and stored atomicly.
1612   //
1613   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1614                                      address *entry, const char *name,
1615                                      bool dest_uninitialized = false) {
1616     const bool not_oop = false;
1617     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1618   }
1619 
1620 
1621   // Arguments:
1622   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1623   //             ignored
1624   //   name    - stub name string
1625   //
1626   // Inputs:
1627   //   c_rarg0   - source array address
1628   //   c_rarg1   - destination array address
1629   //   c_rarg2   - element count, treated as size_t, can be zero
1630   //
1631   // Side Effects:
1632   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1633   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1634   //
1635   address generate_disjoint_long_copy(bool aligned, address *entry,
1636                                           const char *name, bool dest_uninitialized = false) {
1637     const bool not_oop = false;
1638     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1639   }
1640 
1641   // Arguments:
1642   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1643   //             ignored
1644   //   name    - stub name string
1645   //
1646   // Inputs:
1647   //   c_rarg0   - source array address
1648   //   c_rarg1   - destination array address
1649   //   c_rarg2   - element count, treated as size_t, can be zero
1650   //
1651   address generate_conjoint_long_copy(bool aligned,
1652                                       address nooverlap_target, address *entry,
1653                                       const char *name, bool dest_uninitialized = false) {
1654     const bool not_oop = false;
1655     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1656   }
1657 
1658   // Arguments:
1659   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1660   //             ignored
1661   //   name    - stub name string
1662   //
1663   // Inputs:
1664   //   c_rarg0   - source array address
1665   //   c_rarg1   - destination array address
1666   //   c_rarg2   - element count, treated as size_t, can be zero
1667   //
1668   // Side Effects:
1669   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1670   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1671   //
1672   address generate_disjoint_oop_copy(bool aligned, address *entry,
1673                                      const char *name, bool dest_uninitialized) {
1674     const bool is_oop = true;
1675     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1676     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1677   }
1678 
1679   // Arguments:
1680   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1681   //             ignored
1682   //   name    - stub name string
1683   //
1684   // Inputs:
1685   //   c_rarg0   - source array address
1686   //   c_rarg1   - destination array address
1687   //   c_rarg2   - element count, treated as size_t, can be zero
1688   //
1689   address generate_conjoint_oop_copy(bool aligned,
1690                                      address nooverlap_target, address *entry,
1691                                      const char *name, bool dest_uninitialized) {
1692     const bool is_oop = true;
1693     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1694     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1695                                   name, dest_uninitialized);
1696   }
1697 
1698 
1699   // Helper for generating a dynamic type check.
1700   // Smashes rscratch1, rscratch2.
1701   void generate_type_check(Register sub_klass,
1702                            Register super_check_offset,
1703                            Register super_klass,
1704                            Label& L_success) {
1705     assert_different_registers(sub_klass, super_check_offset, super_klass);
1706 
1707     BLOCK_COMMENT("type_check:");
1708 
1709     Label L_miss;
1710 
1711     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1712                                      super_check_offset);
1713     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1714 
1715     // Fall through on failure!
1716     __ BIND(L_miss);
1717   }
1718 
1719   //
1720   //  Generate checkcasting array copy stub
1721   //
1722   //  Input:
1723   //    c_rarg0   - source array address
1724   //    c_rarg1   - destination array address
1725   //    c_rarg2   - element count, treated as ssize_t, can be zero
1726   //    c_rarg3   - size_t ckoff (super_check_offset)
1727   //    c_rarg4   - oop ckval (super_klass)
1728   //
1729   //  Output:
1730   //    r0 ==  0  -  success
1731   //    r0 == -1^K - failure, where K is partial transfer count
1732   //
1733   address generate_checkcast_copy(const char *name, address *entry,
1734                                   bool dest_uninitialized = false) {
1735 
1736     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1737 
1738     // Input registers (after setup_arg_regs)
1739     const Register from        = c_rarg0;   // source array address
1740     const Register to          = c_rarg1;   // destination array address
1741     const Register count       = c_rarg2;   // elementscount
1742     const Register ckoff       = c_rarg3;   // super_check_offset
1743     const Register ckval       = c_rarg4;   // super_klass
1744 
1745     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1746     RegSet wb_post_saved_regs = RegSet::of(count);
1747 
1748     // Registers used as temps (r18, r19, r20 are save-on-entry)
1749     const Register count_save  = r21;       // orig elementscount
1750     const Register start_to    = r20;       // destination array start address
1751     const Register copied_oop  = r18;       // actual oop copied
1752     const Register r19_klass   = r19;       // oop._klass
1753 
1754     //---------------------------------------------------------------
1755     // Assembler stub will be used for this call to arraycopy
1756     // if the two arrays are subtypes of Object[] but the
1757     // destination array type is not equal to or a supertype
1758     // of the source type.  Each element must be separately
1759     // checked.
1760 
1761     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1762                                copied_oop, r19_klass, count_save);
1763 
1764     __ align(CodeEntryAlignment);
1765     StubCodeMark mark(this, "StubRoutines", name);
1766     address start = __ pc();
1767 
1768     __ enter(); // required for proper stackwalking of RuntimeStub frame
1769 
1770 #ifdef ASSERT
1771     // caller guarantees that the arrays really are different
1772     // otherwise, we would have to make conjoint checks
1773     { Label L;
1774       array_overlap_test(L, TIMES_OOP);
1775       __ stop("checkcast_copy within a single array");
1776       __ bind(L);
1777     }
1778 #endif //ASSERT
1779 
1780     // Caller of this entry point must set up the argument registers.
1781     if (entry != NULL) {
1782       *entry = __ pc();
1783       BLOCK_COMMENT("Entry:");
1784     }
1785 
1786      // Empty array:  Nothing to do.
1787     __ cbz(count, L_done);
1788 
1789     __ push(RegSet::of(r18, r19, r20, r21), sp);
1790 
1791 #ifdef ASSERT
1792     BLOCK_COMMENT("assert consistent ckoff/ckval");
1793     // The ckoff and ckval must be mutually consistent,
1794     // even though caller generates both.
1795     { Label L;
1796       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1797       __ ldrw(start_to, Address(ckval, sco_offset));
1798       __ cmpw(ckoff, start_to);
1799       __ br(Assembler::EQ, L);
1800       __ stop("super_check_offset inconsistent");
1801       __ bind(L);
1802     }
1803 #endif //ASSERT
1804 
1805     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1806     bool is_oop = true;
1807     if (dest_uninitialized) {
1808       decorators |= IS_DEST_UNINITIALIZED;
1809     }
1810 
1811     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1812     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1813 
1814     // save the original count
1815     __ mov(count_save, count);
1816 
1817     // Copy from low to high addresses
1818     __ mov(start_to, to);              // Save destination array start address
1819     __ b(L_load_element);
1820 
1821     // ======== begin loop ========
1822     // (Loop is rotated; its entry is L_load_element.)
1823     // Loop control:
1824     //   for (; count != 0; count--) {
1825     //     copied_oop = load_heap_oop(from++);
1826     //     ... generate_type_check ...;
1827     //     store_heap_oop(to++, copied_oop);
1828     //   }
1829     __ align(OptoLoopAlignment);
1830 
1831     __ BIND(L_store_element);
1832     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1833     __ sub(count, count, 1);
1834     __ cbz(count, L_do_card_marks);
1835 
1836     // ======== loop entry is here ========
1837     __ BIND(L_load_element);
1838     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1839     __ cbz(copied_oop, L_store_element);
1840 
1841     __ load_klass(r19_klass, copied_oop);// query the object klass
1842     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1843     // ======== end loop ========
1844 
1845     // It was a real error; we must depend on the caller to finish the job.
1846     // Register count = remaining oops, count_orig = total oops.
1847     // Emit GC store barriers for the oops we have copied and report
1848     // their number to the caller.
1849 
1850     __ subs(count, count_save, count);     // K = partially copied oop count
1851     __ eon(count, count, zr);                   // report (-1^K) to caller
1852     __ br(Assembler::EQ, L_done_pop);
1853 
1854     __ BIND(L_do_card_marks);
1855     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1856 
1857     __ bind(L_done_pop);
1858     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1859     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1860 
1861     __ bind(L_done);
1862     __ mov(r0, count);
1863     __ leave();
1864     __ ret(lr);
1865 
1866     return start;
1867   }
1868 
1869   // Perform range checks on the proposed arraycopy.
1870   // Kills temp, but nothing else.
1871   // Also, clean the sign bits of src_pos and dst_pos.
1872   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1873                               Register src_pos, // source position (c_rarg1)
1874                               Register dst,     // destination array oo (c_rarg2)
1875                               Register dst_pos, // destination position (c_rarg3)
1876                               Register length,
1877                               Register temp,
1878                               Label& L_failed) {
1879     BLOCK_COMMENT("arraycopy_range_checks:");
1880 
1881     assert_different_registers(rscratch1, temp);
1882 
1883     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1884     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1885     __ addw(temp, length, src_pos);
1886     __ cmpw(temp, rscratch1);
1887     __ br(Assembler::HI, L_failed);
1888 
1889     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1890     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1891     __ addw(temp, length, dst_pos);
1892     __ cmpw(temp, rscratch1);
1893     __ br(Assembler::HI, L_failed);
1894 
1895     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1896     __ movw(src_pos, src_pos);
1897     __ movw(dst_pos, dst_pos);
1898 
1899     BLOCK_COMMENT("arraycopy_range_checks done");
1900   }
1901 
1902   // These stubs get called from some dumb test routine.
1903   // I'll write them properly when they're called from
1904   // something that's actually doing something.
1905   static void fake_arraycopy_stub(address src, address dst, int count) {
1906     assert(count == 0, "huh?");
1907   }
1908 
1909 
1910   //
1911   //  Generate 'unsafe' array copy stub
1912   //  Though just as safe as the other stubs, it takes an unscaled
1913   //  size_t argument instead of an element count.
1914   //
1915   //  Input:
1916   //    c_rarg0   - source array address
1917   //    c_rarg1   - destination array address
1918   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1919   //
1920   // Examines the alignment of the operands and dispatches
1921   // to a long, int, short, or byte copy loop.
1922   //
1923   address generate_unsafe_copy(const char *name,
1924                                address byte_copy_entry,
1925                                address short_copy_entry,
1926                                address int_copy_entry,
1927                                address long_copy_entry) {
1928     Label L_long_aligned, L_int_aligned, L_short_aligned;
1929     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1930 
1931     __ align(CodeEntryAlignment);
1932     StubCodeMark mark(this, "StubRoutines", name);
1933     address start = __ pc();
1934     __ enter(); // required for proper stackwalking of RuntimeStub frame
1935 
1936     // bump this on entry, not on exit:
1937     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1938 
1939     __ orr(rscratch1, s, d);
1940     __ orr(rscratch1, rscratch1, count);
1941 
1942     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1943     __ cbz(rscratch1, L_long_aligned);
1944     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1945     __ cbz(rscratch1, L_int_aligned);
1946     __ tbz(rscratch1, 0, L_short_aligned);
1947     __ b(RuntimeAddress(byte_copy_entry));
1948 
1949     __ BIND(L_short_aligned);
1950     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1951     __ b(RuntimeAddress(short_copy_entry));
1952     __ BIND(L_int_aligned);
1953     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1954     __ b(RuntimeAddress(int_copy_entry));
1955     __ BIND(L_long_aligned);
1956     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1957     __ b(RuntimeAddress(long_copy_entry));
1958 
1959     return start;
1960   }
1961 
1962   //
1963   //  Generate generic array copy stubs
1964   //
1965   //  Input:
1966   //    c_rarg0    -  src oop
1967   //    c_rarg1    -  src_pos (32-bits)
1968   //    c_rarg2    -  dst oop
1969   //    c_rarg3    -  dst_pos (32-bits)
1970   //    c_rarg4    -  element count (32-bits)
1971   //
1972   //  Output:
1973   //    r0 ==  0  -  success
1974   //    r0 == -1^K - failure, where K is partial transfer count
1975   //
1976   address generate_generic_copy(const char *name,
1977                                 address byte_copy_entry, address short_copy_entry,
1978                                 address int_copy_entry, address oop_copy_entry,
1979                                 address long_copy_entry, address checkcast_copy_entry) {
1980 
1981     Label L_failed, L_objArray;
1982     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1983 
1984     // Input registers
1985     const Register src        = c_rarg0;  // source array oop
1986     const Register src_pos    = c_rarg1;  // source position
1987     const Register dst        = c_rarg2;  // destination array oop
1988     const Register dst_pos    = c_rarg3;  // destination position
1989     const Register length     = c_rarg4;
1990 
1991 
1992     // Registers used as temps
1993     const Register dst_klass  = c_rarg5;
1994 
1995     __ align(CodeEntryAlignment);
1996 
1997     StubCodeMark mark(this, "StubRoutines", name);
1998 
1999     address start = __ pc();
2000 
2001     __ enter(); // required for proper stackwalking of RuntimeStub frame
2002 
2003     // bump this on entry, not on exit:
2004     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2005 
2006     //-----------------------------------------------------------------------
2007     // Assembler stub will be used for this call to arraycopy
2008     // if the following conditions are met:
2009     //
2010     // (1) src and dst must not be null.
2011     // (2) src_pos must not be negative.
2012     // (3) dst_pos must not be negative.
2013     // (4) length  must not be negative.
2014     // (5) src klass and dst klass should be the same and not NULL.
2015     // (6) src and dst should be arrays.
2016     // (7) src_pos + length must not exceed length of src.
2017     // (8) dst_pos + length must not exceed length of dst.
2018     //
2019 
2020     //  if (src == NULL) return -1;
2021     __ cbz(src, L_failed);
2022 
2023     //  if (src_pos < 0) return -1;
2024     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2025 
2026     //  if (dst == NULL) return -1;
2027     __ cbz(dst, L_failed);
2028 
2029     //  if (dst_pos < 0) return -1;
2030     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2031 
2032     // registers used as temp
2033     const Register scratch_length    = r16; // elements count to copy
2034     const Register scratch_src_klass = r17; // array klass
2035     const Register lh                = r18; // layout helper
2036 
2037     //  if (length < 0) return -1;
2038     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2039     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2040 
2041     __ load_klass(scratch_src_klass, src);
2042 #ifdef ASSERT
2043     //  assert(src->klass() != NULL);
2044     {
2045       BLOCK_COMMENT("assert klasses not null {");
2046       Label L1, L2;
2047       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2048       __ bind(L1);
2049       __ stop("broken null klass");
2050       __ bind(L2);
2051       __ load_klass(rscratch1, dst);
2052       __ cbz(rscratch1, L1);     // this would be broken also
2053       BLOCK_COMMENT("} assert klasses not null done");
2054     }
2055 #endif
2056 
2057     // Load layout helper (32-bits)
2058     //
2059     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2060     // 32        30    24            16              8     2                 0
2061     //
2062     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2063     //
2064 
2065     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2066 
2067     // Handle objArrays completely differently...
2068     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2069     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2070     __ movw(rscratch1, objArray_lh);
2071     __ eorw(rscratch2, lh, rscratch1);
2072     __ cbzw(rscratch2, L_objArray);
2073 
2074     //  if (src->klass() != dst->klass()) return -1;
2075     __ load_klass(rscratch2, dst);
2076     __ eor(rscratch2, rscratch2, scratch_src_klass);
2077     __ cbnz(rscratch2, L_failed);
2078 
2079     //  if (!src->is_Array()) return -1;
2080     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2081 
2082     // At this point, it is known to be a typeArray (array_tag 0x3).
2083 #ifdef ASSERT
2084     {
2085       BLOCK_COMMENT("assert primitive array {");
2086       Label L;
2087       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2088       __ cmpw(lh, rscratch2);
2089       __ br(Assembler::GE, L);
2090       __ stop("must be a primitive array");
2091       __ bind(L);
2092       BLOCK_COMMENT("} assert primitive array done");
2093     }
2094 #endif
2095 
2096     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2097                            rscratch2, L_failed);
2098 
2099     // TypeArrayKlass
2100     //
2101     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2102     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2103     //
2104 
2105     const Register rscratch1_offset = rscratch1;    // array offset
2106     const Register r18_elsize = lh; // element size
2107 
2108     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2109            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2110     __ add(src, src, rscratch1_offset);           // src array offset
2111     __ add(dst, dst, rscratch1_offset);           // dst array offset
2112     BLOCK_COMMENT("choose copy loop based on element size");
2113 
2114     // next registers should be set before the jump to corresponding stub
2115     const Register from     = c_rarg0;  // source array address
2116     const Register to       = c_rarg1;  // destination array address
2117     const Register count    = c_rarg2;  // elements count
2118 
2119     // 'from', 'to', 'count' registers should be set in such order
2120     // since they are the same as 'src', 'src_pos', 'dst'.
2121 
2122     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2123 
2124     // The possible values of elsize are 0-3, i.e. exact_log2(element
2125     // size in bytes).  We do a simple bitwise binary search.
2126   __ BIND(L_copy_bytes);
2127     __ tbnz(r18_elsize, 1, L_copy_ints);
2128     __ tbnz(r18_elsize, 0, L_copy_shorts);
2129     __ lea(from, Address(src, src_pos));// src_addr
2130     __ lea(to,   Address(dst, dst_pos));// dst_addr
2131     __ movw(count, scratch_length); // length
2132     __ b(RuntimeAddress(byte_copy_entry));
2133 
2134   __ BIND(L_copy_shorts);
2135     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2136     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2137     __ movw(count, scratch_length); // length
2138     __ b(RuntimeAddress(short_copy_entry));
2139 
2140   __ BIND(L_copy_ints);
2141     __ tbnz(r18_elsize, 0, L_copy_longs);
2142     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2143     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2144     __ movw(count, scratch_length); // length
2145     __ b(RuntimeAddress(int_copy_entry));
2146 
2147   __ BIND(L_copy_longs);
2148 #ifdef ASSERT
2149     {
2150       BLOCK_COMMENT("assert long copy {");
2151       Label L;
2152       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2153       __ cmpw(r18_elsize, LogBytesPerLong);
2154       __ br(Assembler::EQ, L);
2155       __ stop("must be long copy, but elsize is wrong");
2156       __ bind(L);
2157       BLOCK_COMMENT("} assert long copy done");
2158     }
2159 #endif
2160     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2161     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2162     __ movw(count, scratch_length); // length
2163     __ b(RuntimeAddress(long_copy_entry));
2164 
2165     // ObjArrayKlass
2166   __ BIND(L_objArray);
2167     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2168 
2169     Label L_plain_copy, L_checkcast_copy;
2170     //  test array classes for subtyping
2171     __ load_klass(r18, dst);
2172     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2173     __ br(Assembler::NE, L_checkcast_copy);
2174 
2175     // Identically typed arrays can be copied without element-wise checks.
2176     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2177                            rscratch2, L_failed);
2178 
2179     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2180     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2181     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2182     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2183     __ movw(count, scratch_length); // length
2184   __ BIND(L_plain_copy);
2185     __ b(RuntimeAddress(oop_copy_entry));
2186 
2187   __ BIND(L_checkcast_copy);
2188     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2189     {
2190       // Before looking at dst.length, make sure dst is also an objArray.
2191       __ ldrw(rscratch1, Address(r18, lh_offset));
2192       __ movw(rscratch2, objArray_lh);
2193       __ eorw(rscratch1, rscratch1, rscratch2);
2194       __ cbnzw(rscratch1, L_failed);
2195 
2196       // It is safe to examine both src.length and dst.length.
2197       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2198                              r18, L_failed);
2199 
2200       __ load_klass(dst_klass, dst); // reload
2201 
2202       // Marshal the base address arguments now, freeing registers.
2203       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2204       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2205       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2206       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2207       __ movw(count, length);           // length (reloaded)
2208       Register sco_temp = c_rarg3;      // this register is free now
2209       assert_different_registers(from, to, count, sco_temp,
2210                                  dst_klass, scratch_src_klass);
2211       // assert_clean_int(count, sco_temp);
2212 
2213       // Generate the type check.
2214       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2215       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2216 
2217       // Smashes rscratch1, rscratch2
2218       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2219 
2220       // Fetch destination element klass from the ObjArrayKlass header.
2221       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2222       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2223       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2224 
2225       // the checkcast_copy loop needs two extra arguments:
2226       assert(c_rarg3 == sco_temp, "#3 already in place");
2227       // Set up arguments for checkcast_copy_entry.
2228       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2229       __ b(RuntimeAddress(checkcast_copy_entry));
2230     }
2231 
2232   __ BIND(L_failed);
2233     __ mov(r0, -1);
2234     __ leave();   // required for proper stackwalking of RuntimeStub frame
2235     __ ret(lr);
2236 
2237     return start;
2238   }
2239 
2240   //
2241   // Generate stub for array fill. If "aligned" is true, the
2242   // "to" address is assumed to be heapword aligned.
2243   //
2244   // Arguments for generated stub:
2245   //   to:    c_rarg0
2246   //   value: c_rarg1
2247   //   count: c_rarg2 treated as signed
2248   //
2249   address generate_fill(BasicType t, bool aligned, const char *name) {
2250     __ align(CodeEntryAlignment);
2251     StubCodeMark mark(this, "StubRoutines", name);
2252     address start = __ pc();
2253 
2254     BLOCK_COMMENT("Entry:");
2255 
2256     const Register to        = c_rarg0;  // source array address
2257     const Register value     = c_rarg1;  // value
2258     const Register count     = c_rarg2;  // elements count
2259 
2260     const Register bz_base = r10;        // base for block_zero routine
2261     const Register cnt_words = r11;      // temp register
2262 
2263     __ enter();
2264 
2265     Label L_fill_elements, L_exit1;
2266 
2267     int shift = -1;
2268     switch (t) {
2269       case T_BYTE:
2270         shift = 0;
2271         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2272         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2273         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2274         __ br(Assembler::LO, L_fill_elements);
2275         break;
2276       case T_SHORT:
2277         shift = 1;
2278         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2279         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2280         __ br(Assembler::LO, L_fill_elements);
2281         break;
2282       case T_INT:
2283         shift = 2;
2284         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2285         __ br(Assembler::LO, L_fill_elements);
2286         break;
2287       default: ShouldNotReachHere();
2288     }
2289 
2290     // Align source address at 8 bytes address boundary.
2291     Label L_skip_align1, L_skip_align2, L_skip_align4;
2292     if (!aligned) {
2293       switch (t) {
2294         case T_BYTE:
2295           // One byte misalignment happens only for byte arrays.
2296           __ tbz(to, 0, L_skip_align1);
2297           __ strb(value, Address(__ post(to, 1)));
2298           __ subw(count, count, 1);
2299           __ bind(L_skip_align1);
2300           // Fallthrough
2301         case T_SHORT:
2302           // Two bytes misalignment happens only for byte and short (char) arrays.
2303           __ tbz(to, 1, L_skip_align2);
2304           __ strh(value, Address(__ post(to, 2)));
2305           __ subw(count, count, 2 >> shift);
2306           __ bind(L_skip_align2);
2307           // Fallthrough
2308         case T_INT:
2309           // Align to 8 bytes, we know we are 4 byte aligned to start.
2310           __ tbz(to, 2, L_skip_align4);
2311           __ strw(value, Address(__ post(to, 4)));
2312           __ subw(count, count, 4 >> shift);
2313           __ bind(L_skip_align4);
2314           break;
2315         default: ShouldNotReachHere();
2316       }
2317     }
2318 
2319     //
2320     //  Fill large chunks
2321     //
2322     __ lsrw(cnt_words, count, 3 - shift); // number of words
2323     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2324     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2325     if (UseBlockZeroing) {
2326       Label non_block_zeroing, rest;
2327       // If the fill value is zero we can use the fast zero_words().
2328       __ cbnz(value, non_block_zeroing);
2329       __ mov(bz_base, to);
2330       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2331       __ zero_words(bz_base, cnt_words);
2332       __ b(rest);
2333       __ bind(non_block_zeroing);
2334       __ fill_words(to, cnt_words, value);
2335       __ bind(rest);
2336     } else {
2337       __ fill_words(to, cnt_words, value);
2338     }
2339 
2340     // Remaining count is less than 8 bytes. Fill it by a single store.
2341     // Note that the total length is no less than 8 bytes.
2342     if (t == T_BYTE || t == T_SHORT) {
2343       Label L_exit1;
2344       __ cbzw(count, L_exit1);
2345       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2346       __ str(value, Address(to, -8));    // overwrite some elements
2347       __ bind(L_exit1);
2348       __ leave();
2349       __ ret(lr);
2350     }
2351 
2352     // Handle copies less than 8 bytes.
2353     Label L_fill_2, L_fill_4, L_exit2;
2354     __ bind(L_fill_elements);
2355     switch (t) {
2356       case T_BYTE:
2357         __ tbz(count, 0, L_fill_2);
2358         __ strb(value, Address(__ post(to, 1)));
2359         __ bind(L_fill_2);
2360         __ tbz(count, 1, L_fill_4);
2361         __ strh(value, Address(__ post(to, 2)));
2362         __ bind(L_fill_4);
2363         __ tbz(count, 2, L_exit2);
2364         __ strw(value, Address(to));
2365         break;
2366       case T_SHORT:
2367         __ tbz(count, 0, L_fill_4);
2368         __ strh(value, Address(__ post(to, 2)));
2369         __ bind(L_fill_4);
2370         __ tbz(count, 1, L_exit2);
2371         __ strw(value, Address(to));
2372         break;
2373       case T_INT:
2374         __ cbzw(count, L_exit2);
2375         __ strw(value, Address(to));
2376         break;
2377       default: ShouldNotReachHere();
2378     }
2379     __ bind(L_exit2);
2380     __ leave();
2381     __ ret(lr);
2382     return start;
2383   }
2384 
2385   void generate_arraycopy_stubs() {
2386     address entry;
2387     address entry_jbyte_arraycopy;
2388     address entry_jshort_arraycopy;
2389     address entry_jint_arraycopy;
2390     address entry_oop_arraycopy;
2391     address entry_jlong_arraycopy;
2392     address entry_checkcast_arraycopy;
2393 
2394     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2395     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2396 
2397     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2398 
2399     //*** jbyte
2400     // Always need aligned and unaligned versions
2401     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2402                                                                                   "jbyte_disjoint_arraycopy");
2403     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2404                                                                                   &entry_jbyte_arraycopy,
2405                                                                                   "jbyte_arraycopy");
2406     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2407                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2408     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2409                                                                                   "arrayof_jbyte_arraycopy");
2410 
2411     //*** jshort
2412     // Always need aligned and unaligned versions
2413     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2414                                                                                     "jshort_disjoint_arraycopy");
2415     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2416                                                                                     &entry_jshort_arraycopy,
2417                                                                                     "jshort_arraycopy");
2418     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2419                                                                                     "arrayof_jshort_disjoint_arraycopy");
2420     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2421                                                                                     "arrayof_jshort_arraycopy");
2422 
2423     //*** jint
2424     // Aligned versions
2425     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2426                                                                                 "arrayof_jint_disjoint_arraycopy");
2427     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2428                                                                                 "arrayof_jint_arraycopy");
2429     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2430     // entry_jint_arraycopy always points to the unaligned version
2431     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2432                                                                                 "jint_disjoint_arraycopy");
2433     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2434                                                                                 &entry_jint_arraycopy,
2435                                                                                 "jint_arraycopy");
2436 
2437     //*** jlong
2438     // It is always aligned
2439     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2440                                                                                   "arrayof_jlong_disjoint_arraycopy");
2441     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2442                                                                                   "arrayof_jlong_arraycopy");
2443     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2444     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2445 
2446     //*** oops
2447     {
2448       // With compressed oops we need unaligned versions; notice that
2449       // we overwrite entry_oop_arraycopy.
2450       bool aligned = !UseCompressedOops;
2451 
2452       StubRoutines::_arrayof_oop_disjoint_arraycopy
2453         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2454                                      /*dest_uninitialized*/false);
2455       StubRoutines::_arrayof_oop_arraycopy
2456         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2457                                      /*dest_uninitialized*/false);
2458       // Aligned versions without pre-barriers
2459       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2460         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2461                                      /*dest_uninitialized*/true);
2462       StubRoutines::_arrayof_oop_arraycopy_uninit
2463         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2464                                      /*dest_uninitialized*/true);
2465     }
2466 
2467     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2468     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2469     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2470     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2471 
2472     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2473     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2474                                                                         /*dest_uninitialized*/true);
2475 
2476     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2477                                                               entry_jbyte_arraycopy,
2478                                                               entry_jshort_arraycopy,
2479                                                               entry_jint_arraycopy,
2480                                                               entry_jlong_arraycopy);
2481 
2482     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2483                                                                entry_jbyte_arraycopy,
2484                                                                entry_jshort_arraycopy,
2485                                                                entry_jint_arraycopy,
2486                                                                entry_oop_arraycopy,
2487                                                                entry_jlong_arraycopy,
2488                                                                entry_checkcast_arraycopy);
2489 
2490     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2491     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2492     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2493     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2494     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2495     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2496   }
2497 
2498   void generate_math_stubs() { Unimplemented(); }
2499 
2500   // Arguments:
2501   //
2502   // Inputs:
2503   //   c_rarg0   - source byte array address
2504   //   c_rarg1   - destination byte array address
2505   //   c_rarg2   - K (key) in little endian int array
2506   //
2507   address generate_aescrypt_encryptBlock() {
2508     __ align(CodeEntryAlignment);
2509     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2510 
2511     Label L_doLast;
2512 
2513     const Register from        = c_rarg0;  // source array address
2514     const Register to          = c_rarg1;  // destination array address
2515     const Register key         = c_rarg2;  // key array address
2516     const Register keylen      = rscratch1;
2517 
2518     address start = __ pc();
2519     __ enter();
2520 
2521     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2522 
2523     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2524 
2525     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2526     __ rev32(v1, __ T16B, v1);
2527     __ rev32(v2, __ T16B, v2);
2528     __ rev32(v3, __ T16B, v3);
2529     __ rev32(v4, __ T16B, v4);
2530     __ aese(v0, v1);
2531     __ aesmc(v0, v0);
2532     __ aese(v0, v2);
2533     __ aesmc(v0, v0);
2534     __ aese(v0, v3);
2535     __ aesmc(v0, v0);
2536     __ aese(v0, v4);
2537     __ aesmc(v0, v0);
2538 
2539     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2540     __ rev32(v1, __ T16B, v1);
2541     __ rev32(v2, __ T16B, v2);
2542     __ rev32(v3, __ T16B, v3);
2543     __ rev32(v4, __ T16B, v4);
2544     __ aese(v0, v1);
2545     __ aesmc(v0, v0);
2546     __ aese(v0, v2);
2547     __ aesmc(v0, v0);
2548     __ aese(v0, v3);
2549     __ aesmc(v0, v0);
2550     __ aese(v0, v4);
2551     __ aesmc(v0, v0);
2552 
2553     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2554     __ rev32(v1, __ T16B, v1);
2555     __ rev32(v2, __ T16B, v2);
2556 
2557     __ cmpw(keylen, 44);
2558     __ br(Assembler::EQ, L_doLast);
2559 
2560     __ aese(v0, v1);
2561     __ aesmc(v0, v0);
2562     __ aese(v0, v2);
2563     __ aesmc(v0, v0);
2564 
2565     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2566     __ rev32(v1, __ T16B, v1);
2567     __ rev32(v2, __ T16B, v2);
2568 
2569     __ cmpw(keylen, 52);
2570     __ br(Assembler::EQ, L_doLast);
2571 
2572     __ aese(v0, v1);
2573     __ aesmc(v0, v0);
2574     __ aese(v0, v2);
2575     __ aesmc(v0, v0);
2576 
2577     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2578     __ rev32(v1, __ T16B, v1);
2579     __ rev32(v2, __ T16B, v2);
2580 
2581     __ BIND(L_doLast);
2582 
2583     __ aese(v0, v1);
2584     __ aesmc(v0, v0);
2585     __ aese(v0, v2);
2586 
2587     __ ld1(v1, __ T16B, key);
2588     __ rev32(v1, __ T16B, v1);
2589     __ eor(v0, __ T16B, v0, v1);
2590 
2591     __ st1(v0, __ T16B, to);
2592 
2593     __ mov(r0, 0);
2594 
2595     __ leave();
2596     __ ret(lr);
2597 
2598     return start;
2599   }
2600 
2601   // Arguments:
2602   //
2603   // Inputs:
2604   //   c_rarg0   - source byte array address
2605   //   c_rarg1   - destination byte array address
2606   //   c_rarg2   - K (key) in little endian int array
2607   //
2608   address generate_aescrypt_decryptBlock() {
2609     assert(UseAES, "need AES instructions and misaligned SSE support");
2610     __ align(CodeEntryAlignment);
2611     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2612     Label L_doLast;
2613 
2614     const Register from        = c_rarg0;  // source array address
2615     const Register to          = c_rarg1;  // destination array address
2616     const Register key         = c_rarg2;  // key array address
2617     const Register keylen      = rscratch1;
2618 
2619     address start = __ pc();
2620     __ enter(); // required for proper stackwalking of RuntimeStub frame
2621 
2622     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2623 
2624     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2625 
2626     __ ld1(v5, __ T16B, __ post(key, 16));
2627     __ rev32(v5, __ T16B, v5);
2628 
2629     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2630     __ rev32(v1, __ T16B, v1);
2631     __ rev32(v2, __ T16B, v2);
2632     __ rev32(v3, __ T16B, v3);
2633     __ rev32(v4, __ T16B, v4);
2634     __ aesd(v0, v1);
2635     __ aesimc(v0, v0);
2636     __ aesd(v0, v2);
2637     __ aesimc(v0, v0);
2638     __ aesd(v0, v3);
2639     __ aesimc(v0, v0);
2640     __ aesd(v0, v4);
2641     __ aesimc(v0, v0);
2642 
2643     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2644     __ rev32(v1, __ T16B, v1);
2645     __ rev32(v2, __ T16B, v2);
2646     __ rev32(v3, __ T16B, v3);
2647     __ rev32(v4, __ T16B, v4);
2648     __ aesd(v0, v1);
2649     __ aesimc(v0, v0);
2650     __ aesd(v0, v2);
2651     __ aesimc(v0, v0);
2652     __ aesd(v0, v3);
2653     __ aesimc(v0, v0);
2654     __ aesd(v0, v4);
2655     __ aesimc(v0, v0);
2656 
2657     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2658     __ rev32(v1, __ T16B, v1);
2659     __ rev32(v2, __ T16B, v2);
2660 
2661     __ cmpw(keylen, 44);
2662     __ br(Assembler::EQ, L_doLast);
2663 
2664     __ aesd(v0, v1);
2665     __ aesimc(v0, v0);
2666     __ aesd(v0, v2);
2667     __ aesimc(v0, v0);
2668 
2669     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2670     __ rev32(v1, __ T16B, v1);
2671     __ rev32(v2, __ T16B, v2);
2672 
2673     __ cmpw(keylen, 52);
2674     __ br(Assembler::EQ, L_doLast);
2675 
2676     __ aesd(v0, v1);
2677     __ aesimc(v0, v0);
2678     __ aesd(v0, v2);
2679     __ aesimc(v0, v0);
2680 
2681     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2682     __ rev32(v1, __ T16B, v1);
2683     __ rev32(v2, __ T16B, v2);
2684 
2685     __ BIND(L_doLast);
2686 
2687     __ aesd(v0, v1);
2688     __ aesimc(v0, v0);
2689     __ aesd(v0, v2);
2690 
2691     __ eor(v0, __ T16B, v0, v5);
2692 
2693     __ st1(v0, __ T16B, to);
2694 
2695     __ mov(r0, 0);
2696 
2697     __ leave();
2698     __ ret(lr);
2699 
2700     return start;
2701   }
2702 
2703   // Arguments:
2704   //
2705   // Inputs:
2706   //   c_rarg0   - source byte array address
2707   //   c_rarg1   - destination byte array address
2708   //   c_rarg2   - K (key) in little endian int array
2709   //   c_rarg3   - r vector byte array address
2710   //   c_rarg4   - input length
2711   //
2712   // Output:
2713   //   x0        - input length
2714   //
2715   address generate_cipherBlockChaining_encryptAESCrypt() {
2716     assert(UseAES, "need AES instructions and misaligned SSE support");
2717     __ align(CodeEntryAlignment);
2718     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2719 
2720     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2721 
2722     const Register from        = c_rarg0;  // source array address
2723     const Register to          = c_rarg1;  // destination array address
2724     const Register key         = c_rarg2;  // key array address
2725     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2726                                            // and left with the results of the last encryption block
2727     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2728     const Register keylen      = rscratch1;
2729 
2730     address start = __ pc();
2731 
2732       __ enter();
2733 
2734       __ movw(rscratch2, len_reg);
2735 
2736       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2737 
2738       __ ld1(v0, __ T16B, rvec);
2739 
2740       __ cmpw(keylen, 52);
2741       __ br(Assembler::CC, L_loadkeys_44);
2742       __ br(Assembler::EQ, L_loadkeys_52);
2743 
2744       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2745       __ rev32(v17, __ T16B, v17);
2746       __ rev32(v18, __ T16B, v18);
2747     __ BIND(L_loadkeys_52);
2748       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2749       __ rev32(v19, __ T16B, v19);
2750       __ rev32(v20, __ T16B, v20);
2751     __ BIND(L_loadkeys_44);
2752       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2753       __ rev32(v21, __ T16B, v21);
2754       __ rev32(v22, __ T16B, v22);
2755       __ rev32(v23, __ T16B, v23);
2756       __ rev32(v24, __ T16B, v24);
2757       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2758       __ rev32(v25, __ T16B, v25);
2759       __ rev32(v26, __ T16B, v26);
2760       __ rev32(v27, __ T16B, v27);
2761       __ rev32(v28, __ T16B, v28);
2762       __ ld1(v29, v30, v31, __ T16B, key);
2763       __ rev32(v29, __ T16B, v29);
2764       __ rev32(v30, __ T16B, v30);
2765       __ rev32(v31, __ T16B, v31);
2766 
2767     __ BIND(L_aes_loop);
2768       __ ld1(v1, __ T16B, __ post(from, 16));
2769       __ eor(v0, __ T16B, v0, v1);
2770 
2771       __ br(Assembler::CC, L_rounds_44);
2772       __ br(Assembler::EQ, L_rounds_52);
2773 
2774       __ aese(v0, v17); __ aesmc(v0, v0);
2775       __ aese(v0, v18); __ aesmc(v0, v0);
2776     __ BIND(L_rounds_52);
2777       __ aese(v0, v19); __ aesmc(v0, v0);
2778       __ aese(v0, v20); __ aesmc(v0, v0);
2779     __ BIND(L_rounds_44);
2780       __ aese(v0, v21); __ aesmc(v0, v0);
2781       __ aese(v0, v22); __ aesmc(v0, v0);
2782       __ aese(v0, v23); __ aesmc(v0, v0);
2783       __ aese(v0, v24); __ aesmc(v0, v0);
2784       __ aese(v0, v25); __ aesmc(v0, v0);
2785       __ aese(v0, v26); __ aesmc(v0, v0);
2786       __ aese(v0, v27); __ aesmc(v0, v0);
2787       __ aese(v0, v28); __ aesmc(v0, v0);
2788       __ aese(v0, v29); __ aesmc(v0, v0);
2789       __ aese(v0, v30);
2790       __ eor(v0, __ T16B, v0, v31);
2791 
2792       __ st1(v0, __ T16B, __ post(to, 16));
2793 
2794       __ subw(len_reg, len_reg, 16);
2795       __ cbnzw(len_reg, L_aes_loop);
2796 
2797       __ st1(v0, __ T16B, rvec);
2798 
2799       __ mov(r0, rscratch2);
2800 
2801       __ leave();
2802       __ ret(lr);
2803 
2804       return start;
2805   }
2806 
2807   // Arguments:
2808   //
2809   // Inputs:
2810   //   c_rarg0   - source byte array address
2811   //   c_rarg1   - destination byte array address
2812   //   c_rarg2   - K (key) in little endian int array
2813   //   c_rarg3   - r vector byte array address
2814   //   c_rarg4   - input length
2815   //
2816   // Output:
2817   //   r0        - input length
2818   //
2819   address generate_cipherBlockChaining_decryptAESCrypt() {
2820     assert(UseAES, "need AES instructions and misaligned SSE support");
2821     __ align(CodeEntryAlignment);
2822     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2823 
2824     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2825 
2826     const Register from        = c_rarg0;  // source array address
2827     const Register to          = c_rarg1;  // destination array address
2828     const Register key         = c_rarg2;  // key array address
2829     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2830                                            // and left with the results of the last encryption block
2831     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2832     const Register keylen      = rscratch1;
2833 
2834     address start = __ pc();
2835 
2836       __ enter();
2837 
2838       __ movw(rscratch2, len_reg);
2839 
2840       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2841 
2842       __ ld1(v2, __ T16B, rvec);
2843 
2844       __ ld1(v31, __ T16B, __ post(key, 16));
2845       __ rev32(v31, __ T16B, v31);
2846 
2847       __ cmpw(keylen, 52);
2848       __ br(Assembler::CC, L_loadkeys_44);
2849       __ br(Assembler::EQ, L_loadkeys_52);
2850 
2851       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2852       __ rev32(v17, __ T16B, v17);
2853       __ rev32(v18, __ T16B, v18);
2854     __ BIND(L_loadkeys_52);
2855       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2856       __ rev32(v19, __ T16B, v19);
2857       __ rev32(v20, __ T16B, v20);
2858     __ BIND(L_loadkeys_44);
2859       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2860       __ rev32(v21, __ T16B, v21);
2861       __ rev32(v22, __ T16B, v22);
2862       __ rev32(v23, __ T16B, v23);
2863       __ rev32(v24, __ T16B, v24);
2864       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2865       __ rev32(v25, __ T16B, v25);
2866       __ rev32(v26, __ T16B, v26);
2867       __ rev32(v27, __ T16B, v27);
2868       __ rev32(v28, __ T16B, v28);
2869       __ ld1(v29, v30, __ T16B, key);
2870       __ rev32(v29, __ T16B, v29);
2871       __ rev32(v30, __ T16B, v30);
2872 
2873     __ BIND(L_aes_loop);
2874       __ ld1(v0, __ T16B, __ post(from, 16));
2875       __ orr(v1, __ T16B, v0, v0);
2876 
2877       __ br(Assembler::CC, L_rounds_44);
2878       __ br(Assembler::EQ, L_rounds_52);
2879 
2880       __ aesd(v0, v17); __ aesimc(v0, v0);
2881       __ aesd(v0, v18); __ aesimc(v0, v0);
2882     __ BIND(L_rounds_52);
2883       __ aesd(v0, v19); __ aesimc(v0, v0);
2884       __ aesd(v0, v20); __ aesimc(v0, v0);
2885     __ BIND(L_rounds_44);
2886       __ aesd(v0, v21); __ aesimc(v0, v0);
2887       __ aesd(v0, v22); __ aesimc(v0, v0);
2888       __ aesd(v0, v23); __ aesimc(v0, v0);
2889       __ aesd(v0, v24); __ aesimc(v0, v0);
2890       __ aesd(v0, v25); __ aesimc(v0, v0);
2891       __ aesd(v0, v26); __ aesimc(v0, v0);
2892       __ aesd(v0, v27); __ aesimc(v0, v0);
2893       __ aesd(v0, v28); __ aesimc(v0, v0);
2894       __ aesd(v0, v29); __ aesimc(v0, v0);
2895       __ aesd(v0, v30);
2896       __ eor(v0, __ T16B, v0, v31);
2897       __ eor(v0, __ T16B, v0, v2);
2898 
2899       __ st1(v0, __ T16B, __ post(to, 16));
2900       __ orr(v2, __ T16B, v1, v1);
2901 
2902       __ subw(len_reg, len_reg, 16);
2903       __ cbnzw(len_reg, L_aes_loop);
2904 
2905       __ st1(v2, __ T16B, rvec);
2906 
2907       __ mov(r0, rscratch2);
2908 
2909       __ leave();
2910       __ ret(lr);
2911 
2912     return start;
2913   }
2914 
2915   // Arguments:
2916   //
2917   // Inputs:
2918   //   c_rarg0   - byte[]  source+offset
2919   //   c_rarg1   - int[]   SHA.state
2920   //   c_rarg2   - int     offset
2921   //   c_rarg3   - int     limit
2922   //
2923   address generate_sha1_implCompress(bool multi_block, const char *name) {
2924     __ align(CodeEntryAlignment);
2925     StubCodeMark mark(this, "StubRoutines", name);
2926     address start = __ pc();
2927 
2928     Register buf   = c_rarg0;
2929     Register state = c_rarg1;
2930     Register ofs   = c_rarg2;
2931     Register limit = c_rarg3;
2932 
2933     Label keys;
2934     Label sha1_loop;
2935 
2936     // load the keys into v0..v3
2937     __ adr(rscratch1, keys);
2938     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2939     // load 5 words state into v6, v7
2940     __ ldrq(v6, Address(state, 0));
2941     __ ldrs(v7, Address(state, 16));
2942 
2943 
2944     __ BIND(sha1_loop);
2945     // load 64 bytes of data into v16..v19
2946     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2947     __ rev32(v16, __ T16B, v16);
2948     __ rev32(v17, __ T16B, v17);
2949     __ rev32(v18, __ T16B, v18);
2950     __ rev32(v19, __ T16B, v19);
2951 
2952     // do the sha1
2953     __ addv(v4, __ T4S, v16, v0);
2954     __ orr(v20, __ T16B, v6, v6);
2955 
2956     FloatRegister d0 = v16;
2957     FloatRegister d1 = v17;
2958     FloatRegister d2 = v18;
2959     FloatRegister d3 = v19;
2960 
2961     for (int round = 0; round < 20; round++) {
2962       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2963       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2964       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2965       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2966       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2967 
2968       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2969       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2970       __ sha1h(tmp2, __ T4S, v20);
2971       if (round < 5)
2972         __ sha1c(v20, __ T4S, tmp3, tmp4);
2973       else if (round < 10 || round >= 15)
2974         __ sha1p(v20, __ T4S, tmp3, tmp4);
2975       else
2976         __ sha1m(v20, __ T4S, tmp3, tmp4);
2977       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2978 
2979       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2980     }
2981 
2982     __ addv(v7, __ T2S, v7, v21);
2983     __ addv(v6, __ T4S, v6, v20);
2984 
2985     if (multi_block) {
2986       __ add(ofs, ofs, 64);
2987       __ cmp(ofs, limit);
2988       __ br(Assembler::LE, sha1_loop);
2989       __ mov(c_rarg0, ofs); // return ofs
2990     }
2991 
2992     __ strq(v6, Address(state, 0));
2993     __ strs(v7, Address(state, 16));
2994 
2995     __ ret(lr);
2996 
2997     __ bind(keys);
2998     __ emit_int32(0x5a827999);
2999     __ emit_int32(0x6ed9eba1);
3000     __ emit_int32(0x8f1bbcdc);
3001     __ emit_int32(0xca62c1d6);
3002 
3003     return start;
3004   }
3005 
3006 
3007   // Arguments:
3008   //
3009   // Inputs:
3010   //   c_rarg0   - byte[]  source+offset
3011   //   c_rarg1   - int[]   SHA.state
3012   //   c_rarg2   - int     offset
3013   //   c_rarg3   - int     limit
3014   //
3015   address generate_sha256_implCompress(bool multi_block, const char *name) {
3016     static const uint32_t round_consts[64] = {
3017       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3018       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3019       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3020       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3021       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3022       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3023       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3024       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3025       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3026       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3027       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3028       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3029       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3030       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3031       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3032       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3033     };
3034     __ align(CodeEntryAlignment);
3035     StubCodeMark mark(this, "StubRoutines", name);
3036     address start = __ pc();
3037 
3038     Register buf   = c_rarg0;
3039     Register state = c_rarg1;
3040     Register ofs   = c_rarg2;
3041     Register limit = c_rarg3;
3042 
3043     Label sha1_loop;
3044 
3045     __ stpd(v8, v9, __ pre(sp, -32));
3046     __ stpd(v10, v11, Address(sp, 16));
3047 
3048 // dga == v0
3049 // dgb == v1
3050 // dg0 == v2
3051 // dg1 == v3
3052 // dg2 == v4
3053 // t0 == v6
3054 // t1 == v7
3055 
3056     // load 16 keys to v16..v31
3057     __ lea(rscratch1, ExternalAddress((address)round_consts));
3058     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3059     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3060     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3061     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3062 
3063     // load 8 words (256 bits) state
3064     __ ldpq(v0, v1, state);
3065 
3066     __ BIND(sha1_loop);
3067     // load 64 bytes of data into v8..v11
3068     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3069     __ rev32(v8, __ T16B, v8);
3070     __ rev32(v9, __ T16B, v9);
3071     __ rev32(v10, __ T16B, v10);
3072     __ rev32(v11, __ T16B, v11);
3073 
3074     __ addv(v6, __ T4S, v8, v16);
3075     __ orr(v2, __ T16B, v0, v0);
3076     __ orr(v3, __ T16B, v1, v1);
3077 
3078     FloatRegister d0 = v8;
3079     FloatRegister d1 = v9;
3080     FloatRegister d2 = v10;
3081     FloatRegister d3 = v11;
3082 
3083 
3084     for (int round = 0; round < 16; round++) {
3085       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3086       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3087       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3088       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3089 
3090       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3091        __ orr(v4, __ T16B, v2, v2);
3092       if (round < 15)
3093         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3094       __ sha256h(v2, __ T4S, v3, tmp2);
3095       __ sha256h2(v3, __ T4S, v4, tmp2);
3096       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3097 
3098       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3099     }
3100 
3101     __ addv(v0, __ T4S, v0, v2);
3102     __ addv(v1, __ T4S, v1, v3);
3103 
3104     if (multi_block) {
3105       __ add(ofs, ofs, 64);
3106       __ cmp(ofs, limit);
3107       __ br(Assembler::LE, sha1_loop);
3108       __ mov(c_rarg0, ofs); // return ofs
3109     }
3110 
3111     __ ldpd(v10, v11, Address(sp, 16));
3112     __ ldpd(v8, v9, __ post(sp, 32));
3113 
3114     __ stpq(v0, v1, state);
3115 
3116     __ ret(lr);
3117 
3118     return start;
3119   }
3120 
3121 #ifndef BUILTIN_SIM
3122   // Safefetch stubs.
3123   void generate_safefetch(const char* name, int size, address* entry,
3124                           address* fault_pc, address* continuation_pc) {
3125     // safefetch signatures:
3126     //   int      SafeFetch32(int*      adr, int      errValue);
3127     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3128     //
3129     // arguments:
3130     //   c_rarg0 = adr
3131     //   c_rarg1 = errValue
3132     //
3133     // result:
3134     //   PPC_RET  = *adr or errValue
3135 
3136     StubCodeMark mark(this, "StubRoutines", name);
3137 
3138     // Entry point, pc or function descriptor.
3139     *entry = __ pc();
3140 
3141     // Load *adr into c_rarg1, may fault.
3142     *fault_pc = __ pc();
3143     switch (size) {
3144       case 4:
3145         // int32_t
3146         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3147         break;
3148       case 8:
3149         // int64_t
3150         __ ldr(c_rarg1, Address(c_rarg0, 0));
3151         break;
3152       default:
3153         ShouldNotReachHere();
3154     }
3155 
3156     // return errValue or *adr
3157     *continuation_pc = __ pc();
3158     __ mov(r0, c_rarg1);
3159     __ ret(lr);
3160   }
3161 #endif
3162 
3163   /**
3164    *  Arguments:
3165    *
3166    * Inputs:
3167    *   c_rarg0   - int crc
3168    *   c_rarg1   - byte* buf
3169    *   c_rarg2   - int length
3170    *
3171    * Ouput:
3172    *       rax   - int crc result
3173    */
3174   address generate_updateBytesCRC32() {
3175     assert(UseCRC32Intrinsics, "what are we doing here?");
3176 
3177     __ align(CodeEntryAlignment);
3178     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3179 
3180     address start = __ pc();
3181 
3182     const Register crc   = c_rarg0;  // crc
3183     const Register buf   = c_rarg1;  // source java byte array address
3184     const Register len   = c_rarg2;  // length
3185     const Register table0 = c_rarg3; // crc_table address
3186     const Register table1 = c_rarg4;
3187     const Register table2 = c_rarg5;
3188     const Register table3 = c_rarg6;
3189     const Register tmp3 = c_rarg7;
3190 
3191     BLOCK_COMMENT("Entry:");
3192     __ enter(); // required for proper stackwalking of RuntimeStub frame
3193 
3194     __ kernel_crc32(crc, buf, len,
3195               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3196 
3197     __ leave(); // required for proper stackwalking of RuntimeStub frame
3198     __ ret(lr);
3199 
3200     return start;
3201   }
3202 
3203   /**
3204    *  Arguments:
3205    *
3206    * Inputs:
3207    *   c_rarg0   - int crc
3208    *   c_rarg1   - byte* buf
3209    *   c_rarg2   - int length
3210    *   c_rarg3   - int* table
3211    *
3212    * Ouput:
3213    *       r0   - int crc result
3214    */
3215   address generate_updateBytesCRC32C() {
3216     assert(UseCRC32CIntrinsics, "what are we doing here?");
3217 
3218     __ align(CodeEntryAlignment);
3219     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3220 
3221     address start = __ pc();
3222 
3223     const Register crc   = c_rarg0;  // crc
3224     const Register buf   = c_rarg1;  // source java byte array address
3225     const Register len   = c_rarg2;  // length
3226     const Register table0 = c_rarg3; // crc_table address
3227     const Register table1 = c_rarg4;
3228     const Register table2 = c_rarg5;
3229     const Register table3 = c_rarg6;
3230     const Register tmp3 = c_rarg7;
3231 
3232     BLOCK_COMMENT("Entry:");
3233     __ enter(); // required for proper stackwalking of RuntimeStub frame
3234 
3235     __ kernel_crc32c(crc, buf, len,
3236               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3237 
3238     __ leave(); // required for proper stackwalking of RuntimeStub frame
3239     __ ret(lr);
3240 
3241     return start;
3242   }
3243 
3244   /***
3245    *  Arguments:
3246    *
3247    *  Inputs:
3248    *   c_rarg0   - int   adler
3249    *   c_rarg1   - byte* buff
3250    *   c_rarg2   - int   len
3251    *
3252    * Output:
3253    *   c_rarg0   - int adler result
3254    */
3255   address generate_updateBytesAdler32() {
3256     __ align(CodeEntryAlignment);
3257     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3258     address start = __ pc();
3259 
3260     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3261 
3262     // Aliases
3263     Register adler  = c_rarg0;
3264     Register s1     = c_rarg0;
3265     Register s2     = c_rarg3;
3266     Register buff   = c_rarg1;
3267     Register len    = c_rarg2;
3268     Register nmax  = r4;
3269     Register base  = r5;
3270     Register count = r6;
3271     Register temp0 = rscratch1;
3272     Register temp1 = rscratch2;
3273     FloatRegister vbytes = v0;
3274     FloatRegister vs1acc = v1;
3275     FloatRegister vs2acc = v2;
3276     FloatRegister vtable = v3;
3277 
3278     // Max number of bytes we can process before having to take the mod
3279     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3280     unsigned long BASE = 0xfff1;
3281     unsigned long NMAX = 0x15B0;
3282 
3283     __ mov(base, BASE);
3284     __ mov(nmax, NMAX);
3285 
3286     // Load accumulation coefficients for the upper 16 bits
3287     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3288     __ ld1(vtable, __ T16B, Address(temp0));
3289 
3290     // s1 is initialized to the lower 16 bits of adler
3291     // s2 is initialized to the upper 16 bits of adler
3292     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3293     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3294 
3295     // The pipelined loop needs at least 16 elements for 1 iteration
3296     // It does check this, but it is more effective to skip to the cleanup loop
3297     __ cmp(len, (u1)16);
3298     __ br(Assembler::HS, L_nmax);
3299     __ cbz(len, L_combine);
3300 
3301     __ bind(L_simple_by1_loop);
3302     __ ldrb(temp0, Address(__ post(buff, 1)));
3303     __ add(s1, s1, temp0);
3304     __ add(s2, s2, s1);
3305     __ subs(len, len, 1);
3306     __ br(Assembler::HI, L_simple_by1_loop);
3307 
3308     // s1 = s1 % BASE
3309     __ subs(temp0, s1, base);
3310     __ csel(s1, temp0, s1, Assembler::HS);
3311 
3312     // s2 = s2 % BASE
3313     __ lsr(temp0, s2, 16);
3314     __ lsl(temp1, temp0, 4);
3315     __ sub(temp1, temp1, temp0);
3316     __ add(s2, temp1, s2, ext::uxth);
3317 
3318     __ subs(temp0, s2, base);
3319     __ csel(s2, temp0, s2, Assembler::HS);
3320 
3321     __ b(L_combine);
3322 
3323     __ bind(L_nmax);
3324     __ subs(len, len, nmax);
3325     __ sub(count, nmax, 16);
3326     __ br(Assembler::LO, L_by16);
3327 
3328     __ bind(L_nmax_loop);
3329 
3330     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3331                                       vbytes, vs1acc, vs2acc, vtable);
3332 
3333     __ subs(count, count, 16);
3334     __ br(Assembler::HS, L_nmax_loop);
3335 
3336     // s1 = s1 % BASE
3337     __ lsr(temp0, s1, 16);
3338     __ lsl(temp1, temp0, 4);
3339     __ sub(temp1, temp1, temp0);
3340     __ add(temp1, temp1, s1, ext::uxth);
3341 
3342     __ lsr(temp0, temp1, 16);
3343     __ lsl(s1, temp0, 4);
3344     __ sub(s1, s1, temp0);
3345     __ add(s1, s1, temp1, ext:: uxth);
3346 
3347     __ subs(temp0, s1, base);
3348     __ csel(s1, temp0, s1, Assembler::HS);
3349 
3350     // s2 = s2 % BASE
3351     __ lsr(temp0, s2, 16);
3352     __ lsl(temp1, temp0, 4);
3353     __ sub(temp1, temp1, temp0);
3354     __ add(temp1, temp1, s2, ext::uxth);
3355 
3356     __ lsr(temp0, temp1, 16);
3357     __ lsl(s2, temp0, 4);
3358     __ sub(s2, s2, temp0);
3359     __ add(s2, s2, temp1, ext:: uxth);
3360 
3361     __ subs(temp0, s2, base);
3362     __ csel(s2, temp0, s2, Assembler::HS);
3363 
3364     __ subs(len, len, nmax);
3365     __ sub(count, nmax, 16);
3366     __ br(Assembler::HS, L_nmax_loop);
3367 
3368     __ bind(L_by16);
3369     __ adds(len, len, count);
3370     __ br(Assembler::LO, L_by1);
3371 
3372     __ bind(L_by16_loop);
3373 
3374     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3375                                       vbytes, vs1acc, vs2acc, vtable);
3376 
3377     __ subs(len, len, 16);
3378     __ br(Assembler::HS, L_by16_loop);
3379 
3380     __ bind(L_by1);
3381     __ adds(len, len, 15);
3382     __ br(Assembler::LO, L_do_mod);
3383 
3384     __ bind(L_by1_loop);
3385     __ ldrb(temp0, Address(__ post(buff, 1)));
3386     __ add(s1, temp0, s1);
3387     __ add(s2, s2, s1);
3388     __ subs(len, len, 1);
3389     __ br(Assembler::HS, L_by1_loop);
3390 
3391     __ bind(L_do_mod);
3392     // s1 = s1 % BASE
3393     __ lsr(temp0, s1, 16);
3394     __ lsl(temp1, temp0, 4);
3395     __ sub(temp1, temp1, temp0);
3396     __ add(temp1, temp1, s1, ext::uxth);
3397 
3398     __ lsr(temp0, temp1, 16);
3399     __ lsl(s1, temp0, 4);
3400     __ sub(s1, s1, temp0);
3401     __ add(s1, s1, temp1, ext:: uxth);
3402 
3403     __ subs(temp0, s1, base);
3404     __ csel(s1, temp0, s1, Assembler::HS);
3405 
3406     // s2 = s2 % BASE
3407     __ lsr(temp0, s2, 16);
3408     __ lsl(temp1, temp0, 4);
3409     __ sub(temp1, temp1, temp0);
3410     __ add(temp1, temp1, s2, ext::uxth);
3411 
3412     __ lsr(temp0, temp1, 16);
3413     __ lsl(s2, temp0, 4);
3414     __ sub(s2, s2, temp0);
3415     __ add(s2, s2, temp1, ext:: uxth);
3416 
3417     __ subs(temp0, s2, base);
3418     __ csel(s2, temp0, s2, Assembler::HS);
3419 
3420     // Combine lower bits and higher bits
3421     __ bind(L_combine);
3422     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3423 
3424     __ ret(lr);
3425 
3426     return start;
3427   }
3428 
3429   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3430           Register temp0, Register temp1, FloatRegister vbytes,
3431           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3432     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3433     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3434     // In non-vectorized code, we update s1 and s2 as:
3435     //   s1 <- s1 + b1
3436     //   s2 <- s2 + s1
3437     //   s1 <- s1 + b2
3438     //   s2 <- s2 + b1
3439     //   ...
3440     //   s1 <- s1 + b16
3441     //   s2 <- s2 + s1
3442     // Putting above assignments together, we have:
3443     //   s1_new = s1 + b1 + b2 + ... + b16
3444     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3445     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3446     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3447     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3448 
3449     // s2 = s2 + s1 * 16
3450     __ add(s2, s2, s1, Assembler::LSL, 4);
3451 
3452     // vs1acc = b1 + b2 + b3 + ... + b16
3453     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3454     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3455     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3456     __ uaddlv(vs1acc, __ T16B, vbytes);
3457     __ uaddlv(vs2acc, __ T8H, vs2acc);
3458 
3459     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3460     __ fmovd(temp0, vs1acc);
3461     __ fmovd(temp1, vs2acc);
3462     __ add(s1, s1, temp0);
3463     __ add(s2, s2, temp1);
3464   }
3465 
3466   /**
3467    *  Arguments:
3468    *
3469    *  Input:
3470    *    c_rarg0   - x address
3471    *    c_rarg1   - x length
3472    *    c_rarg2   - y address
3473    *    c_rarg3   - y lenth
3474    *    c_rarg4   - z address
3475    *    c_rarg5   - z length
3476    */
3477   address generate_multiplyToLen() {
3478     __ align(CodeEntryAlignment);
3479     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3480 
3481     address start = __ pc();
3482     const Register x     = r0;
3483     const Register xlen  = r1;
3484     const Register y     = r2;
3485     const Register ylen  = r3;
3486     const Register z     = r4;
3487     const Register zlen  = r5;
3488 
3489     const Register tmp1  = r10;
3490     const Register tmp2  = r11;
3491     const Register tmp3  = r12;
3492     const Register tmp4  = r13;
3493     const Register tmp5  = r14;
3494     const Register tmp6  = r15;
3495     const Register tmp7  = r16;
3496 
3497     BLOCK_COMMENT("Entry:");
3498     __ enter(); // required for proper stackwalking of RuntimeStub frame
3499     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3500     __ leave(); // required for proper stackwalking of RuntimeStub frame
3501     __ ret(lr);
3502 
3503     return start;
3504   }
3505 
3506   address generate_squareToLen() {
3507     // squareToLen algorithm for sizes 1..127 described in java code works
3508     // faster than multiply_to_len on some CPUs and slower on others, but
3509     // multiply_to_len shows a bit better overall results
3510     __ align(CodeEntryAlignment);
3511     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3512     address start = __ pc();
3513 
3514     const Register x     = r0;
3515     const Register xlen  = r1;
3516     const Register z     = r2;
3517     const Register zlen  = r3;
3518     const Register y     = r4; // == x
3519     const Register ylen  = r5; // == xlen
3520 
3521     const Register tmp1  = r10;
3522     const Register tmp2  = r11;
3523     const Register tmp3  = r12;
3524     const Register tmp4  = r13;
3525     const Register tmp5  = r14;
3526     const Register tmp6  = r15;
3527     const Register tmp7  = r16;
3528 
3529     RegSet spilled_regs = RegSet::of(y, ylen);
3530     BLOCK_COMMENT("Entry:");
3531     __ enter();
3532     __ push(spilled_regs, sp);
3533     __ mov(y, x);
3534     __ mov(ylen, xlen);
3535     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3536     __ pop(spilled_regs, sp);
3537     __ leave();
3538     __ ret(lr);
3539     return start;
3540   }
3541 
3542   address generate_mulAdd() {
3543     __ align(CodeEntryAlignment);
3544     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3545 
3546     address start = __ pc();
3547 
3548     const Register out     = r0;
3549     const Register in      = r1;
3550     const Register offset  = r2;
3551     const Register len     = r3;
3552     const Register k       = r4;
3553 
3554     BLOCK_COMMENT("Entry:");
3555     __ enter();
3556     __ mul_add(out, in, offset, len, k);
3557     __ leave();
3558     __ ret(lr);
3559 
3560     return start;
3561   }
3562 
3563   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3564                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3565                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3566     // Karatsuba multiplication performs a 128*128 -> 256-bit
3567     // multiplication in three 128-bit multiplications and a few
3568     // additions.
3569     //
3570     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3571     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3572     //
3573     // Inputs:
3574     //
3575     // A0 in a.d[0]     (subkey)
3576     // A1 in a.d[1]
3577     // (A1+A0) in a1_xor_a0.d[0]
3578     //
3579     // B0 in b.d[0]     (state)
3580     // B1 in b.d[1]
3581 
3582     __ ext(tmp1, __ T16B, b, b, 0x08);
3583     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3584     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3585     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3586     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3587 
3588     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3589     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3590     __ eor(tmp2, __ T16B, tmp2, tmp4);
3591     __ eor(tmp2, __ T16B, tmp2, tmp3);
3592 
3593     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3594     __ ins(result_hi, __ D, tmp2, 0, 1);
3595     __ ins(result_lo, __ D, tmp2, 1, 0);
3596   }
3597 
3598   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3599                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3600     const FloatRegister t0 = result;
3601 
3602     // The GCM field polynomial f is z^128 + p(z), where p =
3603     // z^7+z^2+z+1.
3604     //
3605     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3606     //
3607     // so, given that the product we're reducing is
3608     //    a == lo + hi * z^128
3609     // substituting,
3610     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3611     //
3612     // we reduce by multiplying hi by p(z) and subtracting the result
3613     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3614     // bits we can do this with two 64-bit multiplications, lo*p and
3615     // hi*p.
3616 
3617     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3618     __ ext(t1, __ T16B, t0, z, 8);
3619     __ eor(hi, __ T16B, hi, t1);
3620     __ ext(t1, __ T16B, z, t0, 8);
3621     __ eor(lo, __ T16B, lo, t1);
3622     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3623     __ eor(result, __ T16B, lo, t0);
3624   }
3625 
3626   address generate_has_negatives(address &has_negatives_long) {
3627     const u1 large_loop_size = 64;
3628     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3629     int dcache_line = VM_Version::dcache_line_size();
3630 
3631     Register ary1 = r1, len = r2, result = r0;
3632 
3633     __ align(CodeEntryAlignment);
3634 
3635     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3636 
3637     address entry = __ pc();
3638 
3639     __ enter();
3640 
3641   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3642         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3643 
3644   __ cmp(len, (u1)15);
3645   __ br(Assembler::GT, LEN_OVER_15);
3646   // The only case when execution falls into this code is when pointer is near
3647   // the end of memory page and we have to avoid reading next page
3648   __ add(ary1, ary1, len);
3649   __ subs(len, len, 8);
3650   __ br(Assembler::GT, LEN_OVER_8);
3651   __ ldr(rscratch2, Address(ary1, -8));
3652   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3653   __ lsrv(rscratch2, rscratch2, rscratch1);
3654   __ tst(rscratch2, UPPER_BIT_MASK);
3655   __ cset(result, Assembler::NE);
3656   __ leave();
3657   __ ret(lr);
3658   __ bind(LEN_OVER_8);
3659   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3660   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3661   __ tst(rscratch2, UPPER_BIT_MASK);
3662   __ br(Assembler::NE, RET_TRUE_NO_POP);
3663   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3664   __ lsrv(rscratch1, rscratch1, rscratch2);
3665   __ tst(rscratch1, UPPER_BIT_MASK);
3666   __ cset(result, Assembler::NE);
3667   __ leave();
3668   __ ret(lr);
3669 
3670   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3671   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3672 
3673   has_negatives_long = __ pc(); // 2nd entry point
3674 
3675   __ enter();
3676 
3677   __ bind(LEN_OVER_15);
3678     __ push(spilled_regs, sp);
3679     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3680     __ cbz(rscratch2, ALIGNED);
3681     __ ldp(tmp6, tmp1, Address(ary1));
3682     __ mov(tmp5, 16);
3683     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3684     __ add(ary1, ary1, rscratch1);
3685     __ sub(len, len, rscratch1);
3686     __ orr(tmp6, tmp6, tmp1);
3687     __ tst(tmp6, UPPER_BIT_MASK);
3688     __ br(Assembler::NE, RET_TRUE);
3689 
3690   __ bind(ALIGNED);
3691     __ cmp(len, large_loop_size);
3692     __ br(Assembler::LT, CHECK_16);
3693     // Perform 16-byte load as early return in pre-loop to handle situation
3694     // when initially aligned large array has negative values at starting bytes,
3695     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3696     // slower. Cases with negative bytes further ahead won't be affected that
3697     // much. In fact, it'll be faster due to early loads, less instructions and
3698     // less branches in LARGE_LOOP.
3699     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3700     __ sub(len, len, 16);
3701     __ orr(tmp6, tmp6, tmp1);
3702     __ tst(tmp6, UPPER_BIT_MASK);
3703     __ br(Assembler::NE, RET_TRUE);
3704     __ cmp(len, large_loop_size);
3705     __ br(Assembler::LT, CHECK_16);
3706 
3707     if (SoftwarePrefetchHintDistance >= 0
3708         && SoftwarePrefetchHintDistance >= dcache_line) {
3709       // initial prefetch
3710       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3711     }
3712   __ bind(LARGE_LOOP);
3713     if (SoftwarePrefetchHintDistance >= 0) {
3714       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3715     }
3716     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3717     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3718     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3719     // instructions per cycle and have less branches, but this approach disables
3720     // early return, thus, all 64 bytes are loaded and checked every time.
3721     __ ldp(tmp2, tmp3, Address(ary1));
3722     __ ldp(tmp4, tmp5, Address(ary1, 16));
3723     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3724     __ ldp(tmp6, tmp1, Address(ary1, 48));
3725     __ add(ary1, ary1, large_loop_size);
3726     __ sub(len, len, large_loop_size);
3727     __ orr(tmp2, tmp2, tmp3);
3728     __ orr(tmp4, tmp4, tmp5);
3729     __ orr(rscratch1, rscratch1, rscratch2);
3730     __ orr(tmp6, tmp6, tmp1);
3731     __ orr(tmp2, tmp2, tmp4);
3732     __ orr(rscratch1, rscratch1, tmp6);
3733     __ orr(tmp2, tmp2, rscratch1);
3734     __ tst(tmp2, UPPER_BIT_MASK);
3735     __ br(Assembler::NE, RET_TRUE);
3736     __ cmp(len, large_loop_size);
3737     __ br(Assembler::GE, LARGE_LOOP);
3738 
3739   __ bind(CHECK_16); // small 16-byte load pre-loop
3740     __ cmp(len, (u1)16);
3741     __ br(Assembler::LT, POST_LOOP16);
3742 
3743   __ bind(LOOP16); // small 16-byte load loop
3744     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3745     __ sub(len, len, 16);
3746     __ orr(tmp2, tmp2, tmp3);
3747     __ tst(tmp2, UPPER_BIT_MASK);
3748     __ br(Assembler::NE, RET_TRUE);
3749     __ cmp(len, (u1)16);
3750     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3751 
3752   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3753     __ cmp(len, (u1)8);
3754     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3755     __ ldr(tmp3, Address(__ post(ary1, 8)));
3756     __ sub(len, len, 8);
3757     __ tst(tmp3, UPPER_BIT_MASK);
3758     __ br(Assembler::NE, RET_TRUE);
3759 
3760   __ bind(POST_LOOP16_LOAD_TAIL);
3761     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3762     __ ldr(tmp1, Address(ary1));
3763     __ mov(tmp2, 64);
3764     __ sub(tmp4, tmp2, len, __ LSL, 3);
3765     __ lslv(tmp1, tmp1, tmp4);
3766     __ tst(tmp1, UPPER_BIT_MASK);
3767     __ br(Assembler::NE, RET_TRUE);
3768     // Fallthrough
3769 
3770   __ bind(RET_FALSE);
3771     __ pop(spilled_regs, sp);
3772     __ leave();
3773     __ mov(result, zr);
3774     __ ret(lr);
3775 
3776   __ bind(RET_TRUE);
3777     __ pop(spilled_regs, sp);
3778   __ bind(RET_TRUE_NO_POP);
3779     __ leave();
3780     __ mov(result, 1);
3781     __ ret(lr);
3782 
3783   __ bind(DONE);
3784     __ pop(spilled_regs, sp);
3785     __ leave();
3786     __ ret(lr);
3787     return entry;
3788   }
3789 
3790   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3791         bool usePrefetch, Label &NOT_EQUAL) {
3792     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3793         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3794         tmp7 = r12, tmp8 = r13;
3795     Label LOOP;
3796 
3797     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3798     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3799     __ bind(LOOP);
3800     if (usePrefetch) {
3801       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3802       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3803     }
3804     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3805     __ eor(tmp1, tmp1, tmp2);
3806     __ eor(tmp3, tmp3, tmp4);
3807     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3808     __ orr(tmp1, tmp1, tmp3);
3809     __ cbnz(tmp1, NOT_EQUAL);
3810     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3811     __ eor(tmp5, tmp5, tmp6);
3812     __ eor(tmp7, tmp7, tmp8);
3813     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3814     __ orr(tmp5, tmp5, tmp7);
3815     __ cbnz(tmp5, NOT_EQUAL);
3816     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3817     __ eor(tmp1, tmp1, tmp2);
3818     __ eor(tmp3, tmp3, tmp4);
3819     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3820     __ orr(tmp1, tmp1, tmp3);
3821     __ cbnz(tmp1, NOT_EQUAL);
3822     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3823     __ eor(tmp5, tmp5, tmp6);
3824     __ sub(cnt1, cnt1, 8 * wordSize);
3825     __ eor(tmp7, tmp7, tmp8);
3826     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3827     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3828     // cmp) because subs allows an unlimited range of immediate operand.
3829     __ subs(tmp6, cnt1, loopThreshold);
3830     __ orr(tmp5, tmp5, tmp7);
3831     __ cbnz(tmp5, NOT_EQUAL);
3832     __ br(__ GE, LOOP);
3833     // post-loop
3834     __ eor(tmp1, tmp1, tmp2);
3835     __ eor(tmp3, tmp3, tmp4);
3836     __ orr(tmp1, tmp1, tmp3);
3837     __ sub(cnt1, cnt1, 2 * wordSize);
3838     __ cbnz(tmp1, NOT_EQUAL);
3839   }
3840 
3841   void generate_large_array_equals_loop_simd(int loopThreshold,
3842         bool usePrefetch, Label &NOT_EQUAL) {
3843     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3844         tmp2 = rscratch2;
3845     Label LOOP;
3846 
3847     __ bind(LOOP);
3848     if (usePrefetch) {
3849       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3850       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3851     }
3852     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3853     __ sub(cnt1, cnt1, 8 * wordSize);
3854     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3855     __ subs(tmp1, cnt1, loopThreshold);
3856     __ eor(v0, __ T16B, v0, v4);
3857     __ eor(v1, __ T16B, v1, v5);
3858     __ eor(v2, __ T16B, v2, v6);
3859     __ eor(v3, __ T16B, v3, v7);
3860     __ orr(v0, __ T16B, v0, v1);
3861     __ orr(v1, __ T16B, v2, v3);
3862     __ orr(v0, __ T16B, v0, v1);
3863     __ umov(tmp1, v0, __ D, 0);
3864     __ umov(tmp2, v0, __ D, 1);
3865     __ orr(tmp1, tmp1, tmp2);
3866     __ cbnz(tmp1, NOT_EQUAL);
3867     __ br(__ GE, LOOP);
3868   }
3869 
3870   // a1 = r1 - array1 address
3871   // a2 = r2 - array2 address
3872   // result = r0 - return value. Already contains "false"
3873   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3874   // r3-r5 are reserved temporary registers
3875   address generate_large_array_equals() {
3876     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3877         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3878         tmp7 = r12, tmp8 = r13;
3879     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3880         SMALL_LOOP, POST_LOOP;
3881     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3882     // calculate if at least 32 prefetched bytes are used
3883     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3884     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3885     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3886     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3887         tmp5, tmp6, tmp7, tmp8);
3888 
3889     __ align(CodeEntryAlignment);
3890 
3891     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3892 
3893     address entry = __ pc();
3894     __ enter();
3895     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3896     // also advance pointers to use post-increment instead of pre-increment
3897     __ add(a1, a1, wordSize);
3898     __ add(a2, a2, wordSize);
3899     if (AvoidUnalignedAccesses) {
3900       // both implementations (SIMD/nonSIMD) are using relatively large load
3901       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3902       // on some CPUs in case of address is not at least 16-byte aligned.
3903       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3904       // load if needed at least for 1st address and make if 16-byte aligned.
3905       Label ALIGNED16;
3906       __ tbz(a1, 3, ALIGNED16);
3907       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3908       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3909       __ sub(cnt1, cnt1, wordSize);
3910       __ eor(tmp1, tmp1, tmp2);
3911       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3912       __ bind(ALIGNED16);
3913     }
3914     if (UseSIMDForArrayEquals) {
3915       if (SoftwarePrefetchHintDistance >= 0) {
3916         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3917         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3918         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3919             /* prfm = */ true, NOT_EQUAL);
3920         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3921         __ br(__ LT, TAIL);
3922       }
3923       __ bind(NO_PREFETCH_LARGE_LOOP);
3924       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3925           /* prfm = */ false, NOT_EQUAL);
3926     } else {
3927       __ push(spilled_regs, sp);
3928       if (SoftwarePrefetchHintDistance >= 0) {
3929         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3930         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3931         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3932             /* prfm = */ true, NOT_EQUAL);
3933         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3934         __ br(__ LT, TAIL);
3935       }
3936       __ bind(NO_PREFETCH_LARGE_LOOP);
3937       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3938           /* prfm = */ false, NOT_EQUAL);
3939     }
3940     __ bind(TAIL);
3941       __ cbz(cnt1, EQUAL);
3942       __ subs(cnt1, cnt1, wordSize);
3943       __ br(__ LE, POST_LOOP);
3944     __ bind(SMALL_LOOP);
3945       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3946       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3947       __ subs(cnt1, cnt1, wordSize);
3948       __ eor(tmp1, tmp1, tmp2);
3949       __ cbnz(tmp1, NOT_EQUAL);
3950       __ br(__ GT, SMALL_LOOP);
3951     __ bind(POST_LOOP);
3952       __ ldr(tmp1, Address(a1, cnt1));
3953       __ ldr(tmp2, Address(a2, cnt1));
3954       __ eor(tmp1, tmp1, tmp2);
3955       __ cbnz(tmp1, NOT_EQUAL);
3956     __ bind(EQUAL);
3957       __ mov(result, true);
3958     __ bind(NOT_EQUAL);
3959       if (!UseSIMDForArrayEquals) {
3960         __ pop(spilled_regs, sp);
3961       }
3962     __ bind(NOT_EQUAL_NO_POP);
3963     __ leave();
3964     __ ret(lr);
3965     return entry;
3966   }
3967 
3968   address generate_dsin_dcos(bool isCos) {
3969     __ align(CodeEntryAlignment);
3970     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3971     address start = __ pc();
3972     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3973         (address)StubRoutines::aarch64::_two_over_pi,
3974         (address)StubRoutines::aarch64::_pio2,
3975         (address)StubRoutines::aarch64::_dsin_coef,
3976         (address)StubRoutines::aarch64::_dcos_coef);
3977     return start;
3978   }
3979 
3980   address generate_dlog() {
3981     __ align(CodeEntryAlignment);
3982     StubCodeMark mark(this, "StubRoutines", "dlog");
3983     address entry = __ pc();
3984     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3985         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3986     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3987     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3988         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3989     return entry;
3990   }
3991 
3992   // code for comparing 16 bytes of strings with same encoding
3993   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3994     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3995     __ ldr(rscratch1, Address(__ post(str1, 8)));
3996     __ eor(rscratch2, tmp1, tmp2);
3997     __ ldr(cnt1, Address(__ post(str2, 8)));
3998     __ cbnz(rscratch2, DIFF1);
3999     __ ldr(tmp1, Address(__ post(str1, 8)));
4000     __ eor(rscratch2, rscratch1, cnt1);
4001     __ ldr(tmp2, Address(__ post(str2, 8)));
4002     __ cbnz(rscratch2, DIFF2);
4003   }
4004 
4005   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4006   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4007       Label &DIFF2) {
4008     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4009     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4010 
4011     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4012     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4013     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4014     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4015 
4016     __ fmovd(tmpL, vtmp3);
4017     __ eor(rscratch2, tmp3, tmpL);
4018     __ cbnz(rscratch2, DIFF2);
4019 
4020     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4021     __ umov(tmpL, vtmp3, __ D, 1);
4022     __ eor(rscratch2, tmpU, tmpL);
4023     __ cbnz(rscratch2, DIFF1);
4024 
4025     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4026     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4027     __ fmovd(tmpL, vtmp);
4028     __ eor(rscratch2, tmp3, tmpL);
4029     __ cbnz(rscratch2, DIFF2);
4030 
4031     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4032     __ umov(tmpL, vtmp, __ D, 1);
4033     __ eor(rscratch2, tmpU, tmpL);
4034     __ cbnz(rscratch2, DIFF1);
4035   }
4036 
4037   // r0  = result
4038   // r1  = str1
4039   // r2  = cnt1
4040   // r3  = str2
4041   // r4  = cnt2
4042   // r10 = tmp1
4043   // r11 = tmp2
4044   address generate_compare_long_string_different_encoding(bool isLU) {
4045     __ align(CodeEntryAlignment);
4046     StubCodeMark mark(this, "StubRoutines", isLU
4047         ? "compare_long_string_different_encoding LU"
4048         : "compare_long_string_different_encoding UL");
4049     address entry = __ pc();
4050     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4051         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4052         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4053     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4054         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4055     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4056     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4057 
4058     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4059 
4060     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4061     // cnt2 == amount of characters left to compare
4062     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4063     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4064     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4065     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4066     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4067     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4068     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4069     __ eor(rscratch2, tmp1, tmp2);
4070     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4071     __ mov(rscratch1, tmp2);
4072     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4073     Register strU = isLU ? str2 : str1,
4074              strL = isLU ? str1 : str2,
4075              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4076              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4077     __ push(spilled_regs, sp);
4078     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4079     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4080 
4081     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4082 
4083     if (SoftwarePrefetchHintDistance >= 0) {
4084       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4085       __ br(__ LT, NO_PREFETCH);
4086       __ bind(LARGE_LOOP_PREFETCH);
4087         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4088         __ mov(tmp4, 2);
4089         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4090         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4091           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4092           __ subs(tmp4, tmp4, 1);
4093           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4094           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4095           __ mov(tmp4, 2);
4096         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4097           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4098           __ subs(tmp4, tmp4, 1);
4099           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4100           __ sub(cnt2, cnt2, 64);
4101           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4102           __ br(__ GE, LARGE_LOOP_PREFETCH);
4103     }
4104     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4105     __ bind(NO_PREFETCH);
4106     __ subs(cnt2, cnt2, 16);
4107     __ br(__ LT, TAIL);
4108     __ bind(SMALL_LOOP); // smaller loop
4109       __ subs(cnt2, cnt2, 16);
4110       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4111       __ br(__ GE, SMALL_LOOP);
4112       __ cmn(cnt2, (u1)16);
4113       __ br(__ EQ, LOAD_LAST);
4114     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4115       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
4116       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4117       __ ldr(tmp3, Address(cnt1, -8));
4118       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4119       __ b(LOAD_LAST);
4120     __ bind(DIFF2);
4121       __ mov(tmpU, tmp3);
4122     __ bind(DIFF1);
4123       __ pop(spilled_regs, sp);
4124       __ b(CALCULATE_DIFFERENCE);
4125     __ bind(LOAD_LAST);
4126       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4127       // No need to load it again
4128       __ mov(tmpU, tmp3);
4129       __ pop(spilled_regs, sp);
4130 
4131       __ ldrs(vtmp, Address(strL));
4132       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4133       __ fmovd(tmpL, vtmp);
4134 
4135       __ eor(rscratch2, tmpU, tmpL);
4136       __ cbz(rscratch2, DONE);
4137 
4138     // Find the first different characters in the longwords and
4139     // compute their difference.
4140     __ bind(CALCULATE_DIFFERENCE);
4141       __ rev(rscratch2, rscratch2);
4142       __ clz(rscratch2, rscratch2);
4143       __ andr(rscratch2, rscratch2, -16);
4144       __ lsrv(tmp1, tmp1, rscratch2);
4145       __ uxthw(tmp1, tmp1);
4146       __ lsrv(rscratch1, rscratch1, rscratch2);
4147       __ uxthw(rscratch1, rscratch1);
4148       __ subw(result, tmp1, rscratch1);
4149     __ bind(DONE);
4150       __ ret(lr);
4151     return entry;
4152   }
4153 
4154   // r0  = result
4155   // r1  = str1
4156   // r2  = cnt1
4157   // r3  = str2
4158   // r4  = cnt2
4159   // r10 = tmp1
4160   // r11 = tmp2
4161   address generate_compare_long_string_same_encoding(bool isLL) {
4162     __ align(CodeEntryAlignment);
4163     StubCodeMark mark(this, "StubRoutines", isLL
4164         ? "compare_long_string_same_encoding LL"
4165         : "compare_long_string_same_encoding UU");
4166     address entry = __ pc();
4167     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4168         tmp1 = r10, tmp2 = r11;
4169     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4170         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4171         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4172     // exit from large loop when less than 64 bytes left to read or we're about
4173     // to prefetch memory behind array border
4174     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4175     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4176     // update cnt2 counter with already loaded 8 bytes
4177     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4178     // update pointers, because of previous read
4179     __ add(str1, str1, wordSize);
4180     __ add(str2, str2, wordSize);
4181     if (SoftwarePrefetchHintDistance >= 0) {
4182       __ bind(LARGE_LOOP_PREFETCH);
4183         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4184         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4185         compare_string_16_bytes_same(DIFF, DIFF2);
4186         compare_string_16_bytes_same(DIFF, DIFF2);
4187         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4188         compare_string_16_bytes_same(DIFF, DIFF2);
4189         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4190         compare_string_16_bytes_same(DIFF, DIFF2);
4191         __ br(__ GT, LARGE_LOOP_PREFETCH);
4192         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4193     }
4194     // less than 16 bytes left?
4195     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4196     __ br(__ LT, TAIL);
4197     __ bind(SMALL_LOOP);
4198       compare_string_16_bytes_same(DIFF, DIFF2);
4199       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4200       __ br(__ GE, SMALL_LOOP);
4201     __ bind(TAIL);
4202       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4203       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4204       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4205       __ br(__ LE, CHECK_LAST);
4206       __ eor(rscratch2, tmp1, tmp2);
4207       __ cbnz(rscratch2, DIFF);
4208       __ ldr(tmp1, Address(__ post(str1, 8)));
4209       __ ldr(tmp2, Address(__ post(str2, 8)));
4210       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4211     __ bind(CHECK_LAST);
4212       if (!isLL) {
4213         __ add(cnt2, cnt2, cnt2); // now in bytes
4214       }
4215       __ eor(rscratch2, tmp1, tmp2);
4216       __ cbnz(rscratch2, DIFF);
4217       __ ldr(rscratch1, Address(str1, cnt2));
4218       __ ldr(cnt1, Address(str2, cnt2));
4219       __ eor(rscratch2, rscratch1, cnt1);
4220       __ cbz(rscratch2, LENGTH_DIFF);
4221       // Find the first different characters in the longwords and
4222       // compute their difference.
4223     __ bind(DIFF2);
4224       __ rev(rscratch2, rscratch2);
4225       __ clz(rscratch2, rscratch2);
4226       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4227       __ lsrv(rscratch1, rscratch1, rscratch2);
4228       if (isLL) {
4229         __ lsrv(cnt1, cnt1, rscratch2);
4230         __ uxtbw(rscratch1, rscratch1);
4231         __ uxtbw(cnt1, cnt1);
4232       } else {
4233         __ lsrv(cnt1, cnt1, rscratch2);
4234         __ uxthw(rscratch1, rscratch1);
4235         __ uxthw(cnt1, cnt1);
4236       }
4237       __ subw(result, rscratch1, cnt1);
4238       __ b(LENGTH_DIFF);
4239     __ bind(DIFF);
4240       __ rev(rscratch2, rscratch2);
4241       __ clz(rscratch2, rscratch2);
4242       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4243       __ lsrv(tmp1, tmp1, rscratch2);
4244       if (isLL) {
4245         __ lsrv(tmp2, tmp2, rscratch2);
4246         __ uxtbw(tmp1, tmp1);
4247         __ uxtbw(tmp2, tmp2);
4248       } else {
4249         __ lsrv(tmp2, tmp2, rscratch2);
4250         __ uxthw(tmp1, tmp1);
4251         __ uxthw(tmp2, tmp2);
4252       }
4253       __ subw(result, tmp1, tmp2);
4254       __ b(LENGTH_DIFF);
4255     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4256       __ eor(rscratch2, tmp1, tmp2);
4257       __ cbnz(rscratch2, DIFF);
4258     __ bind(LENGTH_DIFF);
4259       __ ret(lr);
4260     return entry;
4261   }
4262 
4263   void generate_compare_long_strings() {
4264       StubRoutines::aarch64::_compare_long_string_LL
4265           = generate_compare_long_string_same_encoding(true);
4266       StubRoutines::aarch64::_compare_long_string_UU
4267           = generate_compare_long_string_same_encoding(false);
4268       StubRoutines::aarch64::_compare_long_string_LU
4269           = generate_compare_long_string_different_encoding(true);
4270       StubRoutines::aarch64::_compare_long_string_UL
4271           = generate_compare_long_string_different_encoding(false);
4272   }
4273 
4274   // R0 = result
4275   // R1 = str2
4276   // R2 = cnt1
4277   // R3 = str1
4278   // R4 = cnt2
4279   // This generic linear code use few additional ideas, which makes it faster:
4280   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4281   // in order to skip initial loading(help in systems with 1 ld pipeline)
4282   // 2) we can use "fast" algorithm of finding single character to search for
4283   // first symbol with less branches(1 branch per each loaded register instead
4284   // of branch for each symbol), so, this is where constants like
4285   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4286   // 3) after loading and analyzing 1st register of source string, it can be
4287   // used to search for every 1st character entry, saving few loads in
4288   // comparison with "simplier-but-slower" implementation
4289   // 4) in order to avoid lots of push/pop operations, code below is heavily
4290   // re-using/re-initializing/compressing register values, which makes code
4291   // larger and a bit less readable, however, most of extra operations are
4292   // issued during loads or branches, so, penalty is minimal
4293   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4294     const char* stubName = str1_isL
4295         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4296         : "indexof_linear_uu";
4297     __ align(CodeEntryAlignment);
4298     StubCodeMark mark(this, "StubRoutines", stubName);
4299     address entry = __ pc();
4300 
4301     int str1_chr_size = str1_isL ? 1 : 2;
4302     int str2_chr_size = str2_isL ? 1 : 2;
4303     int str1_chr_shift = str1_isL ? 0 : 1;
4304     int str2_chr_shift = str2_isL ? 0 : 1;
4305     bool isL = str1_isL && str2_isL;
4306    // parameters
4307     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4308     // temporary registers
4309     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4310     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4311     // redefinitions
4312     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4313 
4314     __ push(spilled_regs, sp);
4315     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4316         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4317         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4318         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4319         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4320         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4321     // Read whole register from str1. It is safe, because length >=8 here
4322     __ ldr(ch1, Address(str1));
4323     // Read whole register from str2. It is safe, because length >=8 here
4324     __ ldr(ch2, Address(str2));
4325     __ sub(cnt2, cnt2, cnt1);
4326     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4327     if (str1_isL != str2_isL) {
4328       __ eor(v0, __ T16B, v0, v0);
4329     }
4330     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4331     __ mul(first, first, tmp1);
4332     // check if we have less than 1 register to check
4333     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4334     if (str1_isL != str2_isL) {
4335       __ fmovd(v1, ch1);
4336     }
4337     __ br(__ LE, L_SMALL);
4338     __ eor(ch2, first, ch2);
4339     if (str1_isL != str2_isL) {
4340       __ zip1(v1, __ T16B, v1, v0);
4341     }
4342     __ sub(tmp2, ch2, tmp1);
4343     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4344     __ bics(tmp2, tmp2, ch2);
4345     if (str1_isL != str2_isL) {
4346       __ fmovd(ch1, v1);
4347     }
4348     __ br(__ NE, L_HAS_ZERO);
4349     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4350     __ add(result, result, wordSize/str2_chr_size);
4351     __ add(str2, str2, wordSize);
4352     __ br(__ LT, L_POST_LOOP);
4353     __ BIND(L_LOOP);
4354       __ ldr(ch2, Address(str2));
4355       __ eor(ch2, first, ch2);
4356       __ sub(tmp2, ch2, tmp1);
4357       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4358       __ bics(tmp2, tmp2, ch2);
4359       __ br(__ NE, L_HAS_ZERO);
4360     __ BIND(L_LOOP_PROCEED);
4361       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4362       __ add(str2, str2, wordSize);
4363       __ add(result, result, wordSize/str2_chr_size);
4364       __ br(__ GE, L_LOOP);
4365     __ BIND(L_POST_LOOP);
4366       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4367       __ br(__ LE, NOMATCH);
4368       __ ldr(ch2, Address(str2));
4369       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4370       __ eor(ch2, first, ch2);
4371       __ sub(tmp2, ch2, tmp1);
4372       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4373       __ mov(tmp4, -1); // all bits set
4374       __ b(L_SMALL_PROCEED);
4375     __ align(OptoLoopAlignment);
4376     __ BIND(L_SMALL);
4377       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4378       __ eor(ch2, first, ch2);
4379       if (str1_isL != str2_isL) {
4380         __ zip1(v1, __ T16B, v1, v0);
4381       }
4382       __ sub(tmp2, ch2, tmp1);
4383       __ mov(tmp4, -1); // all bits set
4384       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4385       if (str1_isL != str2_isL) {
4386         __ fmovd(ch1, v1); // move converted 4 symbols
4387       }
4388     __ BIND(L_SMALL_PROCEED);
4389       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4390       __ bic(tmp2, tmp2, ch2);
4391       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4392       __ rbit(tmp2, tmp2);
4393       __ br(__ EQ, NOMATCH);
4394     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4395       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4396       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4397       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4398       if (str2_isL) { // LL
4399         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4400         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4401         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4402         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4403         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4404       } else {
4405         __ mov(ch2, 0xE); // all bits in byte set except last one
4406         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4407         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4408         __ lslv(tmp2, tmp2, tmp4);
4409         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4410         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4411         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4412         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4413       }
4414       __ cmp(ch1, ch2);
4415       __ mov(tmp4, wordSize/str2_chr_size);
4416       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4417     __ BIND(L_SMALL_CMP_LOOP);
4418       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4419                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4420       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4421                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4422       __ add(tmp4, tmp4, 1);
4423       __ cmp(tmp4, cnt1);
4424       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4425       __ cmp(first, ch2);
4426       __ br(__ EQ, L_SMALL_CMP_LOOP);
4427     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4428       __ cbz(tmp2, NOMATCH); // no more matches. exit
4429       __ clz(tmp4, tmp2);
4430       __ add(result, result, 1); // advance index
4431       __ add(str2, str2, str2_chr_size); // advance pointer
4432       __ b(L_SMALL_HAS_ZERO_LOOP);
4433     __ align(OptoLoopAlignment);
4434     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4435       __ cmp(first, ch2);
4436       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4437       __ b(DONE);
4438     __ align(OptoLoopAlignment);
4439     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4440       if (str2_isL) { // LL
4441         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4442         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4443         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4444         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4445         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4446       } else {
4447         __ mov(ch2, 0xE); // all bits in byte set except last one
4448         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4449         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4450         __ lslv(tmp2, tmp2, tmp4);
4451         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4452         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4453         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4454         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4455       }
4456       __ cmp(ch1, ch2);
4457       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4458       __ b(DONE);
4459     __ align(OptoLoopAlignment);
4460     __ BIND(L_HAS_ZERO);
4461       __ rbit(tmp2, tmp2);
4462       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4463       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4464       // It's fine because both counters are 32bit and are not changed in this
4465       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4466       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4467       __ sub(result, result, 1);
4468     __ BIND(L_HAS_ZERO_LOOP);
4469       __ mov(cnt1, wordSize/str2_chr_size);
4470       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4471       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4472       if (str2_isL) {
4473         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4474         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4475         __ lslv(tmp2, tmp2, tmp4);
4476         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4477         __ add(tmp4, tmp4, 1);
4478         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4479         __ lsl(tmp2, tmp2, 1);
4480         __ mov(tmp4, wordSize/str2_chr_size);
4481       } else {
4482         __ mov(ch2, 0xE);
4483         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4484         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4485         __ lslv(tmp2, tmp2, tmp4);
4486         __ add(tmp4, tmp4, 1);
4487         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4488         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4489         __ lsl(tmp2, tmp2, 1);
4490         __ mov(tmp4, wordSize/str2_chr_size);
4491         __ sub(str2, str2, str2_chr_size);
4492       }
4493       __ cmp(ch1, ch2);
4494       __ mov(tmp4, wordSize/str2_chr_size);
4495       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4496     __ BIND(L_CMP_LOOP);
4497       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4498                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4499       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4500                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4501       __ add(tmp4, tmp4, 1);
4502       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4503       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4504       __ cmp(cnt1, ch2);
4505       __ br(__ EQ, L_CMP_LOOP);
4506     __ BIND(L_CMP_LOOP_NOMATCH);
4507       // here we're not matched
4508       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4509       __ clz(tmp4, tmp2);
4510       __ add(str2, str2, str2_chr_size); // advance pointer
4511       __ b(L_HAS_ZERO_LOOP);
4512     __ align(OptoLoopAlignment);
4513     __ BIND(L_CMP_LOOP_LAST_CMP);
4514       __ cmp(cnt1, ch2);
4515       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4516       __ b(DONE);
4517     __ align(OptoLoopAlignment);
4518     __ BIND(L_CMP_LOOP_LAST_CMP2);
4519       if (str2_isL) {
4520         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4521         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4522         __ lslv(tmp2, tmp2, tmp4);
4523         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4524         __ add(tmp4, tmp4, 1);
4525         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4526         __ lsl(tmp2, tmp2, 1);
4527       } else {
4528         __ mov(ch2, 0xE);
4529         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4530         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4531         __ lslv(tmp2, tmp2, tmp4);
4532         __ add(tmp4, tmp4, 1);
4533         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4534         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4535         __ lsl(tmp2, tmp2, 1);
4536         __ sub(str2, str2, str2_chr_size);
4537       }
4538       __ cmp(ch1, ch2);
4539       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4540       __ b(DONE);
4541     __ align(OptoLoopAlignment);
4542     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4543       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4544       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4545       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4546       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4547       // result by analyzed characters value, so, we can just reset lower bits
4548       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4549       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4550       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4551       // index of last analyzed substring inside current octet. So, str2 in at
4552       // respective start address. We need to advance it to next octet
4553       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4554       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4555       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4556       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4557       __ movw(cnt2, cnt2);
4558       __ b(L_LOOP_PROCEED);
4559     __ align(OptoLoopAlignment);
4560     __ BIND(NOMATCH);
4561       __ mov(result, -1);
4562     __ BIND(DONE);
4563       __ pop(spilled_regs, sp);
4564       __ ret(lr);
4565     return entry;
4566   }
4567 
4568   void generate_string_indexof_stubs() {
4569     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4570     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4571     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4572   }
4573 
4574   void inflate_and_store_2_fp_registers(bool generatePrfm,
4575       FloatRegister src1, FloatRegister src2) {
4576     Register dst = r1;
4577     __ zip1(v1, __ T16B, src1, v0);
4578     __ zip2(v2, __ T16B, src1, v0);
4579     if (generatePrfm) {
4580       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4581     }
4582     __ zip1(v3, __ T16B, src2, v0);
4583     __ zip2(v4, __ T16B, src2, v0);
4584     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4585   }
4586 
4587   // R0 = src
4588   // R1 = dst
4589   // R2 = len
4590   // R3 = len >> 3
4591   // V0 = 0
4592   // v1 = loaded 8 bytes
4593   address generate_large_byte_array_inflate() {
4594     __ align(CodeEntryAlignment);
4595     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4596     address entry = __ pc();
4597     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4598     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4599     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4600 
4601     // do one more 8-byte read to have address 16-byte aligned in most cases
4602     // also use single store instruction
4603     __ ldrd(v2, __ post(src, 8));
4604     __ sub(octetCounter, octetCounter, 2);
4605     __ zip1(v1, __ T16B, v1, v0);
4606     __ zip1(v2, __ T16B, v2, v0);
4607     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4608     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4609     __ subs(rscratch1, octetCounter, large_loop_threshold);
4610     __ br(__ LE, LOOP_START);
4611     __ b(LOOP_PRFM_START);
4612     __ bind(LOOP_PRFM);
4613       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4614     __ bind(LOOP_PRFM_START);
4615       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4616       __ sub(octetCounter, octetCounter, 8);
4617       __ subs(rscratch1, octetCounter, large_loop_threshold);
4618       inflate_and_store_2_fp_registers(true, v3, v4);
4619       inflate_and_store_2_fp_registers(true, v5, v6);
4620       __ br(__ GT, LOOP_PRFM);
4621       __ cmp(octetCounter, (u1)8);
4622       __ br(__ LT, DONE);
4623     __ bind(LOOP);
4624       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4625       __ bind(LOOP_START);
4626       __ sub(octetCounter, octetCounter, 8);
4627       __ cmp(octetCounter, (u1)8);
4628       inflate_and_store_2_fp_registers(false, v3, v4);
4629       inflate_and_store_2_fp_registers(false, v5, v6);
4630       __ br(__ GE, LOOP);
4631     __ bind(DONE);
4632       __ ret(lr);
4633     return entry;
4634   }
4635 
4636   /**
4637    *  Arguments:
4638    *
4639    *  Input:
4640    *  c_rarg0   - current state address
4641    *  c_rarg1   - H key address
4642    *  c_rarg2   - data address
4643    *  c_rarg3   - number of blocks
4644    *
4645    *  Output:
4646    *  Updated state at c_rarg0
4647    */
4648   address generate_ghash_processBlocks() {
4649     // Bafflingly, GCM uses little-endian for the byte order, but
4650     // big-endian for the bit order.  For example, the polynomial 1 is
4651     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4652     //
4653     // So, we must either reverse the bytes in each word and do
4654     // everything big-endian or reverse the bits in each byte and do
4655     // it little-endian.  On AArch64 it's more idiomatic to reverse
4656     // the bits in each byte (we have an instruction, RBIT, to do
4657     // that) and keep the data in little-endian bit order throught the
4658     // calculation, bit-reversing the inputs and outputs.
4659 
4660     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4661     __ align(wordSize * 2);
4662     address p = __ pc();
4663     __ emit_int64(0x87);  // The low-order bits of the field
4664                           // polynomial (i.e. p = z^7+z^2+z+1)
4665                           // repeated in the low and high parts of a
4666                           // 128-bit vector
4667     __ emit_int64(0x87);
4668 
4669     __ align(CodeEntryAlignment);
4670     address start = __ pc();
4671 
4672     Register state   = c_rarg0;
4673     Register subkeyH = c_rarg1;
4674     Register data    = c_rarg2;
4675     Register blocks  = c_rarg3;
4676 
4677     FloatRegister vzr = v30;
4678     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4679 
4680     __ ldrq(v0, Address(state));
4681     __ ldrq(v1, Address(subkeyH));
4682 
4683     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4684     __ rbit(v0, __ T16B, v0);
4685     __ rev64(v1, __ T16B, v1);
4686     __ rbit(v1, __ T16B, v1);
4687 
4688     __ ldrq(v26, p);
4689 
4690     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4691     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4692 
4693     {
4694       Label L_ghash_loop;
4695       __ bind(L_ghash_loop);
4696 
4697       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4698                                                  // reversing each byte
4699       __ rbit(v2, __ T16B, v2);
4700       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4701 
4702       // Multiply state in v2 by subkey in v1
4703       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4704                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4705                      /*temps*/v6, v20, v18, v21);
4706       // Reduce v7:v5 by the field polynomial
4707       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4708 
4709       __ sub(blocks, blocks, 1);
4710       __ cbnz(blocks, L_ghash_loop);
4711     }
4712 
4713     // The bit-reversed result is at this point in v0
4714     __ rev64(v1, __ T16B, v0);
4715     __ rbit(v1, __ T16B, v1);
4716 
4717     __ st1(v1, __ T16B, state);
4718     __ ret(lr);
4719 
4720     return start;
4721   }
4722 
4723   // Continuation point for throwing of implicit exceptions that are
4724   // not handled in the current activation. Fabricates an exception
4725   // oop and initiates normal exception dispatching in this
4726   // frame. Since we need to preserve callee-saved values (currently
4727   // only for C2, but done for C1 as well) we need a callee-saved oop
4728   // map and therefore have to make these stubs into RuntimeStubs
4729   // rather than BufferBlobs.  If the compiler needs all registers to
4730   // be preserved between the fault point and the exception handler
4731   // then it must assume responsibility for that in
4732   // AbstractCompiler::continuation_for_implicit_null_exception or
4733   // continuation_for_implicit_division_by_zero_exception. All other
4734   // implicit exceptions (e.g., NullPointerException or
4735   // AbstractMethodError on entry) are either at call sites or
4736   // otherwise assume that stack unwinding will be initiated, so
4737   // caller saved registers were assumed volatile in the compiler.
4738 
4739 #undef __
4740 #define __ masm->
4741 
4742   address generate_throw_exception(const char* name,
4743                                    address runtime_entry,
4744                                    Register arg1 = noreg,
4745                                    Register arg2 = noreg) {
4746     // Information about frame layout at time of blocking runtime call.
4747     // Note that we only have to preserve callee-saved registers since
4748     // the compilers are responsible for supplying a continuation point
4749     // if they expect all registers to be preserved.
4750     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4751     enum layout {
4752       rfp_off = 0,
4753       rfp_off2,
4754       return_off,
4755       return_off2,
4756       framesize // inclusive of return address
4757     };
4758 
4759     int insts_size = 512;
4760     int locs_size  = 64;
4761 
4762     CodeBuffer code(name, insts_size, locs_size);
4763     OopMapSet* oop_maps  = new OopMapSet();
4764     MacroAssembler* masm = new MacroAssembler(&code);
4765 
4766     address start = __ pc();
4767 
4768     // This is an inlined and slightly modified version of call_VM
4769     // which has the ability to fetch the return PC out of
4770     // thread-local storage and also sets up last_Java_sp slightly
4771     // differently than the real call_VM
4772 
4773     __ enter(); // Save FP and LR before call
4774 
4775     assert(is_even(framesize/2), "sp not 16-byte aligned");
4776 
4777     // lr and fp are already in place
4778     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4779 
4780     int frame_complete = __ pc() - start;
4781 
4782     // Set up last_Java_sp and last_Java_fp
4783     address the_pc = __ pc();
4784     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4785 
4786     // Call runtime
4787     if (arg1 != noreg) {
4788       assert(arg2 != c_rarg1, "clobbered");
4789       __ mov(c_rarg1, arg1);
4790     }
4791     if (arg2 != noreg) {
4792       __ mov(c_rarg2, arg2);
4793     }
4794     __ mov(c_rarg0, rthread);
4795     BLOCK_COMMENT("call runtime_entry");
4796     __ mov(rscratch1, runtime_entry);
4797     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4798 
4799     // Generate oop map
4800     OopMap* map = new OopMap(framesize, 0);
4801 
4802     oop_maps->add_gc_map(the_pc - start, map);
4803 
4804     __ reset_last_Java_frame(true);
4805     __ maybe_isb();
4806 
4807     __ leave();
4808 
4809     // check for pending exceptions
4810 #ifdef ASSERT
4811     Label L;
4812     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4813     __ cbnz(rscratch1, L);
4814     __ should_not_reach_here();
4815     __ bind(L);
4816 #endif // ASSERT
4817     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4818 
4819 
4820     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4821     RuntimeStub* stub =
4822       RuntimeStub::new_runtime_stub(name,
4823                                     &code,
4824                                     frame_complete,
4825                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4826                                     oop_maps, false);
4827     return stub->entry_point();
4828   }
4829 
4830   class MontgomeryMultiplyGenerator : public MacroAssembler {
4831 
4832     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4833       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4834 
4835     RegSet _toSave;
4836     bool _squaring;
4837 
4838   public:
4839     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4840       : MacroAssembler(as->code()), _squaring(squaring) {
4841 
4842       // Register allocation
4843 
4844       Register reg = c_rarg0;
4845       Pa_base = reg;       // Argument registers
4846       if (squaring)
4847         Pb_base = Pa_base;
4848       else
4849         Pb_base = ++reg;
4850       Pn_base = ++reg;
4851       Rlen= ++reg;
4852       inv = ++reg;
4853       Pm_base = ++reg;
4854 
4855                           // Working registers:
4856       Ra =  ++reg;        // The current digit of a, b, n, and m.
4857       Rb =  ++reg;
4858       Rm =  ++reg;
4859       Rn =  ++reg;
4860 
4861       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4862       Pb =  ++reg;
4863       Pm =  ++reg;
4864       Pn =  ++reg;
4865 
4866       t0 =  ++reg;        // Three registers which form a
4867       t1 =  ++reg;        // triple-precision accumuator.
4868       t2 =  ++reg;
4869 
4870       Ri =  ++reg;        // Inner and outer loop indexes.
4871       Rj =  ++reg;
4872 
4873       Rhi_ab = ++reg;     // Product registers: low and high parts
4874       Rlo_ab = ++reg;     // of a*b and m*n.
4875       Rhi_mn = ++reg;
4876       Rlo_mn = ++reg;
4877 
4878       // r19 and up are callee-saved.
4879       _toSave = RegSet::range(r19, reg) + Pm_base;
4880     }
4881 
4882   private:
4883     void save_regs() {
4884       push(_toSave, sp);
4885     }
4886 
4887     void restore_regs() {
4888       pop(_toSave, sp);
4889     }
4890 
4891     template <typename T>
4892     void unroll_2(Register count, T block) {
4893       Label loop, end, odd;
4894       tbnz(count, 0, odd);
4895       cbz(count, end);
4896       align(16);
4897       bind(loop);
4898       (this->*block)();
4899       bind(odd);
4900       (this->*block)();
4901       subs(count, count, 2);
4902       br(Assembler::GT, loop);
4903       bind(end);
4904     }
4905 
4906     template <typename T>
4907     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4908       Label loop, end, odd;
4909       tbnz(count, 0, odd);
4910       cbz(count, end);
4911       align(16);
4912       bind(loop);
4913       (this->*block)(d, s, tmp);
4914       bind(odd);
4915       (this->*block)(d, s, tmp);
4916       subs(count, count, 2);
4917       br(Assembler::GT, loop);
4918       bind(end);
4919     }
4920 
4921     void pre1(RegisterOrConstant i) {
4922       block_comment("pre1");
4923       // Pa = Pa_base;
4924       // Pb = Pb_base + i;
4925       // Pm = Pm_base;
4926       // Pn = Pn_base + i;
4927       // Ra = *Pa;
4928       // Rb = *Pb;
4929       // Rm = *Pm;
4930       // Rn = *Pn;
4931       ldr(Ra, Address(Pa_base));
4932       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4933       ldr(Rm, Address(Pm_base));
4934       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4935       lea(Pa, Address(Pa_base));
4936       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4937       lea(Pm, Address(Pm_base));
4938       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4939 
4940       // Zero the m*n result.
4941       mov(Rhi_mn, zr);
4942       mov(Rlo_mn, zr);
4943     }
4944 
4945     // The core multiply-accumulate step of a Montgomery
4946     // multiplication.  The idea is to schedule operations as a
4947     // pipeline so that instructions with long latencies (loads and
4948     // multiplies) have time to complete before their results are
4949     // used.  This most benefits in-order implementations of the
4950     // architecture but out-of-order ones also benefit.
4951     void step() {
4952       block_comment("step");
4953       // MACC(Ra, Rb, t0, t1, t2);
4954       // Ra = *++Pa;
4955       // Rb = *--Pb;
4956       umulh(Rhi_ab, Ra, Rb);
4957       mul(Rlo_ab, Ra, Rb);
4958       ldr(Ra, pre(Pa, wordSize));
4959       ldr(Rb, pre(Pb, -wordSize));
4960       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4961                                        // previous iteration.
4962       // MACC(Rm, Rn, t0, t1, t2);
4963       // Rm = *++Pm;
4964       // Rn = *--Pn;
4965       umulh(Rhi_mn, Rm, Rn);
4966       mul(Rlo_mn, Rm, Rn);
4967       ldr(Rm, pre(Pm, wordSize));
4968       ldr(Rn, pre(Pn, -wordSize));
4969       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4970     }
4971 
4972     void post1() {
4973       block_comment("post1");
4974 
4975       // MACC(Ra, Rb, t0, t1, t2);
4976       // Ra = *++Pa;
4977       // Rb = *--Pb;
4978       umulh(Rhi_ab, Ra, Rb);
4979       mul(Rlo_ab, Ra, Rb);
4980       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4981       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4982 
4983       // *Pm = Rm = t0 * inv;
4984       mul(Rm, t0, inv);
4985       str(Rm, Address(Pm));
4986 
4987       // MACC(Rm, Rn, t0, t1, t2);
4988       // t0 = t1; t1 = t2; t2 = 0;
4989       umulh(Rhi_mn, Rm, Rn);
4990 
4991 #ifndef PRODUCT
4992       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4993       {
4994         mul(Rlo_mn, Rm, Rn);
4995         add(Rlo_mn, t0, Rlo_mn);
4996         Label ok;
4997         cbz(Rlo_mn, ok); {
4998           stop("broken Montgomery multiply");
4999         } bind(ok);
5000       }
5001 #endif
5002       // We have very carefully set things up so that
5003       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5004       // the lower half of Rm * Rn because we know the result already:
5005       // it must be -t0.  t0 + (-t0) must generate a carry iff
5006       // t0 != 0.  So, rather than do a mul and an adds we just set
5007       // the carry flag iff t0 is nonzero.
5008       //
5009       // mul(Rlo_mn, Rm, Rn);
5010       // adds(zr, t0, Rlo_mn);
5011       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5012       adcs(t0, t1, Rhi_mn);
5013       adc(t1, t2, zr);
5014       mov(t2, zr);
5015     }
5016 
5017     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5018       block_comment("pre2");
5019       // Pa = Pa_base + i-len;
5020       // Pb = Pb_base + len;
5021       // Pm = Pm_base + i-len;
5022       // Pn = Pn_base + len;
5023 
5024       if (i.is_register()) {
5025         sub(Rj, i.as_register(), len);
5026       } else {
5027         mov(Rj, i.as_constant());
5028         sub(Rj, Rj, len);
5029       }
5030       // Rj == i-len
5031 
5032       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5033       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5034       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5035       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5036 
5037       // Ra = *++Pa;
5038       // Rb = *--Pb;
5039       // Rm = *++Pm;
5040       // Rn = *--Pn;
5041       ldr(Ra, pre(Pa, wordSize));
5042       ldr(Rb, pre(Pb, -wordSize));
5043       ldr(Rm, pre(Pm, wordSize));
5044       ldr(Rn, pre(Pn, -wordSize));
5045 
5046       mov(Rhi_mn, zr);
5047       mov(Rlo_mn, zr);
5048     }
5049 
5050     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5051       block_comment("post2");
5052       if (i.is_constant()) {
5053         mov(Rj, i.as_constant()-len.as_constant());
5054       } else {
5055         sub(Rj, i.as_register(), len);
5056       }
5057 
5058       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5059 
5060       // As soon as we know the least significant digit of our result,
5061       // store it.
5062       // Pm_base[i-len] = t0;
5063       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5064 
5065       // t0 = t1; t1 = t2; t2 = 0;
5066       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5067       adc(t1, t2, zr);
5068       mov(t2, zr);
5069     }
5070 
5071     // A carry in t0 after Montgomery multiplication means that we
5072     // should subtract multiples of n from our result in m.  We'll
5073     // keep doing that until there is no carry.
5074     void normalize(RegisterOrConstant len) {
5075       block_comment("normalize");
5076       // while (t0)
5077       //   t0 = sub(Pm_base, Pn_base, t0, len);
5078       Label loop, post, again;
5079       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5080       cbz(t0, post); {
5081         bind(again); {
5082           mov(i, zr);
5083           mov(cnt, len);
5084           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5085           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5086           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5087           align(16);
5088           bind(loop); {
5089             sbcs(Rm, Rm, Rn);
5090             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5091             add(i, i, 1);
5092             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5093             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5094             sub(cnt, cnt, 1);
5095           } cbnz(cnt, loop);
5096           sbc(t0, t0, zr);
5097         } cbnz(t0, again);
5098       } bind(post);
5099     }
5100 
5101     // Move memory at s to d, reversing words.
5102     //    Increments d to end of copied memory
5103     //    Destroys tmp1, tmp2
5104     //    Preserves len
5105     //    Leaves s pointing to the address which was in d at start
5106     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5107       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5108 
5109       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5110       mov(tmp1, len);
5111       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5112       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5113     }
5114     // where
5115     void reverse1(Register d, Register s, Register tmp) {
5116       ldr(tmp, pre(s, -wordSize));
5117       ror(tmp, tmp, 32);
5118       str(tmp, post(d, wordSize));
5119     }
5120 
5121     void step_squaring() {
5122       // An extra ACC
5123       step();
5124       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5125     }
5126 
5127     void last_squaring(RegisterOrConstant i) {
5128       Label dont;
5129       // if ((i & 1) == 0) {
5130       tbnz(i.as_register(), 0, dont); {
5131         // MACC(Ra, Rb, t0, t1, t2);
5132         // Ra = *++Pa;
5133         // Rb = *--Pb;
5134         umulh(Rhi_ab, Ra, Rb);
5135         mul(Rlo_ab, Ra, Rb);
5136         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5137       } bind(dont);
5138     }
5139 
5140     void extra_step_squaring() {
5141       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5142 
5143       // MACC(Rm, Rn, t0, t1, t2);
5144       // Rm = *++Pm;
5145       // Rn = *--Pn;
5146       umulh(Rhi_mn, Rm, Rn);
5147       mul(Rlo_mn, Rm, Rn);
5148       ldr(Rm, pre(Pm, wordSize));
5149       ldr(Rn, pre(Pn, -wordSize));
5150     }
5151 
5152     void post1_squaring() {
5153       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5154 
5155       // *Pm = Rm = t0 * inv;
5156       mul(Rm, t0, inv);
5157       str(Rm, Address(Pm));
5158 
5159       // MACC(Rm, Rn, t0, t1, t2);
5160       // t0 = t1; t1 = t2; t2 = 0;
5161       umulh(Rhi_mn, Rm, Rn);
5162 
5163 #ifndef PRODUCT
5164       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5165       {
5166         mul(Rlo_mn, Rm, Rn);
5167         add(Rlo_mn, t0, Rlo_mn);
5168         Label ok;
5169         cbz(Rlo_mn, ok); {
5170           stop("broken Montgomery multiply");
5171         } bind(ok);
5172       }
5173 #endif
5174       // We have very carefully set things up so that
5175       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5176       // the lower half of Rm * Rn because we know the result already:
5177       // it must be -t0.  t0 + (-t0) must generate a carry iff
5178       // t0 != 0.  So, rather than do a mul and an adds we just set
5179       // the carry flag iff t0 is nonzero.
5180       //
5181       // mul(Rlo_mn, Rm, Rn);
5182       // adds(zr, t0, Rlo_mn);
5183       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5184       adcs(t0, t1, Rhi_mn);
5185       adc(t1, t2, zr);
5186       mov(t2, zr);
5187     }
5188 
5189     void acc(Register Rhi, Register Rlo,
5190              Register t0, Register t1, Register t2) {
5191       adds(t0, t0, Rlo);
5192       adcs(t1, t1, Rhi);
5193       adc(t2, t2, zr);
5194     }
5195 
5196   public:
5197     /**
5198      * Fast Montgomery multiplication.  The derivation of the
5199      * algorithm is in A Cryptographic Library for the Motorola
5200      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5201      *
5202      * Arguments:
5203      *
5204      * Inputs for multiplication:
5205      *   c_rarg0   - int array elements a
5206      *   c_rarg1   - int array elements b
5207      *   c_rarg2   - int array elements n (the modulus)
5208      *   c_rarg3   - int length
5209      *   c_rarg4   - int inv
5210      *   c_rarg5   - int array elements m (the result)
5211      *
5212      * Inputs for squaring:
5213      *   c_rarg0   - int array elements a
5214      *   c_rarg1   - int array elements n (the modulus)
5215      *   c_rarg2   - int length
5216      *   c_rarg3   - int inv
5217      *   c_rarg4   - int array elements m (the result)
5218      *
5219      */
5220     address generate_multiply() {
5221       Label argh, nothing;
5222       bind(argh);
5223       stop("MontgomeryMultiply total_allocation must be <= 8192");
5224 
5225       align(CodeEntryAlignment);
5226       address entry = pc();
5227 
5228       cbzw(Rlen, nothing);
5229 
5230       enter();
5231 
5232       // Make room.
5233       cmpw(Rlen, 512);
5234       br(Assembler::HI, argh);
5235       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5236       andr(sp, Ra, -2 * wordSize);
5237 
5238       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5239 
5240       {
5241         // Copy input args, reversing as we go.  We use Ra as a
5242         // temporary variable.
5243         reverse(Ra, Pa_base, Rlen, t0, t1);
5244         if (!_squaring)
5245           reverse(Ra, Pb_base, Rlen, t0, t1);
5246         reverse(Ra, Pn_base, Rlen, t0, t1);
5247       }
5248 
5249       // Push all call-saved registers and also Pm_base which we'll need
5250       // at the end.
5251       save_regs();
5252 
5253 #ifndef PRODUCT
5254       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5255       {
5256         ldr(Rn, Address(Pn_base, 0));
5257         mul(Rlo_mn, Rn, inv);
5258         subs(zr, Rlo_mn, -1);
5259         Label ok;
5260         br(EQ, ok); {
5261           stop("broken inverse in Montgomery multiply");
5262         } bind(ok);
5263       }
5264 #endif
5265 
5266       mov(Pm_base, Ra);
5267 
5268       mov(t0, zr);
5269       mov(t1, zr);
5270       mov(t2, zr);
5271 
5272       block_comment("for (int i = 0; i < len; i++) {");
5273       mov(Ri, zr); {
5274         Label loop, end;
5275         cmpw(Ri, Rlen);
5276         br(Assembler::GE, end);
5277 
5278         bind(loop);
5279         pre1(Ri);
5280 
5281         block_comment("  for (j = i; j; j--) {"); {
5282           movw(Rj, Ri);
5283           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5284         } block_comment("  } // j");
5285 
5286         post1();
5287         addw(Ri, Ri, 1);
5288         cmpw(Ri, Rlen);
5289         br(Assembler::LT, loop);
5290         bind(end);
5291         block_comment("} // i");
5292       }
5293 
5294       block_comment("for (int i = len; i < 2*len; i++) {");
5295       mov(Ri, Rlen); {
5296         Label loop, end;
5297         cmpw(Ri, Rlen, Assembler::LSL, 1);
5298         br(Assembler::GE, end);
5299 
5300         bind(loop);
5301         pre2(Ri, Rlen);
5302 
5303         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5304           lslw(Rj, Rlen, 1);
5305           subw(Rj, Rj, Ri);
5306           subw(Rj, Rj, 1);
5307           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5308         } block_comment("  } // j");
5309 
5310         post2(Ri, Rlen);
5311         addw(Ri, Ri, 1);
5312         cmpw(Ri, Rlen, Assembler::LSL, 1);
5313         br(Assembler::LT, loop);
5314         bind(end);
5315       }
5316       block_comment("} // i");
5317 
5318       normalize(Rlen);
5319 
5320       mov(Ra, Pm_base);  // Save Pm_base in Ra
5321       restore_regs();  // Restore caller's Pm_base
5322 
5323       // Copy our result into caller's Pm_base
5324       reverse(Pm_base, Ra, Rlen, t0, t1);
5325 
5326       leave();
5327       bind(nothing);
5328       ret(lr);
5329 
5330       return entry;
5331     }
5332     // In C, approximately:
5333 
5334     // void
5335     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5336     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5337     //                     unsigned long inv, int len) {
5338     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5339     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5340     //   unsigned long Ra, Rb, Rn, Rm;
5341 
5342     //   int i;
5343 
5344     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5345 
5346     //   for (i = 0; i < len; i++) {
5347     //     int j;
5348 
5349     //     Pa = Pa_base;
5350     //     Pb = Pb_base + i;
5351     //     Pm = Pm_base;
5352     //     Pn = Pn_base + i;
5353 
5354     //     Ra = *Pa;
5355     //     Rb = *Pb;
5356     //     Rm = *Pm;
5357     //     Rn = *Pn;
5358 
5359     //     int iters = i;
5360     //     for (j = 0; iters--; j++) {
5361     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5362     //       MACC(Ra, Rb, t0, t1, t2);
5363     //       Ra = *++Pa;
5364     //       Rb = *--Pb;
5365     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5366     //       MACC(Rm, Rn, t0, t1, t2);
5367     //       Rm = *++Pm;
5368     //       Rn = *--Pn;
5369     //     }
5370 
5371     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5372     //     MACC(Ra, Rb, t0, t1, t2);
5373     //     *Pm = Rm = t0 * inv;
5374     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5375     //     MACC(Rm, Rn, t0, t1, t2);
5376 
5377     //     assert(t0 == 0, "broken Montgomery multiply");
5378 
5379     //     t0 = t1; t1 = t2; t2 = 0;
5380     //   }
5381 
5382     //   for (i = len; i < 2*len; i++) {
5383     //     int j;
5384 
5385     //     Pa = Pa_base + i-len;
5386     //     Pb = Pb_base + len;
5387     //     Pm = Pm_base + i-len;
5388     //     Pn = Pn_base + len;
5389 
5390     //     Ra = *++Pa;
5391     //     Rb = *--Pb;
5392     //     Rm = *++Pm;
5393     //     Rn = *--Pn;
5394 
5395     //     int iters = len*2-i-1;
5396     //     for (j = i-len+1; iters--; j++) {
5397     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5398     //       MACC(Ra, Rb, t0, t1, t2);
5399     //       Ra = *++Pa;
5400     //       Rb = *--Pb;
5401     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5402     //       MACC(Rm, Rn, t0, t1, t2);
5403     //       Rm = *++Pm;
5404     //       Rn = *--Pn;
5405     //     }
5406 
5407     //     Pm_base[i-len] = t0;
5408     //     t0 = t1; t1 = t2; t2 = 0;
5409     //   }
5410 
5411     //   while (t0)
5412     //     t0 = sub(Pm_base, Pn_base, t0, len);
5413     // }
5414 
5415     /**
5416      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5417      * multiplies than Montgomery multiplication so it should be up to
5418      * 25% faster.  However, its loop control is more complex and it
5419      * may actually run slower on some machines.
5420      *
5421      * Arguments:
5422      *
5423      * Inputs:
5424      *   c_rarg0   - int array elements a
5425      *   c_rarg1   - int array elements n (the modulus)
5426      *   c_rarg2   - int length
5427      *   c_rarg3   - int inv
5428      *   c_rarg4   - int array elements m (the result)
5429      *
5430      */
5431     address generate_square() {
5432       Label argh;
5433       bind(argh);
5434       stop("MontgomeryMultiply total_allocation must be <= 8192");
5435 
5436       align(CodeEntryAlignment);
5437       address entry = pc();
5438 
5439       enter();
5440 
5441       // Make room.
5442       cmpw(Rlen, 512);
5443       br(Assembler::HI, argh);
5444       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5445       andr(sp, Ra, -2 * wordSize);
5446 
5447       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5448 
5449       {
5450         // Copy input args, reversing as we go.  We use Ra as a
5451         // temporary variable.
5452         reverse(Ra, Pa_base, Rlen, t0, t1);
5453         reverse(Ra, Pn_base, Rlen, t0, t1);
5454       }
5455 
5456       // Push all call-saved registers and also Pm_base which we'll need
5457       // at the end.
5458       save_regs();
5459 
5460       mov(Pm_base, Ra);
5461 
5462       mov(t0, zr);
5463       mov(t1, zr);
5464       mov(t2, zr);
5465 
5466       block_comment("for (int i = 0; i < len; i++) {");
5467       mov(Ri, zr); {
5468         Label loop, end;
5469         bind(loop);
5470         cmp(Ri, Rlen);
5471         br(Assembler::GE, end);
5472 
5473         pre1(Ri);
5474 
5475         block_comment("for (j = (i+1)/2; j; j--) {"); {
5476           add(Rj, Ri, 1);
5477           lsr(Rj, Rj, 1);
5478           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5479         } block_comment("  } // j");
5480 
5481         last_squaring(Ri);
5482 
5483         block_comment("  for (j = i/2; j; j--) {"); {
5484           lsr(Rj, Ri, 1);
5485           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5486         } block_comment("  } // j");
5487 
5488         post1_squaring();
5489         add(Ri, Ri, 1);
5490         cmp(Ri, Rlen);
5491         br(Assembler::LT, loop);
5492 
5493         bind(end);
5494         block_comment("} // i");
5495       }
5496 
5497       block_comment("for (int i = len; i < 2*len; i++) {");
5498       mov(Ri, Rlen); {
5499         Label loop, end;
5500         bind(loop);
5501         cmp(Ri, Rlen, Assembler::LSL, 1);
5502         br(Assembler::GE, end);
5503 
5504         pre2(Ri, Rlen);
5505 
5506         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5507           lsl(Rj, Rlen, 1);
5508           sub(Rj, Rj, Ri);
5509           sub(Rj, Rj, 1);
5510           lsr(Rj, Rj, 1);
5511           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5512         } block_comment("  } // j");
5513 
5514         last_squaring(Ri);
5515 
5516         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5517           lsl(Rj, Rlen, 1);
5518           sub(Rj, Rj, Ri);
5519           lsr(Rj, Rj, 1);
5520           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5521         } block_comment("  } // j");
5522 
5523         post2(Ri, Rlen);
5524         add(Ri, Ri, 1);
5525         cmp(Ri, Rlen, Assembler::LSL, 1);
5526 
5527         br(Assembler::LT, loop);
5528         bind(end);
5529         block_comment("} // i");
5530       }
5531 
5532       normalize(Rlen);
5533 
5534       mov(Ra, Pm_base);  // Save Pm_base in Ra
5535       restore_regs();  // Restore caller's Pm_base
5536 
5537       // Copy our result into caller's Pm_base
5538       reverse(Pm_base, Ra, Rlen, t0, t1);
5539 
5540       leave();
5541       ret(lr);
5542 
5543       return entry;
5544     }
5545     // In C, approximately:
5546 
5547     // void
5548     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5549     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5550     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5551     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5552     //   unsigned long Ra, Rb, Rn, Rm;
5553 
5554     //   int i;
5555 
5556     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5557 
5558     //   for (i = 0; i < len; i++) {
5559     //     int j;
5560 
5561     //     Pa = Pa_base;
5562     //     Pb = Pa_base + i;
5563     //     Pm = Pm_base;
5564     //     Pn = Pn_base + i;
5565 
5566     //     Ra = *Pa;
5567     //     Rb = *Pb;
5568     //     Rm = *Pm;
5569     //     Rn = *Pn;
5570 
5571     //     int iters = (i+1)/2;
5572     //     for (j = 0; iters--; j++) {
5573     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5574     //       MACC2(Ra, Rb, t0, t1, t2);
5575     //       Ra = *++Pa;
5576     //       Rb = *--Pb;
5577     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5578     //       MACC(Rm, Rn, t0, t1, t2);
5579     //       Rm = *++Pm;
5580     //       Rn = *--Pn;
5581     //     }
5582     //     if ((i & 1) == 0) {
5583     //       assert(Ra == Pa_base[j], "must be");
5584     //       MACC(Ra, Ra, t0, t1, t2);
5585     //     }
5586     //     iters = i/2;
5587     //     assert(iters == i-j, "must be");
5588     //     for (; iters--; j++) {
5589     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5590     //       MACC(Rm, Rn, t0, t1, t2);
5591     //       Rm = *++Pm;
5592     //       Rn = *--Pn;
5593     //     }
5594 
5595     //     *Pm = Rm = t0 * inv;
5596     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5597     //     MACC(Rm, Rn, t0, t1, t2);
5598 
5599     //     assert(t0 == 0, "broken Montgomery multiply");
5600 
5601     //     t0 = t1; t1 = t2; t2 = 0;
5602     //   }
5603 
5604     //   for (i = len; i < 2*len; i++) {
5605     //     int start = i-len+1;
5606     //     int end = start + (len - start)/2;
5607     //     int j;
5608 
5609     //     Pa = Pa_base + i-len;
5610     //     Pb = Pa_base + len;
5611     //     Pm = Pm_base + i-len;
5612     //     Pn = Pn_base + len;
5613 
5614     //     Ra = *++Pa;
5615     //     Rb = *--Pb;
5616     //     Rm = *++Pm;
5617     //     Rn = *--Pn;
5618 
5619     //     int iters = (2*len-i-1)/2;
5620     //     assert(iters == end-start, "must be");
5621     //     for (j = start; iters--; j++) {
5622     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5623     //       MACC2(Ra, Rb, t0, t1, t2);
5624     //       Ra = *++Pa;
5625     //       Rb = *--Pb;
5626     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5627     //       MACC(Rm, Rn, t0, t1, t2);
5628     //       Rm = *++Pm;
5629     //       Rn = *--Pn;
5630     //     }
5631     //     if ((i & 1) == 0) {
5632     //       assert(Ra == Pa_base[j], "must be");
5633     //       MACC(Ra, Ra, t0, t1, t2);
5634     //     }
5635     //     iters =  (2*len-i)/2;
5636     //     assert(iters == len-j, "must be");
5637     //     for (; iters--; j++) {
5638     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5639     //       MACC(Rm, Rn, t0, t1, t2);
5640     //       Rm = *++Pm;
5641     //       Rn = *--Pn;
5642     //     }
5643     //     Pm_base[i-len] = t0;
5644     //     t0 = t1; t1 = t2; t2 = 0;
5645     //   }
5646 
5647     //   while (t0)
5648     //     t0 = sub(Pm_base, Pn_base, t0, len);
5649     // }
5650   };
5651 
5652 
5653   // Initialization
5654   void generate_initial() {
5655     // Generate initial stubs and initializes the entry points
5656 
5657     // entry points that exist in all platforms Note: This is code
5658     // that could be shared among different platforms - however the
5659     // benefit seems to be smaller than the disadvantage of having a
5660     // much more complicated generator structure. See also comment in
5661     // stubRoutines.hpp.
5662 
5663     StubRoutines::_forward_exception_entry = generate_forward_exception();
5664 
5665     StubRoutines::_call_stub_entry =
5666       generate_call_stub(StubRoutines::_call_stub_return_address);
5667 
5668     // is referenced by megamorphic call
5669     StubRoutines::_catch_exception_entry = generate_catch_exception();
5670 
5671     // Build this early so it's available for the interpreter.
5672     StubRoutines::_throw_StackOverflowError_entry =
5673       generate_throw_exception("StackOverflowError throw_exception",
5674                                CAST_FROM_FN_PTR(address,
5675                                                 SharedRuntime::throw_StackOverflowError));
5676     StubRoutines::_throw_delayed_StackOverflowError_entry =
5677       generate_throw_exception("delayed StackOverflowError throw_exception",
5678                                CAST_FROM_FN_PTR(address,
5679                                                 SharedRuntime::throw_delayed_StackOverflowError));
5680     if (UseCRC32Intrinsics) {
5681       // set table address before stub generation which use it
5682       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5683       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5684     }
5685 
5686     if (UseCRC32CIntrinsics) {
5687       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5688     }
5689 
5690     // Disabled until JDK-8210858 is fixed
5691     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5692     //   StubRoutines::_dlog = generate_dlog();
5693     // }
5694 
5695     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5696       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5697     }
5698 
5699     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5700       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5701     }
5702   }
5703 
5704   void generate_all() {
5705     // support for verify_oop (must happen after universe_init)
5706     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5707     StubRoutines::_throw_AbstractMethodError_entry =
5708       generate_throw_exception("AbstractMethodError throw_exception",
5709                                CAST_FROM_FN_PTR(address,
5710                                                 SharedRuntime::
5711                                                 throw_AbstractMethodError));
5712 
5713     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5714       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5715                                CAST_FROM_FN_PTR(address,
5716                                                 SharedRuntime::
5717                                                 throw_IncompatibleClassChangeError));
5718 
5719     StubRoutines::_throw_NullPointerException_at_call_entry =
5720       generate_throw_exception("NullPointerException at call throw_exception",
5721                                CAST_FROM_FN_PTR(address,
5722                                                 SharedRuntime::
5723                                                 throw_NullPointerException_at_call));
5724 
5725     // arraycopy stubs used by compilers
5726     generate_arraycopy_stubs();
5727 
5728     // has negatives stub for large arrays.
5729     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5730 
5731     // array equals stub for large arrays.
5732     if (!UseSimpleArrayEquals) {
5733       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5734     }
5735 
5736     generate_compare_long_strings();
5737 
5738     generate_string_indexof_stubs();
5739 
5740     // byte_array_inflate stub for large arrays.
5741     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5742 
5743 #ifdef COMPILER2
5744     if (UseMultiplyToLenIntrinsic) {
5745       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5746     }
5747 
5748     if (UseSquareToLenIntrinsic) {
5749       StubRoutines::_squareToLen = generate_squareToLen();
5750     }
5751 
5752     if (UseMulAddIntrinsic) {
5753       StubRoutines::_mulAdd = generate_mulAdd();
5754     }
5755 
5756     if (UseMontgomeryMultiplyIntrinsic) {
5757       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5758       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5759       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5760     }
5761 
5762     if (UseMontgomerySquareIntrinsic) {
5763       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5764       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5765       // We use generate_multiply() rather than generate_square()
5766       // because it's faster for the sizes of modulus we care about.
5767       StubRoutines::_montgomerySquare = g.generate_multiply();
5768     }
5769 #endif // COMPILER2
5770 
5771 #ifndef BUILTIN_SIM
5772     // generate GHASH intrinsics code
5773     if (UseGHASHIntrinsics) {
5774       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5775     }
5776 
5777     if (UseAESIntrinsics) {
5778       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5779       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5780       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5781       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5782     }
5783 
5784     if (UseSHA1Intrinsics) {
5785       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5786       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5787     }
5788     if (UseSHA256Intrinsics) {
5789       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5790       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5791     }
5792 
5793     // generate Adler32 intrinsics code
5794     if (UseAdler32Intrinsics) {
5795       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5796     }
5797 
5798     // Safefetch stubs.
5799     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5800                                                        &StubRoutines::_safefetch32_fault_pc,
5801                                                        &StubRoutines::_safefetch32_continuation_pc);
5802     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5803                                                        &StubRoutines::_safefetchN_fault_pc,
5804                                                        &StubRoutines::_safefetchN_continuation_pc);
5805 #endif
5806     StubRoutines::aarch64::set_completed();
5807   }
5808 
5809  public:
5810   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5811     if (all) {
5812       generate_all();
5813     } else {
5814       generate_initial();
5815     }
5816   }
5817 }; // end class declaration
5818 
5819 void StubGenerator_generate(CodeBuffer* code, bool all) {
5820   StubGenerator g(code, all);
5821 }