1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (unsigned)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(j_rarg1, T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(j_rarg2));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376 #ifndef PRODUCT
 377     // tell the simulator we are about to end Java execution
 378     if (NotifySimulator) {
 379       __ notify(Assembler::method_exit);
 380     }
 381 #endif
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387 
 388     __ BIND(is_long);
 389     __ str(r0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_float);
 393     __ strs(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_double);
 397     __ strd(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     return start;
 401   }
 402 
 403   // Return point for a Java call if there's an exception thrown in
 404   // Java code.  The exception is caught and transformed into a
 405   // pending exception stored in JavaThread that can be tested from
 406   // within the VM.
 407   //
 408   // Note: Usually the parameters are removed by the callee. In case
 409   // of an exception crossing an activation frame boundary, that is
 410   // not the case if the callee is compiled code => need to setup the
 411   // rsp.
 412   //
 413   // r0: exception oop
 414 
 415   // NOTE: this is used as a target from the signal handler so it
 416   // needs an x86 prolog which returns into the current simulator
 417   // executing the generated catch_exception code. so the prolog
 418   // needs to install rax in a sim register and adjust the sim's
 419   // restart pc to enter the generated code at the start position
 420   // then return from native to simulated execution.
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != NULL,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code wiht no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // we should not really care that lr is no longer the callee
 517     // address. we saved the value the handler needs in r19 so we can
 518     // just copy it to r3. however, the C2 handler will push its own
 519     // frame and then calls into the VM and the VM code asserts that
 520     // the PC for the frame above the handler belongs to a compiled
 521     // Java method. So, we restore lr here to satisfy that assert.
 522     __ mov(lr, r19);
 523     // setup r0 & r3 & clear pending exception
 524     __ mov(r3, r19);
 525     __ mov(r19, r0);
 526     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 527     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ cbnz(r0, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler
 540     // r0: exception
 541     // r3: throwing pc
 542     // r19: exception handler
 543     __ verify_oop(r0);
 544     __ br(r19);
 545 
 546     return start;
 547   }
 548 
 549   // Non-destructive plausibility checks for oops
 550   //
 551   // Arguments:
 552   //    r0: oop to verify
 553   //    rscratch1: error message
 554   //
 555   // Stack after saving c_rarg3:
 556   //    [tos + 0]: saved c_rarg3
 557   //    [tos + 1]: saved c_rarg2
 558   //    [tos + 2]: saved lr
 559   //    [tos + 3]: saved rscratch2
 560   //    [tos + 4]: saved r0
 561   //    [tos + 5]: saved rscratch1
 562   address generate_verify_oop() {
 563 
 564     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 565     address start = __ pc();
 566 
 567     Label exit, error;
 568 
 569     // save c_rarg2 and c_rarg3
 570     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 571 
 572     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 573     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ ldr(c_rarg3, Address(c_rarg2));
 575     __ add(c_rarg3, c_rarg3, 1);
 576     __ str(c_rarg3, Address(c_rarg2));
 577 
 578     // object is in r0
 579     // make sure object is 'reasonable'
 580     __ cbz(r0, exit); // if obj is NULL it is OK
 581 
 582     // Check if the oop is in the right area of memory
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 584     __ andr(c_rarg2, r0, c_rarg3);
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 586 
 587     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 588     // instruction here because the flags register is live.
 589     __ eor(c_rarg2, c_rarg2, c_rarg3);
 590     __ cbnz(c_rarg2, error);
 591 
 592     // make sure klass is 'reasonable', which is not zero.
 593     __ load_klass(r0, r0);  // get klass
 594     __ cbz(r0, error);      // if klass is NULL it is broken
 595 
 596     // return if everything seems ok
 597     __ bind(exit);
 598 
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600     __ ret(lr);
 601 
 602     // handle errors
 603     __ bind(error);
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605 
 606     __ push(RegSet::range(r0, r29), sp);
 607     // debug(char* msg, int64_t pc, int64_t regs[])
 608     __ mov(c_rarg0, rscratch1);      // pass address of error message
 609     __ mov(c_rarg1, lr);             // pass return address
 610     __ mov(c_rarg2, sp);             // pass address of regs on stack
 611 #ifndef PRODUCT
 612     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 613 #endif
 614     BLOCK_COMMENT("call MacroAssembler::debug");
 615     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 616     __ blrt(rscratch1, 3, 0, 1);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label store_pair, loop_store_pair, done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657 
 658       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 659       // alignment.
 660       Label small;
 661       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 662       __ subs(rscratch1, cnt, low_limit >> 3);
 663       __ br(Assembler::LT, small);
 664       __ zero_dcache_blocks(base, cnt);
 665       __ bind(small);
 666     }
 667 
 668     {
 669       // Number of stp instructions we'll unroll
 670       const int unroll =
 671         MacroAssembler::zero_words_block_size / 2;
 672       // Clear the remaining blocks.
 673       Label loop;
 674       __ subs(cnt, cnt, unroll * 2);
 675       __ br(Assembler::LT, done);
 676       __ bind(loop);
 677       for (int i = 0; i < unroll; i++)
 678         __ stp(zr, zr, __ post(base, 16));
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::GE, loop);
 681       __ bind(done);
 682       __ add(cnt, cnt, unroll * 2);
 683     }
 684 
 685     __ ret(lr);
 686 
 687     return start;
 688   }
 689 
 690 
 691   typedef enum {
 692     copy_forwards = 1,
 693     copy_backwards = -1
 694   } copy_direction;
 695 
 696   // Bulk copy of blocks of 8 words.
 697   //
 698   // count is a count of words.
 699   //
 700   // Precondition: count >= 8
 701   //
 702   // Postconditions:
 703   //
 704   // The least significant bit of count contains the remaining count
 705   // of words to copy.  The rest of count is trash.
 706   //
 707   // s and d are adjusted to point to the remaining words to copy
 708   //
 709   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 710                            copy_direction direction) {
 711     int unit = wordSize * direction;
 712     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 713 
 714     int offset;
 715     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 716       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 717     const Register stride = r13;
 718 
 719     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 720     assert_different_registers(s, d, count, rscratch1);
 721 
 722     Label again, drain;
 723     const char *stub_name;
 724     if (direction == copy_forwards)
 725       stub_name = "forward_copy_longs";
 726     else
 727       stub_name = "backward_copy_longs";
 728     StubCodeMark mark(this, "StubRoutines", stub_name);
 729     __ align(CodeEntryAlignment);
 730     __ bind(start);
 731 
 732     Label unaligned_copy_long;
 733     if (AvoidUnalignedAccesses) {
 734       __ tbnz(d, 3, unaligned_copy_long);
 735     }
 736 
 737     if (direction == copy_forwards) {
 738       __ sub(s, s, bias);
 739       __ sub(d, d, bias);
 740     }
 741 
 742 #ifdef ASSERT
 743     // Make sure we are never given < 8 words
 744     {
 745       Label L;
 746       __ cmp(count, 8);
 747       __ br(Assembler::GE, L);
 748       __ stop("genrate_copy_longs called with < 8 words");
 749       __ bind(L);
 750     }
 751 #endif
 752 
 753     // Fill 8 registers
 754     if (UseSIMDForMemoryOps) {
 755       __ ldpq(v0, v1, Address(s, 4 * unit));
 756       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 757     } else {
 758       __ ldp(t0, t1, Address(s, 2 * unit));
 759       __ ldp(t2, t3, Address(s, 4 * unit));
 760       __ ldp(t4, t5, Address(s, 6 * unit));
 761       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 762     }
 763 
 764     __ subs(count, count, 16);
 765     __ br(Assembler::LO, drain);
 766 
 767     int prefetch = PrefetchCopyIntervalInBytes;
 768     bool use_stride = false;
 769     if (direction == copy_backwards) {
 770        use_stride = prefetch > 256;
 771        prefetch = -prefetch;
 772        if (use_stride) __ mov(stride, prefetch);
 773     }
 774 
 775     __ bind(again);
 776 
 777     if (PrefetchCopyIntervalInBytes > 0)
 778       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 779 
 780     if (UseSIMDForMemoryOps) {
 781       __ stpq(v0, v1, Address(d, 4 * unit));
 782       __ ldpq(v0, v1, Address(s, 4 * unit));
 783       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 784       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 785     } else {
 786       __ stp(t0, t1, Address(d, 2 * unit));
 787       __ ldp(t0, t1, Address(s, 2 * unit));
 788       __ stp(t2, t3, Address(d, 4 * unit));
 789       __ ldp(t2, t3, Address(s, 4 * unit));
 790       __ stp(t4, t5, Address(d, 6 * unit));
 791       __ ldp(t4, t5, Address(s, 6 * unit));
 792       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 793       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 794     }
 795 
 796     __ subs(count, count, 8);
 797     __ br(Assembler::HS, again);
 798 
 799     // Drain
 800     __ bind(drain);
 801     if (UseSIMDForMemoryOps) {
 802       __ stpq(v0, v1, Address(d, 4 * unit));
 803       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 804     } else {
 805       __ stp(t0, t1, Address(d, 2 * unit));
 806       __ stp(t2, t3, Address(d, 4 * unit));
 807       __ stp(t4, t5, Address(d, 6 * unit));
 808       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 809     }
 810 
 811     {
 812       Label L1, L2;
 813       __ tbz(count, exact_log2(4), L1);
 814       if (UseSIMDForMemoryOps) {
 815         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 816         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 817       } else {
 818         __ ldp(t0, t1, Address(s, 2 * unit));
 819         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 820         __ stp(t0, t1, Address(d, 2 * unit));
 821         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 822       }
 823       __ bind(L1);
 824 
 825       if (direction == copy_forwards) {
 826         __ add(s, s, bias);
 827         __ add(d, d, bias);
 828       }
 829 
 830       __ tbz(count, 1, L2);
 831       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 832       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 833       __ bind(L2);
 834     }
 835 
 836     __ ret(lr);
 837 
 838     if (AvoidUnalignedAccesses) {
 839       Label drain, again;
 840       // Register order for storing. Order is different for backward copy.
 841 
 842       __ bind(unaligned_copy_long);
 843 
 844       // source address is even aligned, target odd aligned
 845       //
 846       // when forward copying word pairs we read long pairs at offsets
 847       // {0, 2, 4, 6} (in long words). when backwards copying we read
 848       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 849       // address by -2 in the forwards case so we can compute the
 850       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 851       // or -1.
 852       //
 853       // when forward copying we need to store 1 word, 3 pairs and
 854       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 855       // zero offset We adjust the destination by -1 which means we
 856       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 857       //
 858       // When backwards copyng we need to store 1 word, 3 pairs and
 859       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 860       // offsets {1, 3, 5, 7, 8} * unit.
 861 
 862       if (direction == copy_forwards) {
 863         __ sub(s, s, 16);
 864         __ sub(d, d, 8);
 865       }
 866 
 867       // Fill 8 registers
 868       //
 869       // for forwards copy s was offset by -16 from the original input
 870       // value of s so the register contents are at these offsets
 871       // relative to the 64 bit block addressed by that original input
 872       // and so on for each successive 64 byte block when s is updated
 873       //
 874       // t0 at offset 0,  t1 at offset 8
 875       // t2 at offset 16, t3 at offset 24
 876       // t4 at offset 32, t5 at offset 40
 877       // t6 at offset 48, t7 at offset 56
 878 
 879       // for backwards copy s was not offset so the register contents
 880       // are at these offsets into the preceding 64 byte block
 881       // relative to that original input and so on for each successive
 882       // preceding 64 byte block when s is updated. this explains the
 883       // slightly counter-intuitive looking pattern of register usage
 884       // in the stp instructions for backwards copy.
 885       //
 886       // t0 at offset -16, t1 at offset -8
 887       // t2 at offset -32, t3 at offset -24
 888       // t4 at offset -48, t5 at offset -40
 889       // t6 at offset -64, t7 at offset -56
 890 
 891       __ ldp(t0, t1, Address(s, 2 * unit));
 892       __ ldp(t2, t3, Address(s, 4 * unit));
 893       __ ldp(t4, t5, Address(s, 6 * unit));
 894       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 895 
 896       __ subs(count, count, 16);
 897       __ br(Assembler::LO, drain);
 898 
 899       int prefetch = PrefetchCopyIntervalInBytes;
 900       bool use_stride = false;
 901       if (direction == copy_backwards) {
 902          use_stride = prefetch > 256;
 903          prefetch = -prefetch;
 904          if (use_stride) __ mov(stride, prefetch);
 905       }
 906 
 907       __ bind(again);
 908 
 909       if (PrefetchCopyIntervalInBytes > 0)
 910         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 911 
 912       if (direction == copy_forwards) {
 913        // allowing for the offset of -8 the store instructions place
 914        // registers into the target 64 bit block at the following
 915        // offsets
 916        //
 917        // t0 at offset 0
 918        // t1 at offset 8,  t2 at offset 16
 919        // t3 at offset 24, t4 at offset 32
 920        // t5 at offset 40, t6 at offset 48
 921        // t7 at offset 56
 922 
 923         __ str(t0, Address(d, 1 * unit));
 924         __ stp(t1, t2, Address(d, 2 * unit));
 925         __ ldp(t0, t1, Address(s, 2 * unit));
 926         __ stp(t3, t4, Address(d, 4 * unit));
 927         __ ldp(t2, t3, Address(s, 4 * unit));
 928         __ stp(t5, t6, Address(d, 6 * unit));
 929         __ ldp(t4, t5, Address(s, 6 * unit));
 930         __ str(t7, Address(__ pre(d, 8 * unit)));
 931         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 932       } else {
 933        // d was not offset when we started so the registers are
 934        // written into the 64 bit block preceding d with the following
 935        // offsets
 936        //
 937        // t1 at offset -8
 938        // t3 at offset -24, t0 at offset -16
 939        // t5 at offset -48, t2 at offset -32
 940        // t7 at offset -56, t4 at offset -48
 941        //                   t6 at offset -64
 942        //
 943        // note that this matches the offsets previously noted for the
 944        // loads
 945 
 946         __ str(t1, Address(d, 1 * unit));
 947         __ stp(t3, t0, Address(d, 3 * unit));
 948         __ ldp(t0, t1, Address(s, 2 * unit));
 949         __ stp(t5, t2, Address(d, 5 * unit));
 950         __ ldp(t2, t3, Address(s, 4 * unit));
 951         __ stp(t7, t4, Address(d, 7 * unit));
 952         __ ldp(t4, t5, Address(s, 6 * unit));
 953         __ str(t6, Address(__ pre(d, 8 * unit)));
 954         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 955       }
 956 
 957       __ subs(count, count, 8);
 958       __ br(Assembler::HS, again);
 959 
 960       // Drain
 961       //
 962       // this uses the same pattern of offsets and register arguments
 963       // as above
 964       __ bind(drain);
 965       if (direction == copy_forwards) {
 966         __ str(t0, Address(d, 1 * unit));
 967         __ stp(t1, t2, Address(d, 2 * unit));
 968         __ stp(t3, t4, Address(d, 4 * unit));
 969         __ stp(t5, t6, Address(d, 6 * unit));
 970         __ str(t7, Address(__ pre(d, 8 * unit)));
 971       } else {
 972         __ str(t1, Address(d, 1 * unit));
 973         __ stp(t3, t0, Address(d, 3 * unit));
 974         __ stp(t5, t2, Address(d, 5 * unit));
 975         __ stp(t7, t4, Address(d, 7 * unit));
 976         __ str(t6, Address(__ pre(d, 8 * unit)));
 977       }
 978       // now we need to copy any remaining part block which may
 979       // include a 4 word block subblock and/or a 2 word subblock.
 980       // bits 2 and 1 in the count are the tell-tale for whetehr we
 981       // have each such subblock
 982       {
 983         Label L1, L2;
 984         __ tbz(count, exact_log2(4), L1);
 985        // this is the same as above but copying only 4 longs hence
 986        // with ony one intervening stp between the str instructions
 987        // but note that the offsets and registers still follow the
 988        // same pattern
 989         __ ldp(t0, t1, Address(s, 2 * unit));
 990         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 991         if (direction == copy_forwards) {
 992           __ str(t0, Address(d, 1 * unit));
 993           __ stp(t1, t2, Address(d, 2 * unit));
 994           __ str(t3, Address(__ pre(d, 4 * unit)));
 995         } else {
 996           __ str(t1, Address(d, 1 * unit));
 997           __ stp(t3, t0, Address(d, 3 * unit));
 998           __ str(t2, Address(__ pre(d, 4 * unit)));
 999         }
1000         __ bind(L1);
1001 
1002         __ tbz(count, 1, L2);
1003        // this is the same as above but copying only 2 longs hence
1004        // there is no intervening stp between the str instructions
1005        // but note that the offset and register patterns are still
1006        // the same
1007         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1008         if (direction == copy_forwards) {
1009           __ str(t0, Address(d, 1 * unit));
1010           __ str(t1, Address(__ pre(d, 2 * unit)));
1011         } else {
1012           __ str(t1, Address(d, 1 * unit));
1013           __ str(t0, Address(__ pre(d, 2 * unit)));
1014         }
1015         __ bind(L2);
1016 
1017        // for forwards copy we need to re-adjust the offsets we
1018        // applied so that s and d are follow the last words written
1019 
1020        if (direction == copy_forwards) {
1021          __ add(s, s, 16);
1022          __ add(d, d, 8);
1023        }
1024 
1025       }
1026 
1027       __ ret(lr);
1028       }
1029   }
1030 
1031   // Small copy: less than 16 bytes.
1032   //
1033   // NB: Ignores all of the bits of count which represent more than 15
1034   // bytes, so a caller doesn't have to mask them.
1035 
1036   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1037     bool is_backwards = step < 0;
1038     size_t granularity = uabs(step);
1039     int direction = is_backwards ? -1 : 1;
1040     int unit = wordSize * direction;
1041 
1042     Label Lpair, Lword, Lint, Lshort, Lbyte;
1043 
1044     assert(granularity
1045            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1046 
1047     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1048 
1049     // ??? I don't know if this bit-test-and-branch is the right thing
1050     // to do.  It does a lot of jumping, resulting in several
1051     // mispredicted branches.  It might make more sense to do this
1052     // with something like Duff's device with a single computed branch.
1053 
1054     __ tbz(count, 3 - exact_log2(granularity), Lword);
1055     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1056     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1057     __ bind(Lword);
1058 
1059     if (granularity <= sizeof (jint)) {
1060       __ tbz(count, 2 - exact_log2(granularity), Lint);
1061       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1062       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1063       __ bind(Lint);
1064     }
1065 
1066     if (granularity <= sizeof (jshort)) {
1067       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1068       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1069       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1070       __ bind(Lshort);
1071     }
1072 
1073     if (granularity <= sizeof (jbyte)) {
1074       __ tbz(count, 0, Lbyte);
1075       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1076       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1077       __ bind(Lbyte);
1078     }
1079   }
1080 
1081   Label copy_f, copy_b;
1082 
1083   // All-singing all-dancing memory copy.
1084   //
1085   // Copy count units of memory from s to d.  The size of a unit is
1086   // step, which can be positive or negative depending on the direction
1087   // of copy.  If is_aligned is false, we align the source address.
1088   //
1089 
1090   void copy_memory(bool is_aligned, Register s, Register d,
1091                    Register count, Register tmp, int step) {
1092     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1093     bool is_backwards = step < 0;
1094     int granularity = uabs(step);
1095     const Register t0 = r3, t1 = r4;
1096 
1097     // <= 96 bytes do inline. Direction doesn't matter because we always
1098     // load all the data before writing anything
1099     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1100     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1101     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1102     const Register send = r17, dend = r18;
1103 
1104     if (PrefetchCopyIntervalInBytes > 0)
1105       __ prfm(Address(s, 0), PLDL1KEEP);
1106     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1107     __ br(Assembler::HI, copy_big);
1108 
1109     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1110     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1111 
1112     __ cmp(count, 16/granularity);
1113     __ br(Assembler::LS, copy16);
1114 
1115     __ cmp(count, 64/granularity);
1116     __ br(Assembler::HI, copy80);
1117 
1118     __ cmp(count, 32/granularity);
1119     __ br(Assembler::LS, copy32);
1120 
1121     // 33..64 bytes
1122     if (UseSIMDForMemoryOps) {
1123       __ ldpq(v0, v1, Address(s, 0));
1124       __ ldpq(v2, v3, Address(send, -32));
1125       __ stpq(v0, v1, Address(d, 0));
1126       __ stpq(v2, v3, Address(dend, -32));
1127     } else {
1128       __ ldp(t0, t1, Address(s, 0));
1129       __ ldp(t2, t3, Address(s, 16));
1130       __ ldp(t4, t5, Address(send, -32));
1131       __ ldp(t6, t7, Address(send, -16));
1132 
1133       __ stp(t0, t1, Address(d, 0));
1134       __ stp(t2, t3, Address(d, 16));
1135       __ stp(t4, t5, Address(dend, -32));
1136       __ stp(t6, t7, Address(dend, -16));
1137     }
1138     __ b(finish);
1139 
1140     // 17..32 bytes
1141     __ bind(copy32);
1142     __ ldp(t0, t1, Address(s, 0));
1143     __ ldp(t2, t3, Address(send, -16));
1144     __ stp(t0, t1, Address(d, 0));
1145     __ stp(t2, t3, Address(dend, -16));
1146     __ b(finish);
1147 
1148     // 65..80/96 bytes
1149     // (96 bytes if SIMD because we do 32 byes per instruction)
1150     __ bind(copy80);
1151     if (UseSIMDForMemoryOps) {
1152       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1153       __ ldpq(v4, v5, Address(send, -32));
1154       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1155       __ stpq(v4, v5, Address(dend, -32));
1156     } else {
1157       __ ldp(t0, t1, Address(s, 0));
1158       __ ldp(t2, t3, Address(s, 16));
1159       __ ldp(t4, t5, Address(s, 32));
1160       __ ldp(t6, t7, Address(s, 48));
1161       __ ldp(t8, t9, Address(send, -16));
1162 
1163       __ stp(t0, t1, Address(d, 0));
1164       __ stp(t2, t3, Address(d, 16));
1165       __ stp(t4, t5, Address(d, 32));
1166       __ stp(t6, t7, Address(d, 48));
1167       __ stp(t8, t9, Address(dend, -16));
1168     }
1169     __ b(finish);
1170 
1171     // 0..16 bytes
1172     __ bind(copy16);
1173     __ cmp(count, 8/granularity);
1174     __ br(Assembler::LO, copy8);
1175 
1176     // 8..16 bytes
1177     __ ldr(t0, Address(s, 0));
1178     __ ldr(t1, Address(send, -8));
1179     __ str(t0, Address(d, 0));
1180     __ str(t1, Address(dend, -8));
1181     __ b(finish);
1182 
1183     if (granularity < 8) {
1184       // 4..7 bytes
1185       __ bind(copy8);
1186       __ tbz(count, 2 - exact_log2(granularity), copy4);
1187       __ ldrw(t0, Address(s, 0));
1188       __ ldrw(t1, Address(send, -4));
1189       __ strw(t0, Address(d, 0));
1190       __ strw(t1, Address(dend, -4));
1191       __ b(finish);
1192       if (granularity < 4) {
1193         // 0..3 bytes
1194         __ bind(copy4);
1195         __ cbz(count, finish); // get rid of 0 case
1196         if (granularity == 2) {
1197           __ ldrh(t0, Address(s, 0));
1198           __ strh(t0, Address(d, 0));
1199         } else { // granularity == 1
1200           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1201           // the first and last byte.
1202           // Handle the 3 byte case by loading and storing base + count/2
1203           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1204           // This does means in the 1 byte case we load/store the same
1205           // byte 3 times.
1206           __ lsr(count, count, 1);
1207           __ ldrb(t0, Address(s, 0));
1208           __ ldrb(t1, Address(send, -1));
1209           __ ldrb(t2, Address(s, count));
1210           __ strb(t0, Address(d, 0));
1211           __ strb(t1, Address(dend, -1));
1212           __ strb(t2, Address(d, count));
1213         }
1214         __ b(finish);
1215       }
1216     }
1217 
1218     __ bind(copy_big);
1219     if (is_backwards) {
1220       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1221       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1222     }
1223 
1224     // Now we've got the small case out of the way we can align the
1225     // source address on a 2-word boundary.
1226 
1227     Label aligned;
1228 
1229     if (is_aligned) {
1230       // We may have to adjust by 1 word to get s 2-word-aligned.
1231       __ tbz(s, exact_log2(wordSize), aligned);
1232       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1233       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1234       __ sub(count, count, wordSize/granularity);
1235     } else {
1236       if (is_backwards) {
1237         __ andr(rscratch2, s, 2 * wordSize - 1);
1238       } else {
1239         __ neg(rscratch2, s);
1240         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1241       }
1242       // rscratch2 is the byte adjustment needed to align s.
1243       __ cbz(rscratch2, aligned);
1244       int shift = exact_log2(granularity);
1245       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1246       __ sub(count, count, rscratch2);
1247 
1248 #if 0
1249       // ?? This code is only correct for a disjoint copy.  It may or
1250       // may not make sense to use it in that case.
1251 
1252       // Copy the first pair; s and d may not be aligned.
1253       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1254       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1255 
1256       // Align s and d, adjust count
1257       if (is_backwards) {
1258         __ sub(s, s, rscratch2);
1259         __ sub(d, d, rscratch2);
1260       } else {
1261         __ add(s, s, rscratch2);
1262         __ add(d, d, rscratch2);
1263       }
1264 #else
1265       copy_memory_small(s, d, rscratch2, rscratch1, step);
1266 #endif
1267     }
1268 
1269     __ bind(aligned);
1270 
1271     // s is now 2-word-aligned.
1272 
1273     // We have a count of units and some trailing bytes.  Adjust the
1274     // count and do a bulk copy of words.
1275     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1276     if (direction == copy_forwards)
1277       __ bl(copy_f);
1278     else
1279       __ bl(copy_b);
1280 
1281     // And the tail.
1282     copy_memory_small(s, d, count, tmp, step);
1283 
1284     if (granularity >= 8) __ bind(copy8);
1285     if (granularity >= 4) __ bind(copy4);
1286     __ bind(finish);
1287   }
1288 
1289 
1290   void clobber_registers() {
1291 #ifdef ASSERT
1292     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1293     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1294     for (Register r = r3; r <= r18; r++)
1295       if (r != rscratch1) __ mov(r, rscratch1);
1296 #endif
1297   }
1298 
1299   // Scan over array at a for count oops, verifying each one.
1300   // Preserves a and count, clobbers rscratch1 and rscratch2.
1301   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1302     Label loop, end;
1303     __ mov(rscratch1, a);
1304     __ mov(rscratch2, zr);
1305     __ bind(loop);
1306     __ cmp(rscratch2, count);
1307     __ br(Assembler::HS, end);
1308     if (size == (size_t)wordSize) {
1309       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1310       __ verify_oop(temp);
1311     } else {
1312       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1313       __ decode_heap_oop(temp); // calls verify_oop
1314     }
1315     __ add(rscratch2, rscratch2, size);
1316     __ b(loop);
1317     __ bind(end);
1318   }
1319 
1320   // Arguments:
1321   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1322   //             ignored
1323   //   is_oop  - true => oop array, so generate store check code
1324   //   name    - stub name string
1325   //
1326   // Inputs:
1327   //   c_rarg0   - source array address
1328   //   c_rarg1   - destination array address
1329   //   c_rarg2   - element count, treated as ssize_t, can be zero
1330   //
1331   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1332   // the hardware handle it.  The two dwords within qwords that span
1333   // cache line boundaries will still be loaded and stored atomicly.
1334   //
1335   // Side Effects:
1336   //   disjoint_int_copy_entry is set to the no-overlap entry point
1337   //   used by generate_conjoint_int_oop_copy().
1338   //
1339   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1340                                   const char *name, bool dest_uninitialized = false) {
1341     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1342     RegSet saved_reg = RegSet::of(s, d, count);
1343     __ align(CodeEntryAlignment);
1344     StubCodeMark mark(this, "StubRoutines", name);
1345     address start = __ pc();
1346     __ enter();
1347 
1348     if (entry != NULL) {
1349       *entry = __ pc();
1350       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1351       BLOCK_COMMENT("Entry:");
1352     }
1353 
1354     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1355     if (dest_uninitialized) {
1356       decorators |= IS_DEST_UNINITIALIZED;
1357     }
1358     if (aligned) {
1359       decorators |= ARRAYCOPY_ALIGNED;
1360     }
1361 
1362     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1363     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1364 
1365     if (is_oop) {
1366       // save regs before copy_memory
1367       __ push(RegSet::of(d, count), sp);
1368     }
1369     copy_memory(aligned, s, d, count, rscratch1, size);
1370 
1371     if (is_oop) {
1372       __ pop(RegSet::of(d, count), sp);
1373       if (VerifyOops)
1374         verify_oop_array(size, d, count, r16);
1375       __ sub(count, count, 1); // make an inclusive end pointer
1376       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1377     }
1378 
1379     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1380 
1381     __ leave();
1382     __ mov(r0, zr); // return 0
1383     __ ret(lr);
1384 #ifdef BUILTIN_SIM
1385     {
1386       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1387       sim->notifyCompile(const_cast<char*>(name), start);
1388     }
1389 #endif
1390     return start;
1391   }
1392 
1393   // Arguments:
1394   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1395   //             ignored
1396   //   is_oop  - true => oop array, so generate store check code
1397   //   name    - stub name string
1398   //
1399   // Inputs:
1400   //   c_rarg0   - source array address
1401   //   c_rarg1   - destination array address
1402   //   c_rarg2   - element count, treated as ssize_t, can be zero
1403   //
1404   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1405   // the hardware handle it.  The two dwords within qwords that span
1406   // cache line boundaries will still be loaded and stored atomicly.
1407   //
1408   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1409                                  address *entry, const char *name,
1410                                  bool dest_uninitialized = false) {
1411     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1412     RegSet saved_regs = RegSet::of(s, d, count);
1413     StubCodeMark mark(this, "StubRoutines", name);
1414     address start = __ pc();
1415     __ enter();
1416 
1417     if (entry != NULL) {
1418       *entry = __ pc();
1419       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1420       BLOCK_COMMENT("Entry:");
1421     }
1422 
1423     // use fwd copy when (d-s) above_equal (count*size)
1424     __ sub(rscratch1, d, s);
1425     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1426     __ br(Assembler::HS, nooverlap_target);
1427 
1428     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1429     if (dest_uninitialized) {
1430       decorators |= IS_DEST_UNINITIALIZED;
1431     }
1432     if (aligned) {
1433       decorators |= ARRAYCOPY_ALIGNED;
1434     }
1435 
1436     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1437     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1438 
1439     if (is_oop) {
1440       // save regs before copy_memory
1441       __ push(RegSet::of(d, count), sp);
1442     }
1443     copy_memory(aligned, s, d, count, rscratch1, -size);
1444     if (is_oop) {
1445       __ pop(RegSet::of(d, count), sp);
1446       if (VerifyOops)
1447         verify_oop_array(size, d, count, r16);
1448       __ sub(count, count, 1); // make an inclusive end pointer
1449       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1450     }
1451     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1452     __ leave();
1453     __ mov(r0, zr); // return 0
1454     __ ret(lr);
1455 #ifdef BUILTIN_SIM
1456     {
1457       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1458       sim->notifyCompile(const_cast<char*>(name), start);
1459     }
1460 #endif
1461     return start;
1462 }
1463 
1464   // Arguments:
1465   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1466   //             ignored
1467   //   name    - stub name string
1468   //
1469   // Inputs:
1470   //   c_rarg0   - source array address
1471   //   c_rarg1   - destination array address
1472   //   c_rarg2   - element count, treated as ssize_t, can be zero
1473   //
1474   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1475   // we let the hardware handle it.  The one to eight bytes within words,
1476   // dwords or qwords that span cache line boundaries will still be loaded
1477   // and stored atomically.
1478   //
1479   // Side Effects:
1480   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1481   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1482   // we let the hardware handle it.  The one to eight bytes within words,
1483   // dwords or qwords that span cache line boundaries will still be loaded
1484   // and stored atomically.
1485   //
1486   // Side Effects:
1487   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1488   //   used by generate_conjoint_byte_copy().
1489   //
1490   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1491     const bool not_oop = false;
1492     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1493   }
1494 
1495   // Arguments:
1496   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1497   //             ignored
1498   //   name    - stub name string
1499   //
1500   // Inputs:
1501   //   c_rarg0   - source array address
1502   //   c_rarg1   - destination array address
1503   //   c_rarg2   - element count, treated as ssize_t, can be zero
1504   //
1505   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1506   // we let the hardware handle it.  The one to eight bytes within words,
1507   // dwords or qwords that span cache line boundaries will still be loaded
1508   // and stored atomically.
1509   //
1510   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1511                                       address* entry, const char *name) {
1512     const bool not_oop = false;
1513     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1514   }
1515 
1516   // Arguments:
1517   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1518   //             ignored
1519   //   name    - stub name string
1520   //
1521   // Inputs:
1522   //   c_rarg0   - source array address
1523   //   c_rarg1   - destination array address
1524   //   c_rarg2   - element count, treated as ssize_t, can be zero
1525   //
1526   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1527   // let the hardware handle it.  The two or four words within dwords
1528   // or qwords that span cache line boundaries will still be loaded
1529   // and stored atomically.
1530   //
1531   // Side Effects:
1532   //   disjoint_short_copy_entry is set to the no-overlap entry point
1533   //   used by generate_conjoint_short_copy().
1534   //
1535   address generate_disjoint_short_copy(bool aligned,
1536                                        address* entry, const char *name) {
1537     const bool not_oop = false;
1538     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1539   }
1540 
1541   // Arguments:
1542   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1543   //             ignored
1544   //   name    - stub name string
1545   //
1546   // Inputs:
1547   //   c_rarg0   - source array address
1548   //   c_rarg1   - destination array address
1549   //   c_rarg2   - element count, treated as ssize_t, can be zero
1550   //
1551   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1552   // let the hardware handle it.  The two or four words within dwords
1553   // or qwords that span cache line boundaries will still be loaded
1554   // and stored atomically.
1555   //
1556   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1557                                        address *entry, const char *name) {
1558     const bool not_oop = false;
1559     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1560 
1561   }
1562   // Arguments:
1563   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1564   //             ignored
1565   //   name    - stub name string
1566   //
1567   // Inputs:
1568   //   c_rarg0   - source array address
1569   //   c_rarg1   - destination array address
1570   //   c_rarg2   - element count, treated as ssize_t, can be zero
1571   //
1572   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1573   // the hardware handle it.  The two dwords within qwords that span
1574   // cache line boundaries will still be loaded and stored atomicly.
1575   //
1576   // Side Effects:
1577   //   disjoint_int_copy_entry is set to the no-overlap entry point
1578   //   used by generate_conjoint_int_oop_copy().
1579   //
1580   address generate_disjoint_int_copy(bool aligned, address *entry,
1581                                          const char *name, bool dest_uninitialized = false) {
1582     const bool not_oop = false;
1583     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1584   }
1585 
1586   // Arguments:
1587   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1588   //             ignored
1589   //   name    - stub name string
1590   //
1591   // Inputs:
1592   //   c_rarg0   - source array address
1593   //   c_rarg1   - destination array address
1594   //   c_rarg2   - element count, treated as ssize_t, can be zero
1595   //
1596   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1597   // the hardware handle it.  The two dwords within qwords that span
1598   // cache line boundaries will still be loaded and stored atomicly.
1599   //
1600   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1601                                      address *entry, const char *name,
1602                                      bool dest_uninitialized = false) {
1603     const bool not_oop = false;
1604     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1605   }
1606 
1607 
1608   // Arguments:
1609   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1610   //             ignored
1611   //   name    - stub name string
1612   //
1613   // Inputs:
1614   //   c_rarg0   - source array address
1615   //   c_rarg1   - destination array address
1616   //   c_rarg2   - element count, treated as size_t, can be zero
1617   //
1618   // Side Effects:
1619   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1620   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1621   //
1622   address generate_disjoint_long_copy(bool aligned, address *entry,
1623                                           const char *name, bool dest_uninitialized = false) {
1624     const bool not_oop = false;
1625     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1626   }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as size_t, can be zero
1637   //
1638   address generate_conjoint_long_copy(bool aligned,
1639                                       address nooverlap_target, address *entry,
1640                                       const char *name, bool dest_uninitialized = false) {
1641     const bool not_oop = false;
1642     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1643   }
1644 
1645   // Arguments:
1646   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1647   //             ignored
1648   //   name    - stub name string
1649   //
1650   // Inputs:
1651   //   c_rarg0   - source array address
1652   //   c_rarg1   - destination array address
1653   //   c_rarg2   - element count, treated as size_t, can be zero
1654   //
1655   // Side Effects:
1656   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1657   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1658   //
1659   address generate_disjoint_oop_copy(bool aligned, address *entry,
1660                                      const char *name, bool dest_uninitialized) {
1661     const bool is_oop = true;
1662     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1663     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1664   }
1665 
1666   // Arguments:
1667   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1668   //             ignored
1669   //   name    - stub name string
1670   //
1671   // Inputs:
1672   //   c_rarg0   - source array address
1673   //   c_rarg1   - destination array address
1674   //   c_rarg2   - element count, treated as size_t, can be zero
1675   //
1676   address generate_conjoint_oop_copy(bool aligned,
1677                                      address nooverlap_target, address *entry,
1678                                      const char *name, bool dest_uninitialized) {
1679     const bool is_oop = true;
1680     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1681     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1682                                   name, dest_uninitialized);
1683   }
1684 
1685 
1686   // Helper for generating a dynamic type check.
1687   // Smashes rscratch1.
1688   void generate_type_check(Register sub_klass,
1689                            Register super_check_offset,
1690                            Register super_klass,
1691                            Label& L_success) {
1692     assert_different_registers(sub_klass, super_check_offset, super_klass);
1693 
1694     BLOCK_COMMENT("type_check:");
1695 
1696     Label L_miss;
1697 
1698     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1699                                      super_check_offset);
1700     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1701 
1702     // Fall through on failure!
1703     __ BIND(L_miss);
1704   }
1705 
1706   //
1707   //  Generate checkcasting array copy stub
1708   //
1709   //  Input:
1710   //    c_rarg0   - source array address
1711   //    c_rarg1   - destination array address
1712   //    c_rarg2   - element count, treated as ssize_t, can be zero
1713   //    c_rarg3   - size_t ckoff (super_check_offset)
1714   //    c_rarg4   - oop ckval (super_klass)
1715   //
1716   //  Output:
1717   //    r0 ==  0  -  success
1718   //    r0 == -1^K - failure, where K is partial transfer count
1719   //
1720   address generate_checkcast_copy(const char *name, address *entry,
1721                                   bool dest_uninitialized = false) {
1722 
1723     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1724 
1725     // Input registers (after setup_arg_regs)
1726     const Register from        = c_rarg0;   // source array address
1727     const Register to          = c_rarg1;   // destination array address
1728     const Register count       = c_rarg2;   // elementscount
1729     const Register ckoff       = c_rarg3;   // super_check_offset
1730     const Register ckval       = c_rarg4;   // super_klass
1731 
1732     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1733     RegSet wb_post_saved_regs = RegSet::of(count);
1734 
1735     // Registers used as temps (r18, r19, r20 are save-on-entry)
1736     const Register count_save  = r21;       // orig elementscount
1737     const Register start_to    = r20;       // destination array start address
1738     const Register copied_oop  = r18;       // actual oop copied
1739     const Register r19_klass   = r19;       // oop._klass
1740 
1741     //---------------------------------------------------------------
1742     // Assembler stub will be used for this call to arraycopy
1743     // if the two arrays are subtypes of Object[] but the
1744     // destination array type is not equal to or a supertype
1745     // of the source type.  Each element must be separately
1746     // checked.
1747 
1748     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1749                                copied_oop, r19_klass, count_save);
1750 
1751     __ align(CodeEntryAlignment);
1752     StubCodeMark mark(this, "StubRoutines", name);
1753     address start = __ pc();
1754 
1755     __ enter(); // required for proper stackwalking of RuntimeStub frame
1756 
1757 #ifdef ASSERT
1758     // caller guarantees that the arrays really are different
1759     // otherwise, we would have to make conjoint checks
1760     { Label L;
1761       array_overlap_test(L, TIMES_OOP);
1762       __ stop("checkcast_copy within a single array");
1763       __ bind(L);
1764     }
1765 #endif //ASSERT
1766 
1767     // Caller of this entry point must set up the argument registers.
1768     if (entry != NULL) {
1769       *entry = __ pc();
1770       BLOCK_COMMENT("Entry:");
1771     }
1772 
1773      // Empty array:  Nothing to do.
1774     __ cbz(count, L_done);
1775 
1776     __ push(RegSet::of(r18, r19, r20, r21), sp);
1777 
1778 #ifdef ASSERT
1779     BLOCK_COMMENT("assert consistent ckoff/ckval");
1780     // The ckoff and ckval must be mutually consistent,
1781     // even though caller generates both.
1782     { Label L;
1783       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1784       __ ldrw(start_to, Address(ckval, sco_offset));
1785       __ cmpw(ckoff, start_to);
1786       __ br(Assembler::EQ, L);
1787       __ stop("super_check_offset inconsistent");
1788       __ bind(L);
1789     }
1790 #endif //ASSERT
1791 
1792     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1793     bool is_oop = true;
1794     if (dest_uninitialized) {
1795       decorators |= IS_DEST_UNINITIALIZED;
1796     }
1797 
1798     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1799     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1800 
1801     // save the original count
1802     __ mov(count_save, count);
1803 
1804     // Copy from low to high addresses
1805     __ mov(start_to, to);              // Save destination array start address
1806     __ b(L_load_element);
1807 
1808     // ======== begin loop ========
1809     // (Loop is rotated; its entry is L_load_element.)
1810     // Loop control:
1811     //   for (; count != 0; count--) {
1812     //     copied_oop = load_heap_oop(from++);
1813     //     ... generate_type_check ...;
1814     //     store_heap_oop(to++, copied_oop);
1815     //   }
1816     __ align(OptoLoopAlignment);
1817 
1818     __ BIND(L_store_element);
1819     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1820     __ sub(count, count, 1);
1821     __ cbz(count, L_do_card_marks);
1822 
1823     // ======== loop entry is here ========
1824     __ BIND(L_load_element);
1825     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1826     __ cbz(copied_oop, L_store_element);
1827 
1828     __ load_klass(r19_klass, copied_oop);// query the object klass
1829     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1830     // ======== end loop ========
1831 
1832     // It was a real error; we must depend on the caller to finish the job.
1833     // Register count = remaining oops, count_orig = total oops.
1834     // Emit GC store barriers for the oops we have copied and report
1835     // their number to the caller.
1836 
1837     __ subs(count, count_save, count);     // K = partially copied oop count
1838     __ eon(count, count, zr);                   // report (-1^K) to caller
1839     __ br(Assembler::EQ, L_done_pop);
1840 
1841     __ BIND(L_do_card_marks);
1842     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1843     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1844 
1845     __ bind(L_done_pop);
1846     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1847     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1848 
1849     __ bind(L_done);
1850     __ mov(r0, count);
1851     __ leave();
1852     __ ret(lr);
1853 
1854     return start;
1855   }
1856 
1857   // Perform range checks on the proposed arraycopy.
1858   // Kills temp, but nothing else.
1859   // Also, clean the sign bits of src_pos and dst_pos.
1860   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1861                               Register src_pos, // source position (c_rarg1)
1862                               Register dst,     // destination array oo (c_rarg2)
1863                               Register dst_pos, // destination position (c_rarg3)
1864                               Register length,
1865                               Register temp,
1866                               Label& L_failed) {
1867     BLOCK_COMMENT("arraycopy_range_checks:");
1868 
1869     assert_different_registers(rscratch1, temp);
1870 
1871     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1872     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1873     __ addw(temp, length, src_pos);
1874     __ cmpw(temp, rscratch1);
1875     __ br(Assembler::HI, L_failed);
1876 
1877     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1878     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1879     __ addw(temp, length, dst_pos);
1880     __ cmpw(temp, rscratch1);
1881     __ br(Assembler::HI, L_failed);
1882 
1883     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1884     __ movw(src_pos, src_pos);
1885     __ movw(dst_pos, dst_pos);
1886 
1887     BLOCK_COMMENT("arraycopy_range_checks done");
1888   }
1889 
1890   // These stubs get called from some dumb test routine.
1891   // I'll write them properly when they're called from
1892   // something that's actually doing something.
1893   static void fake_arraycopy_stub(address src, address dst, int count) {
1894     assert(count == 0, "huh?");
1895   }
1896 
1897 
1898   //
1899   //  Generate 'unsafe' array copy stub
1900   //  Though just as safe as the other stubs, it takes an unscaled
1901   //  size_t argument instead of an element count.
1902   //
1903   //  Input:
1904   //    c_rarg0   - source array address
1905   //    c_rarg1   - destination array address
1906   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1907   //
1908   // Examines the alignment of the operands and dispatches
1909   // to a long, int, short, or byte copy loop.
1910   //
1911   address generate_unsafe_copy(const char *name,
1912                                address byte_copy_entry,
1913                                address short_copy_entry,
1914                                address int_copy_entry,
1915                                address long_copy_entry) {
1916     Label L_long_aligned, L_int_aligned, L_short_aligned;
1917     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1918 
1919     __ align(CodeEntryAlignment);
1920     StubCodeMark mark(this, "StubRoutines", name);
1921     address start = __ pc();
1922     __ enter(); // required for proper stackwalking of RuntimeStub frame
1923 
1924     // bump this on entry, not on exit:
1925     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1926 
1927     __ orr(rscratch1, s, d);
1928     __ orr(rscratch1, rscratch1, count);
1929 
1930     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1931     __ cbz(rscratch1, L_long_aligned);
1932     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1933     __ cbz(rscratch1, L_int_aligned);
1934     __ tbz(rscratch1, 0, L_short_aligned);
1935     __ b(RuntimeAddress(byte_copy_entry));
1936 
1937     __ BIND(L_short_aligned);
1938     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1939     __ b(RuntimeAddress(short_copy_entry));
1940     __ BIND(L_int_aligned);
1941     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1942     __ b(RuntimeAddress(int_copy_entry));
1943     __ BIND(L_long_aligned);
1944     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1945     __ b(RuntimeAddress(long_copy_entry));
1946 
1947     return start;
1948   }
1949 
1950   //
1951   //  Generate generic array copy stubs
1952   //
1953   //  Input:
1954   //    c_rarg0    -  src oop
1955   //    c_rarg1    -  src_pos (32-bits)
1956   //    c_rarg2    -  dst oop
1957   //    c_rarg3    -  dst_pos (32-bits)
1958   //    c_rarg4    -  element count (32-bits)
1959   //
1960   //  Output:
1961   //    r0 ==  0  -  success
1962   //    r0 == -1^K - failure, where K is partial transfer count
1963   //
1964   address generate_generic_copy(const char *name,
1965                                 address byte_copy_entry, address short_copy_entry,
1966                                 address int_copy_entry, address oop_copy_entry,
1967                                 address long_copy_entry, address checkcast_copy_entry) {
1968 
1969     Label L_failed, L_failed_0, L_objArray;
1970     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1971 
1972     // Input registers
1973     const Register src        = c_rarg0;  // source array oop
1974     const Register src_pos    = c_rarg1;  // source position
1975     const Register dst        = c_rarg2;  // destination array oop
1976     const Register dst_pos    = c_rarg3;  // destination position
1977     const Register length     = c_rarg4;
1978 
1979     StubCodeMark mark(this, "StubRoutines", name);
1980 
1981     __ align(CodeEntryAlignment);
1982     address start = __ pc();
1983 
1984     __ enter(); // required for proper stackwalking of RuntimeStub frame
1985 
1986     // bump this on entry, not on exit:
1987     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1988 
1989     //-----------------------------------------------------------------------
1990     // Assembler stub will be used for this call to arraycopy
1991     // if the following conditions are met:
1992     //
1993     // (1) src and dst must not be null.
1994     // (2) src_pos must not be negative.
1995     // (3) dst_pos must not be negative.
1996     // (4) length  must not be negative.
1997     // (5) src klass and dst klass should be the same and not NULL.
1998     // (6) src and dst should be arrays.
1999     // (7) src_pos + length must not exceed length of src.
2000     // (8) dst_pos + length must not exceed length of dst.
2001     //
2002 
2003     //  if (src == NULL) return -1;
2004     __ cbz(src, L_failed);
2005 
2006     //  if (src_pos < 0) return -1;
2007     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2008 
2009     //  if (dst == NULL) return -1;
2010     __ cbz(dst, L_failed);
2011 
2012     //  if (dst_pos < 0) return -1;
2013     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2014 
2015     // registers used as temp
2016     const Register scratch_length    = r16; // elements count to copy
2017     const Register scratch_src_klass = r17; // array klass
2018     const Register lh                = r18; // layout helper
2019 
2020     //  if (length < 0) return -1;
2021     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2022     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2023 
2024     __ load_klass(scratch_src_klass, src);
2025 #ifdef ASSERT
2026     //  assert(src->klass() != NULL);
2027     {
2028       BLOCK_COMMENT("assert klasses not null {");
2029       Label L1, L2;
2030       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2031       __ bind(L1);
2032       __ stop("broken null klass");
2033       __ bind(L2);
2034       __ load_klass(rscratch1, dst);
2035       __ cbz(rscratch1, L1);     // this would be broken also
2036       BLOCK_COMMENT("} assert klasses not null done");
2037     }
2038 #endif
2039 
2040     // Load layout helper (32-bits)
2041     //
2042     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2043     // 32        30    24            16              8     2                 0
2044     //
2045     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2046     //
2047 
2048     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2049 
2050     // Handle objArrays completely differently...
2051     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2052     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2053     __ movw(rscratch1, objArray_lh);
2054     __ eorw(rscratch2, lh, rscratch1);
2055     __ cbzw(rscratch2, L_objArray);
2056 
2057     //  if (src->klass() != dst->klass()) return -1;
2058     __ load_klass(rscratch2, dst);
2059     __ eor(rscratch2, rscratch2, scratch_src_klass);
2060     __ cbnz(rscratch2, L_failed);
2061 
2062     //  if (!src->is_Array()) return -1;
2063     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2064 
2065     // At this point, it is known to be a typeArray (array_tag 0x3).
2066 #ifdef ASSERT
2067     {
2068       BLOCK_COMMENT("assert primitive array {");
2069       Label L;
2070       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2071       __ cmpw(lh, rscratch2);
2072       __ br(Assembler::GE, L);
2073       __ stop("must be a primitive array");
2074       __ bind(L);
2075       BLOCK_COMMENT("} assert primitive array done");
2076     }
2077 #endif
2078 
2079     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2080                            rscratch2, L_failed);
2081 
2082     // TypeArrayKlass
2083     //
2084     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2085     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2086     //
2087 
2088     const Register rscratch1_offset = rscratch1;    // array offset
2089     const Register r18_elsize = lh; // element size
2090 
2091     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2092            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2093     __ add(src, src, rscratch1_offset);           // src array offset
2094     __ add(dst, dst, rscratch1_offset);           // dst array offset
2095     BLOCK_COMMENT("choose copy loop based on element size");
2096 
2097     // next registers should be set before the jump to corresponding stub
2098     const Register from     = c_rarg0;  // source array address
2099     const Register to       = c_rarg1;  // destination array address
2100     const Register count    = c_rarg2;  // elements count
2101 
2102     // 'from', 'to', 'count' registers should be set in such order
2103     // since they are the same as 'src', 'src_pos', 'dst'.
2104 
2105     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2106 
2107     // The possible values of elsize are 0-3, i.e. exact_log2(element
2108     // size in bytes).  We do a simple bitwise binary search.
2109   __ BIND(L_copy_bytes);
2110     __ tbnz(r18_elsize, 1, L_copy_ints);
2111     __ tbnz(r18_elsize, 0, L_copy_shorts);
2112     __ lea(from, Address(src, src_pos));// src_addr
2113     __ lea(to,   Address(dst, dst_pos));// dst_addr
2114     __ movw(count, scratch_length); // length
2115     __ b(RuntimeAddress(byte_copy_entry));
2116 
2117   __ BIND(L_copy_shorts);
2118     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2119     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2120     __ movw(count, scratch_length); // length
2121     __ b(RuntimeAddress(short_copy_entry));
2122 
2123   __ BIND(L_copy_ints);
2124     __ tbnz(r18_elsize, 0, L_copy_longs);
2125     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2126     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2127     __ movw(count, scratch_length); // length
2128     __ b(RuntimeAddress(int_copy_entry));
2129 
2130   __ BIND(L_copy_longs);
2131 #ifdef ASSERT
2132     {
2133       BLOCK_COMMENT("assert long copy {");
2134       Label L;
2135       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2136       __ cmpw(r18_elsize, LogBytesPerLong);
2137       __ br(Assembler::EQ, L);
2138       __ stop("must be long copy, but elsize is wrong");
2139       __ bind(L);
2140       BLOCK_COMMENT("} assert long copy done");
2141     }
2142 #endif
2143     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2144     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2145     __ movw(count, scratch_length); // length
2146     __ b(RuntimeAddress(long_copy_entry));
2147 
2148     // ObjArrayKlass
2149   __ BIND(L_objArray);
2150     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2151 
2152     Label L_plain_copy, L_checkcast_copy;
2153     //  test array classes for subtyping
2154     __ load_klass(r18, dst);
2155     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2156     __ br(Assembler::NE, L_checkcast_copy);
2157 
2158     // Identically typed arrays can be copied without element-wise checks.
2159     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2160                            rscratch2, L_failed);
2161 
2162     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2163     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2164     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2165     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2166     __ movw(count, scratch_length); // length
2167   __ BIND(L_plain_copy);
2168     __ b(RuntimeAddress(oop_copy_entry));
2169 
2170   __ BIND(L_checkcast_copy);
2171     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2172     {
2173       // Before looking at dst.length, make sure dst is also an objArray.
2174       __ ldrw(rscratch1, Address(r18, lh_offset));
2175       __ movw(rscratch2, objArray_lh);
2176       __ eorw(rscratch1, rscratch1, rscratch2);
2177       __ cbnzw(rscratch1, L_failed);
2178 
2179       // It is safe to examine both src.length and dst.length.
2180       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2181                              r18, L_failed);
2182 
2183       const Register rscratch2_dst_klass = rscratch2;
2184       __ load_klass(rscratch2_dst_klass, dst); // reload
2185 
2186       // Marshal the base address arguments now, freeing registers.
2187       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2188       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2189       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2190       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2191       __ movw(count, length);           // length (reloaded)
2192       Register sco_temp = c_rarg3;      // this register is free now
2193       assert_different_registers(from, to, count, sco_temp,
2194                                  rscratch2_dst_klass, scratch_src_klass);
2195       // assert_clean_int(count, sco_temp);
2196 
2197       // Generate the type check.
2198       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2199       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2200       // assert_clean_int(sco_temp, r18);
2201       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2202 
2203       // Fetch destination element klass from the ObjArrayKlass header.
2204       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2205       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2206       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2207 
2208       // the checkcast_copy loop needs two extra arguments:
2209       assert(c_rarg3 == sco_temp, "#3 already in place");
2210       // Set up arguments for checkcast_copy_entry.
2211       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2212       __ b(RuntimeAddress(checkcast_copy_entry));
2213     }
2214 
2215   __ BIND(L_failed);
2216     __ mov(r0, -1);
2217     __ leave();   // required for proper stackwalking of RuntimeStub frame
2218     __ ret(lr);
2219 
2220     return start;
2221   }
2222 
2223   //
2224   // Generate stub for array fill. If "aligned" is true, the
2225   // "to" address is assumed to be heapword aligned.
2226   //
2227   // Arguments for generated stub:
2228   //   to:    c_rarg0
2229   //   value: c_rarg1
2230   //   count: c_rarg2 treated as signed
2231   //
2232   address generate_fill(BasicType t, bool aligned, const char *name) {
2233     __ align(CodeEntryAlignment);
2234     StubCodeMark mark(this, "StubRoutines", name);
2235     address start = __ pc();
2236 
2237     BLOCK_COMMENT("Entry:");
2238 
2239     const Register to        = c_rarg0;  // source array address
2240     const Register value     = c_rarg1;  // value
2241     const Register count     = c_rarg2;  // elements count
2242 
2243     const Register bz_base = r10;        // base for block_zero routine
2244     const Register cnt_words = r11;      // temp register
2245 
2246     __ enter();
2247 
2248     Label L_fill_elements, L_exit1;
2249 
2250     int shift = -1;
2251     switch (t) {
2252       case T_BYTE:
2253         shift = 0;
2254         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2255         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2256         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2257         __ br(Assembler::LO, L_fill_elements);
2258         break;
2259       case T_SHORT:
2260         shift = 1;
2261         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2262         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2263         __ br(Assembler::LO, L_fill_elements);
2264         break;
2265       case T_INT:
2266         shift = 2;
2267         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2268         __ br(Assembler::LO, L_fill_elements);
2269         break;
2270       default: ShouldNotReachHere();
2271     }
2272 
2273     // Align source address at 8 bytes address boundary.
2274     Label L_skip_align1, L_skip_align2, L_skip_align4;
2275     if (!aligned) {
2276       switch (t) {
2277         case T_BYTE:
2278           // One byte misalignment happens only for byte arrays.
2279           __ tbz(to, 0, L_skip_align1);
2280           __ strb(value, Address(__ post(to, 1)));
2281           __ subw(count, count, 1);
2282           __ bind(L_skip_align1);
2283           // Fallthrough
2284         case T_SHORT:
2285           // Two bytes misalignment happens only for byte and short (char) arrays.
2286           __ tbz(to, 1, L_skip_align2);
2287           __ strh(value, Address(__ post(to, 2)));
2288           __ subw(count, count, 2 >> shift);
2289           __ bind(L_skip_align2);
2290           // Fallthrough
2291         case T_INT:
2292           // Align to 8 bytes, we know we are 4 byte aligned to start.
2293           __ tbz(to, 2, L_skip_align4);
2294           __ strw(value, Address(__ post(to, 4)));
2295           __ subw(count, count, 4 >> shift);
2296           __ bind(L_skip_align4);
2297           break;
2298         default: ShouldNotReachHere();
2299       }
2300     }
2301 
2302     //
2303     //  Fill large chunks
2304     //
2305     __ lsrw(cnt_words, count, 3 - shift); // number of words
2306     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2307     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2308     if (UseBlockZeroing) {
2309       Label non_block_zeroing, rest;
2310       // If the fill value is zero we can use the fast zero_words().
2311       __ cbnz(value, non_block_zeroing);
2312       __ mov(bz_base, to);
2313       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2314       __ zero_words(bz_base, cnt_words);
2315       __ b(rest);
2316       __ bind(non_block_zeroing);
2317       __ fill_words(to, cnt_words, value);
2318       __ bind(rest);
2319     } else {
2320       __ fill_words(to, cnt_words, value);
2321     }
2322 
2323     // Remaining count is less than 8 bytes. Fill it by a single store.
2324     // Note that the total length is no less than 8 bytes.
2325     if (t == T_BYTE || t == T_SHORT) {
2326       Label L_exit1;
2327       __ cbzw(count, L_exit1);
2328       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2329       __ str(value, Address(to, -8));    // overwrite some elements
2330       __ bind(L_exit1);
2331       __ leave();
2332       __ ret(lr);
2333     }
2334 
2335     // Handle copies less than 8 bytes.
2336     Label L_fill_2, L_fill_4, L_exit2;
2337     __ bind(L_fill_elements);
2338     switch (t) {
2339       case T_BYTE:
2340         __ tbz(count, 0, L_fill_2);
2341         __ strb(value, Address(__ post(to, 1)));
2342         __ bind(L_fill_2);
2343         __ tbz(count, 1, L_fill_4);
2344         __ strh(value, Address(__ post(to, 2)));
2345         __ bind(L_fill_4);
2346         __ tbz(count, 2, L_exit2);
2347         __ strw(value, Address(to));
2348         break;
2349       case T_SHORT:
2350         __ tbz(count, 0, L_fill_4);
2351         __ strh(value, Address(__ post(to, 2)));
2352         __ bind(L_fill_4);
2353         __ tbz(count, 1, L_exit2);
2354         __ strw(value, Address(to));
2355         break;
2356       case T_INT:
2357         __ cbzw(count, L_exit2);
2358         __ strw(value, Address(to));
2359         break;
2360       default: ShouldNotReachHere();
2361     }
2362     __ bind(L_exit2);
2363     __ leave();
2364     __ ret(lr);
2365     return start;
2366   }
2367 
2368   address generate_data_cache_writeback() {
2369     const Register line        = c_rarg0;  // address of line to write back
2370 
2371     __ align(CodeEntryAlignment);
2372 
2373     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2374 
2375     address start = __ pc();
2376     __ enter();
2377     __ cache_wb(Address(line, 0));
2378     __ leave();
2379     __ ret(lr);
2380 
2381     return start;
2382   }
2383 
2384   address generate_data_cache_writeback_sync() {
2385     const Register kind       = c_rarg0;  // pre or post sync (unused)
2386 
2387     __ align(CodeEntryAlignment);
2388 
2389     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2390 
2391     address start = __ pc();
2392     __ enter();
2393     __ cache_wbsync();
2394     __ leave();
2395     __ ret(lr);
2396 
2397     return start;
2398   }
2399 
2400   void generate_arraycopy_stubs() {
2401     address entry;
2402     address entry_jbyte_arraycopy;
2403     address entry_jshort_arraycopy;
2404     address entry_jint_arraycopy;
2405     address entry_oop_arraycopy;
2406     address entry_jlong_arraycopy;
2407     address entry_checkcast_arraycopy;
2408 
2409     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2410     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2411 
2412     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2413 
2414     //*** jbyte
2415     // Always need aligned and unaligned versions
2416     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2417                                                                                   "jbyte_disjoint_arraycopy");
2418     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2419                                                                                   &entry_jbyte_arraycopy,
2420                                                                                   "jbyte_arraycopy");
2421     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2422                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2423     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2424                                                                                   "arrayof_jbyte_arraycopy");
2425 
2426     //*** jshort
2427     // Always need aligned and unaligned versions
2428     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2429                                                                                     "jshort_disjoint_arraycopy");
2430     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2431                                                                                     &entry_jshort_arraycopy,
2432                                                                                     "jshort_arraycopy");
2433     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2434                                                                                     "arrayof_jshort_disjoint_arraycopy");
2435     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2436                                                                                     "arrayof_jshort_arraycopy");
2437 
2438     //*** jint
2439     // Aligned versions
2440     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2441                                                                                 "arrayof_jint_disjoint_arraycopy");
2442     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2443                                                                                 "arrayof_jint_arraycopy");
2444     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2445     // entry_jint_arraycopy always points to the unaligned version
2446     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2447                                                                                 "jint_disjoint_arraycopy");
2448     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2449                                                                                 &entry_jint_arraycopy,
2450                                                                                 "jint_arraycopy");
2451 
2452     //*** jlong
2453     // It is always aligned
2454     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2455                                                                                   "arrayof_jlong_disjoint_arraycopy");
2456     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2457                                                                                   "arrayof_jlong_arraycopy");
2458     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2459     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2460 
2461     //*** oops
2462     {
2463       // With compressed oops we need unaligned versions; notice that
2464       // we overwrite entry_oop_arraycopy.
2465       bool aligned = !UseCompressedOops;
2466 
2467       StubRoutines::_arrayof_oop_disjoint_arraycopy
2468         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2469                                      /*dest_uninitialized*/false);
2470       StubRoutines::_arrayof_oop_arraycopy
2471         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2472                                      /*dest_uninitialized*/false);
2473       // Aligned versions without pre-barriers
2474       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2475         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2476                                      /*dest_uninitialized*/true);
2477       StubRoutines::_arrayof_oop_arraycopy_uninit
2478         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2479                                      /*dest_uninitialized*/true);
2480     }
2481 
2482     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2483     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2484     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2485     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2486 
2487     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2488     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2489                                                                         /*dest_uninitialized*/true);
2490 
2491     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2492                                                               entry_jbyte_arraycopy,
2493                                                               entry_jshort_arraycopy,
2494                                                               entry_jint_arraycopy,
2495                                                               entry_jlong_arraycopy);
2496 
2497     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2498                                                                entry_jbyte_arraycopy,
2499                                                                entry_jshort_arraycopy,
2500                                                                entry_jint_arraycopy,
2501                                                                entry_oop_arraycopy,
2502                                                                entry_jlong_arraycopy,
2503                                                                entry_checkcast_arraycopy);
2504 
2505     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2506     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2507     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2508     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2509     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2510     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2511   }
2512 
2513   void generate_math_stubs() { Unimplemented(); }
2514 
2515   // Arguments:
2516   //
2517   // Inputs:
2518   //   c_rarg0   - source byte array address
2519   //   c_rarg1   - destination byte array address
2520   //   c_rarg2   - K (key) in little endian int array
2521   //
2522   address generate_aescrypt_encryptBlock() {
2523     __ align(CodeEntryAlignment);
2524     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2525 
2526     Label L_doLast;
2527 
2528     const Register from        = c_rarg0;  // source array address
2529     const Register to          = c_rarg1;  // destination array address
2530     const Register key         = c_rarg2;  // key array address
2531     const Register keylen      = rscratch1;
2532 
2533     address start = __ pc();
2534     __ enter();
2535 
2536     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2537 
2538     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2539 
2540     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2541     __ rev32(v1, __ T16B, v1);
2542     __ rev32(v2, __ T16B, v2);
2543     __ rev32(v3, __ T16B, v3);
2544     __ rev32(v4, __ T16B, v4);
2545     __ aese(v0, v1);
2546     __ aesmc(v0, v0);
2547     __ aese(v0, v2);
2548     __ aesmc(v0, v0);
2549     __ aese(v0, v3);
2550     __ aesmc(v0, v0);
2551     __ aese(v0, v4);
2552     __ aesmc(v0, v0);
2553 
2554     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2555     __ rev32(v1, __ T16B, v1);
2556     __ rev32(v2, __ T16B, v2);
2557     __ rev32(v3, __ T16B, v3);
2558     __ rev32(v4, __ T16B, v4);
2559     __ aese(v0, v1);
2560     __ aesmc(v0, v0);
2561     __ aese(v0, v2);
2562     __ aesmc(v0, v0);
2563     __ aese(v0, v3);
2564     __ aesmc(v0, v0);
2565     __ aese(v0, v4);
2566     __ aesmc(v0, v0);
2567 
2568     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2569     __ rev32(v1, __ T16B, v1);
2570     __ rev32(v2, __ T16B, v2);
2571 
2572     __ cmpw(keylen, 44);
2573     __ br(Assembler::EQ, L_doLast);
2574 
2575     __ aese(v0, v1);
2576     __ aesmc(v0, v0);
2577     __ aese(v0, v2);
2578     __ aesmc(v0, v0);
2579 
2580     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2581     __ rev32(v1, __ T16B, v1);
2582     __ rev32(v2, __ T16B, v2);
2583 
2584     __ cmpw(keylen, 52);
2585     __ br(Assembler::EQ, L_doLast);
2586 
2587     __ aese(v0, v1);
2588     __ aesmc(v0, v0);
2589     __ aese(v0, v2);
2590     __ aesmc(v0, v0);
2591 
2592     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2593     __ rev32(v1, __ T16B, v1);
2594     __ rev32(v2, __ T16B, v2);
2595 
2596     __ BIND(L_doLast);
2597 
2598     __ aese(v0, v1);
2599     __ aesmc(v0, v0);
2600     __ aese(v0, v2);
2601 
2602     __ ld1(v1, __ T16B, key);
2603     __ rev32(v1, __ T16B, v1);
2604     __ eor(v0, __ T16B, v0, v1);
2605 
2606     __ st1(v0, __ T16B, to);
2607 
2608     __ mov(r0, 0);
2609 
2610     __ leave();
2611     __ ret(lr);
2612 
2613     return start;
2614   }
2615 
2616   // Arguments:
2617   //
2618   // Inputs:
2619   //   c_rarg0   - source byte array address
2620   //   c_rarg1   - destination byte array address
2621   //   c_rarg2   - K (key) in little endian int array
2622   //
2623   address generate_aescrypt_decryptBlock() {
2624     assert(UseAES, "need AES instructions and misaligned SSE support");
2625     __ align(CodeEntryAlignment);
2626     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2627     Label L_doLast;
2628 
2629     const Register from        = c_rarg0;  // source array address
2630     const Register to          = c_rarg1;  // destination array address
2631     const Register key         = c_rarg2;  // key array address
2632     const Register keylen      = rscratch1;
2633 
2634     address start = __ pc();
2635     __ enter(); // required for proper stackwalking of RuntimeStub frame
2636 
2637     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2638 
2639     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2640 
2641     __ ld1(v5, __ T16B, __ post(key, 16));
2642     __ rev32(v5, __ T16B, v5);
2643 
2644     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2645     __ rev32(v1, __ T16B, v1);
2646     __ rev32(v2, __ T16B, v2);
2647     __ rev32(v3, __ T16B, v3);
2648     __ rev32(v4, __ T16B, v4);
2649     __ aesd(v0, v1);
2650     __ aesimc(v0, v0);
2651     __ aesd(v0, v2);
2652     __ aesimc(v0, v0);
2653     __ aesd(v0, v3);
2654     __ aesimc(v0, v0);
2655     __ aesd(v0, v4);
2656     __ aesimc(v0, v0);
2657 
2658     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2659     __ rev32(v1, __ T16B, v1);
2660     __ rev32(v2, __ T16B, v2);
2661     __ rev32(v3, __ T16B, v3);
2662     __ rev32(v4, __ T16B, v4);
2663     __ aesd(v0, v1);
2664     __ aesimc(v0, v0);
2665     __ aesd(v0, v2);
2666     __ aesimc(v0, v0);
2667     __ aesd(v0, v3);
2668     __ aesimc(v0, v0);
2669     __ aesd(v0, v4);
2670     __ aesimc(v0, v0);
2671 
2672     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2673     __ rev32(v1, __ T16B, v1);
2674     __ rev32(v2, __ T16B, v2);
2675 
2676     __ cmpw(keylen, 44);
2677     __ br(Assembler::EQ, L_doLast);
2678 
2679     __ aesd(v0, v1);
2680     __ aesimc(v0, v0);
2681     __ aesd(v0, v2);
2682     __ aesimc(v0, v0);
2683 
2684     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2685     __ rev32(v1, __ T16B, v1);
2686     __ rev32(v2, __ T16B, v2);
2687 
2688     __ cmpw(keylen, 52);
2689     __ br(Assembler::EQ, L_doLast);
2690 
2691     __ aesd(v0, v1);
2692     __ aesimc(v0, v0);
2693     __ aesd(v0, v2);
2694     __ aesimc(v0, v0);
2695 
2696     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2697     __ rev32(v1, __ T16B, v1);
2698     __ rev32(v2, __ T16B, v2);
2699 
2700     __ BIND(L_doLast);
2701 
2702     __ aesd(v0, v1);
2703     __ aesimc(v0, v0);
2704     __ aesd(v0, v2);
2705 
2706     __ eor(v0, __ T16B, v0, v5);
2707 
2708     __ st1(v0, __ T16B, to);
2709 
2710     __ mov(r0, 0);
2711 
2712     __ leave();
2713     __ ret(lr);
2714 
2715     return start;
2716   }
2717 
2718   // Arguments:
2719   //
2720   // Inputs:
2721   //   c_rarg0   - source byte array address
2722   //   c_rarg1   - destination byte array address
2723   //   c_rarg2   - K (key) in little endian int array
2724   //   c_rarg3   - r vector byte array address
2725   //   c_rarg4   - input length
2726   //
2727   // Output:
2728   //   x0        - input length
2729   //
2730   address generate_cipherBlockChaining_encryptAESCrypt() {
2731     assert(UseAES, "need AES instructions and misaligned SSE support");
2732     __ align(CodeEntryAlignment);
2733     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2734 
2735     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2736 
2737     const Register from        = c_rarg0;  // source array address
2738     const Register to          = c_rarg1;  // destination array address
2739     const Register key         = c_rarg2;  // key array address
2740     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2741                                            // and left with the results of the last encryption block
2742     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2743     const Register keylen      = rscratch1;
2744 
2745     address start = __ pc();
2746 
2747       __ enter();
2748 
2749       __ movw(rscratch2, len_reg);
2750 
2751       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2752 
2753       __ ld1(v0, __ T16B, rvec);
2754 
2755       __ cmpw(keylen, 52);
2756       __ br(Assembler::CC, L_loadkeys_44);
2757       __ br(Assembler::EQ, L_loadkeys_52);
2758 
2759       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2760       __ rev32(v17, __ T16B, v17);
2761       __ rev32(v18, __ T16B, v18);
2762     __ BIND(L_loadkeys_52);
2763       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2764       __ rev32(v19, __ T16B, v19);
2765       __ rev32(v20, __ T16B, v20);
2766     __ BIND(L_loadkeys_44);
2767       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2768       __ rev32(v21, __ T16B, v21);
2769       __ rev32(v22, __ T16B, v22);
2770       __ rev32(v23, __ T16B, v23);
2771       __ rev32(v24, __ T16B, v24);
2772       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2773       __ rev32(v25, __ T16B, v25);
2774       __ rev32(v26, __ T16B, v26);
2775       __ rev32(v27, __ T16B, v27);
2776       __ rev32(v28, __ T16B, v28);
2777       __ ld1(v29, v30, v31, __ T16B, key);
2778       __ rev32(v29, __ T16B, v29);
2779       __ rev32(v30, __ T16B, v30);
2780       __ rev32(v31, __ T16B, v31);
2781 
2782     __ BIND(L_aes_loop);
2783       __ ld1(v1, __ T16B, __ post(from, 16));
2784       __ eor(v0, __ T16B, v0, v1);
2785 
2786       __ br(Assembler::CC, L_rounds_44);
2787       __ br(Assembler::EQ, L_rounds_52);
2788 
2789       __ aese(v0, v17); __ aesmc(v0, v0);
2790       __ aese(v0, v18); __ aesmc(v0, v0);
2791     __ BIND(L_rounds_52);
2792       __ aese(v0, v19); __ aesmc(v0, v0);
2793       __ aese(v0, v20); __ aesmc(v0, v0);
2794     __ BIND(L_rounds_44);
2795       __ aese(v0, v21); __ aesmc(v0, v0);
2796       __ aese(v0, v22); __ aesmc(v0, v0);
2797       __ aese(v0, v23); __ aesmc(v0, v0);
2798       __ aese(v0, v24); __ aesmc(v0, v0);
2799       __ aese(v0, v25); __ aesmc(v0, v0);
2800       __ aese(v0, v26); __ aesmc(v0, v0);
2801       __ aese(v0, v27); __ aesmc(v0, v0);
2802       __ aese(v0, v28); __ aesmc(v0, v0);
2803       __ aese(v0, v29); __ aesmc(v0, v0);
2804       __ aese(v0, v30);
2805       __ eor(v0, __ T16B, v0, v31);
2806 
2807       __ st1(v0, __ T16B, __ post(to, 16));
2808 
2809       __ subw(len_reg, len_reg, 16);
2810       __ cbnzw(len_reg, L_aes_loop);
2811 
2812       __ st1(v0, __ T16B, rvec);
2813 
2814       __ mov(r0, rscratch2);
2815 
2816       __ leave();
2817       __ ret(lr);
2818 
2819       return start;
2820   }
2821 
2822   // Arguments:
2823   //
2824   // Inputs:
2825   //   c_rarg0   - source byte array address
2826   //   c_rarg1   - destination byte array address
2827   //   c_rarg2   - K (key) in little endian int array
2828   //   c_rarg3   - r vector byte array address
2829   //   c_rarg4   - input length
2830   //
2831   // Output:
2832   //   r0        - input length
2833   //
2834   address generate_cipherBlockChaining_decryptAESCrypt() {
2835     assert(UseAES, "need AES instructions and misaligned SSE support");
2836     __ align(CodeEntryAlignment);
2837     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2838 
2839     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2840 
2841     const Register from        = c_rarg0;  // source array address
2842     const Register to          = c_rarg1;  // destination array address
2843     const Register key         = c_rarg2;  // key array address
2844     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2845                                            // and left with the results of the last encryption block
2846     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2847     const Register keylen      = rscratch1;
2848 
2849     address start = __ pc();
2850 
2851       __ enter();
2852 
2853       __ movw(rscratch2, len_reg);
2854 
2855       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2856 
2857       __ ld1(v2, __ T16B, rvec);
2858 
2859       __ ld1(v31, __ T16B, __ post(key, 16));
2860       __ rev32(v31, __ T16B, v31);
2861 
2862       __ cmpw(keylen, 52);
2863       __ br(Assembler::CC, L_loadkeys_44);
2864       __ br(Assembler::EQ, L_loadkeys_52);
2865 
2866       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2867       __ rev32(v17, __ T16B, v17);
2868       __ rev32(v18, __ T16B, v18);
2869     __ BIND(L_loadkeys_52);
2870       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2871       __ rev32(v19, __ T16B, v19);
2872       __ rev32(v20, __ T16B, v20);
2873     __ BIND(L_loadkeys_44);
2874       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2875       __ rev32(v21, __ T16B, v21);
2876       __ rev32(v22, __ T16B, v22);
2877       __ rev32(v23, __ T16B, v23);
2878       __ rev32(v24, __ T16B, v24);
2879       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2880       __ rev32(v25, __ T16B, v25);
2881       __ rev32(v26, __ T16B, v26);
2882       __ rev32(v27, __ T16B, v27);
2883       __ rev32(v28, __ T16B, v28);
2884       __ ld1(v29, v30, __ T16B, key);
2885       __ rev32(v29, __ T16B, v29);
2886       __ rev32(v30, __ T16B, v30);
2887 
2888     __ BIND(L_aes_loop);
2889       __ ld1(v0, __ T16B, __ post(from, 16));
2890       __ orr(v1, __ T16B, v0, v0);
2891 
2892       __ br(Assembler::CC, L_rounds_44);
2893       __ br(Assembler::EQ, L_rounds_52);
2894 
2895       __ aesd(v0, v17); __ aesimc(v0, v0);
2896       __ aesd(v0, v18); __ aesimc(v0, v0);
2897     __ BIND(L_rounds_52);
2898       __ aesd(v0, v19); __ aesimc(v0, v0);
2899       __ aesd(v0, v20); __ aesimc(v0, v0);
2900     __ BIND(L_rounds_44);
2901       __ aesd(v0, v21); __ aesimc(v0, v0);
2902       __ aesd(v0, v22); __ aesimc(v0, v0);
2903       __ aesd(v0, v23); __ aesimc(v0, v0);
2904       __ aesd(v0, v24); __ aesimc(v0, v0);
2905       __ aesd(v0, v25); __ aesimc(v0, v0);
2906       __ aesd(v0, v26); __ aesimc(v0, v0);
2907       __ aesd(v0, v27); __ aesimc(v0, v0);
2908       __ aesd(v0, v28); __ aesimc(v0, v0);
2909       __ aesd(v0, v29); __ aesimc(v0, v0);
2910       __ aesd(v0, v30);
2911       __ eor(v0, __ T16B, v0, v31);
2912       __ eor(v0, __ T16B, v0, v2);
2913 
2914       __ st1(v0, __ T16B, __ post(to, 16));
2915       __ orr(v2, __ T16B, v1, v1);
2916 
2917       __ subw(len_reg, len_reg, 16);
2918       __ cbnzw(len_reg, L_aes_loop);
2919 
2920       __ st1(v2, __ T16B, rvec);
2921 
2922       __ mov(r0, rscratch2);
2923 
2924       __ leave();
2925       __ ret(lr);
2926 
2927     return start;
2928   }
2929 
2930   // Arguments:
2931   //
2932   // Inputs:
2933   //   c_rarg0   - byte[]  source+offset
2934   //   c_rarg1   - int[]   SHA.state
2935   //   c_rarg2   - int     offset
2936   //   c_rarg3   - int     limit
2937   //
2938   address generate_sha1_implCompress(bool multi_block, const char *name) {
2939     __ align(CodeEntryAlignment);
2940     StubCodeMark mark(this, "StubRoutines", name);
2941     address start = __ pc();
2942 
2943     Register buf   = c_rarg0;
2944     Register state = c_rarg1;
2945     Register ofs   = c_rarg2;
2946     Register limit = c_rarg3;
2947 
2948     Label keys;
2949     Label sha1_loop;
2950 
2951     // load the keys into v0..v3
2952     __ adr(rscratch1, keys);
2953     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2954     // load 5 words state into v6, v7
2955     __ ldrq(v6, Address(state, 0));
2956     __ ldrs(v7, Address(state, 16));
2957 
2958 
2959     __ BIND(sha1_loop);
2960     // load 64 bytes of data into v16..v19
2961     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2962     __ rev32(v16, __ T16B, v16);
2963     __ rev32(v17, __ T16B, v17);
2964     __ rev32(v18, __ T16B, v18);
2965     __ rev32(v19, __ T16B, v19);
2966 
2967     // do the sha1
2968     __ addv(v4, __ T4S, v16, v0);
2969     __ orr(v20, __ T16B, v6, v6);
2970 
2971     FloatRegister d0 = v16;
2972     FloatRegister d1 = v17;
2973     FloatRegister d2 = v18;
2974     FloatRegister d3 = v19;
2975 
2976     for (int round = 0; round < 20; round++) {
2977       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2978       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2979       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2980       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2981       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2982 
2983       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2984       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2985       __ sha1h(tmp2, __ T4S, v20);
2986       if (round < 5)
2987         __ sha1c(v20, __ T4S, tmp3, tmp4);
2988       else if (round < 10 || round >= 15)
2989         __ sha1p(v20, __ T4S, tmp3, tmp4);
2990       else
2991         __ sha1m(v20, __ T4S, tmp3, tmp4);
2992       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2993 
2994       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2995     }
2996 
2997     __ addv(v7, __ T2S, v7, v21);
2998     __ addv(v6, __ T4S, v6, v20);
2999 
3000     if (multi_block) {
3001       __ add(ofs, ofs, 64);
3002       __ cmp(ofs, limit);
3003       __ br(Assembler::LE, sha1_loop);
3004       __ mov(c_rarg0, ofs); // return ofs
3005     }
3006 
3007     __ strq(v6, Address(state, 0));
3008     __ strs(v7, Address(state, 16));
3009 
3010     __ ret(lr);
3011 
3012     __ bind(keys);
3013     __ emit_int32(0x5a827999);
3014     __ emit_int32(0x6ed9eba1);
3015     __ emit_int32(0x8f1bbcdc);
3016     __ emit_int32(0xca62c1d6);
3017 
3018     return start;
3019   }
3020 
3021 
3022   // Arguments:
3023   //
3024   // Inputs:
3025   //   c_rarg0   - byte[]  source+offset
3026   //   c_rarg1   - int[]   SHA.state
3027   //   c_rarg2   - int     offset
3028   //   c_rarg3   - int     limit
3029   //
3030   address generate_sha256_implCompress(bool multi_block, const char *name) {
3031     static const uint32_t round_consts[64] = {
3032       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3033       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3034       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3035       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3036       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3037       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3038       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3039       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3040       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3041       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3042       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3043       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3044       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3045       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3046       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3047       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3048     };
3049     __ align(CodeEntryAlignment);
3050     StubCodeMark mark(this, "StubRoutines", name);
3051     address start = __ pc();
3052 
3053     Register buf   = c_rarg0;
3054     Register state = c_rarg1;
3055     Register ofs   = c_rarg2;
3056     Register limit = c_rarg3;
3057 
3058     Label sha1_loop;
3059 
3060     __ stpd(v8, v9, __ pre(sp, -32));
3061     __ stpd(v10, v11, Address(sp, 16));
3062 
3063 // dga == v0
3064 // dgb == v1
3065 // dg0 == v2
3066 // dg1 == v3
3067 // dg2 == v4
3068 // t0 == v6
3069 // t1 == v7
3070 
3071     // load 16 keys to v16..v31
3072     __ lea(rscratch1, ExternalAddress((address)round_consts));
3073     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3074     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3075     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3076     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3077 
3078     // load 8 words (256 bits) state
3079     __ ldpq(v0, v1, state);
3080 
3081     __ BIND(sha1_loop);
3082     // load 64 bytes of data into v8..v11
3083     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3084     __ rev32(v8, __ T16B, v8);
3085     __ rev32(v9, __ T16B, v9);
3086     __ rev32(v10, __ T16B, v10);
3087     __ rev32(v11, __ T16B, v11);
3088 
3089     __ addv(v6, __ T4S, v8, v16);
3090     __ orr(v2, __ T16B, v0, v0);
3091     __ orr(v3, __ T16B, v1, v1);
3092 
3093     FloatRegister d0 = v8;
3094     FloatRegister d1 = v9;
3095     FloatRegister d2 = v10;
3096     FloatRegister d3 = v11;
3097 
3098 
3099     for (int round = 0; round < 16; round++) {
3100       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3101       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3102       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3103       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3104 
3105       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3106        __ orr(v4, __ T16B, v2, v2);
3107       if (round < 15)
3108         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3109       __ sha256h(v2, __ T4S, v3, tmp2);
3110       __ sha256h2(v3, __ T4S, v4, tmp2);
3111       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3112 
3113       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3114     }
3115 
3116     __ addv(v0, __ T4S, v0, v2);
3117     __ addv(v1, __ T4S, v1, v3);
3118 
3119     if (multi_block) {
3120       __ add(ofs, ofs, 64);
3121       __ cmp(ofs, limit);
3122       __ br(Assembler::LE, sha1_loop);
3123       __ mov(c_rarg0, ofs); // return ofs
3124     }
3125 
3126     __ ldpd(v10, v11, Address(sp, 16));
3127     __ ldpd(v8, v9, __ post(sp, 32));
3128 
3129     __ stpq(v0, v1, state);
3130 
3131     __ ret(lr);
3132 
3133     return start;
3134   }
3135 
3136 #ifndef BUILTIN_SIM
3137   // Safefetch stubs.
3138   void generate_safefetch(const char* name, int size, address* entry,
3139                           address* fault_pc, address* continuation_pc) {
3140     // safefetch signatures:
3141     //   int      SafeFetch32(int*      adr, int      errValue);
3142     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3143     //
3144     // arguments:
3145     //   c_rarg0 = adr
3146     //   c_rarg1 = errValue
3147     //
3148     // result:
3149     //   PPC_RET  = *adr or errValue
3150 
3151     StubCodeMark mark(this, "StubRoutines", name);
3152 
3153     // Entry point, pc or function descriptor.
3154     *entry = __ pc();
3155 
3156     // Load *adr into c_rarg1, may fault.
3157     *fault_pc = __ pc();
3158     switch (size) {
3159       case 4:
3160         // int32_t
3161         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3162         break;
3163       case 8:
3164         // int64_t
3165         __ ldr(c_rarg1, Address(c_rarg0, 0));
3166         break;
3167       default:
3168         ShouldNotReachHere();
3169     }
3170 
3171     // return errValue or *adr
3172     *continuation_pc = __ pc();
3173     __ mov(r0, c_rarg1);
3174     __ ret(lr);
3175   }
3176 #endif
3177 
3178   /**
3179    *  Arguments:
3180    *
3181    * Inputs:
3182    *   c_rarg0   - int crc
3183    *   c_rarg1   - byte* buf
3184    *   c_rarg2   - int length
3185    *
3186    * Ouput:
3187    *       rax   - int crc result
3188    */
3189   address generate_updateBytesCRC32() {
3190     assert(UseCRC32Intrinsics, "what are we doing here?");
3191 
3192     __ align(CodeEntryAlignment);
3193     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3194 
3195     address start = __ pc();
3196 
3197     const Register crc   = c_rarg0;  // crc
3198     const Register buf   = c_rarg1;  // source java byte array address
3199     const Register len   = c_rarg2;  // length
3200     const Register table0 = c_rarg3; // crc_table address
3201     const Register table1 = c_rarg4;
3202     const Register table2 = c_rarg5;
3203     const Register table3 = c_rarg6;
3204     const Register tmp3 = c_rarg7;
3205 
3206     BLOCK_COMMENT("Entry:");
3207     __ enter(); // required for proper stackwalking of RuntimeStub frame
3208 
3209     __ kernel_crc32(crc, buf, len,
3210               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3211 
3212     __ leave(); // required for proper stackwalking of RuntimeStub frame
3213     __ ret(lr);
3214 
3215     return start;
3216   }
3217 
3218   /**
3219    *  Arguments:
3220    *
3221    * Inputs:
3222    *   c_rarg0   - int crc
3223    *   c_rarg1   - byte* buf
3224    *   c_rarg2   - int length
3225    *   c_rarg3   - int* table
3226    *
3227    * Ouput:
3228    *       r0   - int crc result
3229    */
3230   address generate_updateBytesCRC32C() {
3231     assert(UseCRC32CIntrinsics, "what are we doing here?");
3232 
3233     __ align(CodeEntryAlignment);
3234     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3235 
3236     address start = __ pc();
3237 
3238     const Register crc   = c_rarg0;  // crc
3239     const Register buf   = c_rarg1;  // source java byte array address
3240     const Register len   = c_rarg2;  // length
3241     const Register table0 = c_rarg3; // crc_table address
3242     const Register table1 = c_rarg4;
3243     const Register table2 = c_rarg5;
3244     const Register table3 = c_rarg6;
3245     const Register tmp3 = c_rarg7;
3246 
3247     BLOCK_COMMENT("Entry:");
3248     __ enter(); // required for proper stackwalking of RuntimeStub frame
3249 
3250     __ kernel_crc32c(crc, buf, len,
3251               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3252 
3253     __ leave(); // required for proper stackwalking of RuntimeStub frame
3254     __ ret(lr);
3255 
3256     return start;
3257   }
3258 
3259   /***
3260    *  Arguments:
3261    *
3262    *  Inputs:
3263    *   c_rarg0   - int   adler
3264    *   c_rarg1   - byte* buff
3265    *   c_rarg2   - int   len
3266    *
3267    * Output:
3268    *   c_rarg0   - int adler result
3269    */
3270   address generate_updateBytesAdler32() {
3271     __ align(CodeEntryAlignment);
3272     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3273     address start = __ pc();
3274 
3275     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3276 
3277     // Aliases
3278     Register adler  = c_rarg0;
3279     Register s1     = c_rarg0;
3280     Register s2     = c_rarg3;
3281     Register buff   = c_rarg1;
3282     Register len    = c_rarg2;
3283     Register nmax  = r4;
3284     Register base = r5;
3285     Register count = r6;
3286     Register temp0 = rscratch1;
3287     Register temp1 = rscratch2;
3288     Register temp2 = r7;
3289 
3290     // Max number of bytes we can process before having to take the mod
3291     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3292     unsigned long BASE = 0xfff1;
3293     unsigned long NMAX = 0x15B0;
3294 
3295     __ mov(base, BASE);
3296     __ mov(nmax, NMAX);
3297 
3298     // s1 is initialized to the lower 16 bits of adler
3299     // s2 is initialized to the upper 16 bits of adler
3300     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3301     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3302 
3303     // The pipelined loop needs at least 16 elements for 1 iteration
3304     // It does check this, but it is more effective to skip to the cleanup loop
3305     __ cmp(len, 16);
3306     __ br(Assembler::HS, L_nmax);
3307     __ cbz(len, L_combine);
3308 
3309     __ bind(L_simple_by1_loop);
3310     __ ldrb(temp0, Address(__ post(buff, 1)));
3311     __ add(s1, s1, temp0);
3312     __ add(s2, s2, s1);
3313     __ subs(len, len, 1);
3314     __ br(Assembler::HI, L_simple_by1_loop);
3315 
3316     // s1 = s1 % BASE
3317     __ subs(temp0, s1, base);
3318     __ csel(s1, temp0, s1, Assembler::HS);
3319 
3320     // s2 = s2 % BASE
3321     __ lsr(temp0, s2, 16);
3322     __ lsl(temp1, temp0, 4);
3323     __ sub(temp1, temp1, temp0);
3324     __ add(s2, temp1, s2, ext::uxth);
3325 
3326     __ subs(temp0, s2, base);
3327     __ csel(s2, temp0, s2, Assembler::HS);
3328 
3329     __ b(L_combine);
3330 
3331     __ bind(L_nmax);
3332     __ subs(len, len, nmax);
3333     __ sub(count, nmax, 16);
3334     __ br(Assembler::LO, L_by16);
3335 
3336     __ bind(L_nmax_loop);
3337 
3338     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3339 
3340     __ add(s1, s1, temp0, ext::uxtb);
3341     __ ubfx(temp2, temp0, 8, 8);
3342     __ add(s2, s2, s1);
3343     __ add(s1, s1, temp2);
3344     __ ubfx(temp2, temp0, 16, 8);
3345     __ add(s2, s2, s1);
3346     __ add(s1, s1, temp2);
3347     __ ubfx(temp2, temp0, 24, 8);
3348     __ add(s2, s2, s1);
3349     __ add(s1, s1, temp2);
3350     __ ubfx(temp2, temp0, 32, 8);
3351     __ add(s2, s2, s1);
3352     __ add(s1, s1, temp2);
3353     __ ubfx(temp2, temp0, 40, 8);
3354     __ add(s2, s2, s1);
3355     __ add(s1, s1, temp2);
3356     __ ubfx(temp2, temp0, 48, 8);
3357     __ add(s2, s2, s1);
3358     __ add(s1, s1, temp2);
3359     __ add(s2, s2, s1);
3360     __ add(s1, s1, temp0, Assembler::LSR, 56);
3361     __ add(s2, s2, s1);
3362 
3363     __ add(s1, s1, temp1, ext::uxtb);
3364     __ ubfx(temp2, temp1, 8, 8);
3365     __ add(s2, s2, s1);
3366     __ add(s1, s1, temp2);
3367     __ ubfx(temp2, temp1, 16, 8);
3368     __ add(s2, s2, s1);
3369     __ add(s1, s1, temp2);
3370     __ ubfx(temp2, temp1, 24, 8);
3371     __ add(s2, s2, s1);
3372     __ add(s1, s1, temp2);
3373     __ ubfx(temp2, temp1, 32, 8);
3374     __ add(s2, s2, s1);
3375     __ add(s1, s1, temp2);
3376     __ ubfx(temp2, temp1, 40, 8);
3377     __ add(s2, s2, s1);
3378     __ add(s1, s1, temp2);
3379     __ ubfx(temp2, temp1, 48, 8);
3380     __ add(s2, s2, s1);
3381     __ add(s1, s1, temp2);
3382     __ add(s2, s2, s1);
3383     __ add(s1, s1, temp1, Assembler::LSR, 56);
3384     __ add(s2, s2, s1);
3385 
3386     __ subs(count, count, 16);
3387     __ br(Assembler::HS, L_nmax_loop);
3388 
3389     // s1 = s1 % BASE
3390     __ lsr(temp0, s1, 16);
3391     __ lsl(temp1, temp0, 4);
3392     __ sub(temp1, temp1, temp0);
3393     __ add(temp1, temp1, s1, ext::uxth);
3394 
3395     __ lsr(temp0, temp1, 16);
3396     __ lsl(s1, temp0, 4);
3397     __ sub(s1, s1, temp0);
3398     __ add(s1, s1, temp1, ext:: uxth);
3399 
3400     __ subs(temp0, s1, base);
3401     __ csel(s1, temp0, s1, Assembler::HS);
3402 
3403     // s2 = s2 % BASE
3404     __ lsr(temp0, s2, 16);
3405     __ lsl(temp1, temp0, 4);
3406     __ sub(temp1, temp1, temp0);
3407     __ add(temp1, temp1, s2, ext::uxth);
3408 
3409     __ lsr(temp0, temp1, 16);
3410     __ lsl(s2, temp0, 4);
3411     __ sub(s2, s2, temp0);
3412     __ add(s2, s2, temp1, ext:: uxth);
3413 
3414     __ subs(temp0, s2, base);
3415     __ csel(s2, temp0, s2, Assembler::HS);
3416 
3417     __ subs(len, len, nmax);
3418     __ sub(count, nmax, 16);
3419     __ br(Assembler::HS, L_nmax_loop);
3420 
3421     __ bind(L_by16);
3422     __ adds(len, len, count);
3423     __ br(Assembler::LO, L_by1);
3424 
3425     __ bind(L_by16_loop);
3426 
3427     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3428 
3429     __ add(s1, s1, temp0, ext::uxtb);
3430     __ ubfx(temp2, temp0, 8, 8);
3431     __ add(s2, s2, s1);
3432     __ add(s1, s1, temp2);
3433     __ ubfx(temp2, temp0, 16, 8);
3434     __ add(s2, s2, s1);
3435     __ add(s1, s1, temp2);
3436     __ ubfx(temp2, temp0, 24, 8);
3437     __ add(s2, s2, s1);
3438     __ add(s1, s1, temp2);
3439     __ ubfx(temp2, temp0, 32, 8);
3440     __ add(s2, s2, s1);
3441     __ add(s1, s1, temp2);
3442     __ ubfx(temp2, temp0, 40, 8);
3443     __ add(s2, s2, s1);
3444     __ add(s1, s1, temp2);
3445     __ ubfx(temp2, temp0, 48, 8);
3446     __ add(s2, s2, s1);
3447     __ add(s1, s1, temp2);
3448     __ add(s2, s2, s1);
3449     __ add(s1, s1, temp0, Assembler::LSR, 56);
3450     __ add(s2, s2, s1);
3451 
3452     __ add(s1, s1, temp1, ext::uxtb);
3453     __ ubfx(temp2, temp1, 8, 8);
3454     __ add(s2, s2, s1);
3455     __ add(s1, s1, temp2);
3456     __ ubfx(temp2, temp1, 16, 8);
3457     __ add(s2, s2, s1);
3458     __ add(s1, s1, temp2);
3459     __ ubfx(temp2, temp1, 24, 8);
3460     __ add(s2, s2, s1);
3461     __ add(s1, s1, temp2);
3462     __ ubfx(temp2, temp1, 32, 8);
3463     __ add(s2, s2, s1);
3464     __ add(s1, s1, temp2);
3465     __ ubfx(temp2, temp1, 40, 8);
3466     __ add(s2, s2, s1);
3467     __ add(s1, s1, temp2);
3468     __ ubfx(temp2, temp1, 48, 8);
3469     __ add(s2, s2, s1);
3470     __ add(s1, s1, temp2);
3471     __ add(s2, s2, s1);
3472     __ add(s1, s1, temp1, Assembler::LSR, 56);
3473     __ add(s2, s2, s1);
3474 
3475     __ subs(len, len, 16);
3476     __ br(Assembler::HS, L_by16_loop);
3477 
3478     __ bind(L_by1);
3479     __ adds(len, len, 15);
3480     __ br(Assembler::LO, L_do_mod);
3481 
3482     __ bind(L_by1_loop);
3483     __ ldrb(temp0, Address(__ post(buff, 1)));
3484     __ add(s1, temp0, s1);
3485     __ add(s2, s2, s1);
3486     __ subs(len, len, 1);
3487     __ br(Assembler::HS, L_by1_loop);
3488 
3489     __ bind(L_do_mod);
3490     // s1 = s1 % BASE
3491     __ lsr(temp0, s1, 16);
3492     __ lsl(temp1, temp0, 4);
3493     __ sub(temp1, temp1, temp0);
3494     __ add(temp1, temp1, s1, ext::uxth);
3495 
3496     __ lsr(temp0, temp1, 16);
3497     __ lsl(s1, temp0, 4);
3498     __ sub(s1, s1, temp0);
3499     __ add(s1, s1, temp1, ext:: uxth);
3500 
3501     __ subs(temp0, s1, base);
3502     __ csel(s1, temp0, s1, Assembler::HS);
3503 
3504     // s2 = s2 % BASE
3505     __ lsr(temp0, s2, 16);
3506     __ lsl(temp1, temp0, 4);
3507     __ sub(temp1, temp1, temp0);
3508     __ add(temp1, temp1, s2, ext::uxth);
3509 
3510     __ lsr(temp0, temp1, 16);
3511     __ lsl(s2, temp0, 4);
3512     __ sub(s2, s2, temp0);
3513     __ add(s2, s2, temp1, ext:: uxth);
3514 
3515     __ subs(temp0, s2, base);
3516     __ csel(s2, temp0, s2, Assembler::HS);
3517 
3518     // Combine lower bits and higher bits
3519     __ bind(L_combine);
3520     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3521 
3522     __ ret(lr);
3523 
3524     return start;
3525   }
3526 
3527   /**
3528    *  Arguments:
3529    *
3530    *  Input:
3531    *    c_rarg0   - x address
3532    *    c_rarg1   - x length
3533    *    c_rarg2   - y address
3534    *    c_rarg3   - y lenth
3535    *    c_rarg4   - z address
3536    *    c_rarg5   - z length
3537    */
3538   address generate_multiplyToLen() {
3539     __ align(CodeEntryAlignment);
3540     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3541 
3542     address start = __ pc();
3543     const Register x     = r0;
3544     const Register xlen  = r1;
3545     const Register y     = r2;
3546     const Register ylen  = r3;
3547     const Register z     = r4;
3548     const Register zlen  = r5;
3549 
3550     const Register tmp1  = r10;
3551     const Register tmp2  = r11;
3552     const Register tmp3  = r12;
3553     const Register tmp4  = r13;
3554     const Register tmp5  = r14;
3555     const Register tmp6  = r15;
3556     const Register tmp7  = r16;
3557 
3558     BLOCK_COMMENT("Entry:");
3559     __ enter(); // required for proper stackwalking of RuntimeStub frame
3560     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3561     __ leave(); // required for proper stackwalking of RuntimeStub frame
3562     __ ret(lr);
3563 
3564     return start;
3565   }
3566 
3567   address generate_squareToLen() {
3568     // squareToLen algorithm for sizes 1..127 described in java code works
3569     // faster than multiply_to_len on some CPUs and slower on others, but
3570     // multiply_to_len shows a bit better overall results
3571     __ align(CodeEntryAlignment);
3572     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3573     address start = __ pc();
3574 
3575     const Register x     = r0;
3576     const Register xlen  = r1;
3577     const Register z     = r2;
3578     const Register zlen  = r3;
3579     const Register y     = r4; // == x
3580     const Register ylen  = r5; // == xlen
3581 
3582     const Register tmp1  = r10;
3583     const Register tmp2  = r11;
3584     const Register tmp3  = r12;
3585     const Register tmp4  = r13;
3586     const Register tmp5  = r14;
3587     const Register tmp6  = r15;
3588     const Register tmp7  = r16;
3589 
3590     RegSet spilled_regs = RegSet::of(y, ylen);
3591     BLOCK_COMMENT("Entry:");
3592     __ enter();
3593     __ push(spilled_regs, sp);
3594     __ mov(y, x);
3595     __ mov(ylen, xlen);
3596     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3597     __ pop(spilled_regs, sp);
3598     __ leave();
3599     __ ret(lr);
3600     return start;
3601   }
3602 
3603   address generate_mulAdd() {
3604     __ align(CodeEntryAlignment);
3605     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3606 
3607     address start = __ pc();
3608 
3609     const Register out     = r0;
3610     const Register in      = r1;
3611     const Register offset  = r2;
3612     const Register len     = r3;
3613     const Register k       = r4;
3614 
3615     BLOCK_COMMENT("Entry:");
3616     __ enter();
3617     __ mul_add(out, in, offset, len, k);
3618     __ leave();
3619     __ ret(lr);
3620 
3621     return start;
3622   }
3623 
3624   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3625                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3626                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3627     // Karatsuba multiplication performs a 128*128 -> 256-bit
3628     // multiplication in three 128-bit multiplications and a few
3629     // additions.
3630     //
3631     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3632     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3633     //
3634     // Inputs:
3635     //
3636     // A0 in a.d[0]     (subkey)
3637     // A1 in a.d[1]
3638     // (A1+A0) in a1_xor_a0.d[0]
3639     //
3640     // B0 in b.d[0]     (state)
3641     // B1 in b.d[1]
3642 
3643     __ ext(tmp1, __ T16B, b, b, 0x08);
3644     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3645     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3646     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3647     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3648 
3649     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3650     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3651     __ eor(tmp2, __ T16B, tmp2, tmp4);
3652     __ eor(tmp2, __ T16B, tmp2, tmp3);
3653 
3654     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3655     __ ins(result_hi, __ D, tmp2, 0, 1);
3656     __ ins(result_lo, __ D, tmp2, 1, 0);
3657   }
3658 
3659   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3660                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3661     const FloatRegister t0 = result;
3662 
3663     // The GCM field polynomial f is z^128 + p(z), where p =
3664     // z^7+z^2+z+1.
3665     //
3666     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3667     //
3668     // so, given that the product we're reducing is
3669     //    a == lo + hi * z^128
3670     // substituting,
3671     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3672     //
3673     // we reduce by multiplying hi by p(z) and subtracting the result
3674     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3675     // bits we can do this with two 64-bit multiplications, lo*p and
3676     // hi*p.
3677 
3678     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3679     __ ext(t1, __ T16B, t0, z, 8);
3680     __ eor(hi, __ T16B, hi, t1);
3681     __ ext(t1, __ T16B, z, t0, 8);
3682     __ eor(lo, __ T16B, lo, t1);
3683     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3684     __ eor(result, __ T16B, lo, t0);
3685   }
3686 
3687   address generate_has_negatives(address &has_negatives_long) {
3688     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3689     const int large_loop_size = 64;
3690     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3691     int dcache_line = VM_Version::dcache_line_size();
3692 
3693     Register ary1 = r1, len = r2, result = r0;
3694 
3695     __ align(CodeEntryAlignment);
3696     address entry = __ pc();
3697 
3698     __ enter();
3699 
3700   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3701         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3702 
3703   __ cmp(len, 15);
3704   __ br(Assembler::GT, LEN_OVER_15);
3705   // The only case when execution falls into this code is when pointer is near
3706   // the end of memory page and we have to avoid reading next page
3707   __ add(ary1, ary1, len);
3708   __ subs(len, len, 8);
3709   __ br(Assembler::GT, LEN_OVER_8);
3710   __ ldr(rscratch2, Address(ary1, -8));
3711   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3712   __ lsrv(rscratch2, rscratch2, rscratch1);
3713   __ tst(rscratch2, UPPER_BIT_MASK);
3714   __ cset(result, Assembler::NE);
3715   __ leave();
3716   __ ret(lr);
3717   __ bind(LEN_OVER_8);
3718   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3719   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3720   __ tst(rscratch2, UPPER_BIT_MASK);
3721   __ br(Assembler::NE, RET_TRUE_NO_POP);
3722   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3723   __ lsrv(rscratch1, rscratch1, rscratch2);
3724   __ tst(rscratch1, UPPER_BIT_MASK);
3725   __ cset(result, Assembler::NE);
3726   __ leave();
3727   __ ret(lr);
3728 
3729   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3730   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3731 
3732   has_negatives_long = __ pc(); // 2nd entry point
3733 
3734   __ enter();
3735 
3736   __ bind(LEN_OVER_15);
3737     __ push(spilled_regs, sp);
3738     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3739     __ cbz(rscratch2, ALIGNED);
3740     __ ldp(tmp6, tmp1, Address(ary1));
3741     __ mov(tmp5, 16);
3742     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3743     __ add(ary1, ary1, rscratch1);
3744     __ sub(len, len, rscratch1);
3745     __ orr(tmp6, tmp6, tmp1);
3746     __ tst(tmp6, UPPER_BIT_MASK);
3747     __ br(Assembler::NE, RET_TRUE);
3748 
3749   __ bind(ALIGNED);
3750     __ cmp(len, large_loop_size);
3751     __ br(Assembler::LT, CHECK_16);
3752     // Perform 16-byte load as early return in pre-loop to handle situation
3753     // when initially aligned large array has negative values at starting bytes,
3754     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3755     // slower. Cases with negative bytes further ahead won't be affected that
3756     // much. In fact, it'll be faster due to early loads, less instructions and
3757     // less branches in LARGE_LOOP.
3758     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3759     __ sub(len, len, 16);
3760     __ orr(tmp6, tmp6, tmp1);
3761     __ tst(tmp6, UPPER_BIT_MASK);
3762     __ br(Assembler::NE, RET_TRUE);
3763     __ cmp(len, large_loop_size);
3764     __ br(Assembler::LT, CHECK_16);
3765 
3766     if (SoftwarePrefetchHintDistance >= 0
3767         && SoftwarePrefetchHintDistance >= dcache_line) {
3768       // initial prefetch
3769       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3770     }
3771   __ bind(LARGE_LOOP);
3772     if (SoftwarePrefetchHintDistance >= 0) {
3773       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3774     }
3775     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3776     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3777     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3778     // instructions per cycle and have less branches, but this approach disables
3779     // early return, thus, all 64 bytes are loaded and checked every time.
3780     __ ldp(tmp2, tmp3, Address(ary1));
3781     __ ldp(tmp4, tmp5, Address(ary1, 16));
3782     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3783     __ ldp(tmp6, tmp1, Address(ary1, 48));
3784     __ add(ary1, ary1, large_loop_size);
3785     __ sub(len, len, large_loop_size);
3786     __ orr(tmp2, tmp2, tmp3);
3787     __ orr(tmp4, tmp4, tmp5);
3788     __ orr(rscratch1, rscratch1, rscratch2);
3789     __ orr(tmp6, tmp6, tmp1);
3790     __ orr(tmp2, tmp2, tmp4);
3791     __ orr(rscratch1, rscratch1, tmp6);
3792     __ orr(tmp2, tmp2, rscratch1);
3793     __ tst(tmp2, UPPER_BIT_MASK);
3794     __ br(Assembler::NE, RET_TRUE);
3795     __ cmp(len, large_loop_size);
3796     __ br(Assembler::GE, LARGE_LOOP);
3797 
3798   __ bind(CHECK_16); // small 16-byte load pre-loop
3799     __ cmp(len, 16);
3800     __ br(Assembler::LT, POST_LOOP16);
3801 
3802   __ bind(LOOP16); // small 16-byte load loop
3803     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3804     __ sub(len, len, 16);
3805     __ orr(tmp2, tmp2, tmp3);
3806     __ tst(tmp2, UPPER_BIT_MASK);
3807     __ br(Assembler::NE, RET_TRUE);
3808     __ cmp(len, 16);
3809     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3810 
3811   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3812     __ cmp(len, 8);
3813     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3814     __ ldr(tmp3, Address(__ post(ary1, 8)));
3815     __ sub(len, len, 8);
3816     __ tst(tmp3, UPPER_BIT_MASK);
3817     __ br(Assembler::NE, RET_TRUE);
3818 
3819   __ bind(POST_LOOP16_LOAD_TAIL);
3820     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3821     __ ldr(tmp1, Address(ary1));
3822     __ mov(tmp2, 64);
3823     __ sub(tmp4, tmp2, len, __ LSL, 3);
3824     __ lslv(tmp1, tmp1, tmp4);
3825     __ tst(tmp1, UPPER_BIT_MASK);
3826     __ br(Assembler::NE, RET_TRUE);
3827     // Fallthrough
3828 
3829   __ bind(RET_FALSE);
3830     __ pop(spilled_regs, sp);
3831     __ leave();
3832     __ mov(result, zr);
3833     __ ret(lr);
3834 
3835   __ bind(RET_TRUE);
3836     __ pop(spilled_regs, sp);
3837   __ bind(RET_TRUE_NO_POP);
3838     __ leave();
3839     __ mov(result, 1);
3840     __ ret(lr);
3841 
3842   __ bind(DONE);
3843     __ pop(spilled_regs, sp);
3844     __ leave();
3845     __ ret(lr);
3846     return entry;
3847   }
3848 
3849   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3850         bool usePrefetch, Label &NOT_EQUAL) {
3851     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3852         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3853         tmp7 = r12, tmp8 = r13;
3854     Label LOOP;
3855 
3856     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3857     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3858     __ bind(LOOP);
3859     if (usePrefetch) {
3860       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3861       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3862     }
3863     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3864     __ eor(tmp1, tmp1, tmp2);
3865     __ eor(tmp3, tmp3, tmp4);
3866     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3867     __ orr(tmp1, tmp1, tmp3);
3868     __ cbnz(tmp1, NOT_EQUAL);
3869     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3870     __ eor(tmp5, tmp5, tmp6);
3871     __ eor(tmp7, tmp7, tmp8);
3872     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3873     __ orr(tmp5, tmp5, tmp7);
3874     __ cbnz(tmp5, NOT_EQUAL);
3875     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3876     __ eor(tmp1, tmp1, tmp2);
3877     __ eor(tmp3, tmp3, tmp4);
3878     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3879     __ orr(tmp1, tmp1, tmp3);
3880     __ cbnz(tmp1, NOT_EQUAL);
3881     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3882     __ eor(tmp5, tmp5, tmp6);
3883     __ sub(cnt1, cnt1, 8 * wordSize);
3884     __ eor(tmp7, tmp7, tmp8);
3885     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3886     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3887     // cmp) because subs allows an unlimited range of immediate operand.
3888     __ subs(tmp6, cnt1, loopThreshold);
3889     __ orr(tmp5, tmp5, tmp7);
3890     __ cbnz(tmp5, NOT_EQUAL);
3891     __ br(__ GE, LOOP);
3892     // post-loop
3893     __ eor(tmp1, tmp1, tmp2);
3894     __ eor(tmp3, tmp3, tmp4);
3895     __ orr(tmp1, tmp1, tmp3);
3896     __ sub(cnt1, cnt1, 2 * wordSize);
3897     __ cbnz(tmp1, NOT_EQUAL);
3898   }
3899 
3900   void generate_large_array_equals_loop_simd(int loopThreshold,
3901         bool usePrefetch, Label &NOT_EQUAL) {
3902     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3903         tmp2 = rscratch2;
3904     Label LOOP;
3905 
3906     __ bind(LOOP);
3907     if (usePrefetch) {
3908       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3909       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3910     }
3911     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3912     __ sub(cnt1, cnt1, 8 * wordSize);
3913     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3914     __ subs(tmp1, cnt1, loopThreshold);
3915     __ eor(v0, __ T16B, v0, v4);
3916     __ eor(v1, __ T16B, v1, v5);
3917     __ eor(v2, __ T16B, v2, v6);
3918     __ eor(v3, __ T16B, v3, v7);
3919     __ orr(v0, __ T16B, v0, v1);
3920     __ orr(v1, __ T16B, v2, v3);
3921     __ orr(v0, __ T16B, v0, v1);
3922     __ umov(tmp1, v0, __ D, 0);
3923     __ umov(tmp2, v0, __ D, 1);
3924     __ orr(tmp1, tmp1, tmp2);
3925     __ cbnz(tmp1, NOT_EQUAL);
3926     __ br(__ GE, LOOP);
3927   }
3928 
3929   // a1 = r1 - array1 address
3930   // a2 = r2 - array2 address
3931   // result = r0 - return value. Already contains "false"
3932   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3933   // r3-r5 are reserved temporary registers
3934   address generate_large_array_equals() {
3935     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3936     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3937         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3938         tmp7 = r12, tmp8 = r13;
3939     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3940         SMALL_LOOP, POST_LOOP;
3941     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3942     // calculate if at least 32 prefetched bytes are used
3943     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3944     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3945     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3946     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3947         tmp5, tmp6, tmp7, tmp8);
3948 
3949     __ align(CodeEntryAlignment);
3950     address entry = __ pc();
3951     __ enter();
3952     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3953     // also advance pointers to use post-increment instead of pre-increment
3954     __ add(a1, a1, wordSize);
3955     __ add(a2, a2, wordSize);
3956     if (AvoidUnalignedAccesses) {
3957       // both implementations (SIMD/nonSIMD) are using relatively large load
3958       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3959       // on some CPUs in case of address is not at least 16-byte aligned.
3960       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3961       // load if needed at least for 1st address and make if 16-byte aligned.
3962       Label ALIGNED16;
3963       __ tbz(a1, 3, ALIGNED16);
3964       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3965       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3966       __ sub(cnt1, cnt1, wordSize);
3967       __ eor(tmp1, tmp1, tmp2);
3968       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3969       __ bind(ALIGNED16);
3970     }
3971     if (UseSIMDForArrayEquals) {
3972       if (SoftwarePrefetchHintDistance >= 0) {
3973         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3974         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3975         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3976             /* prfm = */ true, NOT_EQUAL);
3977         __ cmp(cnt1, nonPrefetchLoopThreshold);
3978         __ br(__ LT, TAIL);
3979       }
3980       __ bind(NO_PREFETCH_LARGE_LOOP);
3981       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3982           /* prfm = */ false, NOT_EQUAL);
3983     } else {
3984       __ push(spilled_regs, sp);
3985       if (SoftwarePrefetchHintDistance >= 0) {
3986         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3987         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3988         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3989             /* prfm = */ true, NOT_EQUAL);
3990         __ cmp(cnt1, nonPrefetchLoopThreshold);
3991         __ br(__ LT, TAIL);
3992       }
3993       __ bind(NO_PREFETCH_LARGE_LOOP);
3994       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3995           /* prfm = */ false, NOT_EQUAL);
3996     }
3997     __ bind(TAIL);
3998       __ cbz(cnt1, EQUAL);
3999       __ subs(cnt1, cnt1, wordSize);
4000       __ br(__ LE, POST_LOOP);
4001     __ bind(SMALL_LOOP);
4002       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4003       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4004       __ subs(cnt1, cnt1, wordSize);
4005       __ eor(tmp1, tmp1, tmp2);
4006       __ cbnz(tmp1, NOT_EQUAL);
4007       __ br(__ GT, SMALL_LOOP);
4008     __ bind(POST_LOOP);
4009       __ ldr(tmp1, Address(a1, cnt1));
4010       __ ldr(tmp2, Address(a2, cnt1));
4011       __ eor(tmp1, tmp1, tmp2);
4012       __ cbnz(tmp1, NOT_EQUAL);
4013     __ bind(EQUAL);
4014       __ mov(result, true);
4015     __ bind(NOT_EQUAL);
4016       if (!UseSIMDForArrayEquals) {
4017         __ pop(spilled_regs, sp);
4018       }
4019     __ bind(NOT_EQUAL_NO_POP);
4020     __ leave();
4021     __ ret(lr);
4022     return entry;
4023   }
4024 
4025   address generate_dsin_dcos(bool isCos) {
4026     __ align(CodeEntryAlignment);
4027     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4028     address start = __ pc();
4029     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4030         (address)StubRoutines::aarch64::_two_over_pi,
4031         (address)StubRoutines::aarch64::_pio2,
4032         (address)StubRoutines::aarch64::_dsin_coef,
4033         (address)StubRoutines::aarch64::_dcos_coef);
4034     return start;
4035   }
4036 
4037   address generate_dlog() {
4038     __ align(CodeEntryAlignment);
4039     StubCodeMark mark(this, "StubRoutines", "dlog");
4040     address entry = __ pc();
4041     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4042         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4043     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4044     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4045         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4046     return entry;
4047   }
4048 
4049   // code for comparing 16 bytes of strings with same encoding
4050   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4051     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4052     __ ldr(rscratch1, Address(__ post(str1, 8)));
4053     __ eor(rscratch2, tmp1, tmp2);
4054     __ ldr(cnt1, Address(__ post(str2, 8)));
4055     __ cbnz(rscratch2, DIFF1);
4056     __ ldr(tmp1, Address(__ post(str1, 8)));
4057     __ eor(rscratch2, rscratch1, cnt1);
4058     __ ldr(tmp2, Address(__ post(str2, 8)));
4059     __ cbnz(rscratch2, DIFF2);
4060   }
4061 
4062   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4063   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4064       Label &DIFF2) {
4065     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4066     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4067 
4068     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4069     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4070     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4071     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4072 
4073     __ fmovd(tmpL, vtmp3);
4074     __ eor(rscratch2, tmp3, tmpL);
4075     __ cbnz(rscratch2, DIFF2);
4076 
4077     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4078     __ umov(tmpL, vtmp3, __ D, 1);
4079     __ eor(rscratch2, tmpU, tmpL);
4080     __ cbnz(rscratch2, DIFF1);
4081 
4082     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4083     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4084     __ fmovd(tmpL, vtmp);
4085     __ eor(rscratch2, tmp3, tmpL);
4086     __ cbnz(rscratch2, DIFF2);
4087 
4088     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4089     __ umov(tmpL, vtmp, __ D, 1);
4090     __ eor(rscratch2, tmpU, tmpL);
4091     __ cbnz(rscratch2, DIFF1);
4092   }
4093 
4094   // r0  = result
4095   // r1  = str1
4096   // r2  = cnt1
4097   // r3  = str2
4098   // r4  = cnt2
4099   // r10 = tmp1
4100   // r11 = tmp2
4101   address generate_compare_long_string_different_encoding(bool isLU) {
4102     __ align(CodeEntryAlignment);
4103     StubCodeMark mark(this, "StubRoutines", isLU
4104         ? "compare_long_string_different_encoding LU"
4105         : "compare_long_string_different_encoding UL");
4106     address entry = __ pc();
4107     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4108         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
4109         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4110     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4111         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4112     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4113     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4114 
4115     int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
4116 
4117     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4118     // cnt2 == amount of characters left to compare
4119     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4120     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4121     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4122     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4123     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4124     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4125     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4126     __ eor(rscratch2, tmp1, tmp2);
4127     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4128     __ mov(rscratch1, tmp2);
4129     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4130     Register strU = isLU ? str2 : str1,
4131              strL = isLU ? str1 : str2,
4132              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4133              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4134     __ push(spilled_regs, sp);
4135     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4136     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4137 
4138     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4139 
4140     if (SoftwarePrefetchHintDistance >= 0) {
4141       __ cmp(cnt2, prefetchLoopExitCondition);
4142       __ br(__ LT, SMALL_LOOP);
4143       __ bind(LARGE_LOOP_PREFETCH);
4144         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4145         __ mov(tmp4, 2);
4146         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4147         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4148           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4149           __ subs(tmp4, tmp4, 1);
4150           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4151           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4152           __ mov(tmp4, 2);
4153         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4154           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4155           __ subs(tmp4, tmp4, 1);
4156           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4157           __ sub(cnt2, cnt2, 64);
4158           __ cmp(cnt2, prefetchLoopExitCondition);
4159           __ br(__ GE, LARGE_LOOP_PREFETCH);
4160     }
4161     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4162     __ subs(cnt2, cnt2, 16);
4163     __ br(__ LT, TAIL);
4164     __ b(SMALL_LOOP_ENTER);
4165     __ bind(SMALL_LOOP); // smaller loop
4166       __ subs(cnt2, cnt2, 16);
4167     __ bind(SMALL_LOOP_ENTER);
4168       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4169       __ br(__ GE, SMALL_LOOP);
4170       __ cbz(cnt2, LOAD_LAST);
4171     __ bind(TAIL); // 1..15 characters left
4172       __ cmp(cnt2, -8);
4173       __ br(__ GT, TAIL_LOAD_16);
4174       __ ldrd(vtmp, Address(tmp2));
4175       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4176 
4177       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4178       __ fmovd(tmpL, vtmp3);
4179       __ eor(rscratch2, tmp3, tmpL);
4180       __ cbnz(rscratch2, DIFF2);
4181       __ umov(tmpL, vtmp3, __ D, 1);
4182       __ eor(rscratch2, tmpU, tmpL);
4183       __ cbnz(rscratch2, DIFF1);
4184       __ b(LOAD_LAST);
4185     __ bind(TAIL_LOAD_16);
4186       __ ldrq(vtmp, Address(tmp2));
4187       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4188       __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4189       __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4190       __ fmovd(tmpL, vtmp3);
4191       __ eor(rscratch2, tmp3, tmpL);
4192       __ cbnz(rscratch2, DIFF2);
4193 
4194       __ ldr(tmp3, Address(__ post(cnt1, 8)));
4195       __ umov(tmpL, vtmp3, __ D, 1);
4196       __ eor(rscratch2, tmpU, tmpL);
4197       __ cbnz(rscratch2, DIFF1);
4198 
4199       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4200       __ fmovd(tmpL, vtmp);
4201       __ eor(rscratch2, tmp3, tmpL);
4202       __ cbnz(rscratch2, DIFF2);
4203 
4204       __ umov(tmpL, vtmp, __ D, 1);
4205       __ eor(rscratch2, tmpU, tmpL);
4206       __ cbnz(rscratch2, DIFF1);
4207       __ b(LOAD_LAST);
4208     __ bind(DIFF2);
4209       __ mov(tmpU, tmp3);
4210     __ bind(DIFF1);
4211       __ pop(spilled_regs, sp);
4212       __ b(CALCULATE_DIFFERENCE);
4213     __ bind(LOAD_LAST);
4214       __ pop(spilled_regs, sp);
4215 
4216       __ ldrs(vtmp, Address(strL));
4217       __ ldr(tmpU, Address(strU));
4218       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4219       __ fmovd(tmpL, vtmp);
4220 
4221       __ eor(rscratch2, tmpU, tmpL);
4222       __ cbz(rscratch2, DONE);
4223 
4224     // Find the first different characters in the longwords and
4225     // compute their difference.
4226     __ bind(CALCULATE_DIFFERENCE);
4227       __ rev(rscratch2, rscratch2);
4228       __ clz(rscratch2, rscratch2);
4229       __ andr(rscratch2, rscratch2, -16);
4230       __ lsrv(tmp1, tmp1, rscratch2);
4231       __ uxthw(tmp1, tmp1);
4232       __ lsrv(rscratch1, rscratch1, rscratch2);
4233       __ uxthw(rscratch1, rscratch1);
4234       __ subw(result, tmp1, rscratch1);
4235     __ bind(DONE);
4236       __ ret(lr);
4237     return entry;
4238   }
4239 
4240   // r0  = result
4241   // r1  = str1
4242   // r2  = cnt1
4243   // r3  = str2
4244   // r4  = cnt2
4245   // r10 = tmp1
4246   // r11 = tmp2
4247   address generate_compare_long_string_same_encoding(bool isLL) {
4248     __ align(CodeEntryAlignment);
4249     StubCodeMark mark(this, "StubRoutines", isLL
4250         ? "compare_long_string_same_encoding LL"
4251         : "compare_long_string_same_encoding UU");
4252     address entry = __ pc();
4253     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4254         tmp1 = r10, tmp2 = r11;
4255     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4256         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4257         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4258     // exit from large loop when less than 64 bytes left to read or we're about
4259     // to prefetch memory behind array border
4260     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4261     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4262     // update cnt2 counter with already loaded 8 bytes
4263     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4264     // update pointers, because of previous read
4265     __ add(str1, str1, wordSize);
4266     __ add(str2, str2, wordSize);
4267     if (SoftwarePrefetchHintDistance >= 0) {
4268       __ bind(LARGE_LOOP_PREFETCH);
4269         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4270         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4271         compare_string_16_bytes_same(DIFF, DIFF2);
4272         compare_string_16_bytes_same(DIFF, DIFF2);
4273         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4274         compare_string_16_bytes_same(DIFF, DIFF2);
4275         __ cmp(cnt2, largeLoopExitCondition);
4276         compare_string_16_bytes_same(DIFF, DIFF2);
4277         __ br(__ GT, LARGE_LOOP_PREFETCH);
4278         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4279         // less than 16 bytes left?
4280         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4281         __ br(__ LT, TAIL);
4282     }
4283     __ bind(SMALL_LOOP);
4284       compare_string_16_bytes_same(DIFF, DIFF2);
4285       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4286       __ br(__ GE, SMALL_LOOP);
4287     __ bind(TAIL);
4288       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4289       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4290       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4291       __ br(__ LE, CHECK_LAST);
4292       __ eor(rscratch2, tmp1, tmp2);
4293       __ cbnz(rscratch2, DIFF);
4294       __ ldr(tmp1, Address(__ post(str1, 8)));
4295       __ ldr(tmp2, Address(__ post(str2, 8)));
4296       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4297     __ bind(CHECK_LAST);
4298       if (!isLL) {
4299         __ add(cnt2, cnt2, cnt2); // now in bytes
4300       }
4301       __ eor(rscratch2, tmp1, tmp2);
4302       __ cbnz(rscratch2, DIFF);
4303       __ ldr(rscratch1, Address(str1, cnt2));
4304       __ ldr(cnt1, Address(str2, cnt2));
4305       __ eor(rscratch2, rscratch1, cnt1);
4306       __ cbz(rscratch2, LENGTH_DIFF);
4307       // Find the first different characters in the longwords and
4308       // compute their difference.
4309     __ bind(DIFF2);
4310       __ rev(rscratch2, rscratch2);
4311       __ clz(rscratch2, rscratch2);
4312       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4313       __ lsrv(rscratch1, rscratch1, rscratch2);
4314       if (isLL) {
4315         __ lsrv(cnt1, cnt1, rscratch2);
4316         __ uxtbw(rscratch1, rscratch1);
4317         __ uxtbw(cnt1, cnt1);
4318       } else {
4319         __ lsrv(cnt1, cnt1, rscratch2);
4320         __ uxthw(rscratch1, rscratch1);
4321         __ uxthw(cnt1, cnt1);
4322       }
4323       __ subw(result, rscratch1, cnt1);
4324       __ b(LENGTH_DIFF);
4325     __ bind(DIFF);
4326       __ rev(rscratch2, rscratch2);
4327       __ clz(rscratch2, rscratch2);
4328       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4329       __ lsrv(tmp1, tmp1, rscratch2);
4330       if (isLL) {
4331         __ lsrv(tmp2, tmp2, rscratch2);
4332         __ uxtbw(tmp1, tmp1);
4333         __ uxtbw(tmp2, tmp2);
4334       } else {
4335         __ lsrv(tmp2, tmp2, rscratch2);
4336         __ uxthw(tmp1, tmp1);
4337         __ uxthw(tmp2, tmp2);
4338       }
4339       __ subw(result, tmp1, tmp2);
4340       __ b(LENGTH_DIFF);
4341     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4342       __ eor(rscratch2, tmp1, tmp2);
4343       __ cbnz(rscratch2, DIFF);
4344     __ bind(LENGTH_DIFF);
4345       __ ret(lr);
4346     return entry;
4347   }
4348 
4349   void generate_compare_long_strings() {
4350       StubRoutines::aarch64::_compare_long_string_LL
4351           = generate_compare_long_string_same_encoding(true);
4352       StubRoutines::aarch64::_compare_long_string_UU
4353           = generate_compare_long_string_same_encoding(false);
4354       StubRoutines::aarch64::_compare_long_string_LU
4355           = generate_compare_long_string_different_encoding(true);
4356       StubRoutines::aarch64::_compare_long_string_UL
4357           = generate_compare_long_string_different_encoding(false);
4358   }
4359 
4360   // R0 = result
4361   // R1 = str2
4362   // R2 = cnt1
4363   // R3 = str1
4364   // R4 = cnt2
4365   // This generic linear code use few additional ideas, which makes it faster:
4366   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4367   // in order to skip initial loading(help in systems with 1 ld pipeline)
4368   // 2) we can use "fast" algorithm of finding single character to search for
4369   // first symbol with less branches(1 branch per each loaded register instead
4370   // of branch for each symbol), so, this is where constants like
4371   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4372   // 3) after loading and analyzing 1st register of source string, it can be
4373   // used to search for every 1st character entry, saving few loads in
4374   // comparison with "simplier-but-slower" implementation
4375   // 4) in order to avoid lots of push/pop operations, code below is heavily
4376   // re-using/re-initializing/compressing register values, which makes code
4377   // larger and a bit less readable, however, most of extra operations are
4378   // issued during loads or branches, so, penalty is minimal
4379   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4380     const char* stubName = str1_isL
4381         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4382         : "indexof_linear_uu";
4383     __ align(CodeEntryAlignment);
4384     StubCodeMark mark(this, "StubRoutines", stubName);
4385     address entry = __ pc();
4386 
4387     int str1_chr_size = str1_isL ? 1 : 2;
4388     int str2_chr_size = str2_isL ? 1 : 2;
4389     int str1_chr_shift = str1_isL ? 0 : 1;
4390     int str2_chr_shift = str2_isL ? 0 : 1;
4391     bool isL = str1_isL && str2_isL;
4392    // parameters
4393     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4394     // temporary registers
4395     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4396     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4397     // redefinitions
4398     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4399 
4400     __ push(spilled_regs, sp);
4401     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP,
4402         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4403         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4404         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4405         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4406         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4407     // Read whole register from str1. It is safe, because length >=8 here
4408     __ ldr(ch1, Address(str1));
4409     // Read whole register from str2. It is safe, because length >=8 here
4410     __ ldr(ch2, Address(str2));
4411     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4412     if (str1_isL != str2_isL) {
4413       __ eor(v0, __ T16B, v0, v0);
4414     }
4415     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4416     __ mul(first, first, tmp1);
4417     // check if we have less than 1 register to check
4418     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4419     if (str1_isL != str2_isL) {
4420       __ fmovd(v1, ch1);
4421     }
4422     __ br(__ LE, L_SMALL);
4423     __ eor(ch2, first, ch2);
4424     if (str1_isL != str2_isL) {
4425       __ zip1(v1, __ T16B, v1, v0);
4426     }
4427     __ sub(tmp2, ch2, tmp1);
4428     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4429     __ bics(tmp2, tmp2, ch2);
4430     if (str1_isL != str2_isL) {
4431       __ fmovd(ch1, v1);
4432     }
4433     __ br(__ NE, L_HAS_ZERO);
4434     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4435     __ add(result, result, wordSize/str2_chr_size);
4436     __ add(str2, str2, wordSize);
4437     __ br(__ LT, L_POST_LOOP);
4438     __ BIND(L_LOOP);
4439       __ ldr(ch2, Address(str2));
4440       __ eor(ch2, first, ch2);
4441       __ sub(tmp2, ch2, tmp1);
4442       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4443       __ bics(tmp2, tmp2, ch2);
4444       __ br(__ NE, L_HAS_ZERO);
4445     __ BIND(L_LOOP_PROCEED);
4446       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4447       __ add(str2, str2, wordSize);
4448       __ add(result, result, wordSize/str2_chr_size);
4449       __ br(__ GE, L_LOOP);
4450     __ BIND(L_POST_LOOP);
4451       __ cmp(cnt2, -wordSize/str2_chr_size); // no extra characters to check
4452       __ br(__ LE, NOMATCH);
4453       __ ldr(ch2, Address(str2));
4454       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4455       __ eor(ch2, first, ch2);
4456       __ sub(tmp2, ch2, tmp1);
4457       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4458       __ mov(tmp4, -1); // all bits set
4459       __ b(L_SMALL_PROCEED);
4460     __ align(OptoLoopAlignment);
4461     __ BIND(L_SMALL);
4462       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4463       __ eor(ch2, first, ch2);
4464       if (str1_isL != str2_isL) {
4465         __ zip1(v1, __ T16B, v1, v0);
4466       }
4467       __ sub(tmp2, ch2, tmp1);
4468       __ mov(tmp4, -1); // all bits set
4469       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4470       if (str1_isL != str2_isL) {
4471         __ fmovd(ch1, v1); // move converted 4 symbols
4472       }
4473     __ BIND(L_SMALL_PROCEED);
4474       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4475       __ bic(tmp2, tmp2, ch2);
4476       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4477       __ rbit(tmp2, tmp2);
4478       __ br(__ EQ, NOMATCH);
4479     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4480       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4481       __ cmp(cnt1, wordSize/str2_chr_size);
4482       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4483       if (str2_isL) { // LL
4484         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4485         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4486         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4487         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4488         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4489       } else {
4490         __ mov(ch2, 0xE); // all bits in byte set except last one
4491         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4492         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4493         __ lslv(tmp2, tmp2, tmp4);
4494         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4495         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4496         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4497         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4498       }
4499       __ cmp(ch1, ch2);
4500       __ mov(tmp4, wordSize/str2_chr_size);
4501       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4502     __ BIND(L_SMALL_CMP_LOOP);
4503       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4504                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4505       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4506                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4507       __ add(tmp4, tmp4, 1);
4508       __ cmp(tmp4, cnt1);
4509       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4510       __ cmp(first, ch2);
4511       __ br(__ EQ, L_SMALL_CMP_LOOP);
4512     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4513       __ cbz(tmp2, NOMATCH); // no more matches. exit
4514       __ clz(tmp4, tmp2);
4515       __ add(result, result, 1); // advance index
4516       __ add(str2, str2, str2_chr_size); // advance pointer
4517       __ b(L_SMALL_HAS_ZERO_LOOP);
4518     __ align(OptoLoopAlignment);
4519     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4520       __ cmp(first, ch2);
4521       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4522       __ b(DONE);
4523     __ align(OptoLoopAlignment);
4524     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4525       if (str2_isL) { // LL
4526         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4527         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4528         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4529         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4530         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4531       } else {
4532         __ mov(ch2, 0xE); // all bits in byte set except last one
4533         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4534         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4535         __ lslv(tmp2, tmp2, tmp4);
4536         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4537         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4538         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4539         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4540       }
4541       __ cmp(ch1, ch2);
4542       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4543       __ b(DONE);
4544     __ align(OptoLoopAlignment);
4545     __ BIND(L_HAS_ZERO);
4546       __ rbit(tmp2, tmp2);
4547       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4548       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4549       // It's fine because both counters are 32bit and are not changed in this
4550       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4551       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4552       __ sub(result, result, 1);
4553     __ BIND(L_HAS_ZERO_LOOP);
4554       __ mov(cnt1, wordSize/str2_chr_size);
4555       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4556       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4557       if (str2_isL) {
4558         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4559         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4560         __ lslv(tmp2, tmp2, tmp4);
4561         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4562         __ add(tmp4, tmp4, 1);
4563         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4564         __ lsl(tmp2, tmp2, 1);
4565         __ mov(tmp4, wordSize/str2_chr_size);
4566       } else {
4567         __ mov(ch2, 0xE);
4568         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4569         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4570         __ lslv(tmp2, tmp2, tmp4);
4571         __ add(tmp4, tmp4, 1);
4572         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4573         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4574         __ lsl(tmp2, tmp2, 1);
4575         __ mov(tmp4, wordSize/str2_chr_size);
4576         __ sub(str2, str2, str2_chr_size);
4577       }
4578       __ cmp(ch1, ch2);
4579       __ mov(tmp4, wordSize/str2_chr_size);
4580       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4581     __ BIND(L_CMP_LOOP);
4582       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4583                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4584       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4585                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4586       __ add(tmp4, tmp4, 1);
4587       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4588       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4589       __ cmp(cnt1, ch2);
4590       __ br(__ EQ, L_CMP_LOOP);
4591     __ BIND(L_CMP_LOOP_NOMATCH);
4592       // here we're not matched
4593       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4594       __ clz(tmp4, tmp2);
4595       __ add(str2, str2, str2_chr_size); // advance pointer
4596       __ b(L_HAS_ZERO_LOOP);
4597     __ align(OptoLoopAlignment);
4598     __ BIND(L_CMP_LOOP_LAST_CMP);
4599       __ cmp(cnt1, ch2);
4600       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4601       __ b(DONE);
4602     __ align(OptoLoopAlignment);
4603     __ BIND(L_CMP_LOOP_LAST_CMP2);
4604       if (str2_isL) {
4605         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4606         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4607         __ lslv(tmp2, tmp2, tmp4);
4608         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4609         __ add(tmp4, tmp4, 1);
4610         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4611         __ lsl(tmp2, tmp2, 1);
4612       } else {
4613         __ mov(ch2, 0xE);
4614         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4615         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4616         __ lslv(tmp2, tmp2, tmp4);
4617         __ add(tmp4, tmp4, 1);
4618         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4619         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4620         __ lsl(tmp2, tmp2, 1);
4621         __ sub(str2, str2, str2_chr_size);
4622       }
4623       __ cmp(ch1, ch2);
4624       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4625       __ b(DONE);
4626     __ align(OptoLoopAlignment);
4627     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4628       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4629       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4630       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4631       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4632       // result by analyzed characters value, so, we can just reset lower bits
4633       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4634       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4635       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4636       // index of last analyzed substring inside current octet. So, str2 in at
4637       // respective start address. We need to advance it to next octet
4638       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4639       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4640       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4641       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4642       __ movw(cnt2, cnt2);
4643       __ b(L_LOOP_PROCEED);
4644     __ align(OptoLoopAlignment);
4645     __ BIND(NOMATCH);
4646       __ mov(result, -1);
4647     __ BIND(DONE);
4648       __ pop(spilled_regs, sp);
4649       __ ret(lr);
4650     return entry;
4651   }
4652 
4653   void generate_string_indexof_stubs() {
4654     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4655     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4656     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4657   }
4658 
4659   void inflate_and_store_2_fp_registers(bool generatePrfm,
4660       FloatRegister src1, FloatRegister src2) {
4661     Register dst = r1;
4662     __ zip1(v1, __ T16B, src1, v0);
4663     __ zip2(v2, __ T16B, src1, v0);
4664     if (generatePrfm) {
4665       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4666     }
4667     __ zip1(v3, __ T16B, src2, v0);
4668     __ zip2(v4, __ T16B, src2, v0);
4669     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4670   }
4671 
4672   // R0 = src
4673   // R1 = dst
4674   // R2 = len
4675   // R3 = len >> 3
4676   // V0 = 0
4677   // v1 = loaded 8 bytes
4678   address generate_large_byte_array_inflate() {
4679     __ align(CodeEntryAlignment);
4680     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4681     address entry = __ pc();
4682     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4683     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4684     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4685 
4686     // do one more 8-byte read to have address 16-byte aligned in most cases
4687     // also use single store instruction
4688     __ ldrd(v2, __ post(src, 8));
4689     __ sub(octetCounter, octetCounter, 2);
4690     __ zip1(v1, __ T16B, v1, v0);
4691     __ zip1(v2, __ T16B, v2, v0);
4692     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4693     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4694     __ cmp(octetCounter, large_loop_threshold);
4695     __ br(__ LE, LOOP_START);
4696     __ b(LOOP_PRFM_START);
4697     __ bind(LOOP_PRFM);
4698       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4699     __ bind(LOOP_PRFM_START);
4700       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4701       __ sub(octetCounter, octetCounter, 8);
4702       __ cmp(octetCounter, large_loop_threshold);
4703       inflate_and_store_2_fp_registers(true, v3, v4);
4704       inflate_and_store_2_fp_registers(true, v5, v6);
4705       __ br(__ GT, LOOP_PRFM);
4706       __ cmp(octetCounter, 8);
4707       __ br(__ LT, DONE);
4708     __ bind(LOOP);
4709       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4710       __ bind(LOOP_START);
4711       __ sub(octetCounter, octetCounter, 8);
4712       __ cmp(octetCounter, 8);
4713       inflate_and_store_2_fp_registers(false, v3, v4);
4714       inflate_and_store_2_fp_registers(false, v5, v6);
4715       __ br(__ GE, LOOP);
4716     __ bind(DONE);
4717       __ ret(lr);
4718     return entry;
4719   }
4720 
4721   /**
4722    *  Arguments:
4723    *
4724    *  Input:
4725    *  c_rarg0   - current state address
4726    *  c_rarg1   - H key address
4727    *  c_rarg2   - data address
4728    *  c_rarg3   - number of blocks
4729    *
4730    *  Output:
4731    *  Updated state at c_rarg0
4732    */
4733   address generate_ghash_processBlocks() {
4734     // Bafflingly, GCM uses little-endian for the byte order, but
4735     // big-endian for the bit order.  For example, the polynomial 1 is
4736     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4737     //
4738     // So, we must either reverse the bytes in each word and do
4739     // everything big-endian or reverse the bits in each byte and do
4740     // it little-endian.  On AArch64 it's more idiomatic to reverse
4741     // the bits in each byte (we have an instruction, RBIT, to do
4742     // that) and keep the data in little-endian bit order throught the
4743     // calculation, bit-reversing the inputs and outputs.
4744 
4745     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4746     __ align(wordSize * 2);
4747     address p = __ pc();
4748     __ emit_int64(0x87);  // The low-order bits of the field
4749                           // polynomial (i.e. p = z^7+z^2+z+1)
4750                           // repeated in the low and high parts of a
4751                           // 128-bit vector
4752     __ emit_int64(0x87);
4753 
4754     __ align(CodeEntryAlignment);
4755     address start = __ pc();
4756 
4757     Register state   = c_rarg0;
4758     Register subkeyH = c_rarg1;
4759     Register data    = c_rarg2;
4760     Register blocks  = c_rarg3;
4761 
4762     FloatRegister vzr = v30;
4763     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4764 
4765     __ ldrq(v0, Address(state));
4766     __ ldrq(v1, Address(subkeyH));
4767 
4768     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4769     __ rbit(v0, __ T16B, v0);
4770     __ rev64(v1, __ T16B, v1);
4771     __ rbit(v1, __ T16B, v1);
4772 
4773     __ ldrq(v26, p);
4774 
4775     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4776     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4777 
4778     {
4779       Label L_ghash_loop;
4780       __ bind(L_ghash_loop);
4781 
4782       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4783                                                  // reversing each byte
4784       __ rbit(v2, __ T16B, v2);
4785       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4786 
4787       // Multiply state in v2 by subkey in v1
4788       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4789                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4790                      /*temps*/v6, v20, v18, v21);
4791       // Reduce v7:v5 by the field polynomial
4792       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4793 
4794       __ sub(blocks, blocks, 1);
4795       __ cbnz(blocks, L_ghash_loop);
4796     }
4797 
4798     // The bit-reversed result is at this point in v0
4799     __ rev64(v1, __ T16B, v0);
4800     __ rbit(v1, __ T16B, v1);
4801 
4802     __ st1(v1, __ T16B, state);
4803     __ ret(lr);
4804 
4805     return start;
4806   }
4807 
4808   // Continuation point for throwing of implicit exceptions that are
4809   // not handled in the current activation. Fabricates an exception
4810   // oop and initiates normal exception dispatching in this
4811   // frame. Since we need to preserve callee-saved values (currently
4812   // only for C2, but done for C1 as well) we need a callee-saved oop
4813   // map and therefore have to make these stubs into RuntimeStubs
4814   // rather than BufferBlobs.  If the compiler needs all registers to
4815   // be preserved between the fault point and the exception handler
4816   // then it must assume responsibility for that in
4817   // AbstractCompiler::continuation_for_implicit_null_exception or
4818   // continuation_for_implicit_division_by_zero_exception. All other
4819   // implicit exceptions (e.g., NullPointerException or
4820   // AbstractMethodError on entry) are either at call sites or
4821   // otherwise assume that stack unwinding will be initiated, so
4822   // caller saved registers were assumed volatile in the compiler.
4823 
4824 #undef __
4825 #define __ masm->
4826 
4827   address generate_throw_exception(const char* name,
4828                                    address runtime_entry,
4829                                    Register arg1 = noreg,
4830                                    Register arg2 = noreg) {
4831     // Information about frame layout at time of blocking runtime call.
4832     // Note that we only have to preserve callee-saved registers since
4833     // the compilers are responsible for supplying a continuation point
4834     // if they expect all registers to be preserved.
4835     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4836     enum layout {
4837       rfp_off = 0,
4838       rfp_off2,
4839       return_off,
4840       return_off2,
4841       framesize // inclusive of return address
4842     };
4843 
4844     int insts_size = 512;
4845     int locs_size  = 64;
4846 
4847     CodeBuffer code(name, insts_size, locs_size);
4848     OopMapSet* oop_maps  = new OopMapSet();
4849     MacroAssembler* masm = new MacroAssembler(&code);
4850 
4851     address start = __ pc();
4852 
4853     // This is an inlined and slightly modified version of call_VM
4854     // which has the ability to fetch the return PC out of
4855     // thread-local storage and also sets up last_Java_sp slightly
4856     // differently than the real call_VM
4857 
4858     __ enter(); // Save FP and LR before call
4859 
4860     assert(is_even(framesize/2), "sp not 16-byte aligned");
4861 
4862     // lr and fp are already in place
4863     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4864 
4865     int frame_complete = __ pc() - start;
4866 
4867     // Set up last_Java_sp and last_Java_fp
4868     address the_pc = __ pc();
4869     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4870 
4871     // Call runtime
4872     if (arg1 != noreg) {
4873       assert(arg2 != c_rarg1, "clobbered");
4874       __ mov(c_rarg1, arg1);
4875     }
4876     if (arg2 != noreg) {
4877       __ mov(c_rarg2, arg2);
4878     }
4879     __ mov(c_rarg0, rthread);
4880     BLOCK_COMMENT("call runtime_entry");
4881     __ mov(rscratch1, runtime_entry);
4882     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4883 
4884     // Generate oop map
4885     OopMap* map = new OopMap(framesize, 0);
4886 
4887     oop_maps->add_gc_map(the_pc - start, map);
4888 
4889     __ reset_last_Java_frame(true);
4890     __ maybe_isb();
4891 
4892     __ leave();
4893 
4894     // check for pending exceptions
4895 #ifdef ASSERT
4896     Label L;
4897     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4898     __ cbnz(rscratch1, L);
4899     __ should_not_reach_here();
4900     __ bind(L);
4901 #endif // ASSERT
4902     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4903 
4904 
4905     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4906     RuntimeStub* stub =
4907       RuntimeStub::new_runtime_stub(name,
4908                                     &code,
4909                                     frame_complete,
4910                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4911                                     oop_maps, false);
4912     return stub->entry_point();
4913   }
4914 
4915   class MontgomeryMultiplyGenerator : public MacroAssembler {
4916 
4917     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4918       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4919 
4920     RegSet _toSave;
4921     bool _squaring;
4922 
4923   public:
4924     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4925       : MacroAssembler(as->code()), _squaring(squaring) {
4926 
4927       // Register allocation
4928 
4929       Register reg = c_rarg0;
4930       Pa_base = reg;       // Argument registers
4931       if (squaring)
4932         Pb_base = Pa_base;
4933       else
4934         Pb_base = ++reg;
4935       Pn_base = ++reg;
4936       Rlen= ++reg;
4937       inv = ++reg;
4938       Pm_base = ++reg;
4939 
4940                           // Working registers:
4941       Ra =  ++reg;        // The current digit of a, b, n, and m.
4942       Rb =  ++reg;
4943       Rm =  ++reg;
4944       Rn =  ++reg;
4945 
4946       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4947       Pb =  ++reg;
4948       Pm =  ++reg;
4949       Pn =  ++reg;
4950 
4951       t0 =  ++reg;        // Three registers which form a
4952       t1 =  ++reg;        // triple-precision accumuator.
4953       t2 =  ++reg;
4954 
4955       Ri =  ++reg;        // Inner and outer loop indexes.
4956       Rj =  ++reg;
4957 
4958       Rhi_ab = ++reg;     // Product registers: low and high parts
4959       Rlo_ab = ++reg;     // of a*b and m*n.
4960       Rhi_mn = ++reg;
4961       Rlo_mn = ++reg;
4962 
4963       // r19 and up are callee-saved.
4964       _toSave = RegSet::range(r19, reg) + Pm_base;
4965     }
4966 
4967   private:
4968     void save_regs() {
4969       push(_toSave, sp);
4970     }
4971 
4972     void restore_regs() {
4973       pop(_toSave, sp);
4974     }
4975 
4976     template <typename T>
4977     void unroll_2(Register count, T block) {
4978       Label loop, end, odd;
4979       tbnz(count, 0, odd);
4980       cbz(count, end);
4981       align(16);
4982       bind(loop);
4983       (this->*block)();
4984       bind(odd);
4985       (this->*block)();
4986       subs(count, count, 2);
4987       br(Assembler::GT, loop);
4988       bind(end);
4989     }
4990 
4991     template <typename T>
4992     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4993       Label loop, end, odd;
4994       tbnz(count, 0, odd);
4995       cbz(count, end);
4996       align(16);
4997       bind(loop);
4998       (this->*block)(d, s, tmp);
4999       bind(odd);
5000       (this->*block)(d, s, tmp);
5001       subs(count, count, 2);
5002       br(Assembler::GT, loop);
5003       bind(end);
5004     }
5005 
5006     void pre1(RegisterOrConstant i) {
5007       block_comment("pre1");
5008       // Pa = Pa_base;
5009       // Pb = Pb_base + i;
5010       // Pm = Pm_base;
5011       // Pn = Pn_base + i;
5012       // Ra = *Pa;
5013       // Rb = *Pb;
5014       // Rm = *Pm;
5015       // Rn = *Pn;
5016       ldr(Ra, Address(Pa_base));
5017       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5018       ldr(Rm, Address(Pm_base));
5019       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5020       lea(Pa, Address(Pa_base));
5021       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5022       lea(Pm, Address(Pm_base));
5023       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5024 
5025       // Zero the m*n result.
5026       mov(Rhi_mn, zr);
5027       mov(Rlo_mn, zr);
5028     }
5029 
5030     // The core multiply-accumulate step of a Montgomery
5031     // multiplication.  The idea is to schedule operations as a
5032     // pipeline so that instructions with long latencies (loads and
5033     // multiplies) have time to complete before their results are
5034     // used.  This most benefits in-order implementations of the
5035     // architecture but out-of-order ones also benefit.
5036     void step() {
5037       block_comment("step");
5038       // MACC(Ra, Rb, t0, t1, t2);
5039       // Ra = *++Pa;
5040       // Rb = *--Pb;
5041       umulh(Rhi_ab, Ra, Rb);
5042       mul(Rlo_ab, Ra, Rb);
5043       ldr(Ra, pre(Pa, wordSize));
5044       ldr(Rb, pre(Pb, -wordSize));
5045       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5046                                        // previous iteration.
5047       // MACC(Rm, Rn, t0, t1, t2);
5048       // Rm = *++Pm;
5049       // Rn = *--Pn;
5050       umulh(Rhi_mn, Rm, Rn);
5051       mul(Rlo_mn, Rm, Rn);
5052       ldr(Rm, pre(Pm, wordSize));
5053       ldr(Rn, pre(Pn, -wordSize));
5054       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5055     }
5056 
5057     void post1() {
5058       block_comment("post1");
5059 
5060       // MACC(Ra, Rb, t0, t1, t2);
5061       // Ra = *++Pa;
5062       // Rb = *--Pb;
5063       umulh(Rhi_ab, Ra, Rb);
5064       mul(Rlo_ab, Ra, Rb);
5065       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5066       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5067 
5068       // *Pm = Rm = t0 * inv;
5069       mul(Rm, t0, inv);
5070       str(Rm, Address(Pm));
5071 
5072       // MACC(Rm, Rn, t0, t1, t2);
5073       // t0 = t1; t1 = t2; t2 = 0;
5074       umulh(Rhi_mn, Rm, Rn);
5075 
5076 #ifndef PRODUCT
5077       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5078       {
5079         mul(Rlo_mn, Rm, Rn);
5080         add(Rlo_mn, t0, Rlo_mn);
5081         Label ok;
5082         cbz(Rlo_mn, ok); {
5083           stop("broken Montgomery multiply");
5084         } bind(ok);
5085       }
5086 #endif
5087       // We have very carefully set things up so that
5088       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5089       // the lower half of Rm * Rn because we know the result already:
5090       // it must be -t0.  t0 + (-t0) must generate a carry iff
5091       // t0 != 0.  So, rather than do a mul and an adds we just set
5092       // the carry flag iff t0 is nonzero.
5093       //
5094       // mul(Rlo_mn, Rm, Rn);
5095       // adds(zr, t0, Rlo_mn);
5096       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5097       adcs(t0, t1, Rhi_mn);
5098       adc(t1, t2, zr);
5099       mov(t2, zr);
5100     }
5101 
5102     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5103       block_comment("pre2");
5104       // Pa = Pa_base + i-len;
5105       // Pb = Pb_base + len;
5106       // Pm = Pm_base + i-len;
5107       // Pn = Pn_base + len;
5108 
5109       if (i.is_register()) {
5110         sub(Rj, i.as_register(), len);
5111       } else {
5112         mov(Rj, i.as_constant());
5113         sub(Rj, Rj, len);
5114       }
5115       // Rj == i-len
5116 
5117       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5118       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5119       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5120       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5121 
5122       // Ra = *++Pa;
5123       // Rb = *--Pb;
5124       // Rm = *++Pm;
5125       // Rn = *--Pn;
5126       ldr(Ra, pre(Pa, wordSize));
5127       ldr(Rb, pre(Pb, -wordSize));
5128       ldr(Rm, pre(Pm, wordSize));
5129       ldr(Rn, pre(Pn, -wordSize));
5130 
5131       mov(Rhi_mn, zr);
5132       mov(Rlo_mn, zr);
5133     }
5134 
5135     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5136       block_comment("post2");
5137       if (i.is_constant()) {
5138         mov(Rj, i.as_constant()-len.as_constant());
5139       } else {
5140         sub(Rj, i.as_register(), len);
5141       }
5142 
5143       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5144 
5145       // As soon as we know the least significant digit of our result,
5146       // store it.
5147       // Pm_base[i-len] = t0;
5148       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5149 
5150       // t0 = t1; t1 = t2; t2 = 0;
5151       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5152       adc(t1, t2, zr);
5153       mov(t2, zr);
5154     }
5155 
5156     // A carry in t0 after Montgomery multiplication means that we
5157     // should subtract multiples of n from our result in m.  We'll
5158     // keep doing that until there is no carry.
5159     void normalize(RegisterOrConstant len) {
5160       block_comment("normalize");
5161       // while (t0)
5162       //   t0 = sub(Pm_base, Pn_base, t0, len);
5163       Label loop, post, again;
5164       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5165       cbz(t0, post); {
5166         bind(again); {
5167           mov(i, zr);
5168           mov(cnt, len);
5169           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5170           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5171           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5172           align(16);
5173           bind(loop); {
5174             sbcs(Rm, Rm, Rn);
5175             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5176             add(i, i, 1);
5177             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5178             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5179             sub(cnt, cnt, 1);
5180           } cbnz(cnt, loop);
5181           sbc(t0, t0, zr);
5182         } cbnz(t0, again);
5183       } bind(post);
5184     }
5185 
5186     // Move memory at s to d, reversing words.
5187     //    Increments d to end of copied memory
5188     //    Destroys tmp1, tmp2
5189     //    Preserves len
5190     //    Leaves s pointing to the address which was in d at start
5191     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5192       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5193 
5194       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5195       mov(tmp1, len);
5196       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5197       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5198     }
5199     // where
5200     void reverse1(Register d, Register s, Register tmp) {
5201       ldr(tmp, pre(s, -wordSize));
5202       ror(tmp, tmp, 32);
5203       str(tmp, post(d, wordSize));
5204     }
5205 
5206     void step_squaring() {
5207       // An extra ACC
5208       step();
5209       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5210     }
5211 
5212     void last_squaring(RegisterOrConstant i) {
5213       Label dont;
5214       // if ((i & 1) == 0) {
5215       tbnz(i.as_register(), 0, dont); {
5216         // MACC(Ra, Rb, t0, t1, t2);
5217         // Ra = *++Pa;
5218         // Rb = *--Pb;
5219         umulh(Rhi_ab, Ra, Rb);
5220         mul(Rlo_ab, Ra, Rb);
5221         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5222       } bind(dont);
5223     }
5224 
5225     void extra_step_squaring() {
5226       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5227 
5228       // MACC(Rm, Rn, t0, t1, t2);
5229       // Rm = *++Pm;
5230       // Rn = *--Pn;
5231       umulh(Rhi_mn, Rm, Rn);
5232       mul(Rlo_mn, Rm, Rn);
5233       ldr(Rm, pre(Pm, wordSize));
5234       ldr(Rn, pre(Pn, -wordSize));
5235     }
5236 
5237     void post1_squaring() {
5238       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5239 
5240       // *Pm = Rm = t0 * inv;
5241       mul(Rm, t0, inv);
5242       str(Rm, Address(Pm));
5243 
5244       // MACC(Rm, Rn, t0, t1, t2);
5245       // t0 = t1; t1 = t2; t2 = 0;
5246       umulh(Rhi_mn, Rm, Rn);
5247 
5248 #ifndef PRODUCT
5249       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5250       {
5251         mul(Rlo_mn, Rm, Rn);
5252         add(Rlo_mn, t0, Rlo_mn);
5253         Label ok;
5254         cbz(Rlo_mn, ok); {
5255           stop("broken Montgomery multiply");
5256         } bind(ok);
5257       }
5258 #endif
5259       // We have very carefully set things up so that
5260       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5261       // the lower half of Rm * Rn because we know the result already:
5262       // it must be -t0.  t0 + (-t0) must generate a carry iff
5263       // t0 != 0.  So, rather than do a mul and an adds we just set
5264       // the carry flag iff t0 is nonzero.
5265       //
5266       // mul(Rlo_mn, Rm, Rn);
5267       // adds(zr, t0, Rlo_mn);
5268       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5269       adcs(t0, t1, Rhi_mn);
5270       adc(t1, t2, zr);
5271       mov(t2, zr);
5272     }
5273 
5274     void acc(Register Rhi, Register Rlo,
5275              Register t0, Register t1, Register t2) {
5276       adds(t0, t0, Rlo);
5277       adcs(t1, t1, Rhi);
5278       adc(t2, t2, zr);
5279     }
5280 
5281   public:
5282     /**
5283      * Fast Montgomery multiplication.  The derivation of the
5284      * algorithm is in A Cryptographic Library for the Motorola
5285      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5286      *
5287      * Arguments:
5288      *
5289      * Inputs for multiplication:
5290      *   c_rarg0   - int array elements a
5291      *   c_rarg1   - int array elements b
5292      *   c_rarg2   - int array elements n (the modulus)
5293      *   c_rarg3   - int length
5294      *   c_rarg4   - int inv
5295      *   c_rarg5   - int array elements m (the result)
5296      *
5297      * Inputs for squaring:
5298      *   c_rarg0   - int array elements a
5299      *   c_rarg1   - int array elements n (the modulus)
5300      *   c_rarg2   - int length
5301      *   c_rarg3   - int inv
5302      *   c_rarg4   - int array elements m (the result)
5303      *
5304      */
5305     address generate_multiply() {
5306       Label argh, nothing;
5307       bind(argh);
5308       stop("MontgomeryMultiply total_allocation must be <= 8192");
5309 
5310       align(CodeEntryAlignment);
5311       address entry = pc();
5312 
5313       cbzw(Rlen, nothing);
5314 
5315       enter();
5316 
5317       // Make room.
5318       cmpw(Rlen, 512);
5319       br(Assembler::HI, argh);
5320       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5321       andr(sp, Ra, -2 * wordSize);
5322 
5323       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5324 
5325       {
5326         // Copy input args, reversing as we go.  We use Ra as a
5327         // temporary variable.
5328         reverse(Ra, Pa_base, Rlen, t0, t1);
5329         if (!_squaring)
5330           reverse(Ra, Pb_base, Rlen, t0, t1);
5331         reverse(Ra, Pn_base, Rlen, t0, t1);
5332       }
5333 
5334       // Push all call-saved registers and also Pm_base which we'll need
5335       // at the end.
5336       save_regs();
5337 
5338 #ifndef PRODUCT
5339       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5340       {
5341         ldr(Rn, Address(Pn_base, 0));
5342         mul(Rlo_mn, Rn, inv);
5343         cmp(Rlo_mn, -1);
5344         Label ok;
5345         br(EQ, ok); {
5346           stop("broken inverse in Montgomery multiply");
5347         } bind(ok);
5348       }
5349 #endif
5350 
5351       mov(Pm_base, Ra);
5352 
5353       mov(t0, zr);
5354       mov(t1, zr);
5355       mov(t2, zr);
5356 
5357       block_comment("for (int i = 0; i < len; i++) {");
5358       mov(Ri, zr); {
5359         Label loop, end;
5360         cmpw(Ri, Rlen);
5361         br(Assembler::GE, end);
5362 
5363         bind(loop);
5364         pre1(Ri);
5365 
5366         block_comment("  for (j = i; j; j--) {"); {
5367           movw(Rj, Ri);
5368           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5369         } block_comment("  } // j");
5370 
5371         post1();
5372         addw(Ri, Ri, 1);
5373         cmpw(Ri, Rlen);
5374         br(Assembler::LT, loop);
5375         bind(end);
5376         block_comment("} // i");
5377       }
5378 
5379       block_comment("for (int i = len; i < 2*len; i++) {");
5380       mov(Ri, Rlen); {
5381         Label loop, end;
5382         cmpw(Ri, Rlen, Assembler::LSL, 1);
5383         br(Assembler::GE, end);
5384 
5385         bind(loop);
5386         pre2(Ri, Rlen);
5387 
5388         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5389           lslw(Rj, Rlen, 1);
5390           subw(Rj, Rj, Ri);
5391           subw(Rj, Rj, 1);
5392           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5393         } block_comment("  } // j");
5394 
5395         post2(Ri, Rlen);
5396         addw(Ri, Ri, 1);
5397         cmpw(Ri, Rlen, Assembler::LSL, 1);
5398         br(Assembler::LT, loop);
5399         bind(end);
5400       }
5401       block_comment("} // i");
5402 
5403       normalize(Rlen);
5404 
5405       mov(Ra, Pm_base);  // Save Pm_base in Ra
5406       restore_regs();  // Restore caller's Pm_base
5407 
5408       // Copy our result into caller's Pm_base
5409       reverse(Pm_base, Ra, Rlen, t0, t1);
5410 
5411       leave();
5412       bind(nothing);
5413       ret(lr);
5414 
5415       return entry;
5416     }
5417     // In C, approximately:
5418 
5419     // void
5420     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5421     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5422     //                     unsigned long inv, int len) {
5423     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5424     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5425     //   unsigned long Ra, Rb, Rn, Rm;
5426 
5427     //   int i;
5428 
5429     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5430 
5431     //   for (i = 0; i < len; i++) {
5432     //     int j;
5433 
5434     //     Pa = Pa_base;
5435     //     Pb = Pb_base + i;
5436     //     Pm = Pm_base;
5437     //     Pn = Pn_base + i;
5438 
5439     //     Ra = *Pa;
5440     //     Rb = *Pb;
5441     //     Rm = *Pm;
5442     //     Rn = *Pn;
5443 
5444     //     int iters = i;
5445     //     for (j = 0; iters--; j++) {
5446     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5447     //       MACC(Ra, Rb, t0, t1, t2);
5448     //       Ra = *++Pa;
5449     //       Rb = *--Pb;
5450     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5451     //       MACC(Rm, Rn, t0, t1, t2);
5452     //       Rm = *++Pm;
5453     //       Rn = *--Pn;
5454     //     }
5455 
5456     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5457     //     MACC(Ra, Rb, t0, t1, t2);
5458     //     *Pm = Rm = t0 * inv;
5459     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5460     //     MACC(Rm, Rn, t0, t1, t2);
5461 
5462     //     assert(t0 == 0, "broken Montgomery multiply");
5463 
5464     //     t0 = t1; t1 = t2; t2 = 0;
5465     //   }
5466 
5467     //   for (i = len; i < 2*len; i++) {
5468     //     int j;
5469 
5470     //     Pa = Pa_base + i-len;
5471     //     Pb = Pb_base + len;
5472     //     Pm = Pm_base + i-len;
5473     //     Pn = Pn_base + len;
5474 
5475     //     Ra = *++Pa;
5476     //     Rb = *--Pb;
5477     //     Rm = *++Pm;
5478     //     Rn = *--Pn;
5479 
5480     //     int iters = len*2-i-1;
5481     //     for (j = i-len+1; iters--; j++) {
5482     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5483     //       MACC(Ra, Rb, t0, t1, t2);
5484     //       Ra = *++Pa;
5485     //       Rb = *--Pb;
5486     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5487     //       MACC(Rm, Rn, t0, t1, t2);
5488     //       Rm = *++Pm;
5489     //       Rn = *--Pn;
5490     //     }
5491 
5492     //     Pm_base[i-len] = t0;
5493     //     t0 = t1; t1 = t2; t2 = 0;
5494     //   }
5495 
5496     //   while (t0)
5497     //     t0 = sub(Pm_base, Pn_base, t0, len);
5498     // }
5499 
5500     /**
5501      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5502      * multiplies than Montgomery multiplication so it should be up to
5503      * 25% faster.  However, its loop control is more complex and it
5504      * may actually run slower on some machines.
5505      *
5506      * Arguments:
5507      *
5508      * Inputs:
5509      *   c_rarg0   - int array elements a
5510      *   c_rarg1   - int array elements n (the modulus)
5511      *   c_rarg2   - int length
5512      *   c_rarg3   - int inv
5513      *   c_rarg4   - int array elements m (the result)
5514      *
5515      */
5516     address generate_square() {
5517       Label argh;
5518       bind(argh);
5519       stop("MontgomeryMultiply total_allocation must be <= 8192");
5520 
5521       align(CodeEntryAlignment);
5522       address entry = pc();
5523 
5524       enter();
5525 
5526       // Make room.
5527       cmpw(Rlen, 512);
5528       br(Assembler::HI, argh);
5529       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5530       andr(sp, Ra, -2 * wordSize);
5531 
5532       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5533 
5534       {
5535         // Copy input args, reversing as we go.  We use Ra as a
5536         // temporary variable.
5537         reverse(Ra, Pa_base, Rlen, t0, t1);
5538         reverse(Ra, Pn_base, Rlen, t0, t1);
5539       }
5540 
5541       // Push all call-saved registers and also Pm_base which we'll need
5542       // at the end.
5543       save_regs();
5544 
5545       mov(Pm_base, Ra);
5546 
5547       mov(t0, zr);
5548       mov(t1, zr);
5549       mov(t2, zr);
5550 
5551       block_comment("for (int i = 0; i < len; i++) {");
5552       mov(Ri, zr); {
5553         Label loop, end;
5554         bind(loop);
5555         cmp(Ri, Rlen);
5556         br(Assembler::GE, end);
5557 
5558         pre1(Ri);
5559 
5560         block_comment("for (j = (i+1)/2; j; j--) {"); {
5561           add(Rj, Ri, 1);
5562           lsr(Rj, Rj, 1);
5563           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5564         } block_comment("  } // j");
5565 
5566         last_squaring(Ri);
5567 
5568         block_comment("  for (j = i/2; j; j--) {"); {
5569           lsr(Rj, Ri, 1);
5570           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5571         } block_comment("  } // j");
5572 
5573         post1_squaring();
5574         add(Ri, Ri, 1);
5575         cmp(Ri, Rlen);
5576         br(Assembler::LT, loop);
5577 
5578         bind(end);
5579         block_comment("} // i");
5580       }
5581 
5582       block_comment("for (int i = len; i < 2*len; i++) {");
5583       mov(Ri, Rlen); {
5584         Label loop, end;
5585         bind(loop);
5586         cmp(Ri, Rlen, Assembler::LSL, 1);
5587         br(Assembler::GE, end);
5588 
5589         pre2(Ri, Rlen);
5590 
5591         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5592           lsl(Rj, Rlen, 1);
5593           sub(Rj, Rj, Ri);
5594           sub(Rj, Rj, 1);
5595           lsr(Rj, Rj, 1);
5596           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5597         } block_comment("  } // j");
5598 
5599         last_squaring(Ri);
5600 
5601         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5602           lsl(Rj, Rlen, 1);
5603           sub(Rj, Rj, Ri);
5604           lsr(Rj, Rj, 1);
5605           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5606         } block_comment("  } // j");
5607 
5608         post2(Ri, Rlen);
5609         add(Ri, Ri, 1);
5610         cmp(Ri, Rlen, Assembler::LSL, 1);
5611 
5612         br(Assembler::LT, loop);
5613         bind(end);
5614         block_comment("} // i");
5615       }
5616 
5617       normalize(Rlen);
5618 
5619       mov(Ra, Pm_base);  // Save Pm_base in Ra
5620       restore_regs();  // Restore caller's Pm_base
5621 
5622       // Copy our result into caller's Pm_base
5623       reverse(Pm_base, Ra, Rlen, t0, t1);
5624 
5625       leave();
5626       ret(lr);
5627 
5628       return entry;
5629     }
5630     // In C, approximately:
5631 
5632     // void
5633     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5634     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5635     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5636     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5637     //   unsigned long Ra, Rb, Rn, Rm;
5638 
5639     //   int i;
5640 
5641     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5642 
5643     //   for (i = 0; i < len; i++) {
5644     //     int j;
5645 
5646     //     Pa = Pa_base;
5647     //     Pb = Pa_base + i;
5648     //     Pm = Pm_base;
5649     //     Pn = Pn_base + i;
5650 
5651     //     Ra = *Pa;
5652     //     Rb = *Pb;
5653     //     Rm = *Pm;
5654     //     Rn = *Pn;
5655 
5656     //     int iters = (i+1)/2;
5657     //     for (j = 0; iters--; j++) {
5658     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5659     //       MACC2(Ra, Rb, t0, t1, t2);
5660     //       Ra = *++Pa;
5661     //       Rb = *--Pb;
5662     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5663     //       MACC(Rm, Rn, t0, t1, t2);
5664     //       Rm = *++Pm;
5665     //       Rn = *--Pn;
5666     //     }
5667     //     if ((i & 1) == 0) {
5668     //       assert(Ra == Pa_base[j], "must be");
5669     //       MACC(Ra, Ra, t0, t1, t2);
5670     //     }
5671     //     iters = i/2;
5672     //     assert(iters == i-j, "must be");
5673     //     for (; iters--; j++) {
5674     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5675     //       MACC(Rm, Rn, t0, t1, t2);
5676     //       Rm = *++Pm;
5677     //       Rn = *--Pn;
5678     //     }
5679 
5680     //     *Pm = Rm = t0 * inv;
5681     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5682     //     MACC(Rm, Rn, t0, t1, t2);
5683 
5684     //     assert(t0 == 0, "broken Montgomery multiply");
5685 
5686     //     t0 = t1; t1 = t2; t2 = 0;
5687     //   }
5688 
5689     //   for (i = len; i < 2*len; i++) {
5690     //     int start = i-len+1;
5691     //     int end = start + (len - start)/2;
5692     //     int j;
5693 
5694     //     Pa = Pa_base + i-len;
5695     //     Pb = Pa_base + len;
5696     //     Pm = Pm_base + i-len;
5697     //     Pn = Pn_base + len;
5698 
5699     //     Ra = *++Pa;
5700     //     Rb = *--Pb;
5701     //     Rm = *++Pm;
5702     //     Rn = *--Pn;
5703 
5704     //     int iters = (2*len-i-1)/2;
5705     //     assert(iters == end-start, "must be");
5706     //     for (j = start; iters--; j++) {
5707     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5708     //       MACC2(Ra, Rb, t0, t1, t2);
5709     //       Ra = *++Pa;
5710     //       Rb = *--Pb;
5711     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5712     //       MACC(Rm, Rn, t0, t1, t2);
5713     //       Rm = *++Pm;
5714     //       Rn = *--Pn;
5715     //     }
5716     //     if ((i & 1) == 0) {
5717     //       assert(Ra == Pa_base[j], "must be");
5718     //       MACC(Ra, Ra, t0, t1, t2);
5719     //     }
5720     //     iters =  (2*len-i)/2;
5721     //     assert(iters == len-j, "must be");
5722     //     for (; iters--; j++) {
5723     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5724     //       MACC(Rm, Rn, t0, t1, t2);
5725     //       Rm = *++Pm;
5726     //       Rn = *--Pn;
5727     //     }
5728     //     Pm_base[i-len] = t0;
5729     //     t0 = t1; t1 = t2; t2 = 0;
5730     //   }
5731 
5732     //   while (t0)
5733     //     t0 = sub(Pm_base, Pn_base, t0, len);
5734     // }
5735   };
5736 
5737 
5738   // Initialization
5739   void generate_initial() {
5740     // Generate initial stubs and initializes the entry points
5741 
5742     // entry points that exist in all platforms Note: This is code
5743     // that could be shared among different platforms - however the
5744     // benefit seems to be smaller than the disadvantage of having a
5745     // much more complicated generator structure. See also comment in
5746     // stubRoutines.hpp.
5747 
5748     StubRoutines::_forward_exception_entry = generate_forward_exception();
5749 
5750     StubRoutines::_call_stub_entry =
5751       generate_call_stub(StubRoutines::_call_stub_return_address);
5752 
5753     // is referenced by megamorphic call
5754     StubRoutines::_catch_exception_entry = generate_catch_exception();
5755 
5756     // Build this early so it's available for the interpreter.
5757     StubRoutines::_throw_StackOverflowError_entry =
5758       generate_throw_exception("StackOverflowError throw_exception",
5759                                CAST_FROM_FN_PTR(address,
5760                                                 SharedRuntime::throw_StackOverflowError));
5761     StubRoutines::_throw_delayed_StackOverflowError_entry =
5762       generate_throw_exception("delayed StackOverflowError throw_exception",
5763                                CAST_FROM_FN_PTR(address,
5764                                                 SharedRuntime::throw_delayed_StackOverflowError));
5765     if (UseCRC32Intrinsics) {
5766       // set table address before stub generation which use it
5767       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5768       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5769     }
5770 
5771     if (UseCRC32CIntrinsics) {
5772       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5773     }
5774 
5775     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5776       StubRoutines::_dlog = generate_dlog();
5777     }
5778 
5779     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5780       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5781     }
5782 
5783     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5784       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5785     }
5786   }
5787 
5788   void generate_all() {
5789     // support for verify_oop (must happen after universe_init)
5790     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5791     StubRoutines::_throw_AbstractMethodError_entry =
5792       generate_throw_exception("AbstractMethodError throw_exception",
5793                                CAST_FROM_FN_PTR(address,
5794                                                 SharedRuntime::
5795                                                 throw_AbstractMethodError));
5796 
5797     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5798       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5799                                CAST_FROM_FN_PTR(address,
5800                                                 SharedRuntime::
5801                                                 throw_IncompatibleClassChangeError));
5802 
5803     StubRoutines::_throw_NullPointerException_at_call_entry =
5804       generate_throw_exception("NullPointerException at call throw_exception",
5805                                CAST_FROM_FN_PTR(address,
5806                                                 SharedRuntime::
5807                                                 throw_NullPointerException_at_call));
5808 
5809     // arraycopy stubs used by compilers
5810     generate_arraycopy_stubs();
5811 
5812     // has negatives stub for large arrays.
5813     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5814 
5815     // array equals stub for large arrays.
5816     if (!UseSimpleArrayEquals) {
5817       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5818     }
5819 
5820     generate_compare_long_strings();
5821 
5822     generate_string_indexof_stubs();
5823 
5824     // byte_array_inflate stub for large arrays.
5825     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5826 
5827     if (UseMultiplyToLenIntrinsic) {
5828       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5829     }
5830 
5831     if (UseSquareToLenIntrinsic) {
5832       StubRoutines::_squareToLen = generate_squareToLen();
5833     }
5834 
5835     if (UseMulAddIntrinsic) {
5836       StubRoutines::_mulAdd = generate_mulAdd();
5837     }
5838 
5839     if (UseMontgomeryMultiplyIntrinsic) {
5840       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5841       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5842       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5843     }
5844 
5845     if (UseMontgomerySquareIntrinsic) {
5846       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5847       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5848       // We use generate_multiply() rather than generate_square()
5849       // because it's faster for the sizes of modulus we care about.
5850       StubRoutines::_montgomerySquare = g.generate_multiply();
5851     }
5852 
5853 #ifndef BUILTIN_SIM
5854     // generate GHASH intrinsics code
5855     if (UseGHASHIntrinsics) {
5856       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5857     }
5858 
5859     // data cache line writeback
5860     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5861     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5862     
5863     if (UseAESIntrinsics) {
5864       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5865       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5866       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5867       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5868     }
5869 
5870     if (UseSHA1Intrinsics) {
5871       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5872       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5873     }
5874     if (UseSHA256Intrinsics) {
5875       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5876       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5877     }
5878 
5879     // generate Adler32 intrinsics code
5880     if (UseAdler32Intrinsics) {
5881       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5882     }
5883 
5884     // Safefetch stubs.
5885     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5886                                                        &StubRoutines::_safefetch32_fault_pc,
5887                                                        &StubRoutines::_safefetch32_continuation_pc);
5888     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5889                                                        &StubRoutines::_safefetchN_fault_pc,
5890                                                        &StubRoutines::_safefetchN_continuation_pc);
5891 #endif
5892     StubRoutines::aarch64::set_completed();
5893   }
5894 
5895  public:
5896   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5897     if (all) {
5898       generate_all();
5899     } else {
5900       generate_initial();
5901     }
5902   }
5903 }; // end class declaration
5904 
5905 void StubGenerator_generate(CodeBuffer* code, bool all) {
5906   StubGenerator g(code, all);
5907 }