1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (u1)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, (u1)T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, (u1)T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, (u1)T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(j_rarg1, (u1)T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(j_rarg2));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376 #ifndef PRODUCT
 377     // tell the simulator we are about to end Java execution
 378     if (NotifySimulator) {
 379       __ notify(Assembler::method_exit);
 380     }
 381 #endif
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387 
 388     __ BIND(is_long);
 389     __ str(r0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_float);
 393     __ strs(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_double);
 397     __ strd(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     return start;
 401   }
 402 
 403   // Return point for a Java call if there's an exception thrown in
 404   // Java code.  The exception is caught and transformed into a
 405   // pending exception stored in JavaThread that can be tested from
 406   // within the VM.
 407   //
 408   // Note: Usually the parameters are removed by the callee. In case
 409   // of an exception crossing an activation frame boundary, that is
 410   // not the case if the callee is compiled code => need to setup the
 411   // rsp.
 412   //
 413   // r0: exception oop
 414 
 415   // NOTE: this is used as a target from the signal handler so it
 416   // needs an x86 prolog which returns into the current simulator
 417   // executing the generated catch_exception code. so the prolog
 418   // needs to install rax in a sim register and adjust the sim's
 419   // restart pc to enter the generated code at the start position
 420   // then return from native to simulated execution.
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != NULL,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code wiht no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // we should not really care that lr is no longer the callee
 517     // address. we saved the value the handler needs in r19 so we can
 518     // just copy it to r3. however, the C2 handler will push its own
 519     // frame and then calls into the VM and the VM code asserts that
 520     // the PC for the frame above the handler belongs to a compiled
 521     // Java method. So, we restore lr here to satisfy that assert.
 522     __ mov(lr, r19);
 523     // setup r0 & r3 & clear pending exception
 524     __ mov(r3, r19);
 525     __ mov(r19, r0);
 526     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 527     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ cbnz(r0, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler
 540     // r0: exception
 541     // r3: throwing pc
 542     // r19: exception handler
 543     __ verify_oop(r0);
 544     __ br(r19);
 545 
 546     return start;
 547   }
 548 
 549   // Non-destructive plausibility checks for oops
 550   //
 551   // Arguments:
 552   //    r0: oop to verify
 553   //    rscratch1: error message
 554   //
 555   // Stack after saving c_rarg3:
 556   //    [tos + 0]: saved c_rarg3
 557   //    [tos + 1]: saved c_rarg2
 558   //    [tos + 2]: saved lr
 559   //    [tos + 3]: saved rscratch2
 560   //    [tos + 4]: saved r0
 561   //    [tos + 5]: saved rscratch1
 562   address generate_verify_oop() {
 563 
 564     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 565     address start = __ pc();
 566 
 567     Label exit, error;
 568 
 569     // save c_rarg2 and c_rarg3
 570     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 571 
 572     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 573     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ ldr(c_rarg3, Address(c_rarg2));
 575     __ add(c_rarg3, c_rarg3, 1);
 576     __ str(c_rarg3, Address(c_rarg2));
 577 
 578     // object is in r0
 579     // make sure object is 'reasonable'
 580     __ cbz(r0, exit); // if obj is NULL it is OK
 581 
 582     // Check if the oop is in the right area of memory
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 584     __ andr(c_rarg2, r0, c_rarg3);
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 586 
 587     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 588     // instruction here because the flags register is live.
 589     __ eor(c_rarg2, c_rarg2, c_rarg3);
 590     __ cbnz(c_rarg2, error);
 591 
 592     // make sure klass is 'reasonable', which is not zero.
 593     __ load_klass(r0, r0);  // get klass
 594     __ cbz(r0, error);      // if klass is NULL it is broken
 595 
 596     // return if everything seems ok
 597     __ bind(exit);
 598 
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600     __ ret(lr);
 601 
 602     // handle errors
 603     __ bind(error);
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605 
 606     __ push(RegSet::range(r0, r29), sp);
 607     // debug(char* msg, int64_t pc, int64_t regs[])
 608     __ mov(c_rarg0, rscratch1);      // pass address of error message
 609     __ mov(c_rarg1, lr);             // pass return address
 610     __ mov(c_rarg2, sp);             // pass address of regs on stack
 611 #ifndef PRODUCT
 612     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 613 #endif
 614     BLOCK_COMMENT("call MacroAssembler::debug");
 615     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 616     __ blrt(rscratch1, 3, 0, 1);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657 
 658       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 659       // alignment.
 660       Label small;
 661       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 662       __ subs(rscratch1, cnt, low_limit >> 3);
 663       __ br(Assembler::LT, small);
 664       __ zero_dcache_blocks(base, cnt);
 665       __ bind(small);
 666     }
 667 
 668     {
 669       // Number of stp instructions we'll unroll
 670       const int unroll =
 671         MacroAssembler::zero_words_block_size / 2;
 672       // Clear the remaining blocks.
 673       Label loop;
 674       __ subs(cnt, cnt, unroll * 2);
 675       __ br(Assembler::LT, done);
 676       __ bind(loop);
 677       for (int i = 0; i < unroll; i++)
 678         __ stp(zr, zr, __ post(base, 16));
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::GE, loop);
 681       __ bind(done);
 682       __ add(cnt, cnt, unroll * 2);
 683     }
 684 
 685     __ ret(lr);
 686 
 687     return start;
 688   }
 689 
 690 
 691   typedef enum {
 692     copy_forwards = 1,
 693     copy_backwards = -1
 694   } copy_direction;
 695 
 696   // Bulk copy of blocks of 8 words.
 697   //
 698   // count is a count of words.
 699   //
 700   // Precondition: count >= 8
 701   //
 702   // Postconditions:
 703   //
 704   // The least significant bit of count contains the remaining count
 705   // of words to copy.  The rest of count is trash.
 706   //
 707   // s and d are adjusted to point to the remaining words to copy
 708   //
 709   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 710                            copy_direction direction) {
 711     int unit = wordSize * direction;
 712     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 713 
 714     int offset;
 715     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 716       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 717     const Register stride = r13;
 718 
 719     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 720     assert_different_registers(s, d, count, rscratch1);
 721 
 722     Label again, drain;
 723     const char *stub_name;
 724     if (direction == copy_forwards)
 725       stub_name = "forward_copy_longs";
 726     else
 727       stub_name = "backward_copy_longs";
 728 
 729     __ align(CodeEntryAlignment);
 730 
 731     StubCodeMark mark(this, "StubRoutines", stub_name);
 732 
 733     __ bind(start);
 734 
 735     Label unaligned_copy_long;
 736     if (AvoidUnalignedAccesses) {
 737       __ tbnz(d, 3, unaligned_copy_long);
 738     }
 739 
 740     if (direction == copy_forwards) {
 741       __ sub(s, s, bias);
 742       __ sub(d, d, bias);
 743     }
 744 
 745 #ifdef ASSERT
 746     // Make sure we are never given < 8 words
 747     {
 748       Label L;
 749       __ cmp(count, (u1)8);
 750       __ br(Assembler::GE, L);
 751       __ stop("genrate_copy_longs called with < 8 words");
 752       __ bind(L);
 753     }
 754 #endif
 755 
 756     // Fill 8 registers
 757     if (UseSIMDForMemoryOps) {
 758       __ ldpq(v0, v1, Address(s, 4 * unit));
 759       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 760     } else {
 761       __ ldp(t0, t1, Address(s, 2 * unit));
 762       __ ldp(t2, t3, Address(s, 4 * unit));
 763       __ ldp(t4, t5, Address(s, 6 * unit));
 764       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 765     }
 766 
 767     __ subs(count, count, 16);
 768     __ br(Assembler::LO, drain);
 769 
 770     int prefetch = PrefetchCopyIntervalInBytes;
 771     bool use_stride = false;
 772     if (direction == copy_backwards) {
 773        use_stride = prefetch > 256;
 774        prefetch = -prefetch;
 775        if (use_stride) __ mov(stride, prefetch);
 776     }
 777 
 778     __ bind(again);
 779 
 780     if (PrefetchCopyIntervalInBytes > 0)
 781       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 782 
 783     if (UseSIMDForMemoryOps) {
 784       __ stpq(v0, v1, Address(d, 4 * unit));
 785       __ ldpq(v0, v1, Address(s, 4 * unit));
 786       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 787       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 788     } else {
 789       __ stp(t0, t1, Address(d, 2 * unit));
 790       __ ldp(t0, t1, Address(s, 2 * unit));
 791       __ stp(t2, t3, Address(d, 4 * unit));
 792       __ ldp(t2, t3, Address(s, 4 * unit));
 793       __ stp(t4, t5, Address(d, 6 * unit));
 794       __ ldp(t4, t5, Address(s, 6 * unit));
 795       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 796       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 797     }
 798 
 799     __ subs(count, count, 8);
 800     __ br(Assembler::HS, again);
 801 
 802     // Drain
 803     __ bind(drain);
 804     if (UseSIMDForMemoryOps) {
 805       __ stpq(v0, v1, Address(d, 4 * unit));
 806       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 807     } else {
 808       __ stp(t0, t1, Address(d, 2 * unit));
 809       __ stp(t2, t3, Address(d, 4 * unit));
 810       __ stp(t4, t5, Address(d, 6 * unit));
 811       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 812     }
 813 
 814     {
 815       Label L1, L2;
 816       __ tbz(count, exact_log2(4), L1);
 817       if (UseSIMDForMemoryOps) {
 818         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 819         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 820       } else {
 821         __ ldp(t0, t1, Address(s, 2 * unit));
 822         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 823         __ stp(t0, t1, Address(d, 2 * unit));
 824         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 825       }
 826       __ bind(L1);
 827 
 828       if (direction == copy_forwards) {
 829         __ add(s, s, bias);
 830         __ add(d, d, bias);
 831       }
 832 
 833       __ tbz(count, 1, L2);
 834       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 835       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 836       __ bind(L2);
 837     }
 838 
 839     __ ret(lr);
 840 
 841     if (AvoidUnalignedAccesses) {
 842       Label drain, again;
 843       // Register order for storing. Order is different for backward copy.
 844 
 845       __ bind(unaligned_copy_long);
 846 
 847       // source address is even aligned, target odd aligned
 848       //
 849       // when forward copying word pairs we read long pairs at offsets
 850       // {0, 2, 4, 6} (in long words). when backwards copying we read
 851       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 852       // address by -2 in the forwards case so we can compute the
 853       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 854       // or -1.
 855       //
 856       // when forward copying we need to store 1 word, 3 pairs and
 857       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 858       // zero offset We adjust the destination by -1 which means we
 859       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 860       //
 861       // When backwards copyng we need to store 1 word, 3 pairs and
 862       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 863       // offsets {1, 3, 5, 7, 8} * unit.
 864 
 865       if (direction == copy_forwards) {
 866         __ sub(s, s, 16);
 867         __ sub(d, d, 8);
 868       }
 869 
 870       // Fill 8 registers
 871       //
 872       // for forwards copy s was offset by -16 from the original input
 873       // value of s so the register contents are at these offsets
 874       // relative to the 64 bit block addressed by that original input
 875       // and so on for each successive 64 byte block when s is updated
 876       //
 877       // t0 at offset 0,  t1 at offset 8
 878       // t2 at offset 16, t3 at offset 24
 879       // t4 at offset 32, t5 at offset 40
 880       // t6 at offset 48, t7 at offset 56
 881 
 882       // for backwards copy s was not offset so the register contents
 883       // are at these offsets into the preceding 64 byte block
 884       // relative to that original input and so on for each successive
 885       // preceding 64 byte block when s is updated. this explains the
 886       // slightly counter-intuitive looking pattern of register usage
 887       // in the stp instructions for backwards copy.
 888       //
 889       // t0 at offset -16, t1 at offset -8
 890       // t2 at offset -32, t3 at offset -24
 891       // t4 at offset -48, t5 at offset -40
 892       // t6 at offset -64, t7 at offset -56
 893 
 894       __ ldp(t0, t1, Address(s, 2 * unit));
 895       __ ldp(t2, t3, Address(s, 4 * unit));
 896       __ ldp(t4, t5, Address(s, 6 * unit));
 897       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 898 
 899       __ subs(count, count, 16);
 900       __ br(Assembler::LO, drain);
 901 
 902       int prefetch = PrefetchCopyIntervalInBytes;
 903       bool use_stride = false;
 904       if (direction == copy_backwards) {
 905          use_stride = prefetch > 256;
 906          prefetch = -prefetch;
 907          if (use_stride) __ mov(stride, prefetch);
 908       }
 909 
 910       __ bind(again);
 911 
 912       if (PrefetchCopyIntervalInBytes > 0)
 913         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 914 
 915       if (direction == copy_forwards) {
 916        // allowing for the offset of -8 the store instructions place
 917        // registers into the target 64 bit block at the following
 918        // offsets
 919        //
 920        // t0 at offset 0
 921        // t1 at offset 8,  t2 at offset 16
 922        // t3 at offset 24, t4 at offset 32
 923        // t5 at offset 40, t6 at offset 48
 924        // t7 at offset 56
 925 
 926         __ str(t0, Address(d, 1 * unit));
 927         __ stp(t1, t2, Address(d, 2 * unit));
 928         __ ldp(t0, t1, Address(s, 2 * unit));
 929         __ stp(t3, t4, Address(d, 4 * unit));
 930         __ ldp(t2, t3, Address(s, 4 * unit));
 931         __ stp(t5, t6, Address(d, 6 * unit));
 932         __ ldp(t4, t5, Address(s, 6 * unit));
 933         __ str(t7, Address(__ pre(d, 8 * unit)));
 934         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 935       } else {
 936        // d was not offset when we started so the registers are
 937        // written into the 64 bit block preceding d with the following
 938        // offsets
 939        //
 940        // t1 at offset -8
 941        // t3 at offset -24, t0 at offset -16
 942        // t5 at offset -48, t2 at offset -32
 943        // t7 at offset -56, t4 at offset -48
 944        //                   t6 at offset -64
 945        //
 946        // note that this matches the offsets previously noted for the
 947        // loads
 948 
 949         __ str(t1, Address(d, 1 * unit));
 950         __ stp(t3, t0, Address(d, 3 * unit));
 951         __ ldp(t0, t1, Address(s, 2 * unit));
 952         __ stp(t5, t2, Address(d, 5 * unit));
 953         __ ldp(t2, t3, Address(s, 4 * unit));
 954         __ stp(t7, t4, Address(d, 7 * unit));
 955         __ ldp(t4, t5, Address(s, 6 * unit));
 956         __ str(t6, Address(__ pre(d, 8 * unit)));
 957         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 958       }
 959 
 960       __ subs(count, count, 8);
 961       __ br(Assembler::HS, again);
 962 
 963       // Drain
 964       //
 965       // this uses the same pattern of offsets and register arguments
 966       // as above
 967       __ bind(drain);
 968       if (direction == copy_forwards) {
 969         __ str(t0, Address(d, 1 * unit));
 970         __ stp(t1, t2, Address(d, 2 * unit));
 971         __ stp(t3, t4, Address(d, 4 * unit));
 972         __ stp(t5, t6, Address(d, 6 * unit));
 973         __ str(t7, Address(__ pre(d, 8 * unit)));
 974       } else {
 975         __ str(t1, Address(d, 1 * unit));
 976         __ stp(t3, t0, Address(d, 3 * unit));
 977         __ stp(t5, t2, Address(d, 5 * unit));
 978         __ stp(t7, t4, Address(d, 7 * unit));
 979         __ str(t6, Address(__ pre(d, 8 * unit)));
 980       }
 981       // now we need to copy any remaining part block which may
 982       // include a 4 word block subblock and/or a 2 word subblock.
 983       // bits 2 and 1 in the count are the tell-tale for whetehr we
 984       // have each such subblock
 985       {
 986         Label L1, L2;
 987         __ tbz(count, exact_log2(4), L1);
 988        // this is the same as above but copying only 4 longs hence
 989        // with ony one intervening stp between the str instructions
 990        // but note that the offsets and registers still follow the
 991        // same pattern
 992         __ ldp(t0, t1, Address(s, 2 * unit));
 993         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 994         if (direction == copy_forwards) {
 995           __ str(t0, Address(d, 1 * unit));
 996           __ stp(t1, t2, Address(d, 2 * unit));
 997           __ str(t3, Address(__ pre(d, 4 * unit)));
 998         } else {
 999           __ str(t1, Address(d, 1 * unit));
1000           __ stp(t3, t0, Address(d, 3 * unit));
1001           __ str(t2, Address(__ pre(d, 4 * unit)));
1002         }
1003         __ bind(L1);
1004 
1005         __ tbz(count, 1, L2);
1006        // this is the same as above but copying only 2 longs hence
1007        // there is no intervening stp between the str instructions
1008        // but note that the offset and register patterns are still
1009        // the same
1010         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1011         if (direction == copy_forwards) {
1012           __ str(t0, Address(d, 1 * unit));
1013           __ str(t1, Address(__ pre(d, 2 * unit)));
1014         } else {
1015           __ str(t1, Address(d, 1 * unit));
1016           __ str(t0, Address(__ pre(d, 2 * unit)));
1017         }
1018         __ bind(L2);
1019 
1020        // for forwards copy we need to re-adjust the offsets we
1021        // applied so that s and d are follow the last words written
1022 
1023        if (direction == copy_forwards) {
1024          __ add(s, s, 16);
1025          __ add(d, d, 8);
1026        }
1027 
1028       }
1029 
1030       __ ret(lr);
1031       }
1032   }
1033 
1034   // Small copy: less than 16 bytes.
1035   //
1036   // NB: Ignores all of the bits of count which represent more than 15
1037   // bytes, so a caller doesn't have to mask them.
1038 
1039   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1040     bool is_backwards = step < 0;
1041     size_t granularity = uabs(step);
1042     int direction = is_backwards ? -1 : 1;
1043     int unit = wordSize * direction;
1044 
1045     Label Lword, Lint, Lshort, Lbyte;
1046 
1047     assert(granularity
1048            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1049 
1050     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1051 
1052     // ??? I don't know if this bit-test-and-branch is the right thing
1053     // to do.  It does a lot of jumping, resulting in several
1054     // mispredicted branches.  It might make more sense to do this
1055     // with something like Duff's device with a single computed branch.
1056 
1057     __ tbz(count, 3 - exact_log2(granularity), Lword);
1058     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1059     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1060     __ bind(Lword);
1061 
1062     if (granularity <= sizeof (jint)) {
1063       __ tbz(count, 2 - exact_log2(granularity), Lint);
1064       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1065       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1066       __ bind(Lint);
1067     }
1068 
1069     if (granularity <= sizeof (jshort)) {
1070       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1071       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1072       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1073       __ bind(Lshort);
1074     }
1075 
1076     if (granularity <= sizeof (jbyte)) {
1077       __ tbz(count, 0, Lbyte);
1078       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1079       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1080       __ bind(Lbyte);
1081     }
1082   }
1083 
1084   Label copy_f, copy_b;
1085 
1086   // All-singing all-dancing memory copy.
1087   //
1088   // Copy count units of memory from s to d.  The size of a unit is
1089   // step, which can be positive or negative depending on the direction
1090   // of copy.  If is_aligned is false, we align the source address.
1091   //
1092 
1093   void copy_memory(bool is_aligned, Register s, Register d,
1094                    Register count, Register tmp, int step) {
1095     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1096     bool is_backwards = step < 0;
1097     int granularity = uabs(step);
1098     const Register t0 = r3, t1 = r4;
1099 
1100     // <= 96 bytes do inline. Direction doesn't matter because we always
1101     // load all the data before writing anything
1102     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1103     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1104     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1105     const Register send = r17, dend = r18;
1106 
1107     if (PrefetchCopyIntervalInBytes > 0)
1108       __ prfm(Address(s, 0), PLDL1KEEP);
1109     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1110     __ br(Assembler::HI, copy_big);
1111 
1112     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1113     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1114 
1115     __ cmp(count, u1(16/granularity));
1116     __ br(Assembler::LS, copy16);
1117 
1118     __ cmp(count, u1(64/granularity));
1119     __ br(Assembler::HI, copy80);
1120 
1121     __ cmp(count, u1(32/granularity));
1122     __ br(Assembler::LS, copy32);
1123 
1124     // 33..64 bytes
1125     if (UseSIMDForMemoryOps) {
1126       __ ldpq(v0, v1, Address(s, 0));
1127       __ ldpq(v2, v3, Address(send, -32));
1128       __ stpq(v0, v1, Address(d, 0));
1129       __ stpq(v2, v3, Address(dend, -32));
1130     } else {
1131       __ ldp(t0, t1, Address(s, 0));
1132       __ ldp(t2, t3, Address(s, 16));
1133       __ ldp(t4, t5, Address(send, -32));
1134       __ ldp(t6, t7, Address(send, -16));
1135 
1136       __ stp(t0, t1, Address(d, 0));
1137       __ stp(t2, t3, Address(d, 16));
1138       __ stp(t4, t5, Address(dend, -32));
1139       __ stp(t6, t7, Address(dend, -16));
1140     }
1141     __ b(finish);
1142 
1143     // 17..32 bytes
1144     __ bind(copy32);
1145     __ ldp(t0, t1, Address(s, 0));
1146     __ ldp(t2, t3, Address(send, -16));
1147     __ stp(t0, t1, Address(d, 0));
1148     __ stp(t2, t3, Address(dend, -16));
1149     __ b(finish);
1150 
1151     // 65..80/96 bytes
1152     // (96 bytes if SIMD because we do 32 byes per instruction)
1153     __ bind(copy80);
1154     if (UseSIMDForMemoryOps) {
1155       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1156       __ ldpq(v4, v5, Address(send, -32));
1157       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1158       __ stpq(v4, v5, Address(dend, -32));
1159     } else {
1160       __ ldp(t0, t1, Address(s, 0));
1161       __ ldp(t2, t3, Address(s, 16));
1162       __ ldp(t4, t5, Address(s, 32));
1163       __ ldp(t6, t7, Address(s, 48));
1164       __ ldp(t8, t9, Address(send, -16));
1165 
1166       __ stp(t0, t1, Address(d, 0));
1167       __ stp(t2, t3, Address(d, 16));
1168       __ stp(t4, t5, Address(d, 32));
1169       __ stp(t6, t7, Address(d, 48));
1170       __ stp(t8, t9, Address(dend, -16));
1171     }
1172     __ b(finish);
1173 
1174     // 0..16 bytes
1175     __ bind(copy16);
1176     __ cmp(count, u1(8/granularity));
1177     __ br(Assembler::LO, copy8);
1178 
1179     // 8..16 bytes
1180     __ ldr(t0, Address(s, 0));
1181     __ ldr(t1, Address(send, -8));
1182     __ str(t0, Address(d, 0));
1183     __ str(t1, Address(dend, -8));
1184     __ b(finish);
1185 
1186     if (granularity < 8) {
1187       // 4..7 bytes
1188       __ bind(copy8);
1189       __ tbz(count, 2 - exact_log2(granularity), copy4);
1190       __ ldrw(t0, Address(s, 0));
1191       __ ldrw(t1, Address(send, -4));
1192       __ strw(t0, Address(d, 0));
1193       __ strw(t1, Address(dend, -4));
1194       __ b(finish);
1195       if (granularity < 4) {
1196         // 0..3 bytes
1197         __ bind(copy4);
1198         __ cbz(count, finish); // get rid of 0 case
1199         if (granularity == 2) {
1200           __ ldrh(t0, Address(s, 0));
1201           __ strh(t0, Address(d, 0));
1202         } else { // granularity == 1
1203           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1204           // the first and last byte.
1205           // Handle the 3 byte case by loading and storing base + count/2
1206           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1207           // This does means in the 1 byte case we load/store the same
1208           // byte 3 times.
1209           __ lsr(count, count, 1);
1210           __ ldrb(t0, Address(s, 0));
1211           __ ldrb(t1, Address(send, -1));
1212           __ ldrb(t2, Address(s, count));
1213           __ strb(t0, Address(d, 0));
1214           __ strb(t1, Address(dend, -1));
1215           __ strb(t2, Address(d, count));
1216         }
1217         __ b(finish);
1218       }
1219     }
1220 
1221     __ bind(copy_big);
1222     if (is_backwards) {
1223       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1224       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1225     }
1226 
1227     // Now we've got the small case out of the way we can align the
1228     // source address on a 2-word boundary.
1229 
1230     Label aligned;
1231 
1232     if (is_aligned) {
1233       // We may have to adjust by 1 word to get s 2-word-aligned.
1234       __ tbz(s, exact_log2(wordSize), aligned);
1235       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1236       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1237       __ sub(count, count, wordSize/granularity);
1238     } else {
1239       if (is_backwards) {
1240         __ andr(rscratch2, s, 2 * wordSize - 1);
1241       } else {
1242         __ neg(rscratch2, s);
1243         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1244       }
1245       // rscratch2 is the byte adjustment needed to align s.
1246       __ cbz(rscratch2, aligned);
1247       int shift = exact_log2(granularity);
1248       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1249       __ sub(count, count, rscratch2);
1250 
1251 #if 0
1252       // ?? This code is only correct for a disjoint copy.  It may or
1253       // may not make sense to use it in that case.
1254 
1255       // Copy the first pair; s and d may not be aligned.
1256       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1257       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1258 
1259       // Align s and d, adjust count
1260       if (is_backwards) {
1261         __ sub(s, s, rscratch2);
1262         __ sub(d, d, rscratch2);
1263       } else {
1264         __ add(s, s, rscratch2);
1265         __ add(d, d, rscratch2);
1266       }
1267 #else
1268       copy_memory_small(s, d, rscratch2, rscratch1, step);
1269 #endif
1270     }
1271 
1272     __ bind(aligned);
1273 
1274     // s is now 2-word-aligned.
1275 
1276     // We have a count of units and some trailing bytes.  Adjust the
1277     // count and do a bulk copy of words.
1278     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1279     if (direction == copy_forwards)
1280       __ bl(copy_f);
1281     else
1282       __ bl(copy_b);
1283 
1284     // And the tail.
1285     copy_memory_small(s, d, count, tmp, step);
1286 
1287     if (granularity >= 8) __ bind(copy8);
1288     if (granularity >= 4) __ bind(copy4);
1289     __ bind(finish);
1290   }
1291 
1292 
1293   void clobber_registers() {
1294 #ifdef ASSERT
1295     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1296     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1297     for (Register r = r3; r <= r18; r++)
1298       if (r != rscratch1) __ mov(r, rscratch1);
1299 #endif
1300   }
1301 
1302   // Scan over array at a for count oops, verifying each one.
1303   // Preserves a and count, clobbers rscratch1 and rscratch2.
1304   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1305     Label loop, end;
1306     __ mov(rscratch1, a);
1307     __ mov(rscratch2, zr);
1308     __ bind(loop);
1309     __ cmp(rscratch2, count);
1310     __ br(Assembler::HS, end);
1311     if (size == (size_t)wordSize) {
1312       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1313       __ verify_oop(temp);
1314     } else {
1315       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1316       __ decode_heap_oop(temp); // calls verify_oop
1317     }
1318     __ add(rscratch2, rscratch2, size);
1319     __ b(loop);
1320     __ bind(end);
1321   }
1322 
1323   // Arguments:
1324   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1325   //             ignored
1326   //   is_oop  - true => oop array, so generate store check code
1327   //   name    - stub name string
1328   //
1329   // Inputs:
1330   //   c_rarg0   - source array address
1331   //   c_rarg1   - destination array address
1332   //   c_rarg2   - element count, treated as ssize_t, can be zero
1333   //
1334   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1335   // the hardware handle it.  The two dwords within qwords that span
1336   // cache line boundaries will still be loaded and stored atomicly.
1337   //
1338   // Side Effects:
1339   //   disjoint_int_copy_entry is set to the no-overlap entry point
1340   //   used by generate_conjoint_int_oop_copy().
1341   //
1342   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1343                                   const char *name, bool dest_uninitialized = false) {
1344     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1345     RegSet saved_reg = RegSet::of(s, d, count);
1346     __ align(CodeEntryAlignment);
1347     StubCodeMark mark(this, "StubRoutines", name);
1348     address start = __ pc();
1349     __ enter();
1350 
1351     if (entry != NULL) {
1352       *entry = __ pc();
1353       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1354       BLOCK_COMMENT("Entry:");
1355     }
1356 
1357     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1358     if (dest_uninitialized) {
1359       decorators |= IS_DEST_UNINITIALIZED;
1360     }
1361     if (aligned) {
1362       decorators |= ARRAYCOPY_ALIGNED;
1363     }
1364 
1365     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1366     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1367 
1368     if (is_oop) {
1369       // save regs before copy_memory
1370       __ push(RegSet::of(d, count), sp);
1371     }
1372     copy_memory(aligned, s, d, count, rscratch1, size);
1373 
1374     if (is_oop) {
1375       __ pop(RegSet::of(d, count), sp);
1376       if (VerifyOops)
1377         verify_oop_array(size, d, count, r16);
1378       __ sub(count, count, 1); // make an inclusive end pointer
1379       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1380     }
1381 
1382     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1383 
1384     __ leave();
1385     __ mov(r0, zr); // return 0
1386     __ ret(lr);
1387 #ifdef BUILTIN_SIM
1388     {
1389       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1390       sim->notifyCompile(const_cast<char*>(name), start);
1391     }
1392 #endif
1393     return start;
1394   }
1395 
1396   // Arguments:
1397   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1398   //             ignored
1399   //   is_oop  - true => oop array, so generate store check code
1400   //   name    - stub name string
1401   //
1402   // Inputs:
1403   //   c_rarg0   - source array address
1404   //   c_rarg1   - destination array address
1405   //   c_rarg2   - element count, treated as ssize_t, can be zero
1406   //
1407   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1408   // the hardware handle it.  The two dwords within qwords that span
1409   // cache line boundaries will still be loaded and stored atomicly.
1410   //
1411   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1412                                  address *entry, const char *name,
1413                                  bool dest_uninitialized = false) {
1414     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1415     RegSet saved_regs = RegSet::of(s, d, count);
1416     StubCodeMark mark(this, "StubRoutines", name);
1417     address start = __ pc();
1418     __ enter();
1419 
1420     if (entry != NULL) {
1421       *entry = __ pc();
1422       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1423       BLOCK_COMMENT("Entry:");
1424     }
1425 
1426     // use fwd copy when (d-s) above_equal (count*size)
1427     __ sub(rscratch1, d, s);
1428     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1429     __ br(Assembler::HS, nooverlap_target);
1430 
1431     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1432     if (dest_uninitialized) {
1433       decorators |= IS_DEST_UNINITIALIZED;
1434     }
1435     if (aligned) {
1436       decorators |= ARRAYCOPY_ALIGNED;
1437     }
1438 
1439     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1440     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1441 
1442     if (is_oop) {
1443       // save regs before copy_memory
1444       __ push(RegSet::of(d, count), sp);
1445     }
1446     copy_memory(aligned, s, d, count, rscratch1, -size);
1447     if (is_oop) {
1448       __ pop(RegSet::of(d, count), sp);
1449       if (VerifyOops)
1450         verify_oop_array(size, d, count, r16);
1451       __ sub(count, count, 1); // make an inclusive end pointer
1452       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1453     }
1454     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1455     __ leave();
1456     __ mov(r0, zr); // return 0
1457     __ ret(lr);
1458 #ifdef BUILTIN_SIM
1459     {
1460       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1461       sim->notifyCompile(const_cast<char*>(name), start);
1462     }
1463 #endif
1464     return start;
1465 }
1466 
1467   // Arguments:
1468   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1469   //             ignored
1470   //   name    - stub name string
1471   //
1472   // Inputs:
1473   //   c_rarg0   - source array address
1474   //   c_rarg1   - destination array address
1475   //   c_rarg2   - element count, treated as ssize_t, can be zero
1476   //
1477   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1478   // we let the hardware handle it.  The one to eight bytes within words,
1479   // dwords or qwords that span cache line boundaries will still be loaded
1480   // and stored atomically.
1481   //
1482   // Side Effects:
1483   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1484   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1485   // we let the hardware handle it.  The one to eight bytes within words,
1486   // dwords or qwords that span cache line boundaries will still be loaded
1487   // and stored atomically.
1488   //
1489   // Side Effects:
1490   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1491   //   used by generate_conjoint_byte_copy().
1492   //
1493   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1494     const bool not_oop = false;
1495     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1496   }
1497 
1498   // Arguments:
1499   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1500   //             ignored
1501   //   name    - stub name string
1502   //
1503   // Inputs:
1504   //   c_rarg0   - source array address
1505   //   c_rarg1   - destination array address
1506   //   c_rarg2   - element count, treated as ssize_t, can be zero
1507   //
1508   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1509   // we let the hardware handle it.  The one to eight bytes within words,
1510   // dwords or qwords that span cache line boundaries will still be loaded
1511   // and stored atomically.
1512   //
1513   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1514                                       address* entry, const char *name) {
1515     const bool not_oop = false;
1516     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1517   }
1518 
1519   // Arguments:
1520   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1521   //             ignored
1522   //   name    - stub name string
1523   //
1524   // Inputs:
1525   //   c_rarg0   - source array address
1526   //   c_rarg1   - destination array address
1527   //   c_rarg2   - element count, treated as ssize_t, can be zero
1528   //
1529   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1530   // let the hardware handle it.  The two or four words within dwords
1531   // or qwords that span cache line boundaries will still be loaded
1532   // and stored atomically.
1533   //
1534   // Side Effects:
1535   //   disjoint_short_copy_entry is set to the no-overlap entry point
1536   //   used by generate_conjoint_short_copy().
1537   //
1538   address generate_disjoint_short_copy(bool aligned,
1539                                        address* entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1542   }
1543 
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1555   // let the hardware handle it.  The two or four words within dwords
1556   // or qwords that span cache line boundaries will still be loaded
1557   // and stored atomically.
1558   //
1559   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1560                                        address *entry, const char *name) {
1561     const bool not_oop = false;
1562     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1563 
1564   }
1565   // Arguments:
1566   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1567   //             ignored
1568   //   name    - stub name string
1569   //
1570   // Inputs:
1571   //   c_rarg0   - source array address
1572   //   c_rarg1   - destination array address
1573   //   c_rarg2   - element count, treated as ssize_t, can be zero
1574   //
1575   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1576   // the hardware handle it.  The two dwords within qwords that span
1577   // cache line boundaries will still be loaded and stored atomicly.
1578   //
1579   // Side Effects:
1580   //   disjoint_int_copy_entry is set to the no-overlap entry point
1581   //   used by generate_conjoint_int_oop_copy().
1582   //
1583   address generate_disjoint_int_copy(bool aligned, address *entry,
1584                                          const char *name, bool dest_uninitialized = false) {
1585     const bool not_oop = false;
1586     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1587   }
1588 
1589   // Arguments:
1590   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1591   //             ignored
1592   //   name    - stub name string
1593   //
1594   // Inputs:
1595   //   c_rarg0   - source array address
1596   //   c_rarg1   - destination array address
1597   //   c_rarg2   - element count, treated as ssize_t, can be zero
1598   //
1599   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1600   // the hardware handle it.  The two dwords within qwords that span
1601   // cache line boundaries will still be loaded and stored atomicly.
1602   //
1603   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1604                                      address *entry, const char *name,
1605                                      bool dest_uninitialized = false) {
1606     const bool not_oop = false;
1607     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1608   }
1609 
1610 
1611   // Arguments:
1612   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1613   //             ignored
1614   //   name    - stub name string
1615   //
1616   // Inputs:
1617   //   c_rarg0   - source array address
1618   //   c_rarg1   - destination array address
1619   //   c_rarg2   - element count, treated as size_t, can be zero
1620   //
1621   // Side Effects:
1622   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1623   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1624   //
1625   address generate_disjoint_long_copy(bool aligned, address *entry,
1626                                           const char *name, bool dest_uninitialized = false) {
1627     const bool not_oop = false;
1628     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1629   }
1630 
1631   // Arguments:
1632   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1633   //             ignored
1634   //   name    - stub name string
1635   //
1636   // Inputs:
1637   //   c_rarg0   - source array address
1638   //   c_rarg1   - destination array address
1639   //   c_rarg2   - element count, treated as size_t, can be zero
1640   //
1641   address generate_conjoint_long_copy(bool aligned,
1642                                       address nooverlap_target, address *entry,
1643                                       const char *name, bool dest_uninitialized = false) {
1644     const bool not_oop = false;
1645     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1646   }
1647 
1648   // Arguments:
1649   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1650   //             ignored
1651   //   name    - stub name string
1652   //
1653   // Inputs:
1654   //   c_rarg0   - source array address
1655   //   c_rarg1   - destination array address
1656   //   c_rarg2   - element count, treated as size_t, can be zero
1657   //
1658   // Side Effects:
1659   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1660   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1661   //
1662   address generate_disjoint_oop_copy(bool aligned, address *entry,
1663                                      const char *name, bool dest_uninitialized) {
1664     const bool is_oop = true;
1665     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1666     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1667   }
1668 
1669   // Arguments:
1670   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1671   //             ignored
1672   //   name    - stub name string
1673   //
1674   // Inputs:
1675   //   c_rarg0   - source array address
1676   //   c_rarg1   - destination array address
1677   //   c_rarg2   - element count, treated as size_t, can be zero
1678   //
1679   address generate_conjoint_oop_copy(bool aligned,
1680                                      address nooverlap_target, address *entry,
1681                                      const char *name, bool dest_uninitialized) {
1682     const bool is_oop = true;
1683     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1684     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1685                                   name, dest_uninitialized);
1686   }
1687 
1688 
1689   // Helper for generating a dynamic type check.
1690   // Smashes rscratch1.
1691   void generate_type_check(Register sub_klass,
1692                            Register super_check_offset,
1693                            Register super_klass,
1694                            Label& L_success) {
1695     assert_different_registers(sub_klass, super_check_offset, super_klass);
1696 
1697     BLOCK_COMMENT("type_check:");
1698 
1699     Label L_miss;
1700 
1701     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1702                                      super_check_offset);
1703     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1704 
1705     // Fall through on failure!
1706     __ BIND(L_miss);
1707   }
1708 
1709   //
1710   //  Generate checkcasting array copy stub
1711   //
1712   //  Input:
1713   //    c_rarg0   - source array address
1714   //    c_rarg1   - destination array address
1715   //    c_rarg2   - element count, treated as ssize_t, can be zero
1716   //    c_rarg3   - size_t ckoff (super_check_offset)
1717   //    c_rarg4   - oop ckval (super_klass)
1718   //
1719   //  Output:
1720   //    r0 ==  0  -  success
1721   //    r0 == -1^K - failure, where K is partial transfer count
1722   //
1723   address generate_checkcast_copy(const char *name, address *entry,
1724                                   bool dest_uninitialized = false) {
1725 
1726     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1727 
1728     // Input registers (after setup_arg_regs)
1729     const Register from        = c_rarg0;   // source array address
1730     const Register to          = c_rarg1;   // destination array address
1731     const Register count       = c_rarg2;   // elementscount
1732     const Register ckoff       = c_rarg3;   // super_check_offset
1733     const Register ckval       = c_rarg4;   // super_klass
1734 
1735     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1736     RegSet wb_post_saved_regs = RegSet::of(count);
1737 
1738     // Registers used as temps (r18, r19, r20 are save-on-entry)
1739     const Register count_save  = r21;       // orig elementscount
1740     const Register start_to    = r20;       // destination array start address
1741     const Register copied_oop  = r18;       // actual oop copied
1742     const Register r19_klass   = r19;       // oop._klass
1743 
1744     //---------------------------------------------------------------
1745     // Assembler stub will be used for this call to arraycopy
1746     // if the two arrays are subtypes of Object[] but the
1747     // destination array type is not equal to or a supertype
1748     // of the source type.  Each element must be separately
1749     // checked.
1750 
1751     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1752                                copied_oop, r19_klass, count_save);
1753 
1754     __ align(CodeEntryAlignment);
1755     StubCodeMark mark(this, "StubRoutines", name);
1756     address start = __ pc();
1757 
1758     __ enter(); // required for proper stackwalking of RuntimeStub frame
1759 
1760 #ifdef ASSERT
1761     // caller guarantees that the arrays really are different
1762     // otherwise, we would have to make conjoint checks
1763     { Label L;
1764       array_overlap_test(L, TIMES_OOP);
1765       __ stop("checkcast_copy within a single array");
1766       __ bind(L);
1767     }
1768 #endif //ASSERT
1769 
1770     // Caller of this entry point must set up the argument registers.
1771     if (entry != NULL) {
1772       *entry = __ pc();
1773       BLOCK_COMMENT("Entry:");
1774     }
1775 
1776      // Empty array:  Nothing to do.
1777     __ cbz(count, L_done);
1778 
1779     __ push(RegSet::of(r18, r19, r20, r21), sp);
1780 
1781 #ifdef ASSERT
1782     BLOCK_COMMENT("assert consistent ckoff/ckval");
1783     // The ckoff and ckval must be mutually consistent,
1784     // even though caller generates both.
1785     { Label L;
1786       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1787       __ ldrw(start_to, Address(ckval, sco_offset));
1788       __ cmpw(ckoff, start_to);
1789       __ br(Assembler::EQ, L);
1790       __ stop("super_check_offset inconsistent");
1791       __ bind(L);
1792     }
1793 #endif //ASSERT
1794 
1795     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1796     bool is_oop = true;
1797     if (dest_uninitialized) {
1798       decorators |= IS_DEST_UNINITIALIZED;
1799     }
1800 
1801     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1802     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1803 
1804     // save the original count
1805     __ mov(count_save, count);
1806 
1807     // Copy from low to high addresses
1808     __ mov(start_to, to);              // Save destination array start address
1809     __ b(L_load_element);
1810 
1811     // ======== begin loop ========
1812     // (Loop is rotated; its entry is L_load_element.)
1813     // Loop control:
1814     //   for (; count != 0; count--) {
1815     //     copied_oop = load_heap_oop(from++);
1816     //     ... generate_type_check ...;
1817     //     store_heap_oop(to++, copied_oop);
1818     //   }
1819     __ align(OptoLoopAlignment);
1820 
1821     __ BIND(L_store_element);
1822     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1823     __ sub(count, count, 1);
1824     __ cbz(count, L_do_card_marks);
1825 
1826     // ======== loop entry is here ========
1827     __ BIND(L_load_element);
1828     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1829     __ cbz(copied_oop, L_store_element);
1830 
1831     __ load_klass(r19_klass, copied_oop);// query the object klass
1832     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1833     // ======== end loop ========
1834 
1835     // It was a real error; we must depend on the caller to finish the job.
1836     // Register count = remaining oops, count_orig = total oops.
1837     // Emit GC store barriers for the oops we have copied and report
1838     // their number to the caller.
1839 
1840     __ subs(count, count_save, count);     // K = partially copied oop count
1841     __ eon(count, count, zr);                   // report (-1^K) to caller
1842     __ br(Assembler::EQ, L_done_pop);
1843 
1844     __ BIND(L_do_card_marks);
1845     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1846     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1847 
1848     __ bind(L_done_pop);
1849     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1850     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1851 
1852     __ bind(L_done);
1853     __ mov(r0, count);
1854     __ leave();
1855     __ ret(lr);
1856 
1857     return start;
1858   }
1859 
1860   // Perform range checks on the proposed arraycopy.
1861   // Kills temp, but nothing else.
1862   // Also, clean the sign bits of src_pos and dst_pos.
1863   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1864                               Register src_pos, // source position (c_rarg1)
1865                               Register dst,     // destination array oo (c_rarg2)
1866                               Register dst_pos, // destination position (c_rarg3)
1867                               Register length,
1868                               Register temp,
1869                               Label& L_failed) {
1870     BLOCK_COMMENT("arraycopy_range_checks:");
1871 
1872     assert_different_registers(rscratch1, temp);
1873 
1874     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1875     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1876     __ addw(temp, length, src_pos);
1877     __ cmpw(temp, rscratch1);
1878     __ br(Assembler::HI, L_failed);
1879 
1880     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1881     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1882     __ addw(temp, length, dst_pos);
1883     __ cmpw(temp, rscratch1);
1884     __ br(Assembler::HI, L_failed);
1885 
1886     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1887     __ movw(src_pos, src_pos);
1888     __ movw(dst_pos, dst_pos);
1889 
1890     BLOCK_COMMENT("arraycopy_range_checks done");
1891   }
1892 
1893   // These stubs get called from some dumb test routine.
1894   // I'll write them properly when they're called from
1895   // something that's actually doing something.
1896   static void fake_arraycopy_stub(address src, address dst, int count) {
1897     assert(count == 0, "huh?");
1898   }
1899 
1900 
1901   //
1902   //  Generate 'unsafe' array copy stub
1903   //  Though just as safe as the other stubs, it takes an unscaled
1904   //  size_t argument instead of an element count.
1905   //
1906   //  Input:
1907   //    c_rarg0   - source array address
1908   //    c_rarg1   - destination array address
1909   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1910   //
1911   // Examines the alignment of the operands and dispatches
1912   // to a long, int, short, or byte copy loop.
1913   //
1914   address generate_unsafe_copy(const char *name,
1915                                address byte_copy_entry,
1916                                address short_copy_entry,
1917                                address int_copy_entry,
1918                                address long_copy_entry) {
1919     Label L_long_aligned, L_int_aligned, L_short_aligned;
1920     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1921 
1922     __ align(CodeEntryAlignment);
1923     StubCodeMark mark(this, "StubRoutines", name);
1924     address start = __ pc();
1925     __ enter(); // required for proper stackwalking of RuntimeStub frame
1926 
1927     // bump this on entry, not on exit:
1928     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1929 
1930     __ orr(rscratch1, s, d);
1931     __ orr(rscratch1, rscratch1, count);
1932 
1933     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1934     __ cbz(rscratch1, L_long_aligned);
1935     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1936     __ cbz(rscratch1, L_int_aligned);
1937     __ tbz(rscratch1, 0, L_short_aligned);
1938     __ b(RuntimeAddress(byte_copy_entry));
1939 
1940     __ BIND(L_short_aligned);
1941     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1942     __ b(RuntimeAddress(short_copy_entry));
1943     __ BIND(L_int_aligned);
1944     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1945     __ b(RuntimeAddress(int_copy_entry));
1946     __ BIND(L_long_aligned);
1947     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1948     __ b(RuntimeAddress(long_copy_entry));
1949 
1950     return start;
1951   }
1952 
1953   //
1954   //  Generate generic array copy stubs
1955   //
1956   //  Input:
1957   //    c_rarg0    -  src oop
1958   //    c_rarg1    -  src_pos (32-bits)
1959   //    c_rarg2    -  dst oop
1960   //    c_rarg3    -  dst_pos (32-bits)
1961   //    c_rarg4    -  element count (32-bits)
1962   //
1963   //  Output:
1964   //    r0 ==  0  -  success
1965   //    r0 == -1^K - failure, where K is partial transfer count
1966   //
1967   address generate_generic_copy(const char *name,
1968                                 address byte_copy_entry, address short_copy_entry,
1969                                 address int_copy_entry, address oop_copy_entry,
1970                                 address long_copy_entry, address checkcast_copy_entry) {
1971 
1972     Label L_failed, L_objArray;
1973     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1974 
1975     // Input registers
1976     const Register src        = c_rarg0;  // source array oop
1977     const Register src_pos    = c_rarg1;  // source position
1978     const Register dst        = c_rarg2;  // destination array oop
1979     const Register dst_pos    = c_rarg3;  // destination position
1980     const Register length     = c_rarg4;
1981 
1982     __ align(CodeEntryAlignment);
1983 
1984     StubCodeMark mark(this, "StubRoutines", name);
1985 
1986     address start = __ pc();
1987 
1988     __ enter(); // required for proper stackwalking of RuntimeStub frame
1989 
1990     // bump this on entry, not on exit:
1991     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1992 
1993     //-----------------------------------------------------------------------
1994     // Assembler stub will be used for this call to arraycopy
1995     // if the following conditions are met:
1996     //
1997     // (1) src and dst must not be null.
1998     // (2) src_pos must not be negative.
1999     // (3) dst_pos must not be negative.
2000     // (4) length  must not be negative.
2001     // (5) src klass and dst klass should be the same and not NULL.
2002     // (6) src and dst should be arrays.
2003     // (7) src_pos + length must not exceed length of src.
2004     // (8) dst_pos + length must not exceed length of dst.
2005     //
2006 
2007     //  if (src == NULL) return -1;
2008     __ cbz(src, L_failed);
2009 
2010     //  if (src_pos < 0) return -1;
2011     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2012 
2013     //  if (dst == NULL) return -1;
2014     __ cbz(dst, L_failed);
2015 
2016     //  if (dst_pos < 0) return -1;
2017     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2018 
2019     // registers used as temp
2020     const Register scratch_length    = r16; // elements count to copy
2021     const Register scratch_src_klass = r17; // array klass
2022     const Register lh                = r18; // layout helper
2023 
2024     //  if (length < 0) return -1;
2025     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2026     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2027 
2028     __ load_klass(scratch_src_klass, src);
2029 #ifdef ASSERT
2030     //  assert(src->klass() != NULL);
2031     {
2032       BLOCK_COMMENT("assert klasses not null {");
2033       Label L1, L2;
2034       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2035       __ bind(L1);
2036       __ stop("broken null klass");
2037       __ bind(L2);
2038       __ load_klass(rscratch1, dst);
2039       __ cbz(rscratch1, L1);     // this would be broken also
2040       BLOCK_COMMENT("} assert klasses not null done");
2041     }
2042 #endif
2043 
2044     // Load layout helper (32-bits)
2045     //
2046     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2047     // 32        30    24            16              8     2                 0
2048     //
2049     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2050     //
2051 
2052     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2053 
2054     // Handle objArrays completely differently...
2055     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2056     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2057     __ movw(rscratch1, objArray_lh);
2058     __ eorw(rscratch2, lh, rscratch1);
2059     __ cbzw(rscratch2, L_objArray);
2060 
2061     //  if (src->klass() != dst->klass()) return -1;
2062     __ load_klass(rscratch2, dst);
2063     __ eor(rscratch2, rscratch2, scratch_src_klass);
2064     __ cbnz(rscratch2, L_failed);
2065 
2066     //  if (!src->is_Array()) return -1;
2067     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2068 
2069     // At this point, it is known to be a typeArray (array_tag 0x3).
2070 #ifdef ASSERT
2071     {
2072       BLOCK_COMMENT("assert primitive array {");
2073       Label L;
2074       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2075       __ cmpw(lh, rscratch2);
2076       __ br(Assembler::GE, L);
2077       __ stop("must be a primitive array");
2078       __ bind(L);
2079       BLOCK_COMMENT("} assert primitive array done");
2080     }
2081 #endif
2082 
2083     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2084                            rscratch2, L_failed);
2085 
2086     // TypeArrayKlass
2087     //
2088     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2089     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2090     //
2091 
2092     const Register rscratch1_offset = rscratch1;    // array offset
2093     const Register r18_elsize = lh; // element size
2094 
2095     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2096            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2097     __ add(src, src, rscratch1_offset);           // src array offset
2098     __ add(dst, dst, rscratch1_offset);           // dst array offset
2099     BLOCK_COMMENT("choose copy loop based on element size");
2100 
2101     // next registers should be set before the jump to corresponding stub
2102     const Register from     = c_rarg0;  // source array address
2103     const Register to       = c_rarg1;  // destination array address
2104     const Register count    = c_rarg2;  // elements count
2105 
2106     // 'from', 'to', 'count' registers should be set in such order
2107     // since they are the same as 'src', 'src_pos', 'dst'.
2108 
2109     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2110 
2111     // The possible values of elsize are 0-3, i.e. exact_log2(element
2112     // size in bytes).  We do a simple bitwise binary search.
2113   __ BIND(L_copy_bytes);
2114     __ tbnz(r18_elsize, 1, L_copy_ints);
2115     __ tbnz(r18_elsize, 0, L_copy_shorts);
2116     __ lea(from, Address(src, src_pos));// src_addr
2117     __ lea(to,   Address(dst, dst_pos));// dst_addr
2118     __ movw(count, scratch_length); // length
2119     __ b(RuntimeAddress(byte_copy_entry));
2120 
2121   __ BIND(L_copy_shorts);
2122     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2123     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2124     __ movw(count, scratch_length); // length
2125     __ b(RuntimeAddress(short_copy_entry));
2126 
2127   __ BIND(L_copy_ints);
2128     __ tbnz(r18_elsize, 0, L_copy_longs);
2129     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2130     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2131     __ movw(count, scratch_length); // length
2132     __ b(RuntimeAddress(int_copy_entry));
2133 
2134   __ BIND(L_copy_longs);
2135 #ifdef ASSERT
2136     {
2137       BLOCK_COMMENT("assert long copy {");
2138       Label L;
2139       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2140       __ cmpw(r18_elsize, LogBytesPerLong);
2141       __ br(Assembler::EQ, L);
2142       __ stop("must be long copy, but elsize is wrong");
2143       __ bind(L);
2144       BLOCK_COMMENT("} assert long copy done");
2145     }
2146 #endif
2147     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2148     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2149     __ movw(count, scratch_length); // length
2150     __ b(RuntimeAddress(long_copy_entry));
2151 
2152     // ObjArrayKlass
2153   __ BIND(L_objArray);
2154     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2155 
2156     Label L_plain_copy, L_checkcast_copy;
2157     //  test array classes for subtyping
2158     __ load_klass(r18, dst);
2159     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2160     __ br(Assembler::NE, L_checkcast_copy);
2161 
2162     // Identically typed arrays can be copied without element-wise checks.
2163     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2164                            rscratch2, L_failed);
2165 
2166     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2167     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2168     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2169     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2170     __ movw(count, scratch_length); // length
2171   __ BIND(L_plain_copy);
2172     __ b(RuntimeAddress(oop_copy_entry));
2173 
2174   __ BIND(L_checkcast_copy);
2175     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2176     {
2177       // Before looking at dst.length, make sure dst is also an objArray.
2178       __ ldrw(rscratch1, Address(r18, lh_offset));
2179       __ movw(rscratch2, objArray_lh);
2180       __ eorw(rscratch1, rscratch1, rscratch2);
2181       __ cbnzw(rscratch1, L_failed);
2182 
2183       // It is safe to examine both src.length and dst.length.
2184       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2185                              r18, L_failed);
2186 
2187       const Register rscratch2_dst_klass = rscratch2;
2188       __ load_klass(rscratch2_dst_klass, dst); // reload
2189 
2190       // Marshal the base address arguments now, freeing registers.
2191       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2192       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2193       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2194       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2195       __ movw(count, length);           // length (reloaded)
2196       Register sco_temp = c_rarg3;      // this register is free now
2197       assert_different_registers(from, to, count, sco_temp,
2198                                  rscratch2_dst_klass, scratch_src_klass);
2199       // assert_clean_int(count, sco_temp);
2200 
2201       // Generate the type check.
2202       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2203       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2204       // assert_clean_int(sco_temp, r18);
2205       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2206 
2207       // Fetch destination element klass from the ObjArrayKlass header.
2208       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2209       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2210       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2211 
2212       // the checkcast_copy loop needs two extra arguments:
2213       assert(c_rarg3 == sco_temp, "#3 already in place");
2214       // Set up arguments for checkcast_copy_entry.
2215       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2216       __ b(RuntimeAddress(checkcast_copy_entry));
2217     }
2218 
2219   __ BIND(L_failed);
2220     __ mov(r0, -1);
2221     __ leave();   // required for proper stackwalking of RuntimeStub frame
2222     __ ret(lr);
2223 
2224     return start;
2225   }
2226 
2227   //
2228   // Generate stub for array fill. If "aligned" is true, the
2229   // "to" address is assumed to be heapword aligned.
2230   //
2231   // Arguments for generated stub:
2232   //   to:    c_rarg0
2233   //   value: c_rarg1
2234   //   count: c_rarg2 treated as signed
2235   //
2236   address generate_fill(BasicType t, bool aligned, const char *name) {
2237     __ align(CodeEntryAlignment);
2238     StubCodeMark mark(this, "StubRoutines", name);
2239     address start = __ pc();
2240 
2241     BLOCK_COMMENT("Entry:");
2242 
2243     const Register to        = c_rarg0;  // source array address
2244     const Register value     = c_rarg1;  // value
2245     const Register count     = c_rarg2;  // elements count
2246 
2247     const Register bz_base = r10;        // base for block_zero routine
2248     const Register cnt_words = r11;      // temp register
2249 
2250     __ enter();
2251 
2252     Label L_fill_elements, L_exit1;
2253 
2254     int shift = -1;
2255     switch (t) {
2256       case T_BYTE:
2257         shift = 0;
2258         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2259         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2260         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2261         __ br(Assembler::LO, L_fill_elements);
2262         break;
2263       case T_SHORT:
2264         shift = 1;
2265         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2266         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2267         __ br(Assembler::LO, L_fill_elements);
2268         break;
2269       case T_INT:
2270         shift = 2;
2271         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2272         __ br(Assembler::LO, L_fill_elements);
2273         break;
2274       default: ShouldNotReachHere();
2275     }
2276 
2277     // Align source address at 8 bytes address boundary.
2278     Label L_skip_align1, L_skip_align2, L_skip_align4;
2279     if (!aligned) {
2280       switch (t) {
2281         case T_BYTE:
2282           // One byte misalignment happens only for byte arrays.
2283           __ tbz(to, 0, L_skip_align1);
2284           __ strb(value, Address(__ post(to, 1)));
2285           __ subw(count, count, 1);
2286           __ bind(L_skip_align1);
2287           // Fallthrough
2288         case T_SHORT:
2289           // Two bytes misalignment happens only for byte and short (char) arrays.
2290           __ tbz(to, 1, L_skip_align2);
2291           __ strh(value, Address(__ post(to, 2)));
2292           __ subw(count, count, 2 >> shift);
2293           __ bind(L_skip_align2);
2294           // Fallthrough
2295         case T_INT:
2296           // Align to 8 bytes, we know we are 4 byte aligned to start.
2297           __ tbz(to, 2, L_skip_align4);
2298           __ strw(value, Address(__ post(to, 4)));
2299           __ subw(count, count, 4 >> shift);
2300           __ bind(L_skip_align4);
2301           break;
2302         default: ShouldNotReachHere();
2303       }
2304     }
2305 
2306     //
2307     //  Fill large chunks
2308     //
2309     __ lsrw(cnt_words, count, 3 - shift); // number of words
2310     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2311     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2312     if (UseBlockZeroing) {
2313       Label non_block_zeroing, rest;
2314       // If the fill value is zero we can use the fast zero_words().
2315       __ cbnz(value, non_block_zeroing);
2316       __ mov(bz_base, to);
2317       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2318       __ zero_words(bz_base, cnt_words);
2319       __ b(rest);
2320       __ bind(non_block_zeroing);
2321       __ fill_words(to, cnt_words, value);
2322       __ bind(rest);
2323     } else {
2324       __ fill_words(to, cnt_words, value);
2325     }
2326 
2327     // Remaining count is less than 8 bytes. Fill it by a single store.
2328     // Note that the total length is no less than 8 bytes.
2329     if (t == T_BYTE || t == T_SHORT) {
2330       Label L_exit1;
2331       __ cbzw(count, L_exit1);
2332       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2333       __ str(value, Address(to, -8));    // overwrite some elements
2334       __ bind(L_exit1);
2335       __ leave();
2336       __ ret(lr);
2337     }
2338 
2339     // Handle copies less than 8 bytes.
2340     Label L_fill_2, L_fill_4, L_exit2;
2341     __ bind(L_fill_elements);
2342     switch (t) {
2343       case T_BYTE:
2344         __ tbz(count, 0, L_fill_2);
2345         __ strb(value, Address(__ post(to, 1)));
2346         __ bind(L_fill_2);
2347         __ tbz(count, 1, L_fill_4);
2348         __ strh(value, Address(__ post(to, 2)));
2349         __ bind(L_fill_4);
2350         __ tbz(count, 2, L_exit2);
2351         __ strw(value, Address(to));
2352         break;
2353       case T_SHORT:
2354         __ tbz(count, 0, L_fill_4);
2355         __ strh(value, Address(__ post(to, 2)));
2356         __ bind(L_fill_4);
2357         __ tbz(count, 1, L_exit2);
2358         __ strw(value, Address(to));
2359         break;
2360       case T_INT:
2361         __ cbzw(count, L_exit2);
2362         __ strw(value, Address(to));
2363         break;
2364       default: ShouldNotReachHere();
2365     }
2366     __ bind(L_exit2);
2367     __ leave();
2368     __ ret(lr);
2369     return start;
2370   }
2371 
2372   void generate_arraycopy_stubs() {
2373     address entry;
2374     address entry_jbyte_arraycopy;
2375     address entry_jshort_arraycopy;
2376     address entry_jint_arraycopy;
2377     address entry_oop_arraycopy;
2378     address entry_jlong_arraycopy;
2379     address entry_checkcast_arraycopy;
2380 
2381     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2382     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2383 
2384     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2385 
2386     //*** jbyte
2387     // Always need aligned and unaligned versions
2388     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2389                                                                                   "jbyte_disjoint_arraycopy");
2390     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2391                                                                                   &entry_jbyte_arraycopy,
2392                                                                                   "jbyte_arraycopy");
2393     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2394                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2395     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2396                                                                                   "arrayof_jbyte_arraycopy");
2397 
2398     //*** jshort
2399     // Always need aligned and unaligned versions
2400     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2401                                                                                     "jshort_disjoint_arraycopy");
2402     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2403                                                                                     &entry_jshort_arraycopy,
2404                                                                                     "jshort_arraycopy");
2405     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2406                                                                                     "arrayof_jshort_disjoint_arraycopy");
2407     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2408                                                                                     "arrayof_jshort_arraycopy");
2409 
2410     //*** jint
2411     // Aligned versions
2412     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2413                                                                                 "arrayof_jint_disjoint_arraycopy");
2414     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2415                                                                                 "arrayof_jint_arraycopy");
2416     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2417     // entry_jint_arraycopy always points to the unaligned version
2418     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2419                                                                                 "jint_disjoint_arraycopy");
2420     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2421                                                                                 &entry_jint_arraycopy,
2422                                                                                 "jint_arraycopy");
2423 
2424     //*** jlong
2425     // It is always aligned
2426     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2427                                                                                   "arrayof_jlong_disjoint_arraycopy");
2428     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2429                                                                                   "arrayof_jlong_arraycopy");
2430     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2431     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2432 
2433     //*** oops
2434     {
2435       // With compressed oops we need unaligned versions; notice that
2436       // we overwrite entry_oop_arraycopy.
2437       bool aligned = !UseCompressedOops;
2438 
2439       StubRoutines::_arrayof_oop_disjoint_arraycopy
2440         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2441                                      /*dest_uninitialized*/false);
2442       StubRoutines::_arrayof_oop_arraycopy
2443         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2444                                      /*dest_uninitialized*/false);
2445       // Aligned versions without pre-barriers
2446       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2447         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2448                                      /*dest_uninitialized*/true);
2449       StubRoutines::_arrayof_oop_arraycopy_uninit
2450         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2451                                      /*dest_uninitialized*/true);
2452     }
2453 
2454     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2455     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2456     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2457     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2458 
2459     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2460     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2461                                                                         /*dest_uninitialized*/true);
2462 
2463     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2464                                                               entry_jbyte_arraycopy,
2465                                                               entry_jshort_arraycopy,
2466                                                               entry_jint_arraycopy,
2467                                                               entry_jlong_arraycopy);
2468 
2469     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2470                                                                entry_jbyte_arraycopy,
2471                                                                entry_jshort_arraycopy,
2472                                                                entry_jint_arraycopy,
2473                                                                entry_oop_arraycopy,
2474                                                                entry_jlong_arraycopy,
2475                                                                entry_checkcast_arraycopy);
2476 
2477     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2478     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2479     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2480     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2481     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2482     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2483   }
2484 
2485   void generate_math_stubs() { Unimplemented(); }
2486 
2487   // Arguments:
2488   //
2489   // Inputs:
2490   //   c_rarg0   - source byte array address
2491   //   c_rarg1   - destination byte array address
2492   //   c_rarg2   - K (key) in little endian int array
2493   //
2494   address generate_aescrypt_encryptBlock() {
2495     __ align(CodeEntryAlignment);
2496     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2497 
2498     Label L_doLast;
2499 
2500     const Register from        = c_rarg0;  // source array address
2501     const Register to          = c_rarg1;  // destination array address
2502     const Register key         = c_rarg2;  // key array address
2503     const Register keylen      = rscratch1;
2504 
2505     address start = __ pc();
2506     __ enter();
2507 
2508     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2509 
2510     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2511 
2512     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2513     __ rev32(v1, __ T16B, v1);
2514     __ rev32(v2, __ T16B, v2);
2515     __ rev32(v3, __ T16B, v3);
2516     __ rev32(v4, __ T16B, v4);
2517     __ aese(v0, v1);
2518     __ aesmc(v0, v0);
2519     __ aese(v0, v2);
2520     __ aesmc(v0, v0);
2521     __ aese(v0, v3);
2522     __ aesmc(v0, v0);
2523     __ aese(v0, v4);
2524     __ aesmc(v0, v0);
2525 
2526     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2527     __ rev32(v1, __ T16B, v1);
2528     __ rev32(v2, __ T16B, v2);
2529     __ rev32(v3, __ T16B, v3);
2530     __ rev32(v4, __ T16B, v4);
2531     __ aese(v0, v1);
2532     __ aesmc(v0, v0);
2533     __ aese(v0, v2);
2534     __ aesmc(v0, v0);
2535     __ aese(v0, v3);
2536     __ aesmc(v0, v0);
2537     __ aese(v0, v4);
2538     __ aesmc(v0, v0);
2539 
2540     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2541     __ rev32(v1, __ T16B, v1);
2542     __ rev32(v2, __ T16B, v2);
2543 
2544     __ cmpw(keylen, 44);
2545     __ br(Assembler::EQ, L_doLast);
2546 
2547     __ aese(v0, v1);
2548     __ aesmc(v0, v0);
2549     __ aese(v0, v2);
2550     __ aesmc(v0, v0);
2551 
2552     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2553     __ rev32(v1, __ T16B, v1);
2554     __ rev32(v2, __ T16B, v2);
2555 
2556     __ cmpw(keylen, 52);
2557     __ br(Assembler::EQ, L_doLast);
2558 
2559     __ aese(v0, v1);
2560     __ aesmc(v0, v0);
2561     __ aese(v0, v2);
2562     __ aesmc(v0, v0);
2563 
2564     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2565     __ rev32(v1, __ T16B, v1);
2566     __ rev32(v2, __ T16B, v2);
2567 
2568     __ BIND(L_doLast);
2569 
2570     __ aese(v0, v1);
2571     __ aesmc(v0, v0);
2572     __ aese(v0, v2);
2573 
2574     __ ld1(v1, __ T16B, key);
2575     __ rev32(v1, __ T16B, v1);
2576     __ eor(v0, __ T16B, v0, v1);
2577 
2578     __ st1(v0, __ T16B, to);
2579 
2580     __ mov(r0, 0);
2581 
2582     __ leave();
2583     __ ret(lr);
2584 
2585     return start;
2586   }
2587 
2588   // Arguments:
2589   //
2590   // Inputs:
2591   //   c_rarg0   - source byte array address
2592   //   c_rarg1   - destination byte array address
2593   //   c_rarg2   - K (key) in little endian int array
2594   //
2595   address generate_aescrypt_decryptBlock() {
2596     assert(UseAES, "need AES instructions and misaligned SSE support");
2597     __ align(CodeEntryAlignment);
2598     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2599     Label L_doLast;
2600 
2601     const Register from        = c_rarg0;  // source array address
2602     const Register to          = c_rarg1;  // destination array address
2603     const Register key         = c_rarg2;  // key array address
2604     const Register keylen      = rscratch1;
2605 
2606     address start = __ pc();
2607     __ enter(); // required for proper stackwalking of RuntimeStub frame
2608 
2609     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2610 
2611     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2612 
2613     __ ld1(v5, __ T16B, __ post(key, 16));
2614     __ rev32(v5, __ T16B, v5);
2615 
2616     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2617     __ rev32(v1, __ T16B, v1);
2618     __ rev32(v2, __ T16B, v2);
2619     __ rev32(v3, __ T16B, v3);
2620     __ rev32(v4, __ T16B, v4);
2621     __ aesd(v0, v1);
2622     __ aesimc(v0, v0);
2623     __ aesd(v0, v2);
2624     __ aesimc(v0, v0);
2625     __ aesd(v0, v3);
2626     __ aesimc(v0, v0);
2627     __ aesd(v0, v4);
2628     __ aesimc(v0, v0);
2629 
2630     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2631     __ rev32(v1, __ T16B, v1);
2632     __ rev32(v2, __ T16B, v2);
2633     __ rev32(v3, __ T16B, v3);
2634     __ rev32(v4, __ T16B, v4);
2635     __ aesd(v0, v1);
2636     __ aesimc(v0, v0);
2637     __ aesd(v0, v2);
2638     __ aesimc(v0, v0);
2639     __ aesd(v0, v3);
2640     __ aesimc(v0, v0);
2641     __ aesd(v0, v4);
2642     __ aesimc(v0, v0);
2643 
2644     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2645     __ rev32(v1, __ T16B, v1);
2646     __ rev32(v2, __ T16B, v2);
2647 
2648     __ cmpw(keylen, 44);
2649     __ br(Assembler::EQ, L_doLast);
2650 
2651     __ aesd(v0, v1);
2652     __ aesimc(v0, v0);
2653     __ aesd(v0, v2);
2654     __ aesimc(v0, v0);
2655 
2656     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2657     __ rev32(v1, __ T16B, v1);
2658     __ rev32(v2, __ T16B, v2);
2659 
2660     __ cmpw(keylen, 52);
2661     __ br(Assembler::EQ, L_doLast);
2662 
2663     __ aesd(v0, v1);
2664     __ aesimc(v0, v0);
2665     __ aesd(v0, v2);
2666     __ aesimc(v0, v0);
2667 
2668     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2669     __ rev32(v1, __ T16B, v1);
2670     __ rev32(v2, __ T16B, v2);
2671 
2672     __ BIND(L_doLast);
2673 
2674     __ aesd(v0, v1);
2675     __ aesimc(v0, v0);
2676     __ aesd(v0, v2);
2677 
2678     __ eor(v0, __ T16B, v0, v5);
2679 
2680     __ st1(v0, __ T16B, to);
2681 
2682     __ mov(r0, 0);
2683 
2684     __ leave();
2685     __ ret(lr);
2686 
2687     return start;
2688   }
2689 
2690   // Arguments:
2691   //
2692   // Inputs:
2693   //   c_rarg0   - source byte array address
2694   //   c_rarg1   - destination byte array address
2695   //   c_rarg2   - K (key) in little endian int array
2696   //   c_rarg3   - r vector byte array address
2697   //   c_rarg4   - input length
2698   //
2699   // Output:
2700   //   x0        - input length
2701   //
2702   address generate_cipherBlockChaining_encryptAESCrypt() {
2703     assert(UseAES, "need AES instructions and misaligned SSE support");
2704     __ align(CodeEntryAlignment);
2705     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2706 
2707     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2708 
2709     const Register from        = c_rarg0;  // source array address
2710     const Register to          = c_rarg1;  // destination array address
2711     const Register key         = c_rarg2;  // key array address
2712     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2713                                            // and left with the results of the last encryption block
2714     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2715     const Register keylen      = rscratch1;
2716 
2717     address start = __ pc();
2718 
2719       __ enter();
2720 
2721       __ movw(rscratch2, len_reg);
2722 
2723       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2724 
2725       __ ld1(v0, __ T16B, rvec);
2726 
2727       __ cmpw(keylen, 52);
2728       __ br(Assembler::CC, L_loadkeys_44);
2729       __ br(Assembler::EQ, L_loadkeys_52);
2730 
2731       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2732       __ rev32(v17, __ T16B, v17);
2733       __ rev32(v18, __ T16B, v18);
2734     __ BIND(L_loadkeys_52);
2735       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2736       __ rev32(v19, __ T16B, v19);
2737       __ rev32(v20, __ T16B, v20);
2738     __ BIND(L_loadkeys_44);
2739       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2740       __ rev32(v21, __ T16B, v21);
2741       __ rev32(v22, __ T16B, v22);
2742       __ rev32(v23, __ T16B, v23);
2743       __ rev32(v24, __ T16B, v24);
2744       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2745       __ rev32(v25, __ T16B, v25);
2746       __ rev32(v26, __ T16B, v26);
2747       __ rev32(v27, __ T16B, v27);
2748       __ rev32(v28, __ T16B, v28);
2749       __ ld1(v29, v30, v31, __ T16B, key);
2750       __ rev32(v29, __ T16B, v29);
2751       __ rev32(v30, __ T16B, v30);
2752       __ rev32(v31, __ T16B, v31);
2753 
2754     __ BIND(L_aes_loop);
2755       __ ld1(v1, __ T16B, __ post(from, 16));
2756       __ eor(v0, __ T16B, v0, v1);
2757 
2758       __ br(Assembler::CC, L_rounds_44);
2759       __ br(Assembler::EQ, L_rounds_52);
2760 
2761       __ aese(v0, v17); __ aesmc(v0, v0);
2762       __ aese(v0, v18); __ aesmc(v0, v0);
2763     __ BIND(L_rounds_52);
2764       __ aese(v0, v19); __ aesmc(v0, v0);
2765       __ aese(v0, v20); __ aesmc(v0, v0);
2766     __ BIND(L_rounds_44);
2767       __ aese(v0, v21); __ aesmc(v0, v0);
2768       __ aese(v0, v22); __ aesmc(v0, v0);
2769       __ aese(v0, v23); __ aesmc(v0, v0);
2770       __ aese(v0, v24); __ aesmc(v0, v0);
2771       __ aese(v0, v25); __ aesmc(v0, v0);
2772       __ aese(v0, v26); __ aesmc(v0, v0);
2773       __ aese(v0, v27); __ aesmc(v0, v0);
2774       __ aese(v0, v28); __ aesmc(v0, v0);
2775       __ aese(v0, v29); __ aesmc(v0, v0);
2776       __ aese(v0, v30);
2777       __ eor(v0, __ T16B, v0, v31);
2778 
2779       __ st1(v0, __ T16B, __ post(to, 16));
2780 
2781       __ subw(len_reg, len_reg, 16);
2782       __ cbnzw(len_reg, L_aes_loop);
2783 
2784       __ st1(v0, __ T16B, rvec);
2785 
2786       __ mov(r0, rscratch2);
2787 
2788       __ leave();
2789       __ ret(lr);
2790 
2791       return start;
2792   }
2793 
2794   // Arguments:
2795   //
2796   // Inputs:
2797   //   c_rarg0   - source byte array address
2798   //   c_rarg1   - destination byte array address
2799   //   c_rarg2   - K (key) in little endian int array
2800   //   c_rarg3   - r vector byte array address
2801   //   c_rarg4   - input length
2802   //
2803   // Output:
2804   //   r0        - input length
2805   //
2806   address generate_cipherBlockChaining_decryptAESCrypt() {
2807     assert(UseAES, "need AES instructions and misaligned SSE support");
2808     __ align(CodeEntryAlignment);
2809     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2810 
2811     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2812 
2813     const Register from        = c_rarg0;  // source array address
2814     const Register to          = c_rarg1;  // destination array address
2815     const Register key         = c_rarg2;  // key array address
2816     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2817                                            // and left with the results of the last encryption block
2818     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2819     const Register keylen      = rscratch1;
2820 
2821     address start = __ pc();
2822 
2823       __ enter();
2824 
2825       __ movw(rscratch2, len_reg);
2826 
2827       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2828 
2829       __ ld1(v2, __ T16B, rvec);
2830 
2831       __ ld1(v31, __ T16B, __ post(key, 16));
2832       __ rev32(v31, __ T16B, v31);
2833 
2834       __ cmpw(keylen, 52);
2835       __ br(Assembler::CC, L_loadkeys_44);
2836       __ br(Assembler::EQ, L_loadkeys_52);
2837 
2838       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2839       __ rev32(v17, __ T16B, v17);
2840       __ rev32(v18, __ T16B, v18);
2841     __ BIND(L_loadkeys_52);
2842       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2843       __ rev32(v19, __ T16B, v19);
2844       __ rev32(v20, __ T16B, v20);
2845     __ BIND(L_loadkeys_44);
2846       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2847       __ rev32(v21, __ T16B, v21);
2848       __ rev32(v22, __ T16B, v22);
2849       __ rev32(v23, __ T16B, v23);
2850       __ rev32(v24, __ T16B, v24);
2851       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2852       __ rev32(v25, __ T16B, v25);
2853       __ rev32(v26, __ T16B, v26);
2854       __ rev32(v27, __ T16B, v27);
2855       __ rev32(v28, __ T16B, v28);
2856       __ ld1(v29, v30, __ T16B, key);
2857       __ rev32(v29, __ T16B, v29);
2858       __ rev32(v30, __ T16B, v30);
2859 
2860     __ BIND(L_aes_loop);
2861       __ ld1(v0, __ T16B, __ post(from, 16));
2862       __ orr(v1, __ T16B, v0, v0);
2863 
2864       __ br(Assembler::CC, L_rounds_44);
2865       __ br(Assembler::EQ, L_rounds_52);
2866 
2867       __ aesd(v0, v17); __ aesimc(v0, v0);
2868       __ aesd(v0, v18); __ aesimc(v0, v0);
2869     __ BIND(L_rounds_52);
2870       __ aesd(v0, v19); __ aesimc(v0, v0);
2871       __ aesd(v0, v20); __ aesimc(v0, v0);
2872     __ BIND(L_rounds_44);
2873       __ aesd(v0, v21); __ aesimc(v0, v0);
2874       __ aesd(v0, v22); __ aesimc(v0, v0);
2875       __ aesd(v0, v23); __ aesimc(v0, v0);
2876       __ aesd(v0, v24); __ aesimc(v0, v0);
2877       __ aesd(v0, v25); __ aesimc(v0, v0);
2878       __ aesd(v0, v26); __ aesimc(v0, v0);
2879       __ aesd(v0, v27); __ aesimc(v0, v0);
2880       __ aesd(v0, v28); __ aesimc(v0, v0);
2881       __ aesd(v0, v29); __ aesimc(v0, v0);
2882       __ aesd(v0, v30);
2883       __ eor(v0, __ T16B, v0, v31);
2884       __ eor(v0, __ T16B, v0, v2);
2885 
2886       __ st1(v0, __ T16B, __ post(to, 16));
2887       __ orr(v2, __ T16B, v1, v1);
2888 
2889       __ subw(len_reg, len_reg, 16);
2890       __ cbnzw(len_reg, L_aes_loop);
2891 
2892       __ st1(v2, __ T16B, rvec);
2893 
2894       __ mov(r0, rscratch2);
2895 
2896       __ leave();
2897       __ ret(lr);
2898 
2899     return start;
2900   }
2901 
2902   // Arguments:
2903   //
2904   // Inputs:
2905   //   c_rarg0   - byte[]  source+offset
2906   //   c_rarg1   - int[]   SHA.state
2907   //   c_rarg2   - int     offset
2908   //   c_rarg3   - int     limit
2909   //
2910   address generate_sha1_implCompress(bool multi_block, const char *name) {
2911     __ align(CodeEntryAlignment);
2912     StubCodeMark mark(this, "StubRoutines", name);
2913     address start = __ pc();
2914 
2915     Register buf   = c_rarg0;
2916     Register state = c_rarg1;
2917     Register ofs   = c_rarg2;
2918     Register limit = c_rarg3;
2919 
2920     Label keys;
2921     Label sha1_loop;
2922 
2923     // load the keys into v0..v3
2924     __ adr(rscratch1, keys);
2925     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2926     // load 5 words state into v6, v7
2927     __ ldrq(v6, Address(state, 0));
2928     __ ldrs(v7, Address(state, 16));
2929 
2930 
2931     __ BIND(sha1_loop);
2932     // load 64 bytes of data into v16..v19
2933     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2934     __ rev32(v16, __ T16B, v16);
2935     __ rev32(v17, __ T16B, v17);
2936     __ rev32(v18, __ T16B, v18);
2937     __ rev32(v19, __ T16B, v19);
2938 
2939     // do the sha1
2940     __ addv(v4, __ T4S, v16, v0);
2941     __ orr(v20, __ T16B, v6, v6);
2942 
2943     FloatRegister d0 = v16;
2944     FloatRegister d1 = v17;
2945     FloatRegister d2 = v18;
2946     FloatRegister d3 = v19;
2947 
2948     for (int round = 0; round < 20; round++) {
2949       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2950       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2951       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2952       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2953       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2954 
2955       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2956       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2957       __ sha1h(tmp2, __ T4S, v20);
2958       if (round < 5)
2959         __ sha1c(v20, __ T4S, tmp3, tmp4);
2960       else if (round < 10 || round >= 15)
2961         __ sha1p(v20, __ T4S, tmp3, tmp4);
2962       else
2963         __ sha1m(v20, __ T4S, tmp3, tmp4);
2964       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2965 
2966       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2967     }
2968 
2969     __ addv(v7, __ T2S, v7, v21);
2970     __ addv(v6, __ T4S, v6, v20);
2971 
2972     if (multi_block) {
2973       __ add(ofs, ofs, 64);
2974       __ cmp(ofs, limit);
2975       __ br(Assembler::LE, sha1_loop);
2976       __ mov(c_rarg0, ofs); // return ofs
2977     }
2978 
2979     __ strq(v6, Address(state, 0));
2980     __ strs(v7, Address(state, 16));
2981 
2982     __ ret(lr);
2983 
2984     __ bind(keys);
2985     __ emit_int32(0x5a827999);
2986     __ emit_int32(0x6ed9eba1);
2987     __ emit_int32(0x8f1bbcdc);
2988     __ emit_int32(0xca62c1d6);
2989 
2990     return start;
2991   }
2992 
2993 
2994   // Arguments:
2995   //
2996   // Inputs:
2997   //   c_rarg0   - byte[]  source+offset
2998   //   c_rarg1   - int[]   SHA.state
2999   //   c_rarg2   - int     offset
3000   //   c_rarg3   - int     limit
3001   //
3002   address generate_sha256_implCompress(bool multi_block, const char *name) {
3003     static const uint32_t round_consts[64] = {
3004       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3005       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3006       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3007       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3008       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3009       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3010       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3011       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3012       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3013       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3014       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3015       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3016       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3017       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3018       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3019       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3020     };
3021     __ align(CodeEntryAlignment);
3022     StubCodeMark mark(this, "StubRoutines", name);
3023     address start = __ pc();
3024 
3025     Register buf   = c_rarg0;
3026     Register state = c_rarg1;
3027     Register ofs   = c_rarg2;
3028     Register limit = c_rarg3;
3029 
3030     Label sha1_loop;
3031 
3032     __ stpd(v8, v9, __ pre(sp, -32));
3033     __ stpd(v10, v11, Address(sp, 16));
3034 
3035 // dga == v0
3036 // dgb == v1
3037 // dg0 == v2
3038 // dg1 == v3
3039 // dg2 == v4
3040 // t0 == v6
3041 // t1 == v7
3042 
3043     // load 16 keys to v16..v31
3044     __ lea(rscratch1, ExternalAddress((address)round_consts));
3045     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3046     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3047     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3048     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3049 
3050     // load 8 words (256 bits) state
3051     __ ldpq(v0, v1, state);
3052 
3053     __ BIND(sha1_loop);
3054     // load 64 bytes of data into v8..v11
3055     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3056     __ rev32(v8, __ T16B, v8);
3057     __ rev32(v9, __ T16B, v9);
3058     __ rev32(v10, __ T16B, v10);
3059     __ rev32(v11, __ T16B, v11);
3060 
3061     __ addv(v6, __ T4S, v8, v16);
3062     __ orr(v2, __ T16B, v0, v0);
3063     __ orr(v3, __ T16B, v1, v1);
3064 
3065     FloatRegister d0 = v8;
3066     FloatRegister d1 = v9;
3067     FloatRegister d2 = v10;
3068     FloatRegister d3 = v11;
3069 
3070 
3071     for (int round = 0; round < 16; round++) {
3072       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3073       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3074       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3075       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3076 
3077       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3078        __ orr(v4, __ T16B, v2, v2);
3079       if (round < 15)
3080         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3081       __ sha256h(v2, __ T4S, v3, tmp2);
3082       __ sha256h2(v3, __ T4S, v4, tmp2);
3083       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3084 
3085       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3086     }
3087 
3088     __ addv(v0, __ T4S, v0, v2);
3089     __ addv(v1, __ T4S, v1, v3);
3090 
3091     if (multi_block) {
3092       __ add(ofs, ofs, 64);
3093       __ cmp(ofs, limit);
3094       __ br(Assembler::LE, sha1_loop);
3095       __ mov(c_rarg0, ofs); // return ofs
3096     }
3097 
3098     __ ldpd(v10, v11, Address(sp, 16));
3099     __ ldpd(v8, v9, __ post(sp, 32));
3100 
3101     __ stpq(v0, v1, state);
3102 
3103     __ ret(lr);
3104 
3105     return start;
3106   }
3107 
3108 #ifndef BUILTIN_SIM
3109   // Safefetch stubs.
3110   void generate_safefetch(const char* name, int size, address* entry,
3111                           address* fault_pc, address* continuation_pc) {
3112     // safefetch signatures:
3113     //   int      SafeFetch32(int*      adr, int      errValue);
3114     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3115     //
3116     // arguments:
3117     //   c_rarg0 = adr
3118     //   c_rarg1 = errValue
3119     //
3120     // result:
3121     //   PPC_RET  = *adr or errValue
3122 
3123     StubCodeMark mark(this, "StubRoutines", name);
3124 
3125     // Entry point, pc or function descriptor.
3126     *entry = __ pc();
3127 
3128     // Load *adr into c_rarg1, may fault.
3129     *fault_pc = __ pc();
3130     switch (size) {
3131       case 4:
3132         // int32_t
3133         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3134         break;
3135       case 8:
3136         // int64_t
3137         __ ldr(c_rarg1, Address(c_rarg0, 0));
3138         break;
3139       default:
3140         ShouldNotReachHere();
3141     }
3142 
3143     // return errValue or *adr
3144     *continuation_pc = __ pc();
3145     __ mov(r0, c_rarg1);
3146     __ ret(lr);
3147   }
3148 #endif
3149 
3150   /**
3151    *  Arguments:
3152    *
3153    * Inputs:
3154    *   c_rarg0   - int crc
3155    *   c_rarg1   - byte* buf
3156    *   c_rarg2   - int length
3157    *
3158    * Ouput:
3159    *       rax   - int crc result
3160    */
3161   address generate_updateBytesCRC32() {
3162     assert(UseCRC32Intrinsics, "what are we doing here?");
3163 
3164     __ align(CodeEntryAlignment);
3165     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3166 
3167     address start = __ pc();
3168 
3169     const Register crc   = c_rarg0;  // crc
3170     const Register buf   = c_rarg1;  // source java byte array address
3171     const Register len   = c_rarg2;  // length
3172     const Register table0 = c_rarg3; // crc_table address
3173     const Register table1 = c_rarg4;
3174     const Register table2 = c_rarg5;
3175     const Register table3 = c_rarg6;
3176     const Register tmp3 = c_rarg7;
3177 
3178     BLOCK_COMMENT("Entry:");
3179     __ enter(); // required for proper stackwalking of RuntimeStub frame
3180 
3181     __ kernel_crc32(crc, buf, len,
3182               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3183 
3184     __ leave(); // required for proper stackwalking of RuntimeStub frame
3185     __ ret(lr);
3186 
3187     return start;
3188   }
3189 
3190   /**
3191    *  Arguments:
3192    *
3193    * Inputs:
3194    *   c_rarg0   - int crc
3195    *   c_rarg1   - byte* buf
3196    *   c_rarg2   - int length
3197    *   c_rarg3   - int* table
3198    *
3199    * Ouput:
3200    *       r0   - int crc result
3201    */
3202   address generate_updateBytesCRC32C() {
3203     assert(UseCRC32CIntrinsics, "what are we doing here?");
3204 
3205     __ align(CodeEntryAlignment);
3206     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3207 
3208     address start = __ pc();
3209 
3210     const Register crc   = c_rarg0;  // crc
3211     const Register buf   = c_rarg1;  // source java byte array address
3212     const Register len   = c_rarg2;  // length
3213     const Register table0 = c_rarg3; // crc_table address
3214     const Register table1 = c_rarg4;
3215     const Register table2 = c_rarg5;
3216     const Register table3 = c_rarg6;
3217     const Register tmp3 = c_rarg7;
3218 
3219     BLOCK_COMMENT("Entry:");
3220     __ enter(); // required for proper stackwalking of RuntimeStub frame
3221 
3222     __ kernel_crc32c(crc, buf, len,
3223               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3224 
3225     __ leave(); // required for proper stackwalking of RuntimeStub frame
3226     __ ret(lr);
3227 
3228     return start;
3229   }
3230 
3231   /***
3232    *  Arguments:
3233    *
3234    *  Inputs:
3235    *   c_rarg0   - int   adler
3236    *   c_rarg1   - byte* buff
3237    *   c_rarg2   - int   len
3238    *
3239    * Output:
3240    *   c_rarg0   - int adler result
3241    */
3242   address generate_updateBytesAdler32() {
3243     __ align(CodeEntryAlignment);
3244     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3245     address start = __ pc();
3246 
3247     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3248 
3249     // Aliases
3250     Register adler  = c_rarg0;
3251     Register s1     = c_rarg0;
3252     Register s2     = c_rarg3;
3253     Register buff   = c_rarg1;
3254     Register len    = c_rarg2;
3255     Register nmax  = r4;
3256     Register base = r5;
3257     Register count = r6;
3258     Register temp0 = rscratch1;
3259     Register temp1 = rscratch2;
3260     Register temp2 = r7;
3261 
3262     // Max number of bytes we can process before having to take the mod
3263     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3264     unsigned long BASE = 0xfff1;
3265     unsigned long NMAX = 0x15B0;
3266 
3267     __ mov(base, BASE);
3268     __ mov(nmax, NMAX);
3269 
3270     // s1 is initialized to the lower 16 bits of adler
3271     // s2 is initialized to the upper 16 bits of adler
3272     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3273     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3274 
3275     // The pipelined loop needs at least 16 elements for 1 iteration
3276     // It does check this, but it is more effective to skip to the cleanup loop
3277     __ cmp(len, (u1)16);
3278     __ br(Assembler::HS, L_nmax);
3279     __ cbz(len, L_combine);
3280 
3281     __ bind(L_simple_by1_loop);
3282     __ ldrb(temp0, Address(__ post(buff, 1)));
3283     __ add(s1, s1, temp0);
3284     __ add(s2, s2, s1);
3285     __ subs(len, len, 1);
3286     __ br(Assembler::HI, L_simple_by1_loop);
3287 
3288     // s1 = s1 % BASE
3289     __ subs(temp0, s1, base);
3290     __ csel(s1, temp0, s1, Assembler::HS);
3291 
3292     // s2 = s2 % BASE
3293     __ lsr(temp0, s2, 16);
3294     __ lsl(temp1, temp0, 4);
3295     __ sub(temp1, temp1, temp0);
3296     __ add(s2, temp1, s2, ext::uxth);
3297 
3298     __ subs(temp0, s2, base);
3299     __ csel(s2, temp0, s2, Assembler::HS);
3300 
3301     __ b(L_combine);
3302 
3303     __ bind(L_nmax);
3304     __ subs(len, len, nmax);
3305     __ sub(count, nmax, 16);
3306     __ br(Assembler::LO, L_by16);
3307 
3308     __ bind(L_nmax_loop);
3309 
3310     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3311 
3312     __ add(s1, s1, temp0, ext::uxtb);
3313     __ ubfx(temp2, temp0, 8, 8);
3314     __ add(s2, s2, s1);
3315     __ add(s1, s1, temp2);
3316     __ ubfx(temp2, temp0, 16, 8);
3317     __ add(s2, s2, s1);
3318     __ add(s1, s1, temp2);
3319     __ ubfx(temp2, temp0, 24, 8);
3320     __ add(s2, s2, s1);
3321     __ add(s1, s1, temp2);
3322     __ ubfx(temp2, temp0, 32, 8);
3323     __ add(s2, s2, s1);
3324     __ add(s1, s1, temp2);
3325     __ ubfx(temp2, temp0, 40, 8);
3326     __ add(s2, s2, s1);
3327     __ add(s1, s1, temp2);
3328     __ ubfx(temp2, temp0, 48, 8);
3329     __ add(s2, s2, s1);
3330     __ add(s1, s1, temp2);
3331     __ add(s2, s2, s1);
3332     __ add(s1, s1, temp0, Assembler::LSR, 56);
3333     __ add(s2, s2, s1);
3334 
3335     __ add(s1, s1, temp1, ext::uxtb);
3336     __ ubfx(temp2, temp1, 8, 8);
3337     __ add(s2, s2, s1);
3338     __ add(s1, s1, temp2);
3339     __ ubfx(temp2, temp1, 16, 8);
3340     __ add(s2, s2, s1);
3341     __ add(s1, s1, temp2);
3342     __ ubfx(temp2, temp1, 24, 8);
3343     __ add(s2, s2, s1);
3344     __ add(s1, s1, temp2);
3345     __ ubfx(temp2, temp1, 32, 8);
3346     __ add(s2, s2, s1);
3347     __ add(s1, s1, temp2);
3348     __ ubfx(temp2, temp1, 40, 8);
3349     __ add(s2, s2, s1);
3350     __ add(s1, s1, temp2);
3351     __ ubfx(temp2, temp1, 48, 8);
3352     __ add(s2, s2, s1);
3353     __ add(s1, s1, temp2);
3354     __ add(s2, s2, s1);
3355     __ add(s1, s1, temp1, Assembler::LSR, 56);
3356     __ add(s2, s2, s1);
3357 
3358     __ subs(count, count, 16);
3359     __ br(Assembler::HS, L_nmax_loop);
3360 
3361     // s1 = s1 % BASE
3362     __ lsr(temp0, s1, 16);
3363     __ lsl(temp1, temp0, 4);
3364     __ sub(temp1, temp1, temp0);
3365     __ add(temp1, temp1, s1, ext::uxth);
3366 
3367     __ lsr(temp0, temp1, 16);
3368     __ lsl(s1, temp0, 4);
3369     __ sub(s1, s1, temp0);
3370     __ add(s1, s1, temp1, ext:: uxth);
3371 
3372     __ subs(temp0, s1, base);
3373     __ csel(s1, temp0, s1, Assembler::HS);
3374 
3375     // s2 = s2 % BASE
3376     __ lsr(temp0, s2, 16);
3377     __ lsl(temp1, temp0, 4);
3378     __ sub(temp1, temp1, temp0);
3379     __ add(temp1, temp1, s2, ext::uxth);
3380 
3381     __ lsr(temp0, temp1, 16);
3382     __ lsl(s2, temp0, 4);
3383     __ sub(s2, s2, temp0);
3384     __ add(s2, s2, temp1, ext:: uxth);
3385 
3386     __ subs(temp0, s2, base);
3387     __ csel(s2, temp0, s2, Assembler::HS);
3388 
3389     __ subs(len, len, nmax);
3390     __ sub(count, nmax, 16);
3391     __ br(Assembler::HS, L_nmax_loop);
3392 
3393     __ bind(L_by16);
3394     __ adds(len, len, count);
3395     __ br(Assembler::LO, L_by1);
3396 
3397     __ bind(L_by16_loop);
3398 
3399     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3400 
3401     __ add(s1, s1, temp0, ext::uxtb);
3402     __ ubfx(temp2, temp0, 8, 8);
3403     __ add(s2, s2, s1);
3404     __ add(s1, s1, temp2);
3405     __ ubfx(temp2, temp0, 16, 8);
3406     __ add(s2, s2, s1);
3407     __ add(s1, s1, temp2);
3408     __ ubfx(temp2, temp0, 24, 8);
3409     __ add(s2, s2, s1);
3410     __ add(s1, s1, temp2);
3411     __ ubfx(temp2, temp0, 32, 8);
3412     __ add(s2, s2, s1);
3413     __ add(s1, s1, temp2);
3414     __ ubfx(temp2, temp0, 40, 8);
3415     __ add(s2, s2, s1);
3416     __ add(s1, s1, temp2);
3417     __ ubfx(temp2, temp0, 48, 8);
3418     __ add(s2, s2, s1);
3419     __ add(s1, s1, temp2);
3420     __ add(s2, s2, s1);
3421     __ add(s1, s1, temp0, Assembler::LSR, 56);
3422     __ add(s2, s2, s1);
3423 
3424     __ add(s1, s1, temp1, ext::uxtb);
3425     __ ubfx(temp2, temp1, 8, 8);
3426     __ add(s2, s2, s1);
3427     __ add(s1, s1, temp2);
3428     __ ubfx(temp2, temp1, 16, 8);
3429     __ add(s2, s2, s1);
3430     __ add(s1, s1, temp2);
3431     __ ubfx(temp2, temp1, 24, 8);
3432     __ add(s2, s2, s1);
3433     __ add(s1, s1, temp2);
3434     __ ubfx(temp2, temp1, 32, 8);
3435     __ add(s2, s2, s1);
3436     __ add(s1, s1, temp2);
3437     __ ubfx(temp2, temp1, 40, 8);
3438     __ add(s2, s2, s1);
3439     __ add(s1, s1, temp2);
3440     __ ubfx(temp2, temp1, 48, 8);
3441     __ add(s2, s2, s1);
3442     __ add(s1, s1, temp2);
3443     __ add(s2, s2, s1);
3444     __ add(s1, s1, temp1, Assembler::LSR, 56);
3445     __ add(s2, s2, s1);
3446 
3447     __ subs(len, len, 16);
3448     __ br(Assembler::HS, L_by16_loop);
3449 
3450     __ bind(L_by1);
3451     __ adds(len, len, 15);
3452     __ br(Assembler::LO, L_do_mod);
3453 
3454     __ bind(L_by1_loop);
3455     __ ldrb(temp0, Address(__ post(buff, 1)));
3456     __ add(s1, temp0, s1);
3457     __ add(s2, s2, s1);
3458     __ subs(len, len, 1);
3459     __ br(Assembler::HS, L_by1_loop);
3460 
3461     __ bind(L_do_mod);
3462     // s1 = s1 % BASE
3463     __ lsr(temp0, s1, 16);
3464     __ lsl(temp1, temp0, 4);
3465     __ sub(temp1, temp1, temp0);
3466     __ add(temp1, temp1, s1, ext::uxth);
3467 
3468     __ lsr(temp0, temp1, 16);
3469     __ lsl(s1, temp0, 4);
3470     __ sub(s1, s1, temp0);
3471     __ add(s1, s1, temp1, ext:: uxth);
3472 
3473     __ subs(temp0, s1, base);
3474     __ csel(s1, temp0, s1, Assembler::HS);
3475 
3476     // s2 = s2 % BASE
3477     __ lsr(temp0, s2, 16);
3478     __ lsl(temp1, temp0, 4);
3479     __ sub(temp1, temp1, temp0);
3480     __ add(temp1, temp1, s2, ext::uxth);
3481 
3482     __ lsr(temp0, temp1, 16);
3483     __ lsl(s2, temp0, 4);
3484     __ sub(s2, s2, temp0);
3485     __ add(s2, s2, temp1, ext:: uxth);
3486 
3487     __ subs(temp0, s2, base);
3488     __ csel(s2, temp0, s2, Assembler::HS);
3489 
3490     // Combine lower bits and higher bits
3491     __ bind(L_combine);
3492     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3493 
3494     __ ret(lr);
3495 
3496     return start;
3497   }
3498 
3499   /**
3500    *  Arguments:
3501    *
3502    *  Input:
3503    *    c_rarg0   - x address
3504    *    c_rarg1   - x length
3505    *    c_rarg2   - y address
3506    *    c_rarg3   - y lenth
3507    *    c_rarg4   - z address
3508    *    c_rarg5   - z length
3509    */
3510   address generate_multiplyToLen() {
3511     __ align(CodeEntryAlignment);
3512     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3513 
3514     address start = __ pc();
3515     const Register x     = r0;
3516     const Register xlen  = r1;
3517     const Register y     = r2;
3518     const Register ylen  = r3;
3519     const Register z     = r4;
3520     const Register zlen  = r5;
3521 
3522     const Register tmp1  = r10;
3523     const Register tmp2  = r11;
3524     const Register tmp3  = r12;
3525     const Register tmp4  = r13;
3526     const Register tmp5  = r14;
3527     const Register tmp6  = r15;
3528     const Register tmp7  = r16;
3529 
3530     BLOCK_COMMENT("Entry:");
3531     __ enter(); // required for proper stackwalking of RuntimeStub frame
3532     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3533     __ leave(); // required for proper stackwalking of RuntimeStub frame
3534     __ ret(lr);
3535 
3536     return start;
3537   }
3538 
3539   address generate_squareToLen() {
3540     // squareToLen algorithm for sizes 1..127 described in java code works
3541     // faster than multiply_to_len on some CPUs and slower on others, but
3542     // multiply_to_len shows a bit better overall results
3543     __ align(CodeEntryAlignment);
3544     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3545     address start = __ pc();
3546 
3547     const Register x     = r0;
3548     const Register xlen  = r1;
3549     const Register z     = r2;
3550     const Register zlen  = r3;
3551     const Register y     = r4; // == x
3552     const Register ylen  = r5; // == xlen
3553 
3554     const Register tmp1  = r10;
3555     const Register tmp2  = r11;
3556     const Register tmp3  = r12;
3557     const Register tmp4  = r13;
3558     const Register tmp5  = r14;
3559     const Register tmp6  = r15;
3560     const Register tmp7  = r16;
3561 
3562     RegSet spilled_regs = RegSet::of(y, ylen);
3563     BLOCK_COMMENT("Entry:");
3564     __ enter();
3565     __ push(spilled_regs, sp);
3566     __ mov(y, x);
3567     __ mov(ylen, xlen);
3568     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3569     __ pop(spilled_regs, sp);
3570     __ leave();
3571     __ ret(lr);
3572     return start;
3573   }
3574 
3575   address generate_mulAdd() {
3576     __ align(CodeEntryAlignment);
3577     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3578 
3579     address start = __ pc();
3580 
3581     const Register out     = r0;
3582     const Register in      = r1;
3583     const Register offset  = r2;
3584     const Register len     = r3;
3585     const Register k       = r4;
3586 
3587     BLOCK_COMMENT("Entry:");
3588     __ enter();
3589     __ mul_add(out, in, offset, len, k);
3590     __ leave();
3591     __ ret(lr);
3592 
3593     return start;
3594   }
3595 
3596   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3597                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3598                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3599     // Karatsuba multiplication performs a 128*128 -> 256-bit
3600     // multiplication in three 128-bit multiplications and a few
3601     // additions.
3602     //
3603     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3604     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3605     //
3606     // Inputs:
3607     //
3608     // A0 in a.d[0]     (subkey)
3609     // A1 in a.d[1]
3610     // (A1+A0) in a1_xor_a0.d[0]
3611     //
3612     // B0 in b.d[0]     (state)
3613     // B1 in b.d[1]
3614 
3615     __ ext(tmp1, __ T16B, b, b, 0x08);
3616     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3617     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3618     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3619     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3620 
3621     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3622     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3623     __ eor(tmp2, __ T16B, tmp2, tmp4);
3624     __ eor(tmp2, __ T16B, tmp2, tmp3);
3625 
3626     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3627     __ ins(result_hi, __ D, tmp2, 0, 1);
3628     __ ins(result_lo, __ D, tmp2, 1, 0);
3629   }
3630 
3631   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3632                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3633     const FloatRegister t0 = result;
3634 
3635     // The GCM field polynomial f is z^128 + p(z), where p =
3636     // z^7+z^2+z+1.
3637     //
3638     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3639     //
3640     // so, given that the product we're reducing is
3641     //    a == lo + hi * z^128
3642     // substituting,
3643     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3644     //
3645     // we reduce by multiplying hi by p(z) and subtracting the result
3646     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3647     // bits we can do this with two 64-bit multiplications, lo*p and
3648     // hi*p.
3649 
3650     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3651     __ ext(t1, __ T16B, t0, z, 8);
3652     __ eor(hi, __ T16B, hi, t1);
3653     __ ext(t1, __ T16B, z, t0, 8);
3654     __ eor(lo, __ T16B, lo, t1);
3655     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3656     __ eor(result, __ T16B, lo, t0);
3657   }
3658 
3659   address generate_has_negatives(address &has_negatives_long) {
3660     const u1 large_loop_size = 64;
3661     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3662     int dcache_line = VM_Version::dcache_line_size();
3663 
3664     Register ary1 = r1, len = r2, result = r0;
3665 
3666     __ align(CodeEntryAlignment);
3667 
3668     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3669 
3670     address entry = __ pc();
3671 
3672     __ enter();
3673 
3674   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3675         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3676 
3677   __ cmp(len, (u1)15);
3678   __ br(Assembler::GT, LEN_OVER_15);
3679   // The only case when execution falls into this code is when pointer is near
3680   // the end of memory page and we have to avoid reading next page
3681   __ add(ary1, ary1, len);
3682   __ subs(len, len, 8);
3683   __ br(Assembler::GT, LEN_OVER_8);
3684   __ ldr(rscratch2, Address(ary1, -8));
3685   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3686   __ lsrv(rscratch2, rscratch2, rscratch1);
3687   __ tst(rscratch2, UPPER_BIT_MASK);
3688   __ cset(result, Assembler::NE);
3689   __ leave();
3690   __ ret(lr);
3691   __ bind(LEN_OVER_8);
3692   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3693   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3694   __ tst(rscratch2, UPPER_BIT_MASK);
3695   __ br(Assembler::NE, RET_TRUE_NO_POP);
3696   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3697   __ lsrv(rscratch1, rscratch1, rscratch2);
3698   __ tst(rscratch1, UPPER_BIT_MASK);
3699   __ cset(result, Assembler::NE);
3700   __ leave();
3701   __ ret(lr);
3702 
3703   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3704   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3705 
3706   has_negatives_long = __ pc(); // 2nd entry point
3707 
3708   __ enter();
3709 
3710   __ bind(LEN_OVER_15);
3711     __ push(spilled_regs, sp);
3712     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3713     __ cbz(rscratch2, ALIGNED);
3714     __ ldp(tmp6, tmp1, Address(ary1));
3715     __ mov(tmp5, 16);
3716     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3717     __ add(ary1, ary1, rscratch1);
3718     __ sub(len, len, rscratch1);
3719     __ orr(tmp6, tmp6, tmp1);
3720     __ tst(tmp6, UPPER_BIT_MASK);
3721     __ br(Assembler::NE, RET_TRUE);
3722 
3723   __ bind(ALIGNED);
3724     __ cmp(len, large_loop_size);
3725     __ br(Assembler::LT, CHECK_16);
3726     // Perform 16-byte load as early return in pre-loop to handle situation
3727     // when initially aligned large array has negative values at starting bytes,
3728     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3729     // slower. Cases with negative bytes further ahead won't be affected that
3730     // much. In fact, it'll be faster due to early loads, less instructions and
3731     // less branches in LARGE_LOOP.
3732     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3733     __ sub(len, len, 16);
3734     __ orr(tmp6, tmp6, tmp1);
3735     __ tst(tmp6, UPPER_BIT_MASK);
3736     __ br(Assembler::NE, RET_TRUE);
3737     __ cmp(len, large_loop_size);
3738     __ br(Assembler::LT, CHECK_16);
3739 
3740     if (SoftwarePrefetchHintDistance >= 0
3741         && SoftwarePrefetchHintDistance >= dcache_line) {
3742       // initial prefetch
3743       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3744     }
3745   __ bind(LARGE_LOOP);
3746     if (SoftwarePrefetchHintDistance >= 0) {
3747       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3748     }
3749     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3750     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3751     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3752     // instructions per cycle and have less branches, but this approach disables
3753     // early return, thus, all 64 bytes are loaded and checked every time.
3754     __ ldp(tmp2, tmp3, Address(ary1));
3755     __ ldp(tmp4, tmp5, Address(ary1, 16));
3756     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3757     __ ldp(tmp6, tmp1, Address(ary1, 48));
3758     __ add(ary1, ary1, large_loop_size);
3759     __ sub(len, len, large_loop_size);
3760     __ orr(tmp2, tmp2, tmp3);
3761     __ orr(tmp4, tmp4, tmp5);
3762     __ orr(rscratch1, rscratch1, rscratch2);
3763     __ orr(tmp6, tmp6, tmp1);
3764     __ orr(tmp2, tmp2, tmp4);
3765     __ orr(rscratch1, rscratch1, tmp6);
3766     __ orr(tmp2, tmp2, rscratch1);
3767     __ tst(tmp2, UPPER_BIT_MASK);
3768     __ br(Assembler::NE, RET_TRUE);
3769     __ cmp(len, large_loop_size);
3770     __ br(Assembler::GE, LARGE_LOOP);
3771 
3772   __ bind(CHECK_16); // small 16-byte load pre-loop
3773     __ cmp(len, (u1)16);
3774     __ br(Assembler::LT, POST_LOOP16);
3775 
3776   __ bind(LOOP16); // small 16-byte load loop
3777     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3778     __ sub(len, len, 16);
3779     __ orr(tmp2, tmp2, tmp3);
3780     __ tst(tmp2, UPPER_BIT_MASK);
3781     __ br(Assembler::NE, RET_TRUE);
3782     __ cmp(len, (u1)16);
3783     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3784 
3785   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3786     __ cmp(len, (u1)8);
3787     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3788     __ ldr(tmp3, Address(__ post(ary1, 8)));
3789     __ sub(len, len, 8);
3790     __ tst(tmp3, UPPER_BIT_MASK);
3791     __ br(Assembler::NE, RET_TRUE);
3792 
3793   __ bind(POST_LOOP16_LOAD_TAIL);
3794     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3795     __ ldr(tmp1, Address(ary1));
3796     __ mov(tmp2, 64);
3797     __ sub(tmp4, tmp2, len, __ LSL, 3);
3798     __ lslv(tmp1, tmp1, tmp4);
3799     __ tst(tmp1, UPPER_BIT_MASK);
3800     __ br(Assembler::NE, RET_TRUE);
3801     // Fallthrough
3802 
3803   __ bind(RET_FALSE);
3804     __ pop(spilled_regs, sp);
3805     __ leave();
3806     __ mov(result, zr);
3807     __ ret(lr);
3808 
3809   __ bind(RET_TRUE);
3810     __ pop(spilled_regs, sp);
3811   __ bind(RET_TRUE_NO_POP);
3812     __ leave();
3813     __ mov(result, 1);
3814     __ ret(lr);
3815 
3816   __ bind(DONE);
3817     __ pop(spilled_regs, sp);
3818     __ leave();
3819     __ ret(lr);
3820     return entry;
3821   }
3822 
3823   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3824         bool usePrefetch, Label &NOT_EQUAL) {
3825     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3826         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3827         tmp7 = r12, tmp8 = r13;
3828     Label LOOP;
3829 
3830     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3831     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3832     __ bind(LOOP);
3833     if (usePrefetch) {
3834       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3835       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3836     }
3837     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3838     __ eor(tmp1, tmp1, tmp2);
3839     __ eor(tmp3, tmp3, tmp4);
3840     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3841     __ orr(tmp1, tmp1, tmp3);
3842     __ cbnz(tmp1, NOT_EQUAL);
3843     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3844     __ eor(tmp5, tmp5, tmp6);
3845     __ eor(tmp7, tmp7, tmp8);
3846     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3847     __ orr(tmp5, tmp5, tmp7);
3848     __ cbnz(tmp5, NOT_EQUAL);
3849     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3850     __ eor(tmp1, tmp1, tmp2);
3851     __ eor(tmp3, tmp3, tmp4);
3852     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3853     __ orr(tmp1, tmp1, tmp3);
3854     __ cbnz(tmp1, NOT_EQUAL);
3855     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3856     __ eor(tmp5, tmp5, tmp6);
3857     __ sub(cnt1, cnt1, 8 * wordSize);
3858     __ eor(tmp7, tmp7, tmp8);
3859     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3860     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3861     // cmp) because subs allows an unlimited range of immediate operand.
3862     __ subs(tmp6, cnt1, loopThreshold);
3863     __ orr(tmp5, tmp5, tmp7);
3864     __ cbnz(tmp5, NOT_EQUAL);
3865     __ br(__ GE, LOOP);
3866     // post-loop
3867     __ eor(tmp1, tmp1, tmp2);
3868     __ eor(tmp3, tmp3, tmp4);
3869     __ orr(tmp1, tmp1, tmp3);
3870     __ sub(cnt1, cnt1, 2 * wordSize);
3871     __ cbnz(tmp1, NOT_EQUAL);
3872   }
3873 
3874   void generate_large_array_equals_loop_simd(int loopThreshold,
3875         bool usePrefetch, Label &NOT_EQUAL) {
3876     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3877         tmp2 = rscratch2;
3878     Label LOOP;
3879 
3880     __ bind(LOOP);
3881     if (usePrefetch) {
3882       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3883       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3884     }
3885     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3886     __ sub(cnt1, cnt1, 8 * wordSize);
3887     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3888     __ subs(tmp1, cnt1, loopThreshold);
3889     __ eor(v0, __ T16B, v0, v4);
3890     __ eor(v1, __ T16B, v1, v5);
3891     __ eor(v2, __ T16B, v2, v6);
3892     __ eor(v3, __ T16B, v3, v7);
3893     __ orr(v0, __ T16B, v0, v1);
3894     __ orr(v1, __ T16B, v2, v3);
3895     __ orr(v0, __ T16B, v0, v1);
3896     __ umov(tmp1, v0, __ D, 0);
3897     __ umov(tmp2, v0, __ D, 1);
3898     __ orr(tmp1, tmp1, tmp2);
3899     __ cbnz(tmp1, NOT_EQUAL);
3900     __ br(__ GE, LOOP);
3901   }
3902 
3903   // a1 = r1 - array1 address
3904   // a2 = r2 - array2 address
3905   // result = r0 - return value. Already contains "false"
3906   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3907   // r3-r5 are reserved temporary registers
3908   address generate_large_array_equals() {
3909     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3910         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3911         tmp7 = r12, tmp8 = r13;
3912     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3913         SMALL_LOOP, POST_LOOP;
3914     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3915     // calculate if at least 32 prefetched bytes are used
3916     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3917     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3918     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3919     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3920         tmp5, tmp6, tmp7, tmp8);
3921 
3922     __ align(CodeEntryAlignment);
3923 
3924     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3925 
3926     address entry = __ pc();
3927     __ enter();
3928     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3929     // also advance pointers to use post-increment instead of pre-increment
3930     __ add(a1, a1, wordSize);
3931     __ add(a2, a2, wordSize);
3932     if (AvoidUnalignedAccesses) {
3933       // both implementations (SIMD/nonSIMD) are using relatively large load
3934       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3935       // on some CPUs in case of address is not at least 16-byte aligned.
3936       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3937       // load if needed at least for 1st address and make if 16-byte aligned.
3938       Label ALIGNED16;
3939       __ tbz(a1, 3, ALIGNED16);
3940       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3941       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3942       __ sub(cnt1, cnt1, wordSize);
3943       __ eor(tmp1, tmp1, tmp2);
3944       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3945       __ bind(ALIGNED16);
3946     }
3947     if (UseSIMDForArrayEquals) {
3948       if (SoftwarePrefetchHintDistance >= 0) {
3949         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3950         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3951         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3952             /* prfm = */ true, NOT_EQUAL);
3953         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3954         __ br(__ LT, TAIL);
3955       }
3956       __ bind(NO_PREFETCH_LARGE_LOOP);
3957       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3958           /* prfm = */ false, NOT_EQUAL);
3959     } else {
3960       __ push(spilled_regs, sp);
3961       if (SoftwarePrefetchHintDistance >= 0) {
3962         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3963         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3964         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3965             /* prfm = */ true, NOT_EQUAL);
3966         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3967         __ br(__ LT, TAIL);
3968       }
3969       __ bind(NO_PREFETCH_LARGE_LOOP);
3970       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3971           /* prfm = */ false, NOT_EQUAL);
3972     }
3973     __ bind(TAIL);
3974       __ cbz(cnt1, EQUAL);
3975       __ subs(cnt1, cnt1, wordSize);
3976       __ br(__ LE, POST_LOOP);
3977     __ bind(SMALL_LOOP);
3978       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3979       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3980       __ subs(cnt1, cnt1, wordSize);
3981       __ eor(tmp1, tmp1, tmp2);
3982       __ cbnz(tmp1, NOT_EQUAL);
3983       __ br(__ GT, SMALL_LOOP);
3984     __ bind(POST_LOOP);
3985       __ ldr(tmp1, Address(a1, cnt1));
3986       __ ldr(tmp2, Address(a2, cnt1));
3987       __ eor(tmp1, tmp1, tmp2);
3988       __ cbnz(tmp1, NOT_EQUAL);
3989     __ bind(EQUAL);
3990       __ mov(result, true);
3991     __ bind(NOT_EQUAL);
3992       if (!UseSIMDForArrayEquals) {
3993         __ pop(spilled_regs, sp);
3994       }
3995     __ bind(NOT_EQUAL_NO_POP);
3996     __ leave();
3997     __ ret(lr);
3998     return entry;
3999   }
4000 
4001   address generate_dsin_dcos(bool isCos) {
4002     __ align(CodeEntryAlignment);
4003     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4004     address start = __ pc();
4005     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4006         (address)StubRoutines::aarch64::_two_over_pi,
4007         (address)StubRoutines::aarch64::_pio2,
4008         (address)StubRoutines::aarch64::_dsin_coef,
4009         (address)StubRoutines::aarch64::_dcos_coef);
4010     return start;
4011   }
4012 
4013   address generate_dlog() {
4014     __ align(CodeEntryAlignment);
4015     StubCodeMark mark(this, "StubRoutines", "dlog");
4016     address entry = __ pc();
4017     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4018         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4019     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4020     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4021         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4022     return entry;
4023   }
4024 
4025   // code for comparing 16 bytes of strings with same encoding
4026   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4027     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4028     __ ldr(rscratch1, Address(__ post(str1, 8)));
4029     __ eor(rscratch2, tmp1, tmp2);
4030     __ ldr(cnt1, Address(__ post(str2, 8)));
4031     __ cbnz(rscratch2, DIFF1);
4032     __ ldr(tmp1, Address(__ post(str1, 8)));
4033     __ eor(rscratch2, rscratch1, cnt1);
4034     __ ldr(tmp2, Address(__ post(str2, 8)));
4035     __ cbnz(rscratch2, DIFF2);
4036   }
4037 
4038   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4039   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4040       Label &DIFF2) {
4041     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4042     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4043 
4044     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4045     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4046     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4047     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4048 
4049     __ fmovd(tmpL, vtmp3);
4050     __ eor(rscratch2, tmp3, tmpL);
4051     __ cbnz(rscratch2, DIFF2);
4052 
4053     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4054     __ umov(tmpL, vtmp3, __ D, 1);
4055     __ eor(rscratch2, tmpU, tmpL);
4056     __ cbnz(rscratch2, DIFF1);
4057 
4058     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4059     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4060     __ fmovd(tmpL, vtmp);
4061     __ eor(rscratch2, tmp3, tmpL);
4062     __ cbnz(rscratch2, DIFF2);
4063 
4064     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4065     __ umov(tmpL, vtmp, __ D, 1);
4066     __ eor(rscratch2, tmpU, tmpL);
4067     __ cbnz(rscratch2, DIFF1);
4068   }
4069 
4070   // r0  = result
4071   // r1  = str1
4072   // r2  = cnt1
4073   // r3  = str2
4074   // r4  = cnt2
4075   // r10 = tmp1
4076   // r11 = tmp2
4077   address generate_compare_long_string_different_encoding(bool isLU) {
4078     __ align(CodeEntryAlignment);
4079     StubCodeMark mark(this, "StubRoutines", isLU
4080         ? "compare_long_string_different_encoding LU"
4081         : "compare_long_string_different_encoding UL");
4082     address entry = __ pc();
4083     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4084         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
4085         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4086     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4087         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4088     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4089     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4090 
4091     int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
4092 
4093     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4094     // cnt2 == amount of characters left to compare
4095     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4096     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4097     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4098     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4099     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4100     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4101     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4102     __ eor(rscratch2, tmp1, tmp2);
4103     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4104     __ mov(rscratch1, tmp2);
4105     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4106     Register strU = isLU ? str2 : str1,
4107              strL = isLU ? str1 : str2,
4108              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4109              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4110     __ push(spilled_regs, sp);
4111     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4112     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4113 
4114     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4115 
4116     if (SoftwarePrefetchHintDistance >= 0) {
4117       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4118       __ br(__ LT, SMALL_LOOP);
4119       __ bind(LARGE_LOOP_PREFETCH);
4120         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4121         __ mov(tmp4, 2);
4122         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4123         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4124           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4125           __ subs(tmp4, tmp4, 1);
4126           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4127           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4128           __ mov(tmp4, 2);
4129         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4130           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4131           __ subs(tmp4, tmp4, 1);
4132           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4133           __ sub(cnt2, cnt2, 64);
4134           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4135           __ br(__ GE, LARGE_LOOP_PREFETCH);
4136     }
4137     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4138     __ subs(cnt2, cnt2, 16);
4139     __ br(__ LT, TAIL);
4140     __ b(SMALL_LOOP_ENTER);
4141     __ bind(SMALL_LOOP); // smaller loop
4142       __ subs(cnt2, cnt2, 16);
4143     __ bind(SMALL_LOOP_ENTER);
4144       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4145       __ br(__ GE, SMALL_LOOP);
4146       __ cbz(cnt2, LOAD_LAST);
4147     __ bind(TAIL); // 1..15 characters left
4148       __ subs(zr, cnt2, -8);
4149       __ br(__ GT, TAIL_LOAD_16);
4150       __ ldrd(vtmp, Address(tmp2));
4151       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4152 
4153       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4154       __ fmovd(tmpL, vtmp3);
4155       __ eor(rscratch2, tmp3, tmpL);
4156       __ cbnz(rscratch2, DIFF2);
4157       __ umov(tmpL, vtmp3, __ D, 1);
4158       __ eor(rscratch2, tmpU, tmpL);
4159       __ cbnz(rscratch2, DIFF1);
4160       __ b(LOAD_LAST);
4161     __ bind(TAIL_LOAD_16);
4162       __ ldrq(vtmp, Address(tmp2));
4163       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4164       __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4165       __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4166       __ fmovd(tmpL, vtmp3);
4167       __ eor(rscratch2, tmp3, tmpL);
4168       __ cbnz(rscratch2, DIFF2);
4169 
4170       __ ldr(tmp3, Address(__ post(cnt1, 8)));
4171       __ umov(tmpL, vtmp3, __ D, 1);
4172       __ eor(rscratch2, tmpU, tmpL);
4173       __ cbnz(rscratch2, DIFF1);
4174 
4175       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4176       __ fmovd(tmpL, vtmp);
4177       __ eor(rscratch2, tmp3, tmpL);
4178       __ cbnz(rscratch2, DIFF2);
4179 
4180       __ umov(tmpL, vtmp, __ D, 1);
4181       __ eor(rscratch2, tmpU, tmpL);
4182       __ cbnz(rscratch2, DIFF1);
4183       __ b(LOAD_LAST);
4184     __ bind(DIFF2);
4185       __ mov(tmpU, tmp3);
4186     __ bind(DIFF1);
4187       __ pop(spilled_regs, sp);
4188       __ b(CALCULATE_DIFFERENCE);
4189     __ bind(LOAD_LAST);
4190       __ pop(spilled_regs, sp);
4191 
4192       __ ldrs(vtmp, Address(strL));
4193       __ ldr(tmpU, Address(strU));
4194       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4195       __ fmovd(tmpL, vtmp);
4196 
4197       __ eor(rscratch2, tmpU, tmpL);
4198       __ cbz(rscratch2, DONE);
4199 
4200     // Find the first different characters in the longwords and
4201     // compute their difference.
4202     __ bind(CALCULATE_DIFFERENCE);
4203       __ rev(rscratch2, rscratch2);
4204       __ clz(rscratch2, rscratch2);
4205       __ andr(rscratch2, rscratch2, -16);
4206       __ lsrv(tmp1, tmp1, rscratch2);
4207       __ uxthw(tmp1, tmp1);
4208       __ lsrv(rscratch1, rscratch1, rscratch2);
4209       __ uxthw(rscratch1, rscratch1);
4210       __ subw(result, tmp1, rscratch1);
4211     __ bind(DONE);
4212       __ ret(lr);
4213     return entry;
4214   }
4215 
4216   // r0  = result
4217   // r1  = str1
4218   // r2  = cnt1
4219   // r3  = str2
4220   // r4  = cnt2
4221   // r10 = tmp1
4222   // r11 = tmp2
4223   address generate_compare_long_string_same_encoding(bool isLL) {
4224     __ align(CodeEntryAlignment);
4225     StubCodeMark mark(this, "StubRoutines", isLL
4226         ? "compare_long_string_same_encoding LL"
4227         : "compare_long_string_same_encoding UU");
4228     address entry = __ pc();
4229     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4230         tmp1 = r10, tmp2 = r11;
4231     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4232         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4233         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4234     // exit from large loop when less than 64 bytes left to read or we're about
4235     // to prefetch memory behind array border
4236     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4237     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4238     // update cnt2 counter with already loaded 8 bytes
4239     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4240     // update pointers, because of previous read
4241     __ add(str1, str1, wordSize);
4242     __ add(str2, str2, wordSize);
4243     if (SoftwarePrefetchHintDistance >= 0) {
4244       __ bind(LARGE_LOOP_PREFETCH);
4245         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4246         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4247         compare_string_16_bytes_same(DIFF, DIFF2);
4248         compare_string_16_bytes_same(DIFF, DIFF2);
4249         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4250         compare_string_16_bytes_same(DIFF, DIFF2);
4251         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4252         compare_string_16_bytes_same(DIFF, DIFF2);
4253         __ br(__ GT, LARGE_LOOP_PREFETCH);
4254         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4255         // less than 16 bytes left?
4256         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4257         __ br(__ LT, TAIL);
4258     }
4259     __ bind(SMALL_LOOP);
4260       compare_string_16_bytes_same(DIFF, DIFF2);
4261       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4262       __ br(__ GE, SMALL_LOOP);
4263     __ bind(TAIL);
4264       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4265       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4266       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4267       __ br(__ LE, CHECK_LAST);
4268       __ eor(rscratch2, tmp1, tmp2);
4269       __ cbnz(rscratch2, DIFF);
4270       __ ldr(tmp1, Address(__ post(str1, 8)));
4271       __ ldr(tmp2, Address(__ post(str2, 8)));
4272       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4273     __ bind(CHECK_LAST);
4274       if (!isLL) {
4275         __ add(cnt2, cnt2, cnt2); // now in bytes
4276       }
4277       __ eor(rscratch2, tmp1, tmp2);
4278       __ cbnz(rscratch2, DIFF);
4279       __ ldr(rscratch1, Address(str1, cnt2));
4280       __ ldr(cnt1, Address(str2, cnt2));
4281       __ eor(rscratch2, rscratch1, cnt1);
4282       __ cbz(rscratch2, LENGTH_DIFF);
4283       // Find the first different characters in the longwords and
4284       // compute their difference.
4285     __ bind(DIFF2);
4286       __ rev(rscratch2, rscratch2);
4287       __ clz(rscratch2, rscratch2);
4288       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4289       __ lsrv(rscratch1, rscratch1, rscratch2);
4290       if (isLL) {
4291         __ lsrv(cnt1, cnt1, rscratch2);
4292         __ uxtbw(rscratch1, rscratch1);
4293         __ uxtbw(cnt1, cnt1);
4294       } else {
4295         __ lsrv(cnt1, cnt1, rscratch2);
4296         __ uxthw(rscratch1, rscratch1);
4297         __ uxthw(cnt1, cnt1);
4298       }
4299       __ subw(result, rscratch1, cnt1);
4300       __ b(LENGTH_DIFF);
4301     __ bind(DIFF);
4302       __ rev(rscratch2, rscratch2);
4303       __ clz(rscratch2, rscratch2);
4304       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4305       __ lsrv(tmp1, tmp1, rscratch2);
4306       if (isLL) {
4307         __ lsrv(tmp2, tmp2, rscratch2);
4308         __ uxtbw(tmp1, tmp1);
4309         __ uxtbw(tmp2, tmp2);
4310       } else {
4311         __ lsrv(tmp2, tmp2, rscratch2);
4312         __ uxthw(tmp1, tmp1);
4313         __ uxthw(tmp2, tmp2);
4314       }
4315       __ subw(result, tmp1, tmp2);
4316       __ b(LENGTH_DIFF);
4317     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4318       __ eor(rscratch2, tmp1, tmp2);
4319       __ cbnz(rscratch2, DIFF);
4320     __ bind(LENGTH_DIFF);
4321       __ ret(lr);
4322     return entry;
4323   }
4324 
4325   void generate_compare_long_strings() {
4326       StubRoutines::aarch64::_compare_long_string_LL
4327           = generate_compare_long_string_same_encoding(true);
4328       StubRoutines::aarch64::_compare_long_string_UU
4329           = generate_compare_long_string_same_encoding(false);
4330       StubRoutines::aarch64::_compare_long_string_LU
4331           = generate_compare_long_string_different_encoding(true);
4332       StubRoutines::aarch64::_compare_long_string_UL
4333           = generate_compare_long_string_different_encoding(false);
4334   }
4335 
4336   // R0 = result
4337   // R1 = str2
4338   // R2 = cnt1
4339   // R3 = str1
4340   // R4 = cnt2
4341   // This generic linear code use few additional ideas, which makes it faster:
4342   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4343   // in order to skip initial loading(help in systems with 1 ld pipeline)
4344   // 2) we can use "fast" algorithm of finding single character to search for
4345   // first symbol with less branches(1 branch per each loaded register instead
4346   // of branch for each symbol), so, this is where constants like
4347   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4348   // 3) after loading and analyzing 1st register of source string, it can be
4349   // used to search for every 1st character entry, saving few loads in
4350   // comparison with "simplier-but-slower" implementation
4351   // 4) in order to avoid lots of push/pop operations, code below is heavily
4352   // re-using/re-initializing/compressing register values, which makes code
4353   // larger and a bit less readable, however, most of extra operations are
4354   // issued during loads or branches, so, penalty is minimal
4355   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4356     const char* stubName = str1_isL
4357         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4358         : "indexof_linear_uu";
4359     __ align(CodeEntryAlignment);
4360     StubCodeMark mark(this, "StubRoutines", stubName);
4361     address entry = __ pc();
4362 
4363     int str1_chr_size = str1_isL ? 1 : 2;
4364     int str2_chr_size = str2_isL ? 1 : 2;
4365     int str1_chr_shift = str1_isL ? 0 : 1;
4366     int str2_chr_shift = str2_isL ? 0 : 1;
4367     bool isL = str1_isL && str2_isL;
4368    // parameters
4369     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4370     // temporary registers
4371     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4372     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4373     // redefinitions
4374     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4375 
4376     __ push(spilled_regs, sp);
4377     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4378         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4379         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4380         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4381         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4382         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4383     // Read whole register from str1. It is safe, because length >=8 here
4384     __ ldr(ch1, Address(str1));
4385     // Read whole register from str2. It is safe, because length >=8 here
4386     __ ldr(ch2, Address(str2));
4387     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4388     if (str1_isL != str2_isL) {
4389       __ eor(v0, __ T16B, v0, v0);
4390     }
4391     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4392     __ mul(first, first, tmp1);
4393     // check if we have less than 1 register to check
4394     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4395     if (str1_isL != str2_isL) {
4396       __ fmovd(v1, ch1);
4397     }
4398     __ br(__ LE, L_SMALL);
4399     __ eor(ch2, first, ch2);
4400     if (str1_isL != str2_isL) {
4401       __ zip1(v1, __ T16B, v1, v0);
4402     }
4403     __ sub(tmp2, ch2, tmp1);
4404     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4405     __ bics(tmp2, tmp2, ch2);
4406     if (str1_isL != str2_isL) {
4407       __ fmovd(ch1, v1);
4408     }
4409     __ br(__ NE, L_HAS_ZERO);
4410     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4411     __ add(result, result, wordSize/str2_chr_size);
4412     __ add(str2, str2, wordSize);
4413     __ br(__ LT, L_POST_LOOP);
4414     __ BIND(L_LOOP);
4415       __ ldr(ch2, Address(str2));
4416       __ eor(ch2, first, ch2);
4417       __ sub(tmp2, ch2, tmp1);
4418       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4419       __ bics(tmp2, tmp2, ch2);
4420       __ br(__ NE, L_HAS_ZERO);
4421     __ BIND(L_LOOP_PROCEED);
4422       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4423       __ add(str2, str2, wordSize);
4424       __ add(result, result, wordSize/str2_chr_size);
4425       __ br(__ GE, L_LOOP);
4426     __ BIND(L_POST_LOOP);
4427       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4428       __ br(__ LE, NOMATCH);
4429       __ ldr(ch2, Address(str2));
4430       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4431       __ eor(ch2, first, ch2);
4432       __ sub(tmp2, ch2, tmp1);
4433       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4434       __ mov(tmp4, -1); // all bits set
4435       __ b(L_SMALL_PROCEED);
4436     __ align(OptoLoopAlignment);
4437     __ BIND(L_SMALL);
4438       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4439       __ eor(ch2, first, ch2);
4440       if (str1_isL != str2_isL) {
4441         __ zip1(v1, __ T16B, v1, v0);
4442       }
4443       __ sub(tmp2, ch2, tmp1);
4444       __ mov(tmp4, -1); // all bits set
4445       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4446       if (str1_isL != str2_isL) {
4447         __ fmovd(ch1, v1); // move converted 4 symbols
4448       }
4449     __ BIND(L_SMALL_PROCEED);
4450       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4451       __ bic(tmp2, tmp2, ch2);
4452       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4453       __ rbit(tmp2, tmp2);
4454       __ br(__ EQ, NOMATCH);
4455     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4456       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4457       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4458       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4459       if (str2_isL) { // LL
4460         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4461         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4462         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4463         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4464         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4465       } else {
4466         __ mov(ch2, 0xE); // all bits in byte set except last one
4467         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4468         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4469         __ lslv(tmp2, tmp2, tmp4);
4470         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4471         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4472         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4473         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4474       }
4475       __ cmp(ch1, ch2);
4476       __ mov(tmp4, wordSize/str2_chr_size);
4477       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4478     __ BIND(L_SMALL_CMP_LOOP);
4479       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4480                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4481       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4482                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4483       __ add(tmp4, tmp4, 1);
4484       __ cmp(tmp4, cnt1);
4485       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4486       __ cmp(first, ch2);
4487       __ br(__ EQ, L_SMALL_CMP_LOOP);
4488     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4489       __ cbz(tmp2, NOMATCH); // no more matches. exit
4490       __ clz(tmp4, tmp2);
4491       __ add(result, result, 1); // advance index
4492       __ add(str2, str2, str2_chr_size); // advance pointer
4493       __ b(L_SMALL_HAS_ZERO_LOOP);
4494     __ align(OptoLoopAlignment);
4495     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4496       __ cmp(first, ch2);
4497       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4498       __ b(DONE);
4499     __ align(OptoLoopAlignment);
4500     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4501       if (str2_isL) { // LL
4502         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4503         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4504         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4505         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4506         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4507       } else {
4508         __ mov(ch2, 0xE); // all bits in byte set except last one
4509         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4510         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4511         __ lslv(tmp2, tmp2, tmp4);
4512         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4513         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4514         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4515         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4516       }
4517       __ cmp(ch1, ch2);
4518       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4519       __ b(DONE);
4520     __ align(OptoLoopAlignment);
4521     __ BIND(L_HAS_ZERO);
4522       __ rbit(tmp2, tmp2);
4523       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4524       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4525       // It's fine because both counters are 32bit and are not changed in this
4526       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4527       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4528       __ sub(result, result, 1);
4529     __ BIND(L_HAS_ZERO_LOOP);
4530       __ mov(cnt1, wordSize/str2_chr_size);
4531       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4532       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4533       if (str2_isL) {
4534         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4535         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4536         __ lslv(tmp2, tmp2, tmp4);
4537         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4538         __ add(tmp4, tmp4, 1);
4539         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4540         __ lsl(tmp2, tmp2, 1);
4541         __ mov(tmp4, wordSize/str2_chr_size);
4542       } else {
4543         __ mov(ch2, 0xE);
4544         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4545         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4546         __ lslv(tmp2, tmp2, tmp4);
4547         __ add(tmp4, tmp4, 1);
4548         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4549         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4550         __ lsl(tmp2, tmp2, 1);
4551         __ mov(tmp4, wordSize/str2_chr_size);
4552         __ sub(str2, str2, str2_chr_size);
4553       }
4554       __ cmp(ch1, ch2);
4555       __ mov(tmp4, wordSize/str2_chr_size);
4556       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4557     __ BIND(L_CMP_LOOP);
4558       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4559                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4560       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4561                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4562       __ add(tmp4, tmp4, 1);
4563       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4564       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4565       __ cmp(cnt1, ch2);
4566       __ br(__ EQ, L_CMP_LOOP);
4567     __ BIND(L_CMP_LOOP_NOMATCH);
4568       // here we're not matched
4569       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4570       __ clz(tmp4, tmp2);
4571       __ add(str2, str2, str2_chr_size); // advance pointer
4572       __ b(L_HAS_ZERO_LOOP);
4573     __ align(OptoLoopAlignment);
4574     __ BIND(L_CMP_LOOP_LAST_CMP);
4575       __ cmp(cnt1, ch2);
4576       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4577       __ b(DONE);
4578     __ align(OptoLoopAlignment);
4579     __ BIND(L_CMP_LOOP_LAST_CMP2);
4580       if (str2_isL) {
4581         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4582         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4583         __ lslv(tmp2, tmp2, tmp4);
4584         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4585         __ add(tmp4, tmp4, 1);
4586         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4587         __ lsl(tmp2, tmp2, 1);
4588       } else {
4589         __ mov(ch2, 0xE);
4590         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4591         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4592         __ lslv(tmp2, tmp2, tmp4);
4593         __ add(tmp4, tmp4, 1);
4594         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4595         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4596         __ lsl(tmp2, tmp2, 1);
4597         __ sub(str2, str2, str2_chr_size);
4598       }
4599       __ cmp(ch1, ch2);
4600       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4601       __ b(DONE);
4602     __ align(OptoLoopAlignment);
4603     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4604       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4605       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4606       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4607       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4608       // result by analyzed characters value, so, we can just reset lower bits
4609       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4610       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4611       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4612       // index of last analyzed substring inside current octet. So, str2 in at
4613       // respective start address. We need to advance it to next octet
4614       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4615       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4616       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4617       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4618       __ movw(cnt2, cnt2);
4619       __ b(L_LOOP_PROCEED);
4620     __ align(OptoLoopAlignment);
4621     __ BIND(NOMATCH);
4622       __ mov(result, -1);
4623     __ BIND(DONE);
4624       __ pop(spilled_regs, sp);
4625       __ ret(lr);
4626     return entry;
4627   }
4628 
4629   void generate_string_indexof_stubs() {
4630     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4631     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4632     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4633   }
4634 
4635   void inflate_and_store_2_fp_registers(bool generatePrfm,
4636       FloatRegister src1, FloatRegister src2) {
4637     Register dst = r1;
4638     __ zip1(v1, __ T16B, src1, v0);
4639     __ zip2(v2, __ T16B, src1, v0);
4640     if (generatePrfm) {
4641       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4642     }
4643     __ zip1(v3, __ T16B, src2, v0);
4644     __ zip2(v4, __ T16B, src2, v0);
4645     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4646   }
4647 
4648   // R0 = src
4649   // R1 = dst
4650   // R2 = len
4651   // R3 = len >> 3
4652   // V0 = 0
4653   // v1 = loaded 8 bytes
4654   address generate_large_byte_array_inflate() {
4655     __ align(CodeEntryAlignment);
4656     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4657     address entry = __ pc();
4658     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4659     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4660     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4661 
4662     // do one more 8-byte read to have address 16-byte aligned in most cases
4663     // also use single store instruction
4664     __ ldrd(v2, __ post(src, 8));
4665     __ sub(octetCounter, octetCounter, 2);
4666     __ zip1(v1, __ T16B, v1, v0);
4667     __ zip1(v2, __ T16B, v2, v0);
4668     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4669     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4670     __ subs(rscratch1, octetCounter, large_loop_threshold);
4671     __ br(__ LE, LOOP_START);
4672     __ b(LOOP_PRFM_START);
4673     __ bind(LOOP_PRFM);
4674       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4675     __ bind(LOOP_PRFM_START);
4676       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4677       __ sub(octetCounter, octetCounter, 8);
4678       __ subs(rscratch1, octetCounter, large_loop_threshold);
4679       inflate_and_store_2_fp_registers(true, v3, v4);
4680       inflate_and_store_2_fp_registers(true, v5, v6);
4681       __ br(__ GT, LOOP_PRFM);
4682       __ cmp(octetCounter, (u1)8);
4683       __ br(__ LT, DONE);
4684     __ bind(LOOP);
4685       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4686       __ bind(LOOP_START);
4687       __ sub(octetCounter, octetCounter, 8);
4688       __ cmp(octetCounter, (u1)8);
4689       inflate_and_store_2_fp_registers(false, v3, v4);
4690       inflate_and_store_2_fp_registers(false, v5, v6);
4691       __ br(__ GE, LOOP);
4692     __ bind(DONE);
4693       __ ret(lr);
4694     return entry;
4695   }
4696 
4697   /**
4698    *  Arguments:
4699    *
4700    *  Input:
4701    *  c_rarg0   - current state address
4702    *  c_rarg1   - H key address
4703    *  c_rarg2   - data address
4704    *  c_rarg3   - number of blocks
4705    *
4706    *  Output:
4707    *  Updated state at c_rarg0
4708    */
4709   address generate_ghash_processBlocks() {
4710     // Bafflingly, GCM uses little-endian for the byte order, but
4711     // big-endian for the bit order.  For example, the polynomial 1 is
4712     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4713     //
4714     // So, we must either reverse the bytes in each word and do
4715     // everything big-endian or reverse the bits in each byte and do
4716     // it little-endian.  On AArch64 it's more idiomatic to reverse
4717     // the bits in each byte (we have an instruction, RBIT, to do
4718     // that) and keep the data in little-endian bit order throught the
4719     // calculation, bit-reversing the inputs and outputs.
4720 
4721     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4722     __ align(wordSize * 2);
4723     address p = __ pc();
4724     __ emit_int64(0x87);  // The low-order bits of the field
4725                           // polynomial (i.e. p = z^7+z^2+z+1)
4726                           // repeated in the low and high parts of a
4727                           // 128-bit vector
4728     __ emit_int64(0x87);
4729 
4730     __ align(CodeEntryAlignment);
4731     address start = __ pc();
4732 
4733     Register state   = c_rarg0;
4734     Register subkeyH = c_rarg1;
4735     Register data    = c_rarg2;
4736     Register blocks  = c_rarg3;
4737 
4738     FloatRegister vzr = v30;
4739     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4740 
4741     __ ldrq(v0, Address(state));
4742     __ ldrq(v1, Address(subkeyH));
4743 
4744     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4745     __ rbit(v0, __ T16B, v0);
4746     __ rev64(v1, __ T16B, v1);
4747     __ rbit(v1, __ T16B, v1);
4748 
4749     __ ldrq(v26, p);
4750 
4751     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4752     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4753 
4754     {
4755       Label L_ghash_loop;
4756       __ bind(L_ghash_loop);
4757 
4758       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4759                                                  // reversing each byte
4760       __ rbit(v2, __ T16B, v2);
4761       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4762 
4763       // Multiply state in v2 by subkey in v1
4764       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4765                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4766                      /*temps*/v6, v20, v18, v21);
4767       // Reduce v7:v5 by the field polynomial
4768       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4769 
4770       __ sub(blocks, blocks, 1);
4771       __ cbnz(blocks, L_ghash_loop);
4772     }
4773 
4774     // The bit-reversed result is at this point in v0
4775     __ rev64(v1, __ T16B, v0);
4776     __ rbit(v1, __ T16B, v1);
4777 
4778     __ st1(v1, __ T16B, state);
4779     __ ret(lr);
4780 
4781     return start;
4782   }
4783 
4784   // Continuation point for throwing of implicit exceptions that are
4785   // not handled in the current activation. Fabricates an exception
4786   // oop and initiates normal exception dispatching in this
4787   // frame. Since we need to preserve callee-saved values (currently
4788   // only for C2, but done for C1 as well) we need a callee-saved oop
4789   // map and therefore have to make these stubs into RuntimeStubs
4790   // rather than BufferBlobs.  If the compiler needs all registers to
4791   // be preserved between the fault point and the exception handler
4792   // then it must assume responsibility for that in
4793   // AbstractCompiler::continuation_for_implicit_null_exception or
4794   // continuation_for_implicit_division_by_zero_exception. All other
4795   // implicit exceptions (e.g., NullPointerException or
4796   // AbstractMethodError on entry) are either at call sites or
4797   // otherwise assume that stack unwinding will be initiated, so
4798   // caller saved registers were assumed volatile in the compiler.
4799 
4800 #undef __
4801 #define __ masm->
4802 
4803   address generate_throw_exception(const char* name,
4804                                    address runtime_entry,
4805                                    Register arg1 = noreg,
4806                                    Register arg2 = noreg) {
4807     // Information about frame layout at time of blocking runtime call.
4808     // Note that we only have to preserve callee-saved registers since
4809     // the compilers are responsible for supplying a continuation point
4810     // if they expect all registers to be preserved.
4811     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4812     enum layout {
4813       rfp_off = 0,
4814       rfp_off2,
4815       return_off,
4816       return_off2,
4817       framesize // inclusive of return address
4818     };
4819 
4820     int insts_size = 512;
4821     int locs_size  = 64;
4822 
4823     CodeBuffer code(name, insts_size, locs_size);
4824     OopMapSet* oop_maps  = new OopMapSet();
4825     MacroAssembler* masm = new MacroAssembler(&code);
4826 
4827     address start = __ pc();
4828 
4829     // This is an inlined and slightly modified version of call_VM
4830     // which has the ability to fetch the return PC out of
4831     // thread-local storage and also sets up last_Java_sp slightly
4832     // differently than the real call_VM
4833 
4834     __ enter(); // Save FP and LR before call
4835 
4836     assert(is_even(framesize/2), "sp not 16-byte aligned");
4837 
4838     // lr and fp are already in place
4839     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4840 
4841     int frame_complete = __ pc() - start;
4842 
4843     // Set up last_Java_sp and last_Java_fp
4844     address the_pc = __ pc();
4845     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4846 
4847     // Call runtime
4848     if (arg1 != noreg) {
4849       assert(arg2 != c_rarg1, "clobbered");
4850       __ mov(c_rarg1, arg1);
4851     }
4852     if (arg2 != noreg) {
4853       __ mov(c_rarg2, arg2);
4854     }
4855     __ mov(c_rarg0, rthread);
4856     BLOCK_COMMENT("call runtime_entry");
4857     __ mov(rscratch1, runtime_entry);
4858     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4859 
4860     // Generate oop map
4861     OopMap* map = new OopMap(framesize, 0);
4862 
4863     oop_maps->add_gc_map(the_pc - start, map);
4864 
4865     __ reset_last_Java_frame(true);
4866     __ maybe_isb();
4867 
4868     __ leave();
4869 
4870     // check for pending exceptions
4871 #ifdef ASSERT
4872     Label L;
4873     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4874     __ cbnz(rscratch1, L);
4875     __ should_not_reach_here();
4876     __ bind(L);
4877 #endif // ASSERT
4878     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4879 
4880 
4881     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4882     RuntimeStub* stub =
4883       RuntimeStub::new_runtime_stub(name,
4884                                     &code,
4885                                     frame_complete,
4886                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4887                                     oop_maps, false);
4888     return stub->entry_point();
4889   }
4890 
4891   class MontgomeryMultiplyGenerator : public MacroAssembler {
4892 
4893     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4894       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4895 
4896     RegSet _toSave;
4897     bool _squaring;
4898 
4899   public:
4900     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4901       : MacroAssembler(as->code()), _squaring(squaring) {
4902 
4903       // Register allocation
4904 
4905       Register reg = c_rarg0;
4906       Pa_base = reg;       // Argument registers
4907       if (squaring)
4908         Pb_base = Pa_base;
4909       else
4910         Pb_base = ++reg;
4911       Pn_base = ++reg;
4912       Rlen= ++reg;
4913       inv = ++reg;
4914       Pm_base = ++reg;
4915 
4916                           // Working registers:
4917       Ra =  ++reg;        // The current digit of a, b, n, and m.
4918       Rb =  ++reg;
4919       Rm =  ++reg;
4920       Rn =  ++reg;
4921 
4922       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4923       Pb =  ++reg;
4924       Pm =  ++reg;
4925       Pn =  ++reg;
4926 
4927       t0 =  ++reg;        // Three registers which form a
4928       t1 =  ++reg;        // triple-precision accumuator.
4929       t2 =  ++reg;
4930 
4931       Ri =  ++reg;        // Inner and outer loop indexes.
4932       Rj =  ++reg;
4933 
4934       Rhi_ab = ++reg;     // Product registers: low and high parts
4935       Rlo_ab = ++reg;     // of a*b and m*n.
4936       Rhi_mn = ++reg;
4937       Rlo_mn = ++reg;
4938 
4939       // r19 and up are callee-saved.
4940       _toSave = RegSet::range(r19, reg) + Pm_base;
4941     }
4942 
4943   private:
4944     void save_regs() {
4945       push(_toSave, sp);
4946     }
4947 
4948     void restore_regs() {
4949       pop(_toSave, sp);
4950     }
4951 
4952     template <typename T>
4953     void unroll_2(Register count, T block) {
4954       Label loop, end, odd;
4955       tbnz(count, 0, odd);
4956       cbz(count, end);
4957       align(16);
4958       bind(loop);
4959       (this->*block)();
4960       bind(odd);
4961       (this->*block)();
4962       subs(count, count, 2);
4963       br(Assembler::GT, loop);
4964       bind(end);
4965     }
4966 
4967     template <typename T>
4968     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4969       Label loop, end, odd;
4970       tbnz(count, 0, odd);
4971       cbz(count, end);
4972       align(16);
4973       bind(loop);
4974       (this->*block)(d, s, tmp);
4975       bind(odd);
4976       (this->*block)(d, s, tmp);
4977       subs(count, count, 2);
4978       br(Assembler::GT, loop);
4979       bind(end);
4980     }
4981 
4982     void pre1(RegisterOrConstant i) {
4983       block_comment("pre1");
4984       // Pa = Pa_base;
4985       // Pb = Pb_base + i;
4986       // Pm = Pm_base;
4987       // Pn = Pn_base + i;
4988       // Ra = *Pa;
4989       // Rb = *Pb;
4990       // Rm = *Pm;
4991       // Rn = *Pn;
4992       ldr(Ra, Address(Pa_base));
4993       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4994       ldr(Rm, Address(Pm_base));
4995       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4996       lea(Pa, Address(Pa_base));
4997       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4998       lea(Pm, Address(Pm_base));
4999       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5000 
5001       // Zero the m*n result.
5002       mov(Rhi_mn, zr);
5003       mov(Rlo_mn, zr);
5004     }
5005 
5006     // The core multiply-accumulate step of a Montgomery
5007     // multiplication.  The idea is to schedule operations as a
5008     // pipeline so that instructions with long latencies (loads and
5009     // multiplies) have time to complete before their results are
5010     // used.  This most benefits in-order implementations of the
5011     // architecture but out-of-order ones also benefit.
5012     void step() {
5013       block_comment("step");
5014       // MACC(Ra, Rb, t0, t1, t2);
5015       // Ra = *++Pa;
5016       // Rb = *--Pb;
5017       umulh(Rhi_ab, Ra, Rb);
5018       mul(Rlo_ab, Ra, Rb);
5019       ldr(Ra, pre(Pa, wordSize));
5020       ldr(Rb, pre(Pb, -wordSize));
5021       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5022                                        // previous iteration.
5023       // MACC(Rm, Rn, t0, t1, t2);
5024       // Rm = *++Pm;
5025       // Rn = *--Pn;
5026       umulh(Rhi_mn, Rm, Rn);
5027       mul(Rlo_mn, Rm, Rn);
5028       ldr(Rm, pre(Pm, wordSize));
5029       ldr(Rn, pre(Pn, -wordSize));
5030       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5031     }
5032 
5033     void post1() {
5034       block_comment("post1");
5035 
5036       // MACC(Ra, Rb, t0, t1, t2);
5037       // Ra = *++Pa;
5038       // Rb = *--Pb;
5039       umulh(Rhi_ab, Ra, Rb);
5040       mul(Rlo_ab, Ra, Rb);
5041       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5042       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5043 
5044       // *Pm = Rm = t0 * inv;
5045       mul(Rm, t0, inv);
5046       str(Rm, Address(Pm));
5047 
5048       // MACC(Rm, Rn, t0, t1, t2);
5049       // t0 = t1; t1 = t2; t2 = 0;
5050       umulh(Rhi_mn, Rm, Rn);
5051 
5052 #ifndef PRODUCT
5053       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5054       {
5055         mul(Rlo_mn, Rm, Rn);
5056         add(Rlo_mn, t0, Rlo_mn);
5057         Label ok;
5058         cbz(Rlo_mn, ok); {
5059           stop("broken Montgomery multiply");
5060         } bind(ok);
5061       }
5062 #endif
5063       // We have very carefully set things up so that
5064       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5065       // the lower half of Rm * Rn because we know the result already:
5066       // it must be -t0.  t0 + (-t0) must generate a carry iff
5067       // t0 != 0.  So, rather than do a mul and an adds we just set
5068       // the carry flag iff t0 is nonzero.
5069       //
5070       // mul(Rlo_mn, Rm, Rn);
5071       // adds(zr, t0, Rlo_mn);
5072       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5073       adcs(t0, t1, Rhi_mn);
5074       adc(t1, t2, zr);
5075       mov(t2, zr);
5076     }
5077 
5078     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5079       block_comment("pre2");
5080       // Pa = Pa_base + i-len;
5081       // Pb = Pb_base + len;
5082       // Pm = Pm_base + i-len;
5083       // Pn = Pn_base + len;
5084 
5085       if (i.is_register()) {
5086         sub(Rj, i.as_register(), len);
5087       } else {
5088         mov(Rj, i.as_constant());
5089         sub(Rj, Rj, len);
5090       }
5091       // Rj == i-len
5092 
5093       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5094       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5095       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5096       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5097 
5098       // Ra = *++Pa;
5099       // Rb = *--Pb;
5100       // Rm = *++Pm;
5101       // Rn = *--Pn;
5102       ldr(Ra, pre(Pa, wordSize));
5103       ldr(Rb, pre(Pb, -wordSize));
5104       ldr(Rm, pre(Pm, wordSize));
5105       ldr(Rn, pre(Pn, -wordSize));
5106 
5107       mov(Rhi_mn, zr);
5108       mov(Rlo_mn, zr);
5109     }
5110 
5111     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5112       block_comment("post2");
5113       if (i.is_constant()) {
5114         mov(Rj, i.as_constant()-len.as_constant());
5115       } else {
5116         sub(Rj, i.as_register(), len);
5117       }
5118 
5119       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5120 
5121       // As soon as we know the least significant digit of our result,
5122       // store it.
5123       // Pm_base[i-len] = t0;
5124       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5125 
5126       // t0 = t1; t1 = t2; t2 = 0;
5127       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5128       adc(t1, t2, zr);
5129       mov(t2, zr);
5130     }
5131 
5132     // A carry in t0 after Montgomery multiplication means that we
5133     // should subtract multiples of n from our result in m.  We'll
5134     // keep doing that until there is no carry.
5135     void normalize(RegisterOrConstant len) {
5136       block_comment("normalize");
5137       // while (t0)
5138       //   t0 = sub(Pm_base, Pn_base, t0, len);
5139       Label loop, post, again;
5140       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5141       cbz(t0, post); {
5142         bind(again); {
5143           mov(i, zr);
5144           mov(cnt, len);
5145           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5146           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5147           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5148           align(16);
5149           bind(loop); {
5150             sbcs(Rm, Rm, Rn);
5151             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5152             add(i, i, 1);
5153             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5154             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5155             sub(cnt, cnt, 1);
5156           } cbnz(cnt, loop);
5157           sbc(t0, t0, zr);
5158         } cbnz(t0, again);
5159       } bind(post);
5160     }
5161 
5162     // Move memory at s to d, reversing words.
5163     //    Increments d to end of copied memory
5164     //    Destroys tmp1, tmp2
5165     //    Preserves len
5166     //    Leaves s pointing to the address which was in d at start
5167     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5168       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5169 
5170       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5171       mov(tmp1, len);
5172       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5173       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5174     }
5175     // where
5176     void reverse1(Register d, Register s, Register tmp) {
5177       ldr(tmp, pre(s, -wordSize));
5178       ror(tmp, tmp, 32);
5179       str(tmp, post(d, wordSize));
5180     }
5181 
5182     void step_squaring() {
5183       // An extra ACC
5184       step();
5185       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5186     }
5187 
5188     void last_squaring(RegisterOrConstant i) {
5189       Label dont;
5190       // if ((i & 1) == 0) {
5191       tbnz(i.as_register(), 0, dont); {
5192         // MACC(Ra, Rb, t0, t1, t2);
5193         // Ra = *++Pa;
5194         // Rb = *--Pb;
5195         umulh(Rhi_ab, Ra, Rb);
5196         mul(Rlo_ab, Ra, Rb);
5197         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5198       } bind(dont);
5199     }
5200 
5201     void extra_step_squaring() {
5202       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5203 
5204       // MACC(Rm, Rn, t0, t1, t2);
5205       // Rm = *++Pm;
5206       // Rn = *--Pn;
5207       umulh(Rhi_mn, Rm, Rn);
5208       mul(Rlo_mn, Rm, Rn);
5209       ldr(Rm, pre(Pm, wordSize));
5210       ldr(Rn, pre(Pn, -wordSize));
5211     }
5212 
5213     void post1_squaring() {
5214       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5215 
5216       // *Pm = Rm = t0 * inv;
5217       mul(Rm, t0, inv);
5218       str(Rm, Address(Pm));
5219 
5220       // MACC(Rm, Rn, t0, t1, t2);
5221       // t0 = t1; t1 = t2; t2 = 0;
5222       umulh(Rhi_mn, Rm, Rn);
5223 
5224 #ifndef PRODUCT
5225       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5226       {
5227         mul(Rlo_mn, Rm, Rn);
5228         add(Rlo_mn, t0, Rlo_mn);
5229         Label ok;
5230         cbz(Rlo_mn, ok); {
5231           stop("broken Montgomery multiply");
5232         } bind(ok);
5233       }
5234 #endif
5235       // We have very carefully set things up so that
5236       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5237       // the lower half of Rm * Rn because we know the result already:
5238       // it must be -t0.  t0 + (-t0) must generate a carry iff
5239       // t0 != 0.  So, rather than do a mul and an adds we just set
5240       // the carry flag iff t0 is nonzero.
5241       //
5242       // mul(Rlo_mn, Rm, Rn);
5243       // adds(zr, t0, Rlo_mn);
5244       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5245       adcs(t0, t1, Rhi_mn);
5246       adc(t1, t2, zr);
5247       mov(t2, zr);
5248     }
5249 
5250     void acc(Register Rhi, Register Rlo,
5251              Register t0, Register t1, Register t2) {
5252       adds(t0, t0, Rlo);
5253       adcs(t1, t1, Rhi);
5254       adc(t2, t2, zr);
5255     }
5256 
5257   public:
5258     /**
5259      * Fast Montgomery multiplication.  The derivation of the
5260      * algorithm is in A Cryptographic Library for the Motorola
5261      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5262      *
5263      * Arguments:
5264      *
5265      * Inputs for multiplication:
5266      *   c_rarg0   - int array elements a
5267      *   c_rarg1   - int array elements b
5268      *   c_rarg2   - int array elements n (the modulus)
5269      *   c_rarg3   - int length
5270      *   c_rarg4   - int inv
5271      *   c_rarg5   - int array elements m (the result)
5272      *
5273      * Inputs for squaring:
5274      *   c_rarg0   - int array elements a
5275      *   c_rarg1   - int array elements n (the modulus)
5276      *   c_rarg2   - int length
5277      *   c_rarg3   - int inv
5278      *   c_rarg4   - int array elements m (the result)
5279      *
5280      */
5281     address generate_multiply() {
5282       Label argh, nothing;
5283       bind(argh);
5284       stop("MontgomeryMultiply total_allocation must be <= 8192");
5285 
5286       align(CodeEntryAlignment);
5287       address entry = pc();
5288 
5289       cbzw(Rlen, nothing);
5290 
5291       enter();
5292 
5293       // Make room.
5294       cmpw(Rlen, 512);
5295       br(Assembler::HI, argh);
5296       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5297       andr(sp, Ra, -2 * wordSize);
5298 
5299       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5300 
5301       {
5302         // Copy input args, reversing as we go.  We use Ra as a
5303         // temporary variable.
5304         reverse(Ra, Pa_base, Rlen, t0, t1);
5305         if (!_squaring)
5306           reverse(Ra, Pb_base, Rlen, t0, t1);
5307         reverse(Ra, Pn_base, Rlen, t0, t1);
5308       }
5309 
5310       // Push all call-saved registers and also Pm_base which we'll need
5311       // at the end.
5312       save_regs();
5313 
5314 #ifndef PRODUCT
5315       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5316       {
5317         ldr(Rn, Address(Pn_base, 0));
5318         mul(Rlo_mn, Rn, inv);
5319         subs(zr, Rlo_mn, -1);
5320         Label ok;
5321         br(EQ, ok); {
5322           stop("broken inverse in Montgomery multiply");
5323         } bind(ok);
5324       }
5325 #endif
5326 
5327       mov(Pm_base, Ra);
5328 
5329       mov(t0, zr);
5330       mov(t1, zr);
5331       mov(t2, zr);
5332 
5333       block_comment("for (int i = 0; i < len; i++) {");
5334       mov(Ri, zr); {
5335         Label loop, end;
5336         cmpw(Ri, Rlen);
5337         br(Assembler::GE, end);
5338 
5339         bind(loop);
5340         pre1(Ri);
5341 
5342         block_comment("  for (j = i; j; j--) {"); {
5343           movw(Rj, Ri);
5344           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5345         } block_comment("  } // j");
5346 
5347         post1();
5348         addw(Ri, Ri, 1);
5349         cmpw(Ri, Rlen);
5350         br(Assembler::LT, loop);
5351         bind(end);
5352         block_comment("} // i");
5353       }
5354 
5355       block_comment("for (int i = len; i < 2*len; i++) {");
5356       mov(Ri, Rlen); {
5357         Label loop, end;
5358         cmpw(Ri, Rlen, Assembler::LSL, 1);
5359         br(Assembler::GE, end);
5360 
5361         bind(loop);
5362         pre2(Ri, Rlen);
5363 
5364         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5365           lslw(Rj, Rlen, 1);
5366           subw(Rj, Rj, Ri);
5367           subw(Rj, Rj, 1);
5368           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5369         } block_comment("  } // j");
5370 
5371         post2(Ri, Rlen);
5372         addw(Ri, Ri, 1);
5373         cmpw(Ri, Rlen, Assembler::LSL, 1);
5374         br(Assembler::LT, loop);
5375         bind(end);
5376       }
5377       block_comment("} // i");
5378 
5379       normalize(Rlen);
5380 
5381       mov(Ra, Pm_base);  // Save Pm_base in Ra
5382       restore_regs();  // Restore caller's Pm_base
5383 
5384       // Copy our result into caller's Pm_base
5385       reverse(Pm_base, Ra, Rlen, t0, t1);
5386 
5387       leave();
5388       bind(nothing);
5389       ret(lr);
5390 
5391       return entry;
5392     }
5393     // In C, approximately:
5394 
5395     // void
5396     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5397     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5398     //                     unsigned long inv, int len) {
5399     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5400     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5401     //   unsigned long Ra, Rb, Rn, Rm;
5402 
5403     //   int i;
5404 
5405     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5406 
5407     //   for (i = 0; i < len; i++) {
5408     //     int j;
5409 
5410     //     Pa = Pa_base;
5411     //     Pb = Pb_base + i;
5412     //     Pm = Pm_base;
5413     //     Pn = Pn_base + i;
5414 
5415     //     Ra = *Pa;
5416     //     Rb = *Pb;
5417     //     Rm = *Pm;
5418     //     Rn = *Pn;
5419 
5420     //     int iters = i;
5421     //     for (j = 0; iters--; j++) {
5422     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5423     //       MACC(Ra, Rb, t0, t1, t2);
5424     //       Ra = *++Pa;
5425     //       Rb = *--Pb;
5426     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5427     //       MACC(Rm, Rn, t0, t1, t2);
5428     //       Rm = *++Pm;
5429     //       Rn = *--Pn;
5430     //     }
5431 
5432     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5433     //     MACC(Ra, Rb, t0, t1, t2);
5434     //     *Pm = Rm = t0 * inv;
5435     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5436     //     MACC(Rm, Rn, t0, t1, t2);
5437 
5438     //     assert(t0 == 0, "broken Montgomery multiply");
5439 
5440     //     t0 = t1; t1 = t2; t2 = 0;
5441     //   }
5442 
5443     //   for (i = len; i < 2*len; i++) {
5444     //     int j;
5445 
5446     //     Pa = Pa_base + i-len;
5447     //     Pb = Pb_base + len;
5448     //     Pm = Pm_base + i-len;
5449     //     Pn = Pn_base + len;
5450 
5451     //     Ra = *++Pa;
5452     //     Rb = *--Pb;
5453     //     Rm = *++Pm;
5454     //     Rn = *--Pn;
5455 
5456     //     int iters = len*2-i-1;
5457     //     for (j = i-len+1; iters--; j++) {
5458     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5459     //       MACC(Ra, Rb, t0, t1, t2);
5460     //       Ra = *++Pa;
5461     //       Rb = *--Pb;
5462     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5463     //       MACC(Rm, Rn, t0, t1, t2);
5464     //       Rm = *++Pm;
5465     //       Rn = *--Pn;
5466     //     }
5467 
5468     //     Pm_base[i-len] = t0;
5469     //     t0 = t1; t1 = t2; t2 = 0;
5470     //   }
5471 
5472     //   while (t0)
5473     //     t0 = sub(Pm_base, Pn_base, t0, len);
5474     // }
5475 
5476     /**
5477      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5478      * multiplies than Montgomery multiplication so it should be up to
5479      * 25% faster.  However, its loop control is more complex and it
5480      * may actually run slower on some machines.
5481      *
5482      * Arguments:
5483      *
5484      * Inputs:
5485      *   c_rarg0   - int array elements a
5486      *   c_rarg1   - int array elements n (the modulus)
5487      *   c_rarg2   - int length
5488      *   c_rarg3   - int inv
5489      *   c_rarg4   - int array elements m (the result)
5490      *
5491      */
5492     address generate_square() {
5493       Label argh;
5494       bind(argh);
5495       stop("MontgomeryMultiply total_allocation must be <= 8192");
5496 
5497       align(CodeEntryAlignment);
5498       address entry = pc();
5499 
5500       enter();
5501 
5502       // Make room.
5503       cmpw(Rlen, 512);
5504       br(Assembler::HI, argh);
5505       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5506       andr(sp, Ra, -2 * wordSize);
5507 
5508       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5509 
5510       {
5511         // Copy input args, reversing as we go.  We use Ra as a
5512         // temporary variable.
5513         reverse(Ra, Pa_base, Rlen, t0, t1);
5514         reverse(Ra, Pn_base, Rlen, t0, t1);
5515       }
5516 
5517       // Push all call-saved registers and also Pm_base which we'll need
5518       // at the end.
5519       save_regs();
5520 
5521       mov(Pm_base, Ra);
5522 
5523       mov(t0, zr);
5524       mov(t1, zr);
5525       mov(t2, zr);
5526 
5527       block_comment("for (int i = 0; i < len; i++) {");
5528       mov(Ri, zr); {
5529         Label loop, end;
5530         bind(loop);
5531         cmp(Ri, Rlen);
5532         br(Assembler::GE, end);
5533 
5534         pre1(Ri);
5535 
5536         block_comment("for (j = (i+1)/2; j; j--) {"); {
5537           add(Rj, Ri, 1);
5538           lsr(Rj, Rj, 1);
5539           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5540         } block_comment("  } // j");
5541 
5542         last_squaring(Ri);
5543 
5544         block_comment("  for (j = i/2; j; j--) {"); {
5545           lsr(Rj, Ri, 1);
5546           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5547         } block_comment("  } // j");
5548 
5549         post1_squaring();
5550         add(Ri, Ri, 1);
5551         cmp(Ri, Rlen);
5552         br(Assembler::LT, loop);
5553 
5554         bind(end);
5555         block_comment("} // i");
5556       }
5557 
5558       block_comment("for (int i = len; i < 2*len; i++) {");
5559       mov(Ri, Rlen); {
5560         Label loop, end;
5561         bind(loop);
5562         cmp(Ri, Rlen, Assembler::LSL, 1);
5563         br(Assembler::GE, end);
5564 
5565         pre2(Ri, Rlen);
5566 
5567         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5568           lsl(Rj, Rlen, 1);
5569           sub(Rj, Rj, Ri);
5570           sub(Rj, Rj, 1);
5571           lsr(Rj, Rj, 1);
5572           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5573         } block_comment("  } // j");
5574 
5575         last_squaring(Ri);
5576 
5577         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5578           lsl(Rj, Rlen, 1);
5579           sub(Rj, Rj, Ri);
5580           lsr(Rj, Rj, 1);
5581           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5582         } block_comment("  } // j");
5583 
5584         post2(Ri, Rlen);
5585         add(Ri, Ri, 1);
5586         cmp(Ri, Rlen, Assembler::LSL, 1);
5587 
5588         br(Assembler::LT, loop);
5589         bind(end);
5590         block_comment("} // i");
5591       }
5592 
5593       normalize(Rlen);
5594 
5595       mov(Ra, Pm_base);  // Save Pm_base in Ra
5596       restore_regs();  // Restore caller's Pm_base
5597 
5598       // Copy our result into caller's Pm_base
5599       reverse(Pm_base, Ra, Rlen, t0, t1);
5600 
5601       leave();
5602       ret(lr);
5603 
5604       return entry;
5605     }
5606     // In C, approximately:
5607 
5608     // void
5609     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5610     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5611     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5612     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5613     //   unsigned long Ra, Rb, Rn, Rm;
5614 
5615     //   int i;
5616 
5617     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5618 
5619     //   for (i = 0; i < len; i++) {
5620     //     int j;
5621 
5622     //     Pa = Pa_base;
5623     //     Pb = Pa_base + i;
5624     //     Pm = Pm_base;
5625     //     Pn = Pn_base + i;
5626 
5627     //     Ra = *Pa;
5628     //     Rb = *Pb;
5629     //     Rm = *Pm;
5630     //     Rn = *Pn;
5631 
5632     //     int iters = (i+1)/2;
5633     //     for (j = 0; iters--; j++) {
5634     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5635     //       MACC2(Ra, Rb, t0, t1, t2);
5636     //       Ra = *++Pa;
5637     //       Rb = *--Pb;
5638     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5639     //       MACC(Rm, Rn, t0, t1, t2);
5640     //       Rm = *++Pm;
5641     //       Rn = *--Pn;
5642     //     }
5643     //     if ((i & 1) == 0) {
5644     //       assert(Ra == Pa_base[j], "must be");
5645     //       MACC(Ra, Ra, t0, t1, t2);
5646     //     }
5647     //     iters = i/2;
5648     //     assert(iters == i-j, "must be");
5649     //     for (; iters--; j++) {
5650     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5651     //       MACC(Rm, Rn, t0, t1, t2);
5652     //       Rm = *++Pm;
5653     //       Rn = *--Pn;
5654     //     }
5655 
5656     //     *Pm = Rm = t0 * inv;
5657     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5658     //     MACC(Rm, Rn, t0, t1, t2);
5659 
5660     //     assert(t0 == 0, "broken Montgomery multiply");
5661 
5662     //     t0 = t1; t1 = t2; t2 = 0;
5663     //   }
5664 
5665     //   for (i = len; i < 2*len; i++) {
5666     //     int start = i-len+1;
5667     //     int end = start + (len - start)/2;
5668     //     int j;
5669 
5670     //     Pa = Pa_base + i-len;
5671     //     Pb = Pa_base + len;
5672     //     Pm = Pm_base + i-len;
5673     //     Pn = Pn_base + len;
5674 
5675     //     Ra = *++Pa;
5676     //     Rb = *--Pb;
5677     //     Rm = *++Pm;
5678     //     Rn = *--Pn;
5679 
5680     //     int iters = (2*len-i-1)/2;
5681     //     assert(iters == end-start, "must be");
5682     //     for (j = start; iters--; j++) {
5683     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5684     //       MACC2(Ra, Rb, t0, t1, t2);
5685     //       Ra = *++Pa;
5686     //       Rb = *--Pb;
5687     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5688     //       MACC(Rm, Rn, t0, t1, t2);
5689     //       Rm = *++Pm;
5690     //       Rn = *--Pn;
5691     //     }
5692     //     if ((i & 1) == 0) {
5693     //       assert(Ra == Pa_base[j], "must be");
5694     //       MACC(Ra, Ra, t0, t1, t2);
5695     //     }
5696     //     iters =  (2*len-i)/2;
5697     //     assert(iters == len-j, "must be");
5698     //     for (; iters--; j++) {
5699     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5700     //       MACC(Rm, Rn, t0, t1, t2);
5701     //       Rm = *++Pm;
5702     //       Rn = *--Pn;
5703     //     }
5704     //     Pm_base[i-len] = t0;
5705     //     t0 = t1; t1 = t2; t2 = 0;
5706     //   }
5707 
5708     //   while (t0)
5709     //     t0 = sub(Pm_base, Pn_base, t0, len);
5710     // }
5711   };
5712 
5713 
5714   // Initialization
5715   void generate_initial() {
5716     // Generate initial stubs and initializes the entry points
5717 
5718     // entry points that exist in all platforms Note: This is code
5719     // that could be shared among different platforms - however the
5720     // benefit seems to be smaller than the disadvantage of having a
5721     // much more complicated generator structure. See also comment in
5722     // stubRoutines.hpp.
5723 
5724     StubRoutines::_forward_exception_entry = generate_forward_exception();
5725 
5726     StubRoutines::_call_stub_entry =
5727       generate_call_stub(StubRoutines::_call_stub_return_address);
5728 
5729     // is referenced by megamorphic call
5730     StubRoutines::_catch_exception_entry = generate_catch_exception();
5731 
5732     // Build this early so it's available for the interpreter.
5733     StubRoutines::_throw_StackOverflowError_entry =
5734       generate_throw_exception("StackOverflowError throw_exception",
5735                                CAST_FROM_FN_PTR(address,
5736                                                 SharedRuntime::throw_StackOverflowError));
5737     StubRoutines::_throw_delayed_StackOverflowError_entry =
5738       generate_throw_exception("delayed StackOverflowError throw_exception",
5739                                CAST_FROM_FN_PTR(address,
5740                                                 SharedRuntime::throw_delayed_StackOverflowError));
5741     if (UseCRC32Intrinsics) {
5742       // set table address before stub generation which use it
5743       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5744       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5745     }
5746 
5747     if (UseCRC32CIntrinsics) {
5748       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5749     }
5750 
5751     // Disabled until JDK-8210858 is fixed
5752     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5753     //   StubRoutines::_dlog = generate_dlog();
5754     // }
5755 
5756     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5757       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5758     }
5759 
5760     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5761       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5762     }
5763   }
5764 
5765   void generate_all() {
5766     // support for verify_oop (must happen after universe_init)
5767     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5768     StubRoutines::_throw_AbstractMethodError_entry =
5769       generate_throw_exception("AbstractMethodError throw_exception",
5770                                CAST_FROM_FN_PTR(address,
5771                                                 SharedRuntime::
5772                                                 throw_AbstractMethodError));
5773 
5774     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5775       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5776                                CAST_FROM_FN_PTR(address,
5777                                                 SharedRuntime::
5778                                                 throw_IncompatibleClassChangeError));
5779 
5780     StubRoutines::_throw_NullPointerException_at_call_entry =
5781       generate_throw_exception("NullPointerException at call throw_exception",
5782                                CAST_FROM_FN_PTR(address,
5783                                                 SharedRuntime::
5784                                                 throw_NullPointerException_at_call));
5785 
5786     // arraycopy stubs used by compilers
5787     generate_arraycopy_stubs();
5788 
5789     // has negatives stub for large arrays.
5790     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5791 
5792     // array equals stub for large arrays.
5793     if (!UseSimpleArrayEquals) {
5794       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5795     }
5796 
5797     generate_compare_long_strings();
5798 
5799     generate_string_indexof_stubs();
5800 
5801     // byte_array_inflate stub for large arrays.
5802     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5803 
5804 #ifdef COMPILER2
5805     if (UseMultiplyToLenIntrinsic) {
5806       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5807     }
5808 
5809     if (UseSquareToLenIntrinsic) {
5810       StubRoutines::_squareToLen = generate_squareToLen();
5811     }
5812 
5813     if (UseMulAddIntrinsic) {
5814       StubRoutines::_mulAdd = generate_mulAdd();
5815     }
5816 
5817     if (UseMontgomeryMultiplyIntrinsic) {
5818       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5819       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5820       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5821     }
5822 
5823     if (UseMontgomerySquareIntrinsic) {
5824       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5825       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5826       // We use generate_multiply() rather than generate_square()
5827       // because it's faster for the sizes of modulus we care about.
5828       StubRoutines::_montgomerySquare = g.generate_multiply();
5829     }
5830 #endif // COMPILER2
5831 
5832 #ifndef BUILTIN_SIM
5833     // generate GHASH intrinsics code
5834     if (UseGHASHIntrinsics) {
5835       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5836     }
5837 
5838     if (UseAESIntrinsics) {
5839       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5840       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5841       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5842       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5843     }
5844 
5845     if (UseSHA1Intrinsics) {
5846       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5847       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5848     }
5849     if (UseSHA256Intrinsics) {
5850       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5851       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5852     }
5853 
5854     // generate Adler32 intrinsics code
5855     if (UseAdler32Intrinsics) {
5856       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5857     }
5858 
5859     // Safefetch stubs.
5860     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5861                                                        &StubRoutines::_safefetch32_fault_pc,
5862                                                        &StubRoutines::_safefetch32_continuation_pc);
5863     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5864                                                        &StubRoutines::_safefetchN_fault_pc,
5865                                                        &StubRoutines::_safefetchN_continuation_pc);
5866 #endif
5867     StubRoutines::aarch64::set_completed();
5868   }
5869 
5870  public:
5871   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5872     if (all) {
5873       generate_all();
5874     } else {
5875       generate_initial();
5876     }
5877   }
5878 }; // end class declaration
5879 
5880 void StubGenerator_generate(CodeBuffer* code, bool all) {
5881   StubGenerator g(code, all);
5882 }