1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (u1)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, (u1)T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, (u1)T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, (u1)T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(j_rarg1, (u1)T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(j_rarg2));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376 #ifndef PRODUCT
 377     // tell the simulator we are about to end Java execution
 378     if (NotifySimulator) {
 379       __ notify(Assembler::method_exit);
 380     }
 381 #endif
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387 
 388     __ BIND(is_long);
 389     __ str(r0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_float);
 393     __ strs(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_double);
 397     __ strd(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     return start;
 401   }
 402 
 403   // Return point for a Java call if there's an exception thrown in
 404   // Java code.  The exception is caught and transformed into a
 405   // pending exception stored in JavaThread that can be tested from
 406   // within the VM.
 407   //
 408   // Note: Usually the parameters are removed by the callee. In case
 409   // of an exception crossing an activation frame boundary, that is
 410   // not the case if the callee is compiled code => need to setup the
 411   // rsp.
 412   //
 413   // r0: exception oop
 414 
 415   // NOTE: this is used as a target from the signal handler so it
 416   // needs an x86 prolog which returns into the current simulator
 417   // executing the generated catch_exception code. so the prolog
 418   // needs to install rax in a sim register and adjust the sim's
 419   // restart pc to enter the generated code at the start position
 420   // then return from native to simulated execution.
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != NULL,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code wiht no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // we should not really care that lr is no longer the callee
 517     // address. we saved the value the handler needs in r19 so we can
 518     // just copy it to r3. however, the C2 handler will push its own
 519     // frame and then calls into the VM and the VM code asserts that
 520     // the PC for the frame above the handler belongs to a compiled
 521     // Java method. So, we restore lr here to satisfy that assert.
 522     __ mov(lr, r19);
 523     // setup r0 & r3 & clear pending exception
 524     __ mov(r3, r19);
 525     __ mov(r19, r0);
 526     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 527     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ cbnz(r0, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler
 540     // r0: exception
 541     // r3: throwing pc
 542     // r19: exception handler
 543     __ verify_oop(r0);
 544     __ br(r19);
 545 
 546     return start;
 547   }
 548 
 549   // Non-destructive plausibility checks for oops
 550   //
 551   // Arguments:
 552   //    r0: oop to verify
 553   //    rscratch1: error message
 554   //
 555   // Stack after saving c_rarg3:
 556   //    [tos + 0]: saved c_rarg3
 557   //    [tos + 1]: saved c_rarg2
 558   //    [tos + 2]: saved lr
 559   //    [tos + 3]: saved rscratch2
 560   //    [tos + 4]: saved r0
 561   //    [tos + 5]: saved rscratch1
 562   address generate_verify_oop() {
 563 
 564     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 565     address start = __ pc();
 566 
 567     Label exit, error;
 568 
 569     // save c_rarg2 and c_rarg3
 570     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 571 
 572     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 573     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ ldr(c_rarg3, Address(c_rarg2));
 575     __ add(c_rarg3, c_rarg3, 1);
 576     __ str(c_rarg3, Address(c_rarg2));
 577 
 578     // object is in r0
 579     // make sure object is 'reasonable'
 580     __ cbz(r0, exit); // if obj is NULL it is OK
 581 
 582     // Check if the oop is in the right area of memory
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 584     __ andr(c_rarg2, r0, c_rarg3);
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 586 
 587     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 588     // instruction here because the flags register is live.
 589     __ eor(c_rarg2, c_rarg2, c_rarg3);
 590     __ cbnz(c_rarg2, error);
 591 
 592     // make sure klass is 'reasonable', which is not zero.
 593     __ load_klass(r0, r0);  // get klass
 594     __ cbz(r0, error);      // if klass is NULL it is broken
 595 
 596     // return if everything seems ok
 597     __ bind(exit);
 598 
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600     __ ret(lr);
 601 
 602     // handle errors
 603     __ bind(error);
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605 
 606     __ push(RegSet::range(r0, r29), sp);
 607     // debug(char* msg, int64_t pc, int64_t regs[])
 608     __ mov(c_rarg0, rscratch1);      // pass address of error message
 609     __ mov(c_rarg1, lr);             // pass return address
 610     __ mov(c_rarg2, sp);             // pass address of regs on stack
 611 #ifndef PRODUCT
 612     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 613 #endif
 614     BLOCK_COMMENT("call MacroAssembler::debug");
 615     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 616     __ blrt(rscratch1, 3, 0, 1);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label store_pair, loop_store_pair, done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657 
 658       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 659       // alignment.
 660       Label small;
 661       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 662       __ subs(rscratch1, cnt, low_limit >> 3);
 663       __ br(Assembler::LT, small);
 664       __ zero_dcache_blocks(base, cnt);
 665       __ bind(small);
 666     }
 667 
 668     {
 669       // Number of stp instructions we'll unroll
 670       const int unroll =
 671         MacroAssembler::zero_words_block_size / 2;
 672       // Clear the remaining blocks.
 673       Label loop;
 674       __ subs(cnt, cnt, unroll * 2);
 675       __ br(Assembler::LT, done);
 676       __ bind(loop);
 677       for (int i = 0; i < unroll; i++)
 678         __ stp(zr, zr, __ post(base, 16));
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::GE, loop);
 681       __ bind(done);
 682       __ add(cnt, cnt, unroll * 2);
 683     }
 684 
 685     __ ret(lr);
 686 
 687     return start;
 688   }
 689 
 690 
 691   typedef enum {
 692     copy_forwards = 1,
 693     copy_backwards = -1
 694   } copy_direction;
 695 
 696   // Bulk copy of blocks of 8 words.
 697   //
 698   // count is a count of words.
 699   //
 700   // Precondition: count >= 8
 701   //
 702   // Postconditions:
 703   //
 704   // The least significant bit of count contains the remaining count
 705   // of words to copy.  The rest of count is trash.
 706   //
 707   // s and d are adjusted to point to the remaining words to copy
 708   //
 709   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 710                            copy_direction direction) {
 711     int unit = wordSize * direction;
 712     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 713 
 714     int offset;
 715     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 716       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 717     const Register stride = r13;
 718 
 719     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 720     assert_different_registers(s, d, count, rscratch1);
 721 
 722     Label again, drain;
 723     const char *stub_name;
 724     if (direction == copy_forwards)
 725       stub_name = "forward_copy_longs";
 726     else
 727       stub_name = "backward_copy_longs";
 728     StubCodeMark mark(this, "StubRoutines", stub_name);
 729     __ align(CodeEntryAlignment);
 730     __ bind(start);
 731 
 732     Label unaligned_copy_long;
 733     if (AvoidUnalignedAccesses) {
 734       __ tbnz(d, 3, unaligned_copy_long);
 735     }
 736 
 737     if (direction == copy_forwards) {
 738       __ sub(s, s, bias);
 739       __ sub(d, d, bias);
 740     }
 741 
 742 #ifdef ASSERT
 743     // Make sure we are never given < 8 words
 744     {
 745       Label L;
 746       __ cmp(count, (u1)8);
 747       __ br(Assembler::GE, L);
 748       __ stop("genrate_copy_longs called with < 8 words");
 749       __ bind(L);
 750     }
 751 #endif
 752 
 753     // Fill 8 registers
 754     if (UseSIMDForMemoryOps) {
 755       __ ldpq(v0, v1, Address(s, 4 * unit));
 756       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 757     } else {
 758       __ ldp(t0, t1, Address(s, 2 * unit));
 759       __ ldp(t2, t3, Address(s, 4 * unit));
 760       __ ldp(t4, t5, Address(s, 6 * unit));
 761       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 762     }
 763 
 764     __ subs(count, count, 16);
 765     __ br(Assembler::LO, drain);
 766 
 767     int prefetch = PrefetchCopyIntervalInBytes;
 768     bool use_stride = false;
 769     if (direction == copy_backwards) {
 770        use_stride = prefetch > 256;
 771        prefetch = -prefetch;
 772        if (use_stride) __ mov(stride, prefetch);
 773     }
 774 
 775     __ bind(again);
 776 
 777     if (PrefetchCopyIntervalInBytes > 0)
 778       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 779 
 780     if (UseSIMDForMemoryOps) {
 781       __ stpq(v0, v1, Address(d, 4 * unit));
 782       __ ldpq(v0, v1, Address(s, 4 * unit));
 783       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 784       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 785     } else {
 786       __ stp(t0, t1, Address(d, 2 * unit));
 787       __ ldp(t0, t1, Address(s, 2 * unit));
 788       __ stp(t2, t3, Address(d, 4 * unit));
 789       __ ldp(t2, t3, Address(s, 4 * unit));
 790       __ stp(t4, t5, Address(d, 6 * unit));
 791       __ ldp(t4, t5, Address(s, 6 * unit));
 792       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 793       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 794     }
 795 
 796     __ subs(count, count, 8);
 797     __ br(Assembler::HS, again);
 798 
 799     // Drain
 800     __ bind(drain);
 801     if (UseSIMDForMemoryOps) {
 802       __ stpq(v0, v1, Address(d, 4 * unit));
 803       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 804     } else {
 805       __ stp(t0, t1, Address(d, 2 * unit));
 806       __ stp(t2, t3, Address(d, 4 * unit));
 807       __ stp(t4, t5, Address(d, 6 * unit));
 808       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 809     }
 810 
 811     {
 812       Label L1, L2;
 813       __ tbz(count, exact_log2(4), L1);
 814       if (UseSIMDForMemoryOps) {
 815         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 816         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 817       } else {
 818         __ ldp(t0, t1, Address(s, 2 * unit));
 819         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 820         __ stp(t0, t1, Address(d, 2 * unit));
 821         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 822       }
 823       __ bind(L1);
 824 
 825       if (direction == copy_forwards) {
 826         __ add(s, s, bias);
 827         __ add(d, d, bias);
 828       }
 829 
 830       __ tbz(count, 1, L2);
 831       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 832       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 833       __ bind(L2);
 834     }
 835 
 836     __ ret(lr);
 837 
 838     if (AvoidUnalignedAccesses) {
 839       Label drain, again;
 840       // Register order for storing. Order is different for backward copy.
 841 
 842       __ bind(unaligned_copy_long);
 843 
 844       // source address is even aligned, target odd aligned
 845       //
 846       // when forward copying word pairs we read long pairs at offsets
 847       // {0, 2, 4, 6} (in long words). when backwards copying we read
 848       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 849       // address by -2 in the forwards case so we can compute the
 850       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 851       // or -1.
 852       //
 853       // when forward copying we need to store 1 word, 3 pairs and
 854       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 855       // zero offset We adjust the destination by -1 which means we
 856       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 857       //
 858       // When backwards copyng we need to store 1 word, 3 pairs and
 859       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 860       // offsets {1, 3, 5, 7, 8} * unit.
 861 
 862       if (direction == copy_forwards) {
 863         __ sub(s, s, 16);
 864         __ sub(d, d, 8);
 865       }
 866 
 867       // Fill 8 registers
 868       //
 869       // for forwards copy s was offset by -16 from the original input
 870       // value of s so the register contents are at these offsets
 871       // relative to the 64 bit block addressed by that original input
 872       // and so on for each successive 64 byte block when s is updated
 873       //
 874       // t0 at offset 0,  t1 at offset 8
 875       // t2 at offset 16, t3 at offset 24
 876       // t4 at offset 32, t5 at offset 40
 877       // t6 at offset 48, t7 at offset 56
 878 
 879       // for backwards copy s was not offset so the register contents
 880       // are at these offsets into the preceding 64 byte block
 881       // relative to that original input and so on for each successive
 882       // preceding 64 byte block when s is updated. this explains the
 883       // slightly counter-intuitive looking pattern of register usage
 884       // in the stp instructions for backwards copy.
 885       //
 886       // t0 at offset -16, t1 at offset -8
 887       // t2 at offset -32, t3 at offset -24
 888       // t4 at offset -48, t5 at offset -40
 889       // t6 at offset -64, t7 at offset -56
 890 
 891       __ ldp(t0, t1, Address(s, 2 * unit));
 892       __ ldp(t2, t3, Address(s, 4 * unit));
 893       __ ldp(t4, t5, Address(s, 6 * unit));
 894       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 895 
 896       __ subs(count, count, 16);
 897       __ br(Assembler::LO, drain);
 898 
 899       int prefetch = PrefetchCopyIntervalInBytes;
 900       bool use_stride = false;
 901       if (direction == copy_backwards) {
 902          use_stride = prefetch > 256;
 903          prefetch = -prefetch;
 904          if (use_stride) __ mov(stride, prefetch);
 905       }
 906 
 907       __ bind(again);
 908 
 909       if (PrefetchCopyIntervalInBytes > 0)
 910         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 911 
 912       if (direction == copy_forwards) {
 913        // allowing for the offset of -8 the store instructions place
 914        // registers into the target 64 bit block at the following
 915        // offsets
 916        //
 917        // t0 at offset 0
 918        // t1 at offset 8,  t2 at offset 16
 919        // t3 at offset 24, t4 at offset 32
 920        // t5 at offset 40, t6 at offset 48
 921        // t7 at offset 56
 922 
 923         __ str(t0, Address(d, 1 * unit));
 924         __ stp(t1, t2, Address(d, 2 * unit));
 925         __ ldp(t0, t1, Address(s, 2 * unit));
 926         __ stp(t3, t4, Address(d, 4 * unit));
 927         __ ldp(t2, t3, Address(s, 4 * unit));
 928         __ stp(t5, t6, Address(d, 6 * unit));
 929         __ ldp(t4, t5, Address(s, 6 * unit));
 930         __ str(t7, Address(__ pre(d, 8 * unit)));
 931         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 932       } else {
 933        // d was not offset when we started so the registers are
 934        // written into the 64 bit block preceding d with the following
 935        // offsets
 936        //
 937        // t1 at offset -8
 938        // t3 at offset -24, t0 at offset -16
 939        // t5 at offset -48, t2 at offset -32
 940        // t7 at offset -56, t4 at offset -48
 941        //                   t6 at offset -64
 942        //
 943        // note that this matches the offsets previously noted for the
 944        // loads
 945 
 946         __ str(t1, Address(d, 1 * unit));
 947         __ stp(t3, t0, Address(d, 3 * unit));
 948         __ ldp(t0, t1, Address(s, 2 * unit));
 949         __ stp(t5, t2, Address(d, 5 * unit));
 950         __ ldp(t2, t3, Address(s, 4 * unit));
 951         __ stp(t7, t4, Address(d, 7 * unit));
 952         __ ldp(t4, t5, Address(s, 6 * unit));
 953         __ str(t6, Address(__ pre(d, 8 * unit)));
 954         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 955       }
 956 
 957       __ subs(count, count, 8);
 958       __ br(Assembler::HS, again);
 959 
 960       // Drain
 961       //
 962       // this uses the same pattern of offsets and register arguments
 963       // as above
 964       __ bind(drain);
 965       if (direction == copy_forwards) {
 966         __ str(t0, Address(d, 1 * unit));
 967         __ stp(t1, t2, Address(d, 2 * unit));
 968         __ stp(t3, t4, Address(d, 4 * unit));
 969         __ stp(t5, t6, Address(d, 6 * unit));
 970         __ str(t7, Address(__ pre(d, 8 * unit)));
 971       } else {
 972         __ str(t1, Address(d, 1 * unit));
 973         __ stp(t3, t0, Address(d, 3 * unit));
 974         __ stp(t5, t2, Address(d, 5 * unit));
 975         __ stp(t7, t4, Address(d, 7 * unit));
 976         __ str(t6, Address(__ pre(d, 8 * unit)));
 977       }
 978       // now we need to copy any remaining part block which may
 979       // include a 4 word block subblock and/or a 2 word subblock.
 980       // bits 2 and 1 in the count are the tell-tale for whetehr we
 981       // have each such subblock
 982       {
 983         Label L1, L2;
 984         __ tbz(count, exact_log2(4), L1);
 985        // this is the same as above but copying only 4 longs hence
 986        // with ony one intervening stp between the str instructions
 987        // but note that the offsets and registers still follow the
 988        // same pattern
 989         __ ldp(t0, t1, Address(s, 2 * unit));
 990         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 991         if (direction == copy_forwards) {
 992           __ str(t0, Address(d, 1 * unit));
 993           __ stp(t1, t2, Address(d, 2 * unit));
 994           __ str(t3, Address(__ pre(d, 4 * unit)));
 995         } else {
 996           __ str(t1, Address(d, 1 * unit));
 997           __ stp(t3, t0, Address(d, 3 * unit));
 998           __ str(t2, Address(__ pre(d, 4 * unit)));
 999         }
1000         __ bind(L1);
1001 
1002         __ tbz(count, 1, L2);
1003        // this is the same as above but copying only 2 longs hence
1004        // there is no intervening stp between the str instructions
1005        // but note that the offset and register patterns are still
1006        // the same
1007         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1008         if (direction == copy_forwards) {
1009           __ str(t0, Address(d, 1 * unit));
1010           __ str(t1, Address(__ pre(d, 2 * unit)));
1011         } else {
1012           __ str(t1, Address(d, 1 * unit));
1013           __ str(t0, Address(__ pre(d, 2 * unit)));
1014         }
1015         __ bind(L2);
1016 
1017        // for forwards copy we need to re-adjust the offsets we
1018        // applied so that s and d are follow the last words written
1019 
1020        if (direction == copy_forwards) {
1021          __ add(s, s, 16);
1022          __ add(d, d, 8);
1023        }
1024 
1025       }
1026 
1027       __ ret(lr);
1028       }
1029   }
1030 
1031   // Small copy: less than 16 bytes.
1032   //
1033   // NB: Ignores all of the bits of count which represent more than 15
1034   // bytes, so a caller doesn't have to mask them.
1035 
1036   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1037     bool is_backwards = step < 0;
1038     size_t granularity = uabs(step);
1039     int direction = is_backwards ? -1 : 1;
1040     int unit = wordSize * direction;
1041 
1042     Label Lpair, Lword, Lint, Lshort, Lbyte;
1043 
1044     assert(granularity
1045            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1046 
1047     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1048 
1049     // ??? I don't know if this bit-test-and-branch is the right thing
1050     // to do.  It does a lot of jumping, resulting in several
1051     // mispredicted branches.  It might make more sense to do this
1052     // with something like Duff's device with a single computed branch.
1053 
1054     __ tbz(count, 3 - exact_log2(granularity), Lword);
1055     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1056     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1057     __ bind(Lword);
1058 
1059     if (granularity <= sizeof (jint)) {
1060       __ tbz(count, 2 - exact_log2(granularity), Lint);
1061       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1062       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1063       __ bind(Lint);
1064     }
1065 
1066     if (granularity <= sizeof (jshort)) {
1067       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1068       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1069       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1070       __ bind(Lshort);
1071     }
1072 
1073     if (granularity <= sizeof (jbyte)) {
1074       __ tbz(count, 0, Lbyte);
1075       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1076       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1077       __ bind(Lbyte);
1078     }
1079   }
1080 
1081   Label copy_f, copy_b;
1082 
1083   // All-singing all-dancing memory copy.
1084   //
1085   // Copy count units of memory from s to d.  The size of a unit is
1086   // step, which can be positive or negative depending on the direction
1087   // of copy.  If is_aligned is false, we align the source address.
1088   //
1089 
1090   void copy_memory(bool is_aligned, Register s, Register d,
1091                    Register count, Register tmp, int step) {
1092     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1093     bool is_backwards = step < 0;
1094     int granularity = uabs(step);
1095     const Register t0 = r3, t1 = r4;
1096 
1097     // <= 96 bytes do inline. Direction doesn't matter because we always
1098     // load all the data before writing anything
1099     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1100     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1101     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1102     const Register send = r17, dend = r18;
1103 
1104     if (PrefetchCopyIntervalInBytes > 0)
1105       __ prfm(Address(s, 0), PLDL1KEEP);
1106     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1107     __ br(Assembler::HI, copy_big);
1108 
1109     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1110     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1111 
1112     __ cmp(count, u1(16/granularity));
1113     __ br(Assembler::LS, copy16);
1114 
1115     __ cmp(count, u1(64/granularity));
1116     __ br(Assembler::HI, copy80);
1117 
1118     __ cmp(count, u1(32/granularity));
1119     __ br(Assembler::LS, copy32);
1120 
1121     // 33..64 bytes
1122     if (UseSIMDForMemoryOps) {
1123       __ ldpq(v0, v1, Address(s, 0));
1124       __ ldpq(v2, v3, Address(send, -32));
1125       __ stpq(v0, v1, Address(d, 0));
1126       __ stpq(v2, v3, Address(dend, -32));
1127     } else {
1128       __ ldp(t0, t1, Address(s, 0));
1129       __ ldp(t2, t3, Address(s, 16));
1130       __ ldp(t4, t5, Address(send, -32));
1131       __ ldp(t6, t7, Address(send, -16));
1132 
1133       __ stp(t0, t1, Address(d, 0));
1134       __ stp(t2, t3, Address(d, 16));
1135       __ stp(t4, t5, Address(dend, -32));
1136       __ stp(t6, t7, Address(dend, -16));
1137     }
1138     __ b(finish);
1139 
1140     // 17..32 bytes
1141     __ bind(copy32);
1142     __ ldp(t0, t1, Address(s, 0));
1143     __ ldp(t2, t3, Address(send, -16));
1144     __ stp(t0, t1, Address(d, 0));
1145     __ stp(t2, t3, Address(dend, -16));
1146     __ b(finish);
1147 
1148     // 65..80/96 bytes
1149     // (96 bytes if SIMD because we do 32 byes per instruction)
1150     __ bind(copy80);
1151     if (UseSIMDForMemoryOps) {
1152       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1153       __ ldpq(v4, v5, Address(send, -32));
1154       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1155       __ stpq(v4, v5, Address(dend, -32));
1156     } else {
1157       __ ldp(t0, t1, Address(s, 0));
1158       __ ldp(t2, t3, Address(s, 16));
1159       __ ldp(t4, t5, Address(s, 32));
1160       __ ldp(t6, t7, Address(s, 48));
1161       __ ldp(t8, t9, Address(send, -16));
1162 
1163       __ stp(t0, t1, Address(d, 0));
1164       __ stp(t2, t3, Address(d, 16));
1165       __ stp(t4, t5, Address(d, 32));
1166       __ stp(t6, t7, Address(d, 48));
1167       __ stp(t8, t9, Address(dend, -16));
1168     }
1169     __ b(finish);
1170 
1171     // 0..16 bytes
1172     __ bind(copy16);
1173     __ cmp(count, u1(8/granularity));
1174     __ br(Assembler::LO, copy8);
1175 
1176     // 8..16 bytes
1177     __ ldr(t0, Address(s, 0));
1178     __ ldr(t1, Address(send, -8));
1179     __ str(t0, Address(d, 0));
1180     __ str(t1, Address(dend, -8));
1181     __ b(finish);
1182 
1183     if (granularity < 8) {
1184       // 4..7 bytes
1185       __ bind(copy8);
1186       __ tbz(count, 2 - exact_log2(granularity), copy4);
1187       __ ldrw(t0, Address(s, 0));
1188       __ ldrw(t1, Address(send, -4));
1189       __ strw(t0, Address(d, 0));
1190       __ strw(t1, Address(dend, -4));
1191       __ b(finish);
1192       if (granularity < 4) {
1193         // 0..3 bytes
1194         __ bind(copy4);
1195         __ cbz(count, finish); // get rid of 0 case
1196         if (granularity == 2) {
1197           __ ldrh(t0, Address(s, 0));
1198           __ strh(t0, Address(d, 0));
1199         } else { // granularity == 1
1200           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1201           // the first and last byte.
1202           // Handle the 3 byte case by loading and storing base + count/2
1203           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1204           // This does means in the 1 byte case we load/store the same
1205           // byte 3 times.
1206           __ lsr(count, count, 1);
1207           __ ldrb(t0, Address(s, 0));
1208           __ ldrb(t1, Address(send, -1));
1209           __ ldrb(t2, Address(s, count));
1210           __ strb(t0, Address(d, 0));
1211           __ strb(t1, Address(dend, -1));
1212           __ strb(t2, Address(d, count));
1213         }
1214         __ b(finish);
1215       }
1216     }
1217 
1218     __ bind(copy_big);
1219     if (is_backwards) {
1220       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1221       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1222     }
1223 
1224     // Now we've got the small case out of the way we can align the
1225     // source address on a 2-word boundary.
1226 
1227     Label aligned;
1228 
1229     if (is_aligned) {
1230       // We may have to adjust by 1 word to get s 2-word-aligned.
1231       __ tbz(s, exact_log2(wordSize), aligned);
1232       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1233       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1234       __ sub(count, count, wordSize/granularity);
1235     } else {
1236       if (is_backwards) {
1237         __ andr(rscratch2, s, 2 * wordSize - 1);
1238       } else {
1239         __ neg(rscratch2, s);
1240         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1241       }
1242       // rscratch2 is the byte adjustment needed to align s.
1243       __ cbz(rscratch2, aligned);
1244       int shift = exact_log2(granularity);
1245       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1246       __ sub(count, count, rscratch2);
1247 
1248 #if 0
1249       // ?? This code is only correct for a disjoint copy.  It may or
1250       // may not make sense to use it in that case.
1251 
1252       // Copy the first pair; s and d may not be aligned.
1253       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1254       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1255 
1256       // Align s and d, adjust count
1257       if (is_backwards) {
1258         __ sub(s, s, rscratch2);
1259         __ sub(d, d, rscratch2);
1260       } else {
1261         __ add(s, s, rscratch2);
1262         __ add(d, d, rscratch2);
1263       }
1264 #else
1265       copy_memory_small(s, d, rscratch2, rscratch1, step);
1266 #endif
1267     }
1268 
1269     __ bind(aligned);
1270 
1271     // s is now 2-word-aligned.
1272 
1273     // We have a count of units and some trailing bytes.  Adjust the
1274     // count and do a bulk copy of words.
1275     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1276     if (direction == copy_forwards)
1277       __ bl(copy_f);
1278     else
1279       __ bl(copy_b);
1280 
1281     // And the tail.
1282     copy_memory_small(s, d, count, tmp, step);
1283 
1284     if (granularity >= 8) __ bind(copy8);
1285     if (granularity >= 4) __ bind(copy4);
1286     __ bind(finish);
1287   }
1288 
1289 
1290   void clobber_registers() {
1291 #ifdef ASSERT
1292     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1293     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1294     for (Register r = r3; r <= r18; r++)
1295       if (r != rscratch1) __ mov(r, rscratch1);
1296 #endif
1297   }
1298 
1299   // Scan over array at a for count oops, verifying each one.
1300   // Preserves a and count, clobbers rscratch1 and rscratch2.
1301   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1302     Label loop, end;
1303     __ mov(rscratch1, a);
1304     __ mov(rscratch2, zr);
1305     __ bind(loop);
1306     __ cmp(rscratch2, count);
1307     __ br(Assembler::HS, end);
1308     if (size == (size_t)wordSize) {
1309       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1310       __ verify_oop(temp);
1311     } else {
1312       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1313       __ decode_heap_oop(temp); // calls verify_oop
1314     }
1315     __ add(rscratch2, rscratch2, size);
1316     __ b(loop);
1317     __ bind(end);
1318   }
1319 
1320   // Arguments:
1321   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1322   //             ignored
1323   //   is_oop  - true => oop array, so generate store check code
1324   //   name    - stub name string
1325   //
1326   // Inputs:
1327   //   c_rarg0   - source array address
1328   //   c_rarg1   - destination array address
1329   //   c_rarg2   - element count, treated as ssize_t, can be zero
1330   //
1331   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1332   // the hardware handle it.  The two dwords within qwords that span
1333   // cache line boundaries will still be loaded and stored atomicly.
1334   //
1335   // Side Effects:
1336   //   disjoint_int_copy_entry is set to the no-overlap entry point
1337   //   used by generate_conjoint_int_oop_copy().
1338   //
1339   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1340                                   const char *name, bool dest_uninitialized = false) {
1341     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1342     RegSet saved_reg = RegSet::of(s, d, count);
1343     __ align(CodeEntryAlignment);
1344     StubCodeMark mark(this, "StubRoutines", name);
1345     address start = __ pc();
1346     __ enter();
1347 
1348     if (entry != NULL) {
1349       *entry = __ pc();
1350       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1351       BLOCK_COMMENT("Entry:");
1352     }
1353 
1354     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1355     if (dest_uninitialized) {
1356       decorators |= IS_DEST_UNINITIALIZED;
1357     }
1358     if (aligned) {
1359       decorators |= ARRAYCOPY_ALIGNED;
1360     }
1361 
1362     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1363     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1364 
1365     if (is_oop) {
1366       // save regs before copy_memory
1367       __ push(RegSet::of(d, count), sp);
1368     }
1369     copy_memory(aligned, s, d, count, rscratch1, size);
1370 
1371     if (is_oop) {
1372       __ pop(RegSet::of(d, count), sp);
1373       if (VerifyOops)
1374         verify_oop_array(size, d, count, r16);
1375       __ sub(count, count, 1); // make an inclusive end pointer
1376       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1377     }
1378 
1379     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1380 
1381     __ leave();
1382     __ mov(r0, zr); // return 0
1383     __ ret(lr);
1384 #ifdef BUILTIN_SIM
1385     {
1386       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1387       sim->notifyCompile(const_cast<char*>(name), start);
1388     }
1389 #endif
1390     return start;
1391   }
1392 
1393   // Arguments:
1394   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1395   //             ignored
1396   //   is_oop  - true => oop array, so generate store check code
1397   //   name    - stub name string
1398   //
1399   // Inputs:
1400   //   c_rarg0   - source array address
1401   //   c_rarg1   - destination array address
1402   //   c_rarg2   - element count, treated as ssize_t, can be zero
1403   //
1404   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1405   // the hardware handle it.  The two dwords within qwords that span
1406   // cache line boundaries will still be loaded and stored atomicly.
1407   //
1408   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1409                                  address *entry, const char *name,
1410                                  bool dest_uninitialized = false) {
1411     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1412     RegSet saved_regs = RegSet::of(s, d, count);
1413     StubCodeMark mark(this, "StubRoutines", name);
1414     address start = __ pc();
1415     __ enter();
1416 
1417     if (entry != NULL) {
1418       *entry = __ pc();
1419       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1420       BLOCK_COMMENT("Entry:");
1421     }
1422 
1423     // use fwd copy when (d-s) above_equal (count*size)
1424     __ sub(rscratch1, d, s);
1425     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1426     __ br(Assembler::HS, nooverlap_target);
1427 
1428     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1429     if (dest_uninitialized) {
1430       decorators |= IS_DEST_UNINITIALIZED;
1431     }
1432     if (aligned) {
1433       decorators |= ARRAYCOPY_ALIGNED;
1434     }
1435 
1436     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1437     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1438 
1439     if (is_oop) {
1440       // save regs before copy_memory
1441       __ push(RegSet::of(d, count), sp);
1442     }
1443     copy_memory(aligned, s, d, count, rscratch1, -size);
1444     if (is_oop) {
1445       __ pop(RegSet::of(d, count), sp);
1446       if (VerifyOops)
1447         verify_oop_array(size, d, count, r16);
1448       __ sub(count, count, 1); // make an inclusive end pointer
1449       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1450     }
1451     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1452     __ leave();
1453     __ mov(r0, zr); // return 0
1454     __ ret(lr);
1455 #ifdef BUILTIN_SIM
1456     {
1457       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1458       sim->notifyCompile(const_cast<char*>(name), start);
1459     }
1460 #endif
1461     return start;
1462 }
1463 
1464   // Arguments:
1465   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1466   //             ignored
1467   //   name    - stub name string
1468   //
1469   // Inputs:
1470   //   c_rarg0   - source array address
1471   //   c_rarg1   - destination array address
1472   //   c_rarg2   - element count, treated as ssize_t, can be zero
1473   //
1474   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1475   // we let the hardware handle it.  The one to eight bytes within words,
1476   // dwords or qwords that span cache line boundaries will still be loaded
1477   // and stored atomically.
1478   //
1479   // Side Effects:
1480   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1481   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1482   // we let the hardware handle it.  The one to eight bytes within words,
1483   // dwords or qwords that span cache line boundaries will still be loaded
1484   // and stored atomically.
1485   //
1486   // Side Effects:
1487   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1488   //   used by generate_conjoint_byte_copy().
1489   //
1490   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1491     const bool not_oop = false;
1492     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1493   }
1494 
1495   // Arguments:
1496   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1497   //             ignored
1498   //   name    - stub name string
1499   //
1500   // Inputs:
1501   //   c_rarg0   - source array address
1502   //   c_rarg1   - destination array address
1503   //   c_rarg2   - element count, treated as ssize_t, can be zero
1504   //
1505   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1506   // we let the hardware handle it.  The one to eight bytes within words,
1507   // dwords or qwords that span cache line boundaries will still be loaded
1508   // and stored atomically.
1509   //
1510   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1511                                       address* entry, const char *name) {
1512     const bool not_oop = false;
1513     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1514   }
1515 
1516   // Arguments:
1517   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1518   //             ignored
1519   //   name    - stub name string
1520   //
1521   // Inputs:
1522   //   c_rarg0   - source array address
1523   //   c_rarg1   - destination array address
1524   //   c_rarg2   - element count, treated as ssize_t, can be zero
1525   //
1526   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1527   // let the hardware handle it.  The two or four words within dwords
1528   // or qwords that span cache line boundaries will still be loaded
1529   // and stored atomically.
1530   //
1531   // Side Effects:
1532   //   disjoint_short_copy_entry is set to the no-overlap entry point
1533   //   used by generate_conjoint_short_copy().
1534   //
1535   address generate_disjoint_short_copy(bool aligned,
1536                                        address* entry, const char *name) {
1537     const bool not_oop = false;
1538     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1539   }
1540 
1541   // Arguments:
1542   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1543   //             ignored
1544   //   name    - stub name string
1545   //
1546   // Inputs:
1547   //   c_rarg0   - source array address
1548   //   c_rarg1   - destination array address
1549   //   c_rarg2   - element count, treated as ssize_t, can be zero
1550   //
1551   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1552   // let the hardware handle it.  The two or four words within dwords
1553   // or qwords that span cache line boundaries will still be loaded
1554   // and stored atomically.
1555   //
1556   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1557                                        address *entry, const char *name) {
1558     const bool not_oop = false;
1559     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1560 
1561   }
1562   // Arguments:
1563   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1564   //             ignored
1565   //   name    - stub name string
1566   //
1567   // Inputs:
1568   //   c_rarg0   - source array address
1569   //   c_rarg1   - destination array address
1570   //   c_rarg2   - element count, treated as ssize_t, can be zero
1571   //
1572   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1573   // the hardware handle it.  The two dwords within qwords that span
1574   // cache line boundaries will still be loaded and stored atomicly.
1575   //
1576   // Side Effects:
1577   //   disjoint_int_copy_entry is set to the no-overlap entry point
1578   //   used by generate_conjoint_int_oop_copy().
1579   //
1580   address generate_disjoint_int_copy(bool aligned, address *entry,
1581                                          const char *name, bool dest_uninitialized = false) {
1582     const bool not_oop = false;
1583     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1584   }
1585 
1586   // Arguments:
1587   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1588   //             ignored
1589   //   name    - stub name string
1590   //
1591   // Inputs:
1592   //   c_rarg0   - source array address
1593   //   c_rarg1   - destination array address
1594   //   c_rarg2   - element count, treated as ssize_t, can be zero
1595   //
1596   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1597   // the hardware handle it.  The two dwords within qwords that span
1598   // cache line boundaries will still be loaded and stored atomicly.
1599   //
1600   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1601                                      address *entry, const char *name,
1602                                      bool dest_uninitialized = false) {
1603     const bool not_oop = false;
1604     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1605   }
1606 
1607 
1608   // Arguments:
1609   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1610   //             ignored
1611   //   name    - stub name string
1612   //
1613   // Inputs:
1614   //   c_rarg0   - source array address
1615   //   c_rarg1   - destination array address
1616   //   c_rarg2   - element count, treated as size_t, can be zero
1617   //
1618   // Side Effects:
1619   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1620   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1621   //
1622   address generate_disjoint_long_copy(bool aligned, address *entry,
1623                                           const char *name, bool dest_uninitialized = false) {
1624     const bool not_oop = false;
1625     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1626   }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as size_t, can be zero
1637   //
1638   address generate_conjoint_long_copy(bool aligned,
1639                                       address nooverlap_target, address *entry,
1640                                       const char *name, bool dest_uninitialized = false) {
1641     const bool not_oop = false;
1642     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1643   }
1644 
1645   // Arguments:
1646   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1647   //             ignored
1648   //   name    - stub name string
1649   //
1650   // Inputs:
1651   //   c_rarg0   - source array address
1652   //   c_rarg1   - destination array address
1653   //   c_rarg2   - element count, treated as size_t, can be zero
1654   //
1655   // Side Effects:
1656   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1657   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1658   //
1659   address generate_disjoint_oop_copy(bool aligned, address *entry,
1660                                      const char *name, bool dest_uninitialized) {
1661     const bool is_oop = true;
1662     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1663     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1664   }
1665 
1666   // Arguments:
1667   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1668   //             ignored
1669   //   name    - stub name string
1670   //
1671   // Inputs:
1672   //   c_rarg0   - source array address
1673   //   c_rarg1   - destination array address
1674   //   c_rarg2   - element count, treated as size_t, can be zero
1675   //
1676   address generate_conjoint_oop_copy(bool aligned,
1677                                      address nooverlap_target, address *entry,
1678                                      const char *name, bool dest_uninitialized) {
1679     const bool is_oop = true;
1680     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1681     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1682                                   name, dest_uninitialized);
1683   }
1684 
1685 
1686   // Helper for generating a dynamic type check.
1687   // Smashes rscratch1.
1688   void generate_type_check(Register sub_klass,
1689                            Register super_check_offset,
1690                            Register super_klass,
1691                            Label& L_success) {
1692     assert_different_registers(sub_klass, super_check_offset, super_klass);
1693 
1694     BLOCK_COMMENT("type_check:");
1695 
1696     Label L_miss;
1697 
1698     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1699                                      super_check_offset);
1700     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1701 
1702     // Fall through on failure!
1703     __ BIND(L_miss);
1704   }
1705 
1706   //
1707   //  Generate checkcasting array copy stub
1708   //
1709   //  Input:
1710   //    c_rarg0   - source array address
1711   //    c_rarg1   - destination array address
1712   //    c_rarg2   - element count, treated as ssize_t, can be zero
1713   //    c_rarg3   - size_t ckoff (super_check_offset)
1714   //    c_rarg4   - oop ckval (super_klass)
1715   //
1716   //  Output:
1717   //    r0 ==  0  -  success
1718   //    r0 == -1^K - failure, where K is partial transfer count
1719   //
1720   address generate_checkcast_copy(const char *name, address *entry,
1721                                   bool dest_uninitialized = false) {
1722 
1723     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1724 
1725     // Input registers (after setup_arg_regs)
1726     const Register from        = c_rarg0;   // source array address
1727     const Register to          = c_rarg1;   // destination array address
1728     const Register count       = c_rarg2;   // elementscount
1729     const Register ckoff       = c_rarg3;   // super_check_offset
1730     const Register ckval       = c_rarg4;   // super_klass
1731 
1732     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1733     RegSet wb_post_saved_regs = RegSet::of(count);
1734 
1735     // Registers used as temps (r18, r19, r20 are save-on-entry)
1736     const Register count_save  = r21;       // orig elementscount
1737     const Register start_to    = r20;       // destination array start address
1738     const Register copied_oop  = r18;       // actual oop copied
1739     const Register r19_klass   = r19;       // oop._klass
1740 
1741     //---------------------------------------------------------------
1742     // Assembler stub will be used for this call to arraycopy
1743     // if the two arrays are subtypes of Object[] but the
1744     // destination array type is not equal to or a supertype
1745     // of the source type.  Each element must be separately
1746     // checked.
1747 
1748     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1749                                copied_oop, r19_klass, count_save);
1750 
1751     __ align(CodeEntryAlignment);
1752     StubCodeMark mark(this, "StubRoutines", name);
1753     address start = __ pc();
1754 
1755     __ enter(); // required for proper stackwalking of RuntimeStub frame
1756 
1757 #ifdef ASSERT
1758     // caller guarantees that the arrays really are different
1759     // otherwise, we would have to make conjoint checks
1760     { Label L;
1761       array_overlap_test(L, TIMES_OOP);
1762       __ stop("checkcast_copy within a single array");
1763       __ bind(L);
1764     }
1765 #endif //ASSERT
1766 
1767     // Caller of this entry point must set up the argument registers.
1768     if (entry != NULL) {
1769       *entry = __ pc();
1770       BLOCK_COMMENT("Entry:");
1771     }
1772 
1773      // Empty array:  Nothing to do.
1774     __ cbz(count, L_done);
1775 
1776     __ push(RegSet::of(r18, r19, r20, r21), sp);
1777 
1778 #ifdef ASSERT
1779     BLOCK_COMMENT("assert consistent ckoff/ckval");
1780     // The ckoff and ckval must be mutually consistent,
1781     // even though caller generates both.
1782     { Label L;
1783       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1784       __ ldrw(start_to, Address(ckval, sco_offset));
1785       __ cmpw(ckoff, start_to);
1786       __ br(Assembler::EQ, L);
1787       __ stop("super_check_offset inconsistent");
1788       __ bind(L);
1789     }
1790 #endif //ASSERT
1791 
1792     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1793     bool is_oop = true;
1794     if (dest_uninitialized) {
1795       decorators |= IS_DEST_UNINITIALIZED;
1796     }
1797 
1798     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1799     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1800 
1801     // save the original count
1802     __ mov(count_save, count);
1803 
1804     // Copy from low to high addresses
1805     __ mov(start_to, to);              // Save destination array start address
1806     __ b(L_load_element);
1807 
1808     // ======== begin loop ========
1809     // (Loop is rotated; its entry is L_load_element.)
1810     // Loop control:
1811     //   for (; count != 0; count--) {
1812     //     copied_oop = load_heap_oop(from++);
1813     //     ... generate_type_check ...;
1814     //     store_heap_oop(to++, copied_oop);
1815     //   }
1816     __ align(OptoLoopAlignment);
1817 
1818     __ BIND(L_store_element);
1819     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1820     __ sub(count, count, 1);
1821     __ cbz(count, L_do_card_marks);
1822 
1823     // ======== loop entry is here ========
1824     __ BIND(L_load_element);
1825     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1826     __ cbz(copied_oop, L_store_element);
1827 
1828     __ load_klass(r19_klass, copied_oop);// query the object klass
1829     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1830     // ======== end loop ========
1831 
1832     // It was a real error; we must depend on the caller to finish the job.
1833     // Register count = remaining oops, count_orig = total oops.
1834     // Emit GC store barriers for the oops we have copied and report
1835     // their number to the caller.
1836 
1837     __ subs(count, count_save, count);     // K = partially copied oop count
1838     __ eon(count, count, zr);                   // report (-1^K) to caller
1839     __ br(Assembler::EQ, L_done_pop);
1840 
1841     __ BIND(L_do_card_marks);
1842     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1843     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1844 
1845     __ bind(L_done_pop);
1846     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1847     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1848 
1849     __ bind(L_done);
1850     __ mov(r0, count);
1851     __ leave();
1852     __ ret(lr);
1853 
1854     return start;
1855   }
1856 
1857   // Perform range checks on the proposed arraycopy.
1858   // Kills temp, but nothing else.
1859   // Also, clean the sign bits of src_pos and dst_pos.
1860   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1861                               Register src_pos, // source position (c_rarg1)
1862                               Register dst,     // destination array oo (c_rarg2)
1863                               Register dst_pos, // destination position (c_rarg3)
1864                               Register length,
1865                               Register temp,
1866                               Label& L_failed) {
1867     BLOCK_COMMENT("arraycopy_range_checks:");
1868 
1869     assert_different_registers(rscratch1, temp);
1870 
1871     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1872     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1873     __ addw(temp, length, src_pos);
1874     __ cmpw(temp, rscratch1);
1875     __ br(Assembler::HI, L_failed);
1876 
1877     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1878     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1879     __ addw(temp, length, dst_pos);
1880     __ cmpw(temp, rscratch1);
1881     __ br(Assembler::HI, L_failed);
1882 
1883     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1884     __ movw(src_pos, src_pos);
1885     __ movw(dst_pos, dst_pos);
1886 
1887     BLOCK_COMMENT("arraycopy_range_checks done");
1888   }
1889 
1890   // These stubs get called from some dumb test routine.
1891   // I'll write them properly when they're called from
1892   // something that's actually doing something.
1893   static void fake_arraycopy_stub(address src, address dst, int count) {
1894     assert(count == 0, "huh?");
1895   }
1896 
1897 
1898   //
1899   //  Generate 'unsafe' array copy stub
1900   //  Though just as safe as the other stubs, it takes an unscaled
1901   //  size_t argument instead of an element count.
1902   //
1903   //  Input:
1904   //    c_rarg0   - source array address
1905   //    c_rarg1   - destination array address
1906   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1907   //
1908   // Examines the alignment of the operands and dispatches
1909   // to a long, int, short, or byte copy loop.
1910   //
1911   address generate_unsafe_copy(const char *name,
1912                                address byte_copy_entry,
1913                                address short_copy_entry,
1914                                address int_copy_entry,
1915                                address long_copy_entry) {
1916     Label L_long_aligned, L_int_aligned, L_short_aligned;
1917     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1918 
1919     __ align(CodeEntryAlignment);
1920     StubCodeMark mark(this, "StubRoutines", name);
1921     address start = __ pc();
1922     __ enter(); // required for proper stackwalking of RuntimeStub frame
1923 
1924     // bump this on entry, not on exit:
1925     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1926 
1927     __ orr(rscratch1, s, d);
1928     __ orr(rscratch1, rscratch1, count);
1929 
1930     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1931     __ cbz(rscratch1, L_long_aligned);
1932     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1933     __ cbz(rscratch1, L_int_aligned);
1934     __ tbz(rscratch1, 0, L_short_aligned);
1935     __ b(RuntimeAddress(byte_copy_entry));
1936 
1937     __ BIND(L_short_aligned);
1938     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1939     __ b(RuntimeAddress(short_copy_entry));
1940     __ BIND(L_int_aligned);
1941     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1942     __ b(RuntimeAddress(int_copy_entry));
1943     __ BIND(L_long_aligned);
1944     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1945     __ b(RuntimeAddress(long_copy_entry));
1946 
1947     return start;
1948   }
1949 
1950   //
1951   //  Generate generic array copy stubs
1952   //
1953   //  Input:
1954   //    c_rarg0    -  src oop
1955   //    c_rarg1    -  src_pos (32-bits)
1956   //    c_rarg2    -  dst oop
1957   //    c_rarg3    -  dst_pos (32-bits)
1958   //    c_rarg4    -  element count (32-bits)
1959   //
1960   //  Output:
1961   //    r0 ==  0  -  success
1962   //    r0 == -1^K - failure, where K is partial transfer count
1963   //
1964   address generate_generic_copy(const char *name,
1965                                 address byte_copy_entry, address short_copy_entry,
1966                                 address int_copy_entry, address oop_copy_entry,
1967                                 address long_copy_entry, address checkcast_copy_entry) {
1968 
1969     Label L_failed, L_failed_0, L_objArray;
1970     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1971 
1972     // Input registers
1973     const Register src        = c_rarg0;  // source array oop
1974     const Register src_pos    = c_rarg1;  // source position
1975     const Register dst        = c_rarg2;  // destination array oop
1976     const Register dst_pos    = c_rarg3;  // destination position
1977     const Register length     = c_rarg4;
1978 
1979     StubCodeMark mark(this, "StubRoutines", name);
1980 
1981     __ align(CodeEntryAlignment);
1982     address start = __ pc();
1983 
1984     __ enter(); // required for proper stackwalking of RuntimeStub frame
1985 
1986     // bump this on entry, not on exit:
1987     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1988 
1989     //-----------------------------------------------------------------------
1990     // Assembler stub will be used for this call to arraycopy
1991     // if the following conditions are met:
1992     //
1993     // (1) src and dst must not be null.
1994     // (2) src_pos must not be negative.
1995     // (3) dst_pos must not be negative.
1996     // (4) length  must not be negative.
1997     // (5) src klass and dst klass should be the same and not NULL.
1998     // (6) src and dst should be arrays.
1999     // (7) src_pos + length must not exceed length of src.
2000     // (8) dst_pos + length must not exceed length of dst.
2001     //
2002 
2003     //  if (src == NULL) return -1;
2004     __ cbz(src, L_failed);
2005 
2006     //  if (src_pos < 0) return -1;
2007     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2008 
2009     //  if (dst == NULL) return -1;
2010     __ cbz(dst, L_failed);
2011 
2012     //  if (dst_pos < 0) return -1;
2013     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2014 
2015     // registers used as temp
2016     const Register scratch_length    = r16; // elements count to copy
2017     const Register scratch_src_klass = r17; // array klass
2018     const Register lh                = r18; // layout helper
2019 
2020     //  if (length < 0) return -1;
2021     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2022     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2023 
2024     __ load_klass(scratch_src_klass, src);
2025 #ifdef ASSERT
2026     //  assert(src->klass() != NULL);
2027     {
2028       BLOCK_COMMENT("assert klasses not null {");
2029       Label L1, L2;
2030       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2031       __ bind(L1);
2032       __ stop("broken null klass");
2033       __ bind(L2);
2034       __ load_klass(rscratch1, dst);
2035       __ cbz(rscratch1, L1);     // this would be broken also
2036       BLOCK_COMMENT("} assert klasses not null done");
2037     }
2038 #endif
2039 
2040     // Load layout helper (32-bits)
2041     //
2042     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2043     // 32        30    24            16              8     2                 0
2044     //
2045     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2046     //
2047 
2048     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2049 
2050     // Handle objArrays completely differently...
2051     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2052     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2053     __ movw(rscratch1, objArray_lh);
2054     __ eorw(rscratch2, lh, rscratch1);
2055     __ cbzw(rscratch2, L_objArray);
2056 
2057     //  if (src->klass() != dst->klass()) return -1;
2058     __ load_klass(rscratch2, dst);
2059     __ eor(rscratch2, rscratch2, scratch_src_klass);
2060     __ cbnz(rscratch2, L_failed);
2061 
2062     //  if (!src->is_Array()) return -1;
2063     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2064 
2065     // At this point, it is known to be a typeArray (array_tag 0x3).
2066 #ifdef ASSERT
2067     {
2068       BLOCK_COMMENT("assert primitive array {");
2069       Label L;
2070       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2071       __ cmpw(lh, rscratch2);
2072       __ br(Assembler::GE, L);
2073       __ stop("must be a primitive array");
2074       __ bind(L);
2075       BLOCK_COMMENT("} assert primitive array done");
2076     }
2077 #endif
2078 
2079     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2080                            rscratch2, L_failed);
2081 
2082     // TypeArrayKlass
2083     //
2084     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2085     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2086     //
2087 
2088     const Register rscratch1_offset = rscratch1;    // array offset
2089     const Register r18_elsize = lh; // element size
2090 
2091     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2092            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2093     __ add(src, src, rscratch1_offset);           // src array offset
2094     __ add(dst, dst, rscratch1_offset);           // dst array offset
2095     BLOCK_COMMENT("choose copy loop based on element size");
2096 
2097     // next registers should be set before the jump to corresponding stub
2098     const Register from     = c_rarg0;  // source array address
2099     const Register to       = c_rarg1;  // destination array address
2100     const Register count    = c_rarg2;  // elements count
2101 
2102     // 'from', 'to', 'count' registers should be set in such order
2103     // since they are the same as 'src', 'src_pos', 'dst'.
2104 
2105     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2106 
2107     // The possible values of elsize are 0-3, i.e. exact_log2(element
2108     // size in bytes).  We do a simple bitwise binary search.
2109   __ BIND(L_copy_bytes);
2110     __ tbnz(r18_elsize, 1, L_copy_ints);
2111     __ tbnz(r18_elsize, 0, L_copy_shorts);
2112     __ lea(from, Address(src, src_pos));// src_addr
2113     __ lea(to,   Address(dst, dst_pos));// dst_addr
2114     __ movw(count, scratch_length); // length
2115     __ b(RuntimeAddress(byte_copy_entry));
2116 
2117   __ BIND(L_copy_shorts);
2118     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2119     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2120     __ movw(count, scratch_length); // length
2121     __ b(RuntimeAddress(short_copy_entry));
2122 
2123   __ BIND(L_copy_ints);
2124     __ tbnz(r18_elsize, 0, L_copy_longs);
2125     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2126     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2127     __ movw(count, scratch_length); // length
2128     __ b(RuntimeAddress(int_copy_entry));
2129 
2130   __ BIND(L_copy_longs);
2131 #ifdef ASSERT
2132     {
2133       BLOCK_COMMENT("assert long copy {");
2134       Label L;
2135       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2136       __ cmpw(r18_elsize, LogBytesPerLong);
2137       __ br(Assembler::EQ, L);
2138       __ stop("must be long copy, but elsize is wrong");
2139       __ bind(L);
2140       BLOCK_COMMENT("} assert long copy done");
2141     }
2142 #endif
2143     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2144     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2145     __ movw(count, scratch_length); // length
2146     __ b(RuntimeAddress(long_copy_entry));
2147 
2148     // ObjArrayKlass
2149   __ BIND(L_objArray);
2150     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2151 
2152     Label L_plain_copy, L_checkcast_copy;
2153     //  test array classes for subtyping
2154     __ load_klass(r18, dst);
2155     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2156     __ br(Assembler::NE, L_checkcast_copy);
2157 
2158     // Identically typed arrays can be copied without element-wise checks.
2159     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2160                            rscratch2, L_failed);
2161 
2162     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2163     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2164     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2165     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2166     __ movw(count, scratch_length); // length
2167   __ BIND(L_plain_copy);
2168     __ b(RuntimeAddress(oop_copy_entry));
2169 
2170   __ BIND(L_checkcast_copy);
2171     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2172     {
2173       // Before looking at dst.length, make sure dst is also an objArray.
2174       __ ldrw(rscratch1, Address(r18, lh_offset));
2175       __ movw(rscratch2, objArray_lh);
2176       __ eorw(rscratch1, rscratch1, rscratch2);
2177       __ cbnzw(rscratch1, L_failed);
2178 
2179       // It is safe to examine both src.length and dst.length.
2180       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2181                              r18, L_failed);
2182 
2183       const Register rscratch2_dst_klass = rscratch2;
2184       __ load_klass(rscratch2_dst_klass, dst); // reload
2185 
2186       // Marshal the base address arguments now, freeing registers.
2187       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2188       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2189       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2190       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2191       __ movw(count, length);           // length (reloaded)
2192       Register sco_temp = c_rarg3;      // this register is free now
2193       assert_different_registers(from, to, count, sco_temp,
2194                                  rscratch2_dst_klass, scratch_src_klass);
2195       // assert_clean_int(count, sco_temp);
2196 
2197       // Generate the type check.
2198       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2199       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2200       // assert_clean_int(sco_temp, r18);
2201       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2202 
2203       // Fetch destination element klass from the ObjArrayKlass header.
2204       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2205       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2206       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2207 
2208       // the checkcast_copy loop needs two extra arguments:
2209       assert(c_rarg3 == sco_temp, "#3 already in place");
2210       // Set up arguments for checkcast_copy_entry.
2211       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2212       __ b(RuntimeAddress(checkcast_copy_entry));
2213     }
2214 
2215   __ BIND(L_failed);
2216     __ mov(r0, -1);
2217     __ leave();   // required for proper stackwalking of RuntimeStub frame
2218     __ ret(lr);
2219 
2220     return start;
2221   }
2222 
2223   //
2224   // Generate stub for array fill. If "aligned" is true, the
2225   // "to" address is assumed to be heapword aligned.
2226   //
2227   // Arguments for generated stub:
2228   //   to:    c_rarg0
2229   //   value: c_rarg1
2230   //   count: c_rarg2 treated as signed
2231   //
2232   address generate_fill(BasicType t, bool aligned, const char *name) {
2233     __ align(CodeEntryAlignment);
2234     StubCodeMark mark(this, "StubRoutines", name);
2235     address start = __ pc();
2236 
2237     BLOCK_COMMENT("Entry:");
2238 
2239     const Register to        = c_rarg0;  // source array address
2240     const Register value     = c_rarg1;  // value
2241     const Register count     = c_rarg2;  // elements count
2242 
2243     const Register bz_base = r10;        // base for block_zero routine
2244     const Register cnt_words = r11;      // temp register
2245 
2246     __ enter();
2247 
2248     Label L_fill_elements, L_exit1;
2249 
2250     int shift = -1;
2251     switch (t) {
2252       case T_BYTE:
2253         shift = 0;
2254         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2255         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2256         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2257         __ br(Assembler::LO, L_fill_elements);
2258         break;
2259       case T_SHORT:
2260         shift = 1;
2261         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2262         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2263         __ br(Assembler::LO, L_fill_elements);
2264         break;
2265       case T_INT:
2266         shift = 2;
2267         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2268         __ br(Assembler::LO, L_fill_elements);
2269         break;
2270       default: ShouldNotReachHere();
2271     }
2272 
2273     // Align source address at 8 bytes address boundary.
2274     Label L_skip_align1, L_skip_align2, L_skip_align4;
2275     if (!aligned) {
2276       switch (t) {
2277         case T_BYTE:
2278           // One byte misalignment happens only for byte arrays.
2279           __ tbz(to, 0, L_skip_align1);
2280           __ strb(value, Address(__ post(to, 1)));
2281           __ subw(count, count, 1);
2282           __ bind(L_skip_align1);
2283           // Fallthrough
2284         case T_SHORT:
2285           // Two bytes misalignment happens only for byte and short (char) arrays.
2286           __ tbz(to, 1, L_skip_align2);
2287           __ strh(value, Address(__ post(to, 2)));
2288           __ subw(count, count, 2 >> shift);
2289           __ bind(L_skip_align2);
2290           // Fallthrough
2291         case T_INT:
2292           // Align to 8 bytes, we know we are 4 byte aligned to start.
2293           __ tbz(to, 2, L_skip_align4);
2294           __ strw(value, Address(__ post(to, 4)));
2295           __ subw(count, count, 4 >> shift);
2296           __ bind(L_skip_align4);
2297           break;
2298         default: ShouldNotReachHere();
2299       }
2300     }
2301 
2302     //
2303     //  Fill large chunks
2304     //
2305     __ lsrw(cnt_words, count, 3 - shift); // number of words
2306     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2307     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2308     if (UseBlockZeroing) {
2309       Label non_block_zeroing, rest;
2310       // If the fill value is zero we can use the fast zero_words().
2311       __ cbnz(value, non_block_zeroing);
2312       __ mov(bz_base, to);
2313       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2314       __ zero_words(bz_base, cnt_words);
2315       __ b(rest);
2316       __ bind(non_block_zeroing);
2317       __ fill_words(to, cnt_words, value);
2318       __ bind(rest);
2319     } else {
2320       __ fill_words(to, cnt_words, value);
2321     }
2322 
2323     // Remaining count is less than 8 bytes. Fill it by a single store.
2324     // Note that the total length is no less than 8 bytes.
2325     if (t == T_BYTE || t == T_SHORT) {
2326       Label L_exit1;
2327       __ cbzw(count, L_exit1);
2328       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2329       __ str(value, Address(to, -8));    // overwrite some elements
2330       __ bind(L_exit1);
2331       __ leave();
2332       __ ret(lr);
2333     }
2334 
2335     // Handle copies less than 8 bytes.
2336     Label L_fill_2, L_fill_4, L_exit2;
2337     __ bind(L_fill_elements);
2338     switch (t) {
2339       case T_BYTE:
2340         __ tbz(count, 0, L_fill_2);
2341         __ strb(value, Address(__ post(to, 1)));
2342         __ bind(L_fill_2);
2343         __ tbz(count, 1, L_fill_4);
2344         __ strh(value, Address(__ post(to, 2)));
2345         __ bind(L_fill_4);
2346         __ tbz(count, 2, L_exit2);
2347         __ strw(value, Address(to));
2348         break;
2349       case T_SHORT:
2350         __ tbz(count, 0, L_fill_4);
2351         __ strh(value, Address(__ post(to, 2)));
2352         __ bind(L_fill_4);
2353         __ tbz(count, 1, L_exit2);
2354         __ strw(value, Address(to));
2355         break;
2356       case T_INT:
2357         __ cbzw(count, L_exit2);
2358         __ strw(value, Address(to));
2359         break;
2360       default: ShouldNotReachHere();
2361     }
2362     __ bind(L_exit2);
2363     __ leave();
2364     __ ret(lr);
2365     return start;
2366   }
2367 
2368   void generate_arraycopy_stubs() {
2369     address entry;
2370     address entry_jbyte_arraycopy;
2371     address entry_jshort_arraycopy;
2372     address entry_jint_arraycopy;
2373     address entry_oop_arraycopy;
2374     address entry_jlong_arraycopy;
2375     address entry_checkcast_arraycopy;
2376 
2377     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2378     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2379 
2380     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2381 
2382     //*** jbyte
2383     // Always need aligned and unaligned versions
2384     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2385                                                                                   "jbyte_disjoint_arraycopy");
2386     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2387                                                                                   &entry_jbyte_arraycopy,
2388                                                                                   "jbyte_arraycopy");
2389     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2390                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2391     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2392                                                                                   "arrayof_jbyte_arraycopy");
2393 
2394     //*** jshort
2395     // Always need aligned and unaligned versions
2396     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2397                                                                                     "jshort_disjoint_arraycopy");
2398     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2399                                                                                     &entry_jshort_arraycopy,
2400                                                                                     "jshort_arraycopy");
2401     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2402                                                                                     "arrayof_jshort_disjoint_arraycopy");
2403     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2404                                                                                     "arrayof_jshort_arraycopy");
2405 
2406     //*** jint
2407     // Aligned versions
2408     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2409                                                                                 "arrayof_jint_disjoint_arraycopy");
2410     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2411                                                                                 "arrayof_jint_arraycopy");
2412     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2413     // entry_jint_arraycopy always points to the unaligned version
2414     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2415                                                                                 "jint_disjoint_arraycopy");
2416     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2417                                                                                 &entry_jint_arraycopy,
2418                                                                                 "jint_arraycopy");
2419 
2420     //*** jlong
2421     // It is always aligned
2422     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2423                                                                                   "arrayof_jlong_disjoint_arraycopy");
2424     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2425                                                                                   "arrayof_jlong_arraycopy");
2426     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2427     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2428 
2429     //*** oops
2430     {
2431       // With compressed oops we need unaligned versions; notice that
2432       // we overwrite entry_oop_arraycopy.
2433       bool aligned = !UseCompressedOops;
2434 
2435       StubRoutines::_arrayof_oop_disjoint_arraycopy
2436         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2437                                      /*dest_uninitialized*/false);
2438       StubRoutines::_arrayof_oop_arraycopy
2439         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2440                                      /*dest_uninitialized*/false);
2441       // Aligned versions without pre-barriers
2442       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2443         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2444                                      /*dest_uninitialized*/true);
2445       StubRoutines::_arrayof_oop_arraycopy_uninit
2446         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2447                                      /*dest_uninitialized*/true);
2448     }
2449 
2450     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2451     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2452     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2453     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2454 
2455     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2456     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2457                                                                         /*dest_uninitialized*/true);
2458 
2459     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2460                                                               entry_jbyte_arraycopy,
2461                                                               entry_jshort_arraycopy,
2462                                                               entry_jint_arraycopy,
2463                                                               entry_jlong_arraycopy);
2464 
2465     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2466                                                                entry_jbyte_arraycopy,
2467                                                                entry_jshort_arraycopy,
2468                                                                entry_jint_arraycopy,
2469                                                                entry_oop_arraycopy,
2470                                                                entry_jlong_arraycopy,
2471                                                                entry_checkcast_arraycopy);
2472 
2473     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2474     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2475     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2476     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2477     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2478     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2479   }
2480 
2481   void generate_math_stubs() { Unimplemented(); }
2482 
2483   // Arguments:
2484   //
2485   // Inputs:
2486   //   c_rarg0   - source byte array address
2487   //   c_rarg1   - destination byte array address
2488   //   c_rarg2   - K (key) in little endian int array
2489   //
2490   address generate_aescrypt_encryptBlock() {
2491     __ align(CodeEntryAlignment);
2492     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2493 
2494     Label L_doLast;
2495 
2496     const Register from        = c_rarg0;  // source array address
2497     const Register to          = c_rarg1;  // destination array address
2498     const Register key         = c_rarg2;  // key array address
2499     const Register keylen      = rscratch1;
2500 
2501     address start = __ pc();
2502     __ enter();
2503 
2504     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2505 
2506     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2507 
2508     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2509     __ rev32(v1, __ T16B, v1);
2510     __ rev32(v2, __ T16B, v2);
2511     __ rev32(v3, __ T16B, v3);
2512     __ rev32(v4, __ T16B, v4);
2513     __ aese(v0, v1);
2514     __ aesmc(v0, v0);
2515     __ aese(v0, v2);
2516     __ aesmc(v0, v0);
2517     __ aese(v0, v3);
2518     __ aesmc(v0, v0);
2519     __ aese(v0, v4);
2520     __ aesmc(v0, v0);
2521 
2522     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2523     __ rev32(v1, __ T16B, v1);
2524     __ rev32(v2, __ T16B, v2);
2525     __ rev32(v3, __ T16B, v3);
2526     __ rev32(v4, __ T16B, v4);
2527     __ aese(v0, v1);
2528     __ aesmc(v0, v0);
2529     __ aese(v0, v2);
2530     __ aesmc(v0, v0);
2531     __ aese(v0, v3);
2532     __ aesmc(v0, v0);
2533     __ aese(v0, v4);
2534     __ aesmc(v0, v0);
2535 
2536     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2537     __ rev32(v1, __ T16B, v1);
2538     __ rev32(v2, __ T16B, v2);
2539 
2540     __ cmpw(keylen, 44);
2541     __ br(Assembler::EQ, L_doLast);
2542 
2543     __ aese(v0, v1);
2544     __ aesmc(v0, v0);
2545     __ aese(v0, v2);
2546     __ aesmc(v0, v0);
2547 
2548     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2549     __ rev32(v1, __ T16B, v1);
2550     __ rev32(v2, __ T16B, v2);
2551 
2552     __ cmpw(keylen, 52);
2553     __ br(Assembler::EQ, L_doLast);
2554 
2555     __ aese(v0, v1);
2556     __ aesmc(v0, v0);
2557     __ aese(v0, v2);
2558     __ aesmc(v0, v0);
2559 
2560     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2561     __ rev32(v1, __ T16B, v1);
2562     __ rev32(v2, __ T16B, v2);
2563 
2564     __ BIND(L_doLast);
2565 
2566     __ aese(v0, v1);
2567     __ aesmc(v0, v0);
2568     __ aese(v0, v2);
2569 
2570     __ ld1(v1, __ T16B, key);
2571     __ rev32(v1, __ T16B, v1);
2572     __ eor(v0, __ T16B, v0, v1);
2573 
2574     __ st1(v0, __ T16B, to);
2575 
2576     __ mov(r0, 0);
2577 
2578     __ leave();
2579     __ ret(lr);
2580 
2581     return start;
2582   }
2583 
2584   // Arguments:
2585   //
2586   // Inputs:
2587   //   c_rarg0   - source byte array address
2588   //   c_rarg1   - destination byte array address
2589   //   c_rarg2   - K (key) in little endian int array
2590   //
2591   address generate_aescrypt_decryptBlock() {
2592     assert(UseAES, "need AES instructions and misaligned SSE support");
2593     __ align(CodeEntryAlignment);
2594     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2595     Label L_doLast;
2596 
2597     const Register from        = c_rarg0;  // source array address
2598     const Register to          = c_rarg1;  // destination array address
2599     const Register key         = c_rarg2;  // key array address
2600     const Register keylen      = rscratch1;
2601 
2602     address start = __ pc();
2603     __ enter(); // required for proper stackwalking of RuntimeStub frame
2604 
2605     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2606 
2607     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2608 
2609     __ ld1(v5, __ T16B, __ post(key, 16));
2610     __ rev32(v5, __ T16B, v5);
2611 
2612     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2613     __ rev32(v1, __ T16B, v1);
2614     __ rev32(v2, __ T16B, v2);
2615     __ rev32(v3, __ T16B, v3);
2616     __ rev32(v4, __ T16B, v4);
2617     __ aesd(v0, v1);
2618     __ aesimc(v0, v0);
2619     __ aesd(v0, v2);
2620     __ aesimc(v0, v0);
2621     __ aesd(v0, v3);
2622     __ aesimc(v0, v0);
2623     __ aesd(v0, v4);
2624     __ aesimc(v0, v0);
2625 
2626     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2627     __ rev32(v1, __ T16B, v1);
2628     __ rev32(v2, __ T16B, v2);
2629     __ rev32(v3, __ T16B, v3);
2630     __ rev32(v4, __ T16B, v4);
2631     __ aesd(v0, v1);
2632     __ aesimc(v0, v0);
2633     __ aesd(v0, v2);
2634     __ aesimc(v0, v0);
2635     __ aesd(v0, v3);
2636     __ aesimc(v0, v0);
2637     __ aesd(v0, v4);
2638     __ aesimc(v0, v0);
2639 
2640     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2641     __ rev32(v1, __ T16B, v1);
2642     __ rev32(v2, __ T16B, v2);
2643 
2644     __ cmpw(keylen, 44);
2645     __ br(Assembler::EQ, L_doLast);
2646 
2647     __ aesd(v0, v1);
2648     __ aesimc(v0, v0);
2649     __ aesd(v0, v2);
2650     __ aesimc(v0, v0);
2651 
2652     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2653     __ rev32(v1, __ T16B, v1);
2654     __ rev32(v2, __ T16B, v2);
2655 
2656     __ cmpw(keylen, 52);
2657     __ br(Assembler::EQ, L_doLast);
2658 
2659     __ aesd(v0, v1);
2660     __ aesimc(v0, v0);
2661     __ aesd(v0, v2);
2662     __ aesimc(v0, v0);
2663 
2664     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2665     __ rev32(v1, __ T16B, v1);
2666     __ rev32(v2, __ T16B, v2);
2667 
2668     __ BIND(L_doLast);
2669 
2670     __ aesd(v0, v1);
2671     __ aesimc(v0, v0);
2672     __ aesd(v0, v2);
2673 
2674     __ eor(v0, __ T16B, v0, v5);
2675 
2676     __ st1(v0, __ T16B, to);
2677 
2678     __ mov(r0, 0);
2679 
2680     __ leave();
2681     __ ret(lr);
2682 
2683     return start;
2684   }
2685 
2686   // Arguments:
2687   //
2688   // Inputs:
2689   //   c_rarg0   - source byte array address
2690   //   c_rarg1   - destination byte array address
2691   //   c_rarg2   - K (key) in little endian int array
2692   //   c_rarg3   - r vector byte array address
2693   //   c_rarg4   - input length
2694   //
2695   // Output:
2696   //   x0        - input length
2697   //
2698   address generate_cipherBlockChaining_encryptAESCrypt() {
2699     assert(UseAES, "need AES instructions and misaligned SSE support");
2700     __ align(CodeEntryAlignment);
2701     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2702 
2703     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2704 
2705     const Register from        = c_rarg0;  // source array address
2706     const Register to          = c_rarg1;  // destination array address
2707     const Register key         = c_rarg2;  // key array address
2708     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2709                                            // and left with the results of the last encryption block
2710     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2711     const Register keylen      = rscratch1;
2712 
2713     address start = __ pc();
2714 
2715       __ enter();
2716 
2717       __ movw(rscratch2, len_reg);
2718 
2719       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2720 
2721       __ ld1(v0, __ T16B, rvec);
2722 
2723       __ cmpw(keylen, 52);
2724       __ br(Assembler::CC, L_loadkeys_44);
2725       __ br(Assembler::EQ, L_loadkeys_52);
2726 
2727       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2728       __ rev32(v17, __ T16B, v17);
2729       __ rev32(v18, __ T16B, v18);
2730     __ BIND(L_loadkeys_52);
2731       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2732       __ rev32(v19, __ T16B, v19);
2733       __ rev32(v20, __ T16B, v20);
2734     __ BIND(L_loadkeys_44);
2735       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2736       __ rev32(v21, __ T16B, v21);
2737       __ rev32(v22, __ T16B, v22);
2738       __ rev32(v23, __ T16B, v23);
2739       __ rev32(v24, __ T16B, v24);
2740       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2741       __ rev32(v25, __ T16B, v25);
2742       __ rev32(v26, __ T16B, v26);
2743       __ rev32(v27, __ T16B, v27);
2744       __ rev32(v28, __ T16B, v28);
2745       __ ld1(v29, v30, v31, __ T16B, key);
2746       __ rev32(v29, __ T16B, v29);
2747       __ rev32(v30, __ T16B, v30);
2748       __ rev32(v31, __ T16B, v31);
2749 
2750     __ BIND(L_aes_loop);
2751       __ ld1(v1, __ T16B, __ post(from, 16));
2752       __ eor(v0, __ T16B, v0, v1);
2753 
2754       __ br(Assembler::CC, L_rounds_44);
2755       __ br(Assembler::EQ, L_rounds_52);
2756 
2757       __ aese(v0, v17); __ aesmc(v0, v0);
2758       __ aese(v0, v18); __ aesmc(v0, v0);
2759     __ BIND(L_rounds_52);
2760       __ aese(v0, v19); __ aesmc(v0, v0);
2761       __ aese(v0, v20); __ aesmc(v0, v0);
2762     __ BIND(L_rounds_44);
2763       __ aese(v0, v21); __ aesmc(v0, v0);
2764       __ aese(v0, v22); __ aesmc(v0, v0);
2765       __ aese(v0, v23); __ aesmc(v0, v0);
2766       __ aese(v0, v24); __ aesmc(v0, v0);
2767       __ aese(v0, v25); __ aesmc(v0, v0);
2768       __ aese(v0, v26); __ aesmc(v0, v0);
2769       __ aese(v0, v27); __ aesmc(v0, v0);
2770       __ aese(v0, v28); __ aesmc(v0, v0);
2771       __ aese(v0, v29); __ aesmc(v0, v0);
2772       __ aese(v0, v30);
2773       __ eor(v0, __ T16B, v0, v31);
2774 
2775       __ st1(v0, __ T16B, __ post(to, 16));
2776 
2777       __ subw(len_reg, len_reg, 16);
2778       __ cbnzw(len_reg, L_aes_loop);
2779 
2780       __ st1(v0, __ T16B, rvec);
2781 
2782       __ mov(r0, rscratch2);
2783 
2784       __ leave();
2785       __ ret(lr);
2786 
2787       return start;
2788   }
2789 
2790   // Arguments:
2791   //
2792   // Inputs:
2793   //   c_rarg0   - source byte array address
2794   //   c_rarg1   - destination byte array address
2795   //   c_rarg2   - K (key) in little endian int array
2796   //   c_rarg3   - r vector byte array address
2797   //   c_rarg4   - input length
2798   //
2799   // Output:
2800   //   r0        - input length
2801   //
2802   address generate_cipherBlockChaining_decryptAESCrypt() {
2803     assert(UseAES, "need AES instructions and misaligned SSE support");
2804     __ align(CodeEntryAlignment);
2805     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2806 
2807     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2808 
2809     const Register from        = c_rarg0;  // source array address
2810     const Register to          = c_rarg1;  // destination array address
2811     const Register key         = c_rarg2;  // key array address
2812     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2813                                            // and left with the results of the last encryption block
2814     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2815     const Register keylen      = rscratch1;
2816 
2817     address start = __ pc();
2818 
2819       __ enter();
2820 
2821       __ movw(rscratch2, len_reg);
2822 
2823       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2824 
2825       __ ld1(v2, __ T16B, rvec);
2826 
2827       __ ld1(v31, __ T16B, __ post(key, 16));
2828       __ rev32(v31, __ T16B, v31);
2829 
2830       __ cmpw(keylen, 52);
2831       __ br(Assembler::CC, L_loadkeys_44);
2832       __ br(Assembler::EQ, L_loadkeys_52);
2833 
2834       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2835       __ rev32(v17, __ T16B, v17);
2836       __ rev32(v18, __ T16B, v18);
2837     __ BIND(L_loadkeys_52);
2838       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2839       __ rev32(v19, __ T16B, v19);
2840       __ rev32(v20, __ T16B, v20);
2841     __ BIND(L_loadkeys_44);
2842       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2843       __ rev32(v21, __ T16B, v21);
2844       __ rev32(v22, __ T16B, v22);
2845       __ rev32(v23, __ T16B, v23);
2846       __ rev32(v24, __ T16B, v24);
2847       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2848       __ rev32(v25, __ T16B, v25);
2849       __ rev32(v26, __ T16B, v26);
2850       __ rev32(v27, __ T16B, v27);
2851       __ rev32(v28, __ T16B, v28);
2852       __ ld1(v29, v30, __ T16B, key);
2853       __ rev32(v29, __ T16B, v29);
2854       __ rev32(v30, __ T16B, v30);
2855 
2856     __ BIND(L_aes_loop);
2857       __ ld1(v0, __ T16B, __ post(from, 16));
2858       __ orr(v1, __ T16B, v0, v0);
2859 
2860       __ br(Assembler::CC, L_rounds_44);
2861       __ br(Assembler::EQ, L_rounds_52);
2862 
2863       __ aesd(v0, v17); __ aesimc(v0, v0);
2864       __ aesd(v0, v18); __ aesimc(v0, v0);
2865     __ BIND(L_rounds_52);
2866       __ aesd(v0, v19); __ aesimc(v0, v0);
2867       __ aesd(v0, v20); __ aesimc(v0, v0);
2868     __ BIND(L_rounds_44);
2869       __ aesd(v0, v21); __ aesimc(v0, v0);
2870       __ aesd(v0, v22); __ aesimc(v0, v0);
2871       __ aesd(v0, v23); __ aesimc(v0, v0);
2872       __ aesd(v0, v24); __ aesimc(v0, v0);
2873       __ aesd(v0, v25); __ aesimc(v0, v0);
2874       __ aesd(v0, v26); __ aesimc(v0, v0);
2875       __ aesd(v0, v27); __ aesimc(v0, v0);
2876       __ aesd(v0, v28); __ aesimc(v0, v0);
2877       __ aesd(v0, v29); __ aesimc(v0, v0);
2878       __ aesd(v0, v30);
2879       __ eor(v0, __ T16B, v0, v31);
2880       __ eor(v0, __ T16B, v0, v2);
2881 
2882       __ st1(v0, __ T16B, __ post(to, 16));
2883       __ orr(v2, __ T16B, v1, v1);
2884 
2885       __ subw(len_reg, len_reg, 16);
2886       __ cbnzw(len_reg, L_aes_loop);
2887 
2888       __ st1(v2, __ T16B, rvec);
2889 
2890       __ mov(r0, rscratch2);
2891 
2892       __ leave();
2893       __ ret(lr);
2894 
2895     return start;
2896   }
2897 
2898   // Arguments:
2899   //
2900   // Inputs:
2901   //   c_rarg0   - byte[]  source+offset
2902   //   c_rarg1   - int[]   SHA.state
2903   //   c_rarg2   - int     offset
2904   //   c_rarg3   - int     limit
2905   //
2906   address generate_sha1_implCompress(bool multi_block, const char *name) {
2907     __ align(CodeEntryAlignment);
2908     StubCodeMark mark(this, "StubRoutines", name);
2909     address start = __ pc();
2910 
2911     Register buf   = c_rarg0;
2912     Register state = c_rarg1;
2913     Register ofs   = c_rarg2;
2914     Register limit = c_rarg3;
2915 
2916     Label keys;
2917     Label sha1_loop;
2918 
2919     // load the keys into v0..v3
2920     __ adr(rscratch1, keys);
2921     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2922     // load 5 words state into v6, v7
2923     __ ldrq(v6, Address(state, 0));
2924     __ ldrs(v7, Address(state, 16));
2925 
2926 
2927     __ BIND(sha1_loop);
2928     // load 64 bytes of data into v16..v19
2929     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2930     __ rev32(v16, __ T16B, v16);
2931     __ rev32(v17, __ T16B, v17);
2932     __ rev32(v18, __ T16B, v18);
2933     __ rev32(v19, __ T16B, v19);
2934 
2935     // do the sha1
2936     __ addv(v4, __ T4S, v16, v0);
2937     __ orr(v20, __ T16B, v6, v6);
2938 
2939     FloatRegister d0 = v16;
2940     FloatRegister d1 = v17;
2941     FloatRegister d2 = v18;
2942     FloatRegister d3 = v19;
2943 
2944     for (int round = 0; round < 20; round++) {
2945       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2946       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2947       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2948       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2949       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2950 
2951       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2952       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2953       __ sha1h(tmp2, __ T4S, v20);
2954       if (round < 5)
2955         __ sha1c(v20, __ T4S, tmp3, tmp4);
2956       else if (round < 10 || round >= 15)
2957         __ sha1p(v20, __ T4S, tmp3, tmp4);
2958       else
2959         __ sha1m(v20, __ T4S, tmp3, tmp4);
2960       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2961 
2962       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2963     }
2964 
2965     __ addv(v7, __ T2S, v7, v21);
2966     __ addv(v6, __ T4S, v6, v20);
2967 
2968     if (multi_block) {
2969       __ add(ofs, ofs, 64);
2970       __ cmp(ofs, limit);
2971       __ br(Assembler::LE, sha1_loop);
2972       __ mov(c_rarg0, ofs); // return ofs
2973     }
2974 
2975     __ strq(v6, Address(state, 0));
2976     __ strs(v7, Address(state, 16));
2977 
2978     __ ret(lr);
2979 
2980     __ bind(keys);
2981     __ emit_int32(0x5a827999);
2982     __ emit_int32(0x6ed9eba1);
2983     __ emit_int32(0x8f1bbcdc);
2984     __ emit_int32(0xca62c1d6);
2985 
2986     return start;
2987   }
2988 
2989 
2990   // Arguments:
2991   //
2992   // Inputs:
2993   //   c_rarg0   - byte[]  source+offset
2994   //   c_rarg1   - int[]   SHA.state
2995   //   c_rarg2   - int     offset
2996   //   c_rarg3   - int     limit
2997   //
2998   address generate_sha256_implCompress(bool multi_block, const char *name) {
2999     static const uint32_t round_consts[64] = {
3000       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3001       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3002       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3003       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3004       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3005       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3006       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3007       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3008       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3009       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3010       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3011       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3012       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3013       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3014       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3015       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3016     };
3017     __ align(CodeEntryAlignment);
3018     StubCodeMark mark(this, "StubRoutines", name);
3019     address start = __ pc();
3020 
3021     Register buf   = c_rarg0;
3022     Register state = c_rarg1;
3023     Register ofs   = c_rarg2;
3024     Register limit = c_rarg3;
3025 
3026     Label sha1_loop;
3027 
3028     __ stpd(v8, v9, __ pre(sp, -32));
3029     __ stpd(v10, v11, Address(sp, 16));
3030 
3031 // dga == v0
3032 // dgb == v1
3033 // dg0 == v2
3034 // dg1 == v3
3035 // dg2 == v4
3036 // t0 == v6
3037 // t1 == v7
3038 
3039     // load 16 keys to v16..v31
3040     __ lea(rscratch1, ExternalAddress((address)round_consts));
3041     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3042     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3043     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3044     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3045 
3046     // load 8 words (256 bits) state
3047     __ ldpq(v0, v1, state);
3048 
3049     __ BIND(sha1_loop);
3050     // load 64 bytes of data into v8..v11
3051     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3052     __ rev32(v8, __ T16B, v8);
3053     __ rev32(v9, __ T16B, v9);
3054     __ rev32(v10, __ T16B, v10);
3055     __ rev32(v11, __ T16B, v11);
3056 
3057     __ addv(v6, __ T4S, v8, v16);
3058     __ orr(v2, __ T16B, v0, v0);
3059     __ orr(v3, __ T16B, v1, v1);
3060 
3061     FloatRegister d0 = v8;
3062     FloatRegister d1 = v9;
3063     FloatRegister d2 = v10;
3064     FloatRegister d3 = v11;
3065 
3066 
3067     for (int round = 0; round < 16; round++) {
3068       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3069       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3070       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3071       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3072 
3073       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3074        __ orr(v4, __ T16B, v2, v2);
3075       if (round < 15)
3076         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3077       __ sha256h(v2, __ T4S, v3, tmp2);
3078       __ sha256h2(v3, __ T4S, v4, tmp2);
3079       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3080 
3081       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3082     }
3083 
3084     __ addv(v0, __ T4S, v0, v2);
3085     __ addv(v1, __ T4S, v1, v3);
3086 
3087     if (multi_block) {
3088       __ add(ofs, ofs, 64);
3089       __ cmp(ofs, limit);
3090       __ br(Assembler::LE, sha1_loop);
3091       __ mov(c_rarg0, ofs); // return ofs
3092     }
3093 
3094     __ ldpd(v10, v11, Address(sp, 16));
3095     __ ldpd(v8, v9, __ post(sp, 32));
3096 
3097     __ stpq(v0, v1, state);
3098 
3099     __ ret(lr);
3100 
3101     return start;
3102   }
3103 
3104 #ifndef BUILTIN_SIM
3105   // Safefetch stubs.
3106   void generate_safefetch(const char* name, int size, address* entry,
3107                           address* fault_pc, address* continuation_pc) {
3108     // safefetch signatures:
3109     //   int      SafeFetch32(int*      adr, int      errValue);
3110     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3111     //
3112     // arguments:
3113     //   c_rarg0 = adr
3114     //   c_rarg1 = errValue
3115     //
3116     // result:
3117     //   PPC_RET  = *adr or errValue
3118 
3119     StubCodeMark mark(this, "StubRoutines", name);
3120 
3121     // Entry point, pc or function descriptor.
3122     *entry = __ pc();
3123 
3124     // Load *adr into c_rarg1, may fault.
3125     *fault_pc = __ pc();
3126     switch (size) {
3127       case 4:
3128         // int32_t
3129         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3130         break;
3131       case 8:
3132         // int64_t
3133         __ ldr(c_rarg1, Address(c_rarg0, 0));
3134         break;
3135       default:
3136         ShouldNotReachHere();
3137     }
3138 
3139     // return errValue or *adr
3140     *continuation_pc = __ pc();
3141     __ mov(r0, c_rarg1);
3142     __ ret(lr);
3143   }
3144 #endif
3145 
3146   /**
3147    *  Arguments:
3148    *
3149    * Inputs:
3150    *   c_rarg0   - int crc
3151    *   c_rarg1   - byte* buf
3152    *   c_rarg2   - int length
3153    *
3154    * Ouput:
3155    *       rax   - int crc result
3156    */
3157   address generate_updateBytesCRC32() {
3158     assert(UseCRC32Intrinsics, "what are we doing here?");
3159 
3160     __ align(CodeEntryAlignment);
3161     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3162 
3163     address start = __ pc();
3164 
3165     const Register crc   = c_rarg0;  // crc
3166     const Register buf   = c_rarg1;  // source java byte array address
3167     const Register len   = c_rarg2;  // length
3168     const Register table0 = c_rarg3; // crc_table address
3169     const Register table1 = c_rarg4;
3170     const Register table2 = c_rarg5;
3171     const Register table3 = c_rarg6;
3172     const Register tmp3 = c_rarg7;
3173 
3174     BLOCK_COMMENT("Entry:");
3175     __ enter(); // required for proper stackwalking of RuntimeStub frame
3176 
3177     __ kernel_crc32(crc, buf, len,
3178               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3179 
3180     __ leave(); // required for proper stackwalking of RuntimeStub frame
3181     __ ret(lr);
3182 
3183     return start;
3184   }
3185 
3186   /**
3187    *  Arguments:
3188    *
3189    * Inputs:
3190    *   c_rarg0   - int crc
3191    *   c_rarg1   - byte* buf
3192    *   c_rarg2   - int length
3193    *   c_rarg3   - int* table
3194    *
3195    * Ouput:
3196    *       r0   - int crc result
3197    */
3198   address generate_updateBytesCRC32C() {
3199     assert(UseCRC32CIntrinsics, "what are we doing here?");
3200 
3201     __ align(CodeEntryAlignment);
3202     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3203 
3204     address start = __ pc();
3205 
3206     const Register crc   = c_rarg0;  // crc
3207     const Register buf   = c_rarg1;  // source java byte array address
3208     const Register len   = c_rarg2;  // length
3209     const Register table0 = c_rarg3; // crc_table address
3210     const Register table1 = c_rarg4;
3211     const Register table2 = c_rarg5;
3212     const Register table3 = c_rarg6;
3213     const Register tmp3 = c_rarg7;
3214 
3215     BLOCK_COMMENT("Entry:");
3216     __ enter(); // required for proper stackwalking of RuntimeStub frame
3217 
3218     __ kernel_crc32c(crc, buf, len,
3219               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3220 
3221     __ leave(); // required for proper stackwalking of RuntimeStub frame
3222     __ ret(lr);
3223 
3224     return start;
3225   }
3226 
3227   /***
3228    *  Arguments:
3229    *
3230    *  Inputs:
3231    *   c_rarg0   - int   adler
3232    *   c_rarg1   - byte* buff
3233    *   c_rarg2   - int   len
3234    *
3235    * Output:
3236    *   c_rarg0   - int adler result
3237    */
3238   address generate_updateBytesAdler32() {
3239     __ align(CodeEntryAlignment);
3240     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3241     address start = __ pc();
3242 
3243     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3244 
3245     // Aliases
3246     Register adler  = c_rarg0;
3247     Register s1     = c_rarg0;
3248     Register s2     = c_rarg3;
3249     Register buff   = c_rarg1;
3250     Register len    = c_rarg2;
3251     Register nmax  = r4;
3252     Register base = r5;
3253     Register count = r6;
3254     Register temp0 = rscratch1;
3255     Register temp1 = rscratch2;
3256     Register temp2 = r7;
3257 
3258     // Max number of bytes we can process before having to take the mod
3259     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3260     unsigned long BASE = 0xfff1;
3261     unsigned long NMAX = 0x15B0;
3262 
3263     __ mov(base, BASE);
3264     __ mov(nmax, NMAX);
3265 
3266     // s1 is initialized to the lower 16 bits of adler
3267     // s2 is initialized to the upper 16 bits of adler
3268     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3269     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3270 
3271     // The pipelined loop needs at least 16 elements for 1 iteration
3272     // It does check this, but it is more effective to skip to the cleanup loop
3273     __ cmp(len, (u1)16);
3274     __ br(Assembler::HS, L_nmax);
3275     __ cbz(len, L_combine);
3276 
3277     __ bind(L_simple_by1_loop);
3278     __ ldrb(temp0, Address(__ post(buff, 1)));
3279     __ add(s1, s1, temp0);
3280     __ add(s2, s2, s1);
3281     __ subs(len, len, 1);
3282     __ br(Assembler::HI, L_simple_by1_loop);
3283 
3284     // s1 = s1 % BASE
3285     __ subs(temp0, s1, base);
3286     __ csel(s1, temp0, s1, Assembler::HS);
3287 
3288     // s2 = s2 % BASE
3289     __ lsr(temp0, s2, 16);
3290     __ lsl(temp1, temp0, 4);
3291     __ sub(temp1, temp1, temp0);
3292     __ add(s2, temp1, s2, ext::uxth);
3293 
3294     __ subs(temp0, s2, base);
3295     __ csel(s2, temp0, s2, Assembler::HS);
3296 
3297     __ b(L_combine);
3298 
3299     __ bind(L_nmax);
3300     __ subs(len, len, nmax);
3301     __ sub(count, nmax, 16);
3302     __ br(Assembler::LO, L_by16);
3303 
3304     __ bind(L_nmax_loop);
3305 
3306     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3307 
3308     __ add(s1, s1, temp0, ext::uxtb);
3309     __ ubfx(temp2, temp0, 8, 8);
3310     __ add(s2, s2, s1);
3311     __ add(s1, s1, temp2);
3312     __ ubfx(temp2, temp0, 16, 8);
3313     __ add(s2, s2, s1);
3314     __ add(s1, s1, temp2);
3315     __ ubfx(temp2, temp0, 24, 8);
3316     __ add(s2, s2, s1);
3317     __ add(s1, s1, temp2);
3318     __ ubfx(temp2, temp0, 32, 8);
3319     __ add(s2, s2, s1);
3320     __ add(s1, s1, temp2);
3321     __ ubfx(temp2, temp0, 40, 8);
3322     __ add(s2, s2, s1);
3323     __ add(s1, s1, temp2);
3324     __ ubfx(temp2, temp0, 48, 8);
3325     __ add(s2, s2, s1);
3326     __ add(s1, s1, temp2);
3327     __ add(s2, s2, s1);
3328     __ add(s1, s1, temp0, Assembler::LSR, 56);
3329     __ add(s2, s2, s1);
3330 
3331     __ add(s1, s1, temp1, ext::uxtb);
3332     __ ubfx(temp2, temp1, 8, 8);
3333     __ add(s2, s2, s1);
3334     __ add(s1, s1, temp2);
3335     __ ubfx(temp2, temp1, 16, 8);
3336     __ add(s2, s2, s1);
3337     __ add(s1, s1, temp2);
3338     __ ubfx(temp2, temp1, 24, 8);
3339     __ add(s2, s2, s1);
3340     __ add(s1, s1, temp2);
3341     __ ubfx(temp2, temp1, 32, 8);
3342     __ add(s2, s2, s1);
3343     __ add(s1, s1, temp2);
3344     __ ubfx(temp2, temp1, 40, 8);
3345     __ add(s2, s2, s1);
3346     __ add(s1, s1, temp2);
3347     __ ubfx(temp2, temp1, 48, 8);
3348     __ add(s2, s2, s1);
3349     __ add(s1, s1, temp2);
3350     __ add(s2, s2, s1);
3351     __ add(s1, s1, temp1, Assembler::LSR, 56);
3352     __ add(s2, s2, s1);
3353 
3354     __ subs(count, count, 16);
3355     __ br(Assembler::HS, L_nmax_loop);
3356 
3357     // s1 = s1 % BASE
3358     __ lsr(temp0, s1, 16);
3359     __ lsl(temp1, temp0, 4);
3360     __ sub(temp1, temp1, temp0);
3361     __ add(temp1, temp1, s1, ext::uxth);
3362 
3363     __ lsr(temp0, temp1, 16);
3364     __ lsl(s1, temp0, 4);
3365     __ sub(s1, s1, temp0);
3366     __ add(s1, s1, temp1, ext:: uxth);
3367 
3368     __ subs(temp0, s1, base);
3369     __ csel(s1, temp0, s1, Assembler::HS);
3370 
3371     // s2 = s2 % BASE
3372     __ lsr(temp0, s2, 16);
3373     __ lsl(temp1, temp0, 4);
3374     __ sub(temp1, temp1, temp0);
3375     __ add(temp1, temp1, s2, ext::uxth);
3376 
3377     __ lsr(temp0, temp1, 16);
3378     __ lsl(s2, temp0, 4);
3379     __ sub(s2, s2, temp0);
3380     __ add(s2, s2, temp1, ext:: uxth);
3381 
3382     __ subs(temp0, s2, base);
3383     __ csel(s2, temp0, s2, Assembler::HS);
3384 
3385     __ subs(len, len, nmax);
3386     __ sub(count, nmax, 16);
3387     __ br(Assembler::HS, L_nmax_loop);
3388 
3389     __ bind(L_by16);
3390     __ adds(len, len, count);
3391     __ br(Assembler::LO, L_by1);
3392 
3393     __ bind(L_by16_loop);
3394 
3395     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3396 
3397     __ add(s1, s1, temp0, ext::uxtb);
3398     __ ubfx(temp2, temp0, 8, 8);
3399     __ add(s2, s2, s1);
3400     __ add(s1, s1, temp2);
3401     __ ubfx(temp2, temp0, 16, 8);
3402     __ add(s2, s2, s1);
3403     __ add(s1, s1, temp2);
3404     __ ubfx(temp2, temp0, 24, 8);
3405     __ add(s2, s2, s1);
3406     __ add(s1, s1, temp2);
3407     __ ubfx(temp2, temp0, 32, 8);
3408     __ add(s2, s2, s1);
3409     __ add(s1, s1, temp2);
3410     __ ubfx(temp2, temp0, 40, 8);
3411     __ add(s2, s2, s1);
3412     __ add(s1, s1, temp2);
3413     __ ubfx(temp2, temp0, 48, 8);
3414     __ add(s2, s2, s1);
3415     __ add(s1, s1, temp2);
3416     __ add(s2, s2, s1);
3417     __ add(s1, s1, temp0, Assembler::LSR, 56);
3418     __ add(s2, s2, s1);
3419 
3420     __ add(s1, s1, temp1, ext::uxtb);
3421     __ ubfx(temp2, temp1, 8, 8);
3422     __ add(s2, s2, s1);
3423     __ add(s1, s1, temp2);
3424     __ ubfx(temp2, temp1, 16, 8);
3425     __ add(s2, s2, s1);
3426     __ add(s1, s1, temp2);
3427     __ ubfx(temp2, temp1, 24, 8);
3428     __ add(s2, s2, s1);
3429     __ add(s1, s1, temp2);
3430     __ ubfx(temp2, temp1, 32, 8);
3431     __ add(s2, s2, s1);
3432     __ add(s1, s1, temp2);
3433     __ ubfx(temp2, temp1, 40, 8);
3434     __ add(s2, s2, s1);
3435     __ add(s1, s1, temp2);
3436     __ ubfx(temp2, temp1, 48, 8);
3437     __ add(s2, s2, s1);
3438     __ add(s1, s1, temp2);
3439     __ add(s2, s2, s1);
3440     __ add(s1, s1, temp1, Assembler::LSR, 56);
3441     __ add(s2, s2, s1);
3442 
3443     __ subs(len, len, 16);
3444     __ br(Assembler::HS, L_by16_loop);
3445 
3446     __ bind(L_by1);
3447     __ adds(len, len, 15);
3448     __ br(Assembler::LO, L_do_mod);
3449 
3450     __ bind(L_by1_loop);
3451     __ ldrb(temp0, Address(__ post(buff, 1)));
3452     __ add(s1, temp0, s1);
3453     __ add(s2, s2, s1);
3454     __ subs(len, len, 1);
3455     __ br(Assembler::HS, L_by1_loop);
3456 
3457     __ bind(L_do_mod);
3458     // s1 = s1 % BASE
3459     __ lsr(temp0, s1, 16);
3460     __ lsl(temp1, temp0, 4);
3461     __ sub(temp1, temp1, temp0);
3462     __ add(temp1, temp1, s1, ext::uxth);
3463 
3464     __ lsr(temp0, temp1, 16);
3465     __ lsl(s1, temp0, 4);
3466     __ sub(s1, s1, temp0);
3467     __ add(s1, s1, temp1, ext:: uxth);
3468 
3469     __ subs(temp0, s1, base);
3470     __ csel(s1, temp0, s1, Assembler::HS);
3471 
3472     // s2 = s2 % BASE
3473     __ lsr(temp0, s2, 16);
3474     __ lsl(temp1, temp0, 4);
3475     __ sub(temp1, temp1, temp0);
3476     __ add(temp1, temp1, s2, ext::uxth);
3477 
3478     __ lsr(temp0, temp1, 16);
3479     __ lsl(s2, temp0, 4);
3480     __ sub(s2, s2, temp0);
3481     __ add(s2, s2, temp1, ext:: uxth);
3482 
3483     __ subs(temp0, s2, base);
3484     __ csel(s2, temp0, s2, Assembler::HS);
3485 
3486     // Combine lower bits and higher bits
3487     __ bind(L_combine);
3488     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3489 
3490     __ ret(lr);
3491 
3492     return start;
3493   }
3494 
3495   /**
3496    *  Arguments:
3497    *
3498    *  Input:
3499    *    c_rarg0   - x address
3500    *    c_rarg1   - x length
3501    *    c_rarg2   - y address
3502    *    c_rarg3   - y lenth
3503    *    c_rarg4   - z address
3504    *    c_rarg5   - z length
3505    */
3506   address generate_multiplyToLen() {
3507     __ align(CodeEntryAlignment);
3508     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3509 
3510     address start = __ pc();
3511     const Register x     = r0;
3512     const Register xlen  = r1;
3513     const Register y     = r2;
3514     const Register ylen  = r3;
3515     const Register z     = r4;
3516     const Register zlen  = r5;
3517 
3518     const Register tmp1  = r10;
3519     const Register tmp2  = r11;
3520     const Register tmp3  = r12;
3521     const Register tmp4  = r13;
3522     const Register tmp5  = r14;
3523     const Register tmp6  = r15;
3524     const Register tmp7  = r16;
3525 
3526     BLOCK_COMMENT("Entry:");
3527     __ enter(); // required for proper stackwalking of RuntimeStub frame
3528     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3529     __ leave(); // required for proper stackwalking of RuntimeStub frame
3530     __ ret(lr);
3531 
3532     return start;
3533   }
3534 
3535   address generate_squareToLen() {
3536     // squareToLen algorithm for sizes 1..127 described in java code works
3537     // faster than multiply_to_len on some CPUs and slower on others, but
3538     // multiply_to_len shows a bit better overall results
3539     __ align(CodeEntryAlignment);
3540     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3541     address start = __ pc();
3542 
3543     const Register x     = r0;
3544     const Register xlen  = r1;
3545     const Register z     = r2;
3546     const Register zlen  = r3;
3547     const Register y     = r4; // == x
3548     const Register ylen  = r5; // == xlen
3549 
3550     const Register tmp1  = r10;
3551     const Register tmp2  = r11;
3552     const Register tmp3  = r12;
3553     const Register tmp4  = r13;
3554     const Register tmp5  = r14;
3555     const Register tmp6  = r15;
3556     const Register tmp7  = r16;
3557 
3558     RegSet spilled_regs = RegSet::of(y, ylen);
3559     BLOCK_COMMENT("Entry:");
3560     __ enter();
3561     __ push(spilled_regs, sp);
3562     __ mov(y, x);
3563     __ mov(ylen, xlen);
3564     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3565     __ pop(spilled_regs, sp);
3566     __ leave();
3567     __ ret(lr);
3568     return start;
3569   }
3570 
3571   address generate_mulAdd() {
3572     __ align(CodeEntryAlignment);
3573     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3574 
3575     address start = __ pc();
3576 
3577     const Register out     = r0;
3578     const Register in      = r1;
3579     const Register offset  = r2;
3580     const Register len     = r3;
3581     const Register k       = r4;
3582 
3583     BLOCK_COMMENT("Entry:");
3584     __ enter();
3585     __ mul_add(out, in, offset, len, k);
3586     __ leave();
3587     __ ret(lr);
3588 
3589     return start;
3590   }
3591 
3592   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3593                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3594                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3595     // Karatsuba multiplication performs a 128*128 -> 256-bit
3596     // multiplication in three 128-bit multiplications and a few
3597     // additions.
3598     //
3599     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3600     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3601     //
3602     // Inputs:
3603     //
3604     // A0 in a.d[0]     (subkey)
3605     // A1 in a.d[1]
3606     // (A1+A0) in a1_xor_a0.d[0]
3607     //
3608     // B0 in b.d[0]     (state)
3609     // B1 in b.d[1]
3610 
3611     __ ext(tmp1, __ T16B, b, b, 0x08);
3612     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3613     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3614     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3615     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3616 
3617     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3618     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3619     __ eor(tmp2, __ T16B, tmp2, tmp4);
3620     __ eor(tmp2, __ T16B, tmp2, tmp3);
3621 
3622     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3623     __ ins(result_hi, __ D, tmp2, 0, 1);
3624     __ ins(result_lo, __ D, tmp2, 1, 0);
3625   }
3626 
3627   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3628                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3629     const FloatRegister t0 = result;
3630 
3631     // The GCM field polynomial f is z^128 + p(z), where p =
3632     // z^7+z^2+z+1.
3633     //
3634     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3635     //
3636     // so, given that the product we're reducing is
3637     //    a == lo + hi * z^128
3638     // substituting,
3639     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3640     //
3641     // we reduce by multiplying hi by p(z) and subtracting the result
3642     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3643     // bits we can do this with two 64-bit multiplications, lo*p and
3644     // hi*p.
3645 
3646     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3647     __ ext(t1, __ T16B, t0, z, 8);
3648     __ eor(hi, __ T16B, hi, t1);
3649     __ ext(t1, __ T16B, z, t0, 8);
3650     __ eor(lo, __ T16B, lo, t1);
3651     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3652     __ eor(result, __ T16B, lo, t0);
3653   }
3654 
3655   address generate_has_negatives(address &has_negatives_long) {
3656     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3657     const u1 large_loop_size = 64;
3658     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3659     int dcache_line = VM_Version::dcache_line_size();
3660 
3661     Register ary1 = r1, len = r2, result = r0;
3662 
3663     __ align(CodeEntryAlignment);
3664     address entry = __ pc();
3665 
3666     __ enter();
3667 
3668   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3669         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3670 
3671   __ cmp(len, (u1)15);
3672   __ br(Assembler::GT, LEN_OVER_15);
3673   // The only case when execution falls into this code is when pointer is near
3674   // the end of memory page and we have to avoid reading next page
3675   __ add(ary1, ary1, len);
3676   __ subs(len, len, 8);
3677   __ br(Assembler::GT, LEN_OVER_8);
3678   __ ldr(rscratch2, Address(ary1, -8));
3679   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3680   __ lsrv(rscratch2, rscratch2, rscratch1);
3681   __ tst(rscratch2, UPPER_BIT_MASK);
3682   __ cset(result, Assembler::NE);
3683   __ leave();
3684   __ ret(lr);
3685   __ bind(LEN_OVER_8);
3686   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3687   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3688   __ tst(rscratch2, UPPER_BIT_MASK);
3689   __ br(Assembler::NE, RET_TRUE_NO_POP);
3690   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3691   __ lsrv(rscratch1, rscratch1, rscratch2);
3692   __ tst(rscratch1, UPPER_BIT_MASK);
3693   __ cset(result, Assembler::NE);
3694   __ leave();
3695   __ ret(lr);
3696 
3697   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3698   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3699 
3700   has_negatives_long = __ pc(); // 2nd entry point
3701 
3702   __ enter();
3703 
3704   __ bind(LEN_OVER_15);
3705     __ push(spilled_regs, sp);
3706     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3707     __ cbz(rscratch2, ALIGNED);
3708     __ ldp(tmp6, tmp1, Address(ary1));
3709     __ mov(tmp5, 16);
3710     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3711     __ add(ary1, ary1, rscratch1);
3712     __ sub(len, len, rscratch1);
3713     __ orr(tmp6, tmp6, tmp1);
3714     __ tst(tmp6, UPPER_BIT_MASK);
3715     __ br(Assembler::NE, RET_TRUE);
3716 
3717   __ bind(ALIGNED);
3718     __ cmp(len, large_loop_size);
3719     __ br(Assembler::LT, CHECK_16);
3720     // Perform 16-byte load as early return in pre-loop to handle situation
3721     // when initially aligned large array has negative values at starting bytes,
3722     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3723     // slower. Cases with negative bytes further ahead won't be affected that
3724     // much. In fact, it'll be faster due to early loads, less instructions and
3725     // less branches in LARGE_LOOP.
3726     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3727     __ sub(len, len, 16);
3728     __ orr(tmp6, tmp6, tmp1);
3729     __ tst(tmp6, UPPER_BIT_MASK);
3730     __ br(Assembler::NE, RET_TRUE);
3731     __ cmp(len, large_loop_size);
3732     __ br(Assembler::LT, CHECK_16);
3733 
3734     if (SoftwarePrefetchHintDistance >= 0
3735         && SoftwarePrefetchHintDistance >= dcache_line) {
3736       // initial prefetch
3737       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3738     }
3739   __ bind(LARGE_LOOP);
3740     if (SoftwarePrefetchHintDistance >= 0) {
3741       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3742     }
3743     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3744     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3745     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3746     // instructions per cycle and have less branches, but this approach disables
3747     // early return, thus, all 64 bytes are loaded and checked every time.
3748     __ ldp(tmp2, tmp3, Address(ary1));
3749     __ ldp(tmp4, tmp5, Address(ary1, 16));
3750     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3751     __ ldp(tmp6, tmp1, Address(ary1, 48));
3752     __ add(ary1, ary1, large_loop_size);
3753     __ sub(len, len, large_loop_size);
3754     __ orr(tmp2, tmp2, tmp3);
3755     __ orr(tmp4, tmp4, tmp5);
3756     __ orr(rscratch1, rscratch1, rscratch2);
3757     __ orr(tmp6, tmp6, tmp1);
3758     __ orr(tmp2, tmp2, tmp4);
3759     __ orr(rscratch1, rscratch1, tmp6);
3760     __ orr(tmp2, tmp2, rscratch1);
3761     __ tst(tmp2, UPPER_BIT_MASK);
3762     __ br(Assembler::NE, RET_TRUE);
3763     __ cmp(len, large_loop_size);
3764     __ br(Assembler::GE, LARGE_LOOP);
3765 
3766   __ bind(CHECK_16); // small 16-byte load pre-loop
3767     __ cmp(len, (u1)16);
3768     __ br(Assembler::LT, POST_LOOP16);
3769 
3770   __ bind(LOOP16); // small 16-byte load loop
3771     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3772     __ sub(len, len, 16);
3773     __ orr(tmp2, tmp2, tmp3);
3774     __ tst(tmp2, UPPER_BIT_MASK);
3775     __ br(Assembler::NE, RET_TRUE);
3776     __ cmp(len, (u1)16);
3777     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3778 
3779   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3780     __ cmp(len, (u1)8);
3781     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3782     __ ldr(tmp3, Address(__ post(ary1, 8)));
3783     __ sub(len, len, 8);
3784     __ tst(tmp3, UPPER_BIT_MASK);
3785     __ br(Assembler::NE, RET_TRUE);
3786 
3787   __ bind(POST_LOOP16_LOAD_TAIL);
3788     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3789     __ ldr(tmp1, Address(ary1));
3790     __ mov(tmp2, 64);
3791     __ sub(tmp4, tmp2, len, __ LSL, 3);
3792     __ lslv(tmp1, tmp1, tmp4);
3793     __ tst(tmp1, UPPER_BIT_MASK);
3794     __ br(Assembler::NE, RET_TRUE);
3795     // Fallthrough
3796 
3797   __ bind(RET_FALSE);
3798     __ pop(spilled_regs, sp);
3799     __ leave();
3800     __ mov(result, zr);
3801     __ ret(lr);
3802 
3803   __ bind(RET_TRUE);
3804     __ pop(spilled_regs, sp);
3805   __ bind(RET_TRUE_NO_POP);
3806     __ leave();
3807     __ mov(result, 1);
3808     __ ret(lr);
3809 
3810   __ bind(DONE);
3811     __ pop(spilled_regs, sp);
3812     __ leave();
3813     __ ret(lr);
3814     return entry;
3815   }
3816 
3817   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3818         bool usePrefetch, Label &NOT_EQUAL) {
3819     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3820         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3821         tmp7 = r12, tmp8 = r13;
3822     Label LOOP;
3823 
3824     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3825     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3826     __ bind(LOOP);
3827     if (usePrefetch) {
3828       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3829       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3830     }
3831     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3832     __ eor(tmp1, tmp1, tmp2);
3833     __ eor(tmp3, tmp3, tmp4);
3834     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3835     __ orr(tmp1, tmp1, tmp3);
3836     __ cbnz(tmp1, NOT_EQUAL);
3837     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3838     __ eor(tmp5, tmp5, tmp6);
3839     __ eor(tmp7, tmp7, tmp8);
3840     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3841     __ orr(tmp5, tmp5, tmp7);
3842     __ cbnz(tmp5, NOT_EQUAL);
3843     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3844     __ eor(tmp1, tmp1, tmp2);
3845     __ eor(tmp3, tmp3, tmp4);
3846     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3847     __ orr(tmp1, tmp1, tmp3);
3848     __ cbnz(tmp1, NOT_EQUAL);
3849     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3850     __ eor(tmp5, tmp5, tmp6);
3851     __ sub(cnt1, cnt1, 8 * wordSize);
3852     __ eor(tmp7, tmp7, tmp8);
3853     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3854     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3855     // cmp) because subs allows an unlimited range of immediate operand.
3856     __ subs(tmp6, cnt1, loopThreshold);
3857     __ orr(tmp5, tmp5, tmp7);
3858     __ cbnz(tmp5, NOT_EQUAL);
3859     __ br(__ GE, LOOP);
3860     // post-loop
3861     __ eor(tmp1, tmp1, tmp2);
3862     __ eor(tmp3, tmp3, tmp4);
3863     __ orr(tmp1, tmp1, tmp3);
3864     __ sub(cnt1, cnt1, 2 * wordSize);
3865     __ cbnz(tmp1, NOT_EQUAL);
3866   }
3867 
3868   void generate_large_array_equals_loop_simd(int loopThreshold,
3869         bool usePrefetch, Label &NOT_EQUAL) {
3870     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3871         tmp2 = rscratch2;
3872     Label LOOP;
3873 
3874     __ bind(LOOP);
3875     if (usePrefetch) {
3876       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3877       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3878     }
3879     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3880     __ sub(cnt1, cnt1, 8 * wordSize);
3881     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3882     __ subs(tmp1, cnt1, loopThreshold);
3883     __ eor(v0, __ T16B, v0, v4);
3884     __ eor(v1, __ T16B, v1, v5);
3885     __ eor(v2, __ T16B, v2, v6);
3886     __ eor(v3, __ T16B, v3, v7);
3887     __ orr(v0, __ T16B, v0, v1);
3888     __ orr(v1, __ T16B, v2, v3);
3889     __ orr(v0, __ T16B, v0, v1);
3890     __ umov(tmp1, v0, __ D, 0);
3891     __ umov(tmp2, v0, __ D, 1);
3892     __ orr(tmp1, tmp1, tmp2);
3893     __ cbnz(tmp1, NOT_EQUAL);
3894     __ br(__ GE, LOOP);
3895   }
3896 
3897   // a1 = r1 - array1 address
3898   // a2 = r2 - array2 address
3899   // result = r0 - return value. Already contains "false"
3900   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3901   // r3-r5 are reserved temporary registers
3902   address generate_large_array_equals() {
3903     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3904     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3905         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3906         tmp7 = r12, tmp8 = r13;
3907     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3908         SMALL_LOOP, POST_LOOP;
3909     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3910     // calculate if at least 32 prefetched bytes are used
3911     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3912     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3913     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3914     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3915         tmp5, tmp6, tmp7, tmp8);
3916 
3917     __ align(CodeEntryAlignment);
3918     address entry = __ pc();
3919     __ enter();
3920     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3921     // also advance pointers to use post-increment instead of pre-increment
3922     __ add(a1, a1, wordSize);
3923     __ add(a2, a2, wordSize);
3924     if (AvoidUnalignedAccesses) {
3925       // both implementations (SIMD/nonSIMD) are using relatively large load
3926       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3927       // on some CPUs in case of address is not at least 16-byte aligned.
3928       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3929       // load if needed at least for 1st address and make if 16-byte aligned.
3930       Label ALIGNED16;
3931       __ tbz(a1, 3, ALIGNED16);
3932       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3933       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3934       __ sub(cnt1, cnt1, wordSize);
3935       __ eor(tmp1, tmp1, tmp2);
3936       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3937       __ bind(ALIGNED16);
3938     }
3939     if (UseSIMDForArrayEquals) {
3940       if (SoftwarePrefetchHintDistance >= 0) {
3941         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3942         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3943         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3944             /* prfm = */ true, NOT_EQUAL);
3945         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3946         __ br(__ LT, TAIL);
3947       }
3948       __ bind(NO_PREFETCH_LARGE_LOOP);
3949       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3950           /* prfm = */ false, NOT_EQUAL);
3951     } else {
3952       __ push(spilled_regs, sp);
3953       if (SoftwarePrefetchHintDistance >= 0) {
3954         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3955         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3956         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3957             /* prfm = */ true, NOT_EQUAL);
3958         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3959         __ br(__ LT, TAIL);
3960       }
3961       __ bind(NO_PREFETCH_LARGE_LOOP);
3962       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3963           /* prfm = */ false, NOT_EQUAL);
3964     }
3965     __ bind(TAIL);
3966       __ cbz(cnt1, EQUAL);
3967       __ subs(cnt1, cnt1, wordSize);
3968       __ br(__ LE, POST_LOOP);
3969     __ bind(SMALL_LOOP);
3970       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3971       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3972       __ subs(cnt1, cnt1, wordSize);
3973       __ eor(tmp1, tmp1, tmp2);
3974       __ cbnz(tmp1, NOT_EQUAL);
3975       __ br(__ GT, SMALL_LOOP);
3976     __ bind(POST_LOOP);
3977       __ ldr(tmp1, Address(a1, cnt1));
3978       __ ldr(tmp2, Address(a2, cnt1));
3979       __ eor(tmp1, tmp1, tmp2);
3980       __ cbnz(tmp1, NOT_EQUAL);
3981     __ bind(EQUAL);
3982       __ mov(result, true);
3983     __ bind(NOT_EQUAL);
3984       if (!UseSIMDForArrayEquals) {
3985         __ pop(spilled_regs, sp);
3986       }
3987     __ bind(NOT_EQUAL_NO_POP);
3988     __ leave();
3989     __ ret(lr);
3990     return entry;
3991   }
3992 
3993   address generate_dsin_dcos(bool isCos) {
3994     __ align(CodeEntryAlignment);
3995     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3996     address start = __ pc();
3997     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3998         (address)StubRoutines::aarch64::_two_over_pi,
3999         (address)StubRoutines::aarch64::_pio2,
4000         (address)StubRoutines::aarch64::_dsin_coef,
4001         (address)StubRoutines::aarch64::_dcos_coef);
4002     return start;
4003   }
4004 
4005   address generate_dlog() {
4006     __ align(CodeEntryAlignment);
4007     StubCodeMark mark(this, "StubRoutines", "dlog");
4008     address entry = __ pc();
4009     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4010         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4011     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4012     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4013         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4014     return entry;
4015   }
4016 
4017   // code for comparing 16 bytes of strings with same encoding
4018   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4019     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4020     __ ldr(rscratch1, Address(__ post(str1, 8)));
4021     __ eor(rscratch2, tmp1, tmp2);
4022     __ ldr(cnt1, Address(__ post(str2, 8)));
4023     __ cbnz(rscratch2, DIFF1);
4024     __ ldr(tmp1, Address(__ post(str1, 8)));
4025     __ eor(rscratch2, rscratch1, cnt1);
4026     __ ldr(tmp2, Address(__ post(str2, 8)));
4027     __ cbnz(rscratch2, DIFF2);
4028   }
4029 
4030   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4031   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4032       Label &DIFF2) {
4033     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4034     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4035 
4036     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4037     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4038     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4039     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4040 
4041     __ fmovd(tmpL, vtmp3);
4042     __ eor(rscratch2, tmp3, tmpL);
4043     __ cbnz(rscratch2, DIFF2);
4044 
4045     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4046     __ umov(tmpL, vtmp3, __ D, 1);
4047     __ eor(rscratch2, tmpU, tmpL);
4048     __ cbnz(rscratch2, DIFF1);
4049 
4050     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4051     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4052     __ fmovd(tmpL, vtmp);
4053     __ eor(rscratch2, tmp3, tmpL);
4054     __ cbnz(rscratch2, DIFF2);
4055 
4056     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4057     __ umov(tmpL, vtmp, __ D, 1);
4058     __ eor(rscratch2, tmpU, tmpL);
4059     __ cbnz(rscratch2, DIFF1);
4060   }
4061 
4062   // r0  = result
4063   // r1  = str1
4064   // r2  = cnt1
4065   // r3  = str2
4066   // r4  = cnt2
4067   // r10 = tmp1
4068   // r11 = tmp2
4069   address generate_compare_long_string_different_encoding(bool isLU) {
4070     __ align(CodeEntryAlignment);
4071     StubCodeMark mark(this, "StubRoutines", isLU
4072         ? "compare_long_string_different_encoding LU"
4073         : "compare_long_string_different_encoding UL");
4074     address entry = __ pc();
4075     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4076         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
4077         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4078     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4079         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4080     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4081     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4082 
4083     int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
4084 
4085     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4086     // cnt2 == amount of characters left to compare
4087     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4088     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4089     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4090     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4091     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4092     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4093     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4094     __ eor(rscratch2, tmp1, tmp2);
4095     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4096     __ mov(rscratch1, tmp2);
4097     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4098     Register strU = isLU ? str2 : str1,
4099              strL = isLU ? str1 : str2,
4100              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4101              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4102     __ push(spilled_regs, sp);
4103     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4104     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4105 
4106     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4107 
4108     if (SoftwarePrefetchHintDistance >= 0) {
4109       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4110       __ br(__ LT, SMALL_LOOP);
4111       __ bind(LARGE_LOOP_PREFETCH);
4112         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4113         __ mov(tmp4, 2);
4114         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4115         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4116           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4117           __ subs(tmp4, tmp4, 1);
4118           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4119           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4120           __ mov(tmp4, 2);
4121         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4122           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4123           __ subs(tmp4, tmp4, 1);
4124           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4125           __ sub(cnt2, cnt2, 64);
4126           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4127           __ br(__ GE, LARGE_LOOP_PREFETCH);
4128     }
4129     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4130     __ subs(cnt2, cnt2, 16);
4131     __ br(__ LT, TAIL);
4132     __ b(SMALL_LOOP_ENTER);
4133     __ bind(SMALL_LOOP); // smaller loop
4134       __ subs(cnt2, cnt2, 16);
4135     __ bind(SMALL_LOOP_ENTER);
4136       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4137       __ br(__ GE, SMALL_LOOP);
4138       __ cbz(cnt2, LOAD_LAST);
4139     __ bind(TAIL); // 1..15 characters left
4140       __ subs(zr, cnt2, -8);
4141       __ br(__ GT, TAIL_LOAD_16);
4142       __ ldrd(vtmp, Address(tmp2));
4143       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4144 
4145       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4146       __ fmovd(tmpL, vtmp3);
4147       __ eor(rscratch2, tmp3, tmpL);
4148       __ cbnz(rscratch2, DIFF2);
4149       __ umov(tmpL, vtmp3, __ D, 1);
4150       __ eor(rscratch2, tmpU, tmpL);
4151       __ cbnz(rscratch2, DIFF1);
4152       __ b(LOAD_LAST);
4153     __ bind(TAIL_LOAD_16);
4154       __ ldrq(vtmp, Address(tmp2));
4155       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4156       __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4157       __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4158       __ fmovd(tmpL, vtmp3);
4159       __ eor(rscratch2, tmp3, tmpL);
4160       __ cbnz(rscratch2, DIFF2);
4161 
4162       __ ldr(tmp3, Address(__ post(cnt1, 8)));
4163       __ umov(tmpL, vtmp3, __ D, 1);
4164       __ eor(rscratch2, tmpU, tmpL);
4165       __ cbnz(rscratch2, DIFF1);
4166 
4167       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4168       __ fmovd(tmpL, vtmp);
4169       __ eor(rscratch2, tmp3, tmpL);
4170       __ cbnz(rscratch2, DIFF2);
4171 
4172       __ umov(tmpL, vtmp, __ D, 1);
4173       __ eor(rscratch2, tmpU, tmpL);
4174       __ cbnz(rscratch2, DIFF1);
4175       __ b(LOAD_LAST);
4176     __ bind(DIFF2);
4177       __ mov(tmpU, tmp3);
4178     __ bind(DIFF1);
4179       __ pop(spilled_regs, sp);
4180       __ b(CALCULATE_DIFFERENCE);
4181     __ bind(LOAD_LAST);
4182       __ pop(spilled_regs, sp);
4183 
4184       __ ldrs(vtmp, Address(strL));
4185       __ ldr(tmpU, Address(strU));
4186       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4187       __ fmovd(tmpL, vtmp);
4188 
4189       __ eor(rscratch2, tmpU, tmpL);
4190       __ cbz(rscratch2, DONE);
4191 
4192     // Find the first different characters in the longwords and
4193     // compute their difference.
4194     __ bind(CALCULATE_DIFFERENCE);
4195       __ rev(rscratch2, rscratch2);
4196       __ clz(rscratch2, rscratch2);
4197       __ andr(rscratch2, rscratch2, -16);
4198       __ lsrv(tmp1, tmp1, rscratch2);
4199       __ uxthw(tmp1, tmp1);
4200       __ lsrv(rscratch1, rscratch1, rscratch2);
4201       __ uxthw(rscratch1, rscratch1);
4202       __ subw(result, tmp1, rscratch1);
4203     __ bind(DONE);
4204       __ ret(lr);
4205     return entry;
4206   }
4207 
4208   // r0  = result
4209   // r1  = str1
4210   // r2  = cnt1
4211   // r3  = str2
4212   // r4  = cnt2
4213   // r10 = tmp1
4214   // r11 = tmp2
4215   address generate_compare_long_string_same_encoding(bool isLL) {
4216     __ align(CodeEntryAlignment);
4217     StubCodeMark mark(this, "StubRoutines", isLL
4218         ? "compare_long_string_same_encoding LL"
4219         : "compare_long_string_same_encoding UU");
4220     address entry = __ pc();
4221     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4222         tmp1 = r10, tmp2 = r11;
4223     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4224         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4225         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4226     // exit from large loop when less than 64 bytes left to read or we're about
4227     // to prefetch memory behind array border
4228     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4229     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4230     // update cnt2 counter with already loaded 8 bytes
4231     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4232     // update pointers, because of previous read
4233     __ add(str1, str1, wordSize);
4234     __ add(str2, str2, wordSize);
4235     if (SoftwarePrefetchHintDistance >= 0) {
4236       __ bind(LARGE_LOOP_PREFETCH);
4237         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4238         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4239         compare_string_16_bytes_same(DIFF, DIFF2);
4240         compare_string_16_bytes_same(DIFF, DIFF2);
4241         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4242         compare_string_16_bytes_same(DIFF, DIFF2);
4243         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4244         compare_string_16_bytes_same(DIFF, DIFF2);
4245         __ br(__ GT, LARGE_LOOP_PREFETCH);
4246         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4247         // less than 16 bytes left?
4248         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4249         __ br(__ LT, TAIL);
4250     }
4251     __ bind(SMALL_LOOP);
4252       compare_string_16_bytes_same(DIFF, DIFF2);
4253       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4254       __ br(__ GE, SMALL_LOOP);
4255     __ bind(TAIL);
4256       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4257       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4258       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4259       __ br(__ LE, CHECK_LAST);
4260       __ eor(rscratch2, tmp1, tmp2);
4261       __ cbnz(rscratch2, DIFF);
4262       __ ldr(tmp1, Address(__ post(str1, 8)));
4263       __ ldr(tmp2, Address(__ post(str2, 8)));
4264       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4265     __ bind(CHECK_LAST);
4266       if (!isLL) {
4267         __ add(cnt2, cnt2, cnt2); // now in bytes
4268       }
4269       __ eor(rscratch2, tmp1, tmp2);
4270       __ cbnz(rscratch2, DIFF);
4271       __ ldr(rscratch1, Address(str1, cnt2));
4272       __ ldr(cnt1, Address(str2, cnt2));
4273       __ eor(rscratch2, rscratch1, cnt1);
4274       __ cbz(rscratch2, LENGTH_DIFF);
4275       // Find the first different characters in the longwords and
4276       // compute their difference.
4277     __ bind(DIFF2);
4278       __ rev(rscratch2, rscratch2);
4279       __ clz(rscratch2, rscratch2);
4280       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4281       __ lsrv(rscratch1, rscratch1, rscratch2);
4282       if (isLL) {
4283         __ lsrv(cnt1, cnt1, rscratch2);
4284         __ uxtbw(rscratch1, rscratch1);
4285         __ uxtbw(cnt1, cnt1);
4286       } else {
4287         __ lsrv(cnt1, cnt1, rscratch2);
4288         __ uxthw(rscratch1, rscratch1);
4289         __ uxthw(cnt1, cnt1);
4290       }
4291       __ subw(result, rscratch1, cnt1);
4292       __ b(LENGTH_DIFF);
4293     __ bind(DIFF);
4294       __ rev(rscratch2, rscratch2);
4295       __ clz(rscratch2, rscratch2);
4296       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4297       __ lsrv(tmp1, tmp1, rscratch2);
4298       if (isLL) {
4299         __ lsrv(tmp2, tmp2, rscratch2);
4300         __ uxtbw(tmp1, tmp1);
4301         __ uxtbw(tmp2, tmp2);
4302       } else {
4303         __ lsrv(tmp2, tmp2, rscratch2);
4304         __ uxthw(tmp1, tmp1);
4305         __ uxthw(tmp2, tmp2);
4306       }
4307       __ subw(result, tmp1, tmp2);
4308       __ b(LENGTH_DIFF);
4309     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4310       __ eor(rscratch2, tmp1, tmp2);
4311       __ cbnz(rscratch2, DIFF);
4312     __ bind(LENGTH_DIFF);
4313       __ ret(lr);
4314     return entry;
4315   }
4316 
4317   void generate_compare_long_strings() {
4318       StubRoutines::aarch64::_compare_long_string_LL
4319           = generate_compare_long_string_same_encoding(true);
4320       StubRoutines::aarch64::_compare_long_string_UU
4321           = generate_compare_long_string_same_encoding(false);
4322       StubRoutines::aarch64::_compare_long_string_LU
4323           = generate_compare_long_string_different_encoding(true);
4324       StubRoutines::aarch64::_compare_long_string_UL
4325           = generate_compare_long_string_different_encoding(false);
4326   }
4327 
4328   // R0 = result
4329   // R1 = str2
4330   // R2 = cnt1
4331   // R3 = str1
4332   // R4 = cnt2
4333   // This generic linear code use few additional ideas, which makes it faster:
4334   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4335   // in order to skip initial loading(help in systems with 1 ld pipeline)
4336   // 2) we can use "fast" algorithm of finding single character to search for
4337   // first symbol with less branches(1 branch per each loaded register instead
4338   // of branch for each symbol), so, this is where constants like
4339   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4340   // 3) after loading and analyzing 1st register of source string, it can be
4341   // used to search for every 1st character entry, saving few loads in
4342   // comparison with "simplier-but-slower" implementation
4343   // 4) in order to avoid lots of push/pop operations, code below is heavily
4344   // re-using/re-initializing/compressing register values, which makes code
4345   // larger and a bit less readable, however, most of extra operations are
4346   // issued during loads or branches, so, penalty is minimal
4347   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4348     const char* stubName = str1_isL
4349         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4350         : "indexof_linear_uu";
4351     __ align(CodeEntryAlignment);
4352     StubCodeMark mark(this, "StubRoutines", stubName);
4353     address entry = __ pc();
4354 
4355     int str1_chr_size = str1_isL ? 1 : 2;
4356     int str2_chr_size = str2_isL ? 1 : 2;
4357     int str1_chr_shift = str1_isL ? 0 : 1;
4358     int str2_chr_shift = str2_isL ? 0 : 1;
4359     bool isL = str1_isL && str2_isL;
4360    // parameters
4361     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4362     // temporary registers
4363     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4364     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4365     // redefinitions
4366     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4367 
4368     __ push(spilled_regs, sp);
4369     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP,
4370         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4371         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4372         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4373         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4374         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4375     // Read whole register from str1. It is safe, because length >=8 here
4376     __ ldr(ch1, Address(str1));
4377     // Read whole register from str2. It is safe, because length >=8 here
4378     __ ldr(ch2, Address(str2));
4379     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4380     if (str1_isL != str2_isL) {
4381       __ eor(v0, __ T16B, v0, v0);
4382     }
4383     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4384     __ mul(first, first, tmp1);
4385     // check if we have less than 1 register to check
4386     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4387     if (str1_isL != str2_isL) {
4388       __ fmovd(v1, ch1);
4389     }
4390     __ br(__ LE, L_SMALL);
4391     __ eor(ch2, first, ch2);
4392     if (str1_isL != str2_isL) {
4393       __ zip1(v1, __ T16B, v1, v0);
4394     }
4395     __ sub(tmp2, ch2, tmp1);
4396     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4397     __ bics(tmp2, tmp2, ch2);
4398     if (str1_isL != str2_isL) {
4399       __ fmovd(ch1, v1);
4400     }
4401     __ br(__ NE, L_HAS_ZERO);
4402     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4403     __ add(result, result, wordSize/str2_chr_size);
4404     __ add(str2, str2, wordSize);
4405     __ br(__ LT, L_POST_LOOP);
4406     __ BIND(L_LOOP);
4407       __ ldr(ch2, Address(str2));
4408       __ eor(ch2, first, ch2);
4409       __ sub(tmp2, ch2, tmp1);
4410       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4411       __ bics(tmp2, tmp2, ch2);
4412       __ br(__ NE, L_HAS_ZERO);
4413     __ BIND(L_LOOP_PROCEED);
4414       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4415       __ add(str2, str2, wordSize);
4416       __ add(result, result, wordSize/str2_chr_size);
4417       __ br(__ GE, L_LOOP);
4418     __ BIND(L_POST_LOOP);
4419       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4420       __ br(__ LE, NOMATCH);
4421       __ ldr(ch2, Address(str2));
4422       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4423       __ eor(ch2, first, ch2);
4424       __ sub(tmp2, ch2, tmp1);
4425       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4426       __ mov(tmp4, -1); // all bits set
4427       __ b(L_SMALL_PROCEED);
4428     __ align(OptoLoopAlignment);
4429     __ BIND(L_SMALL);
4430       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4431       __ eor(ch2, first, ch2);
4432       if (str1_isL != str2_isL) {
4433         __ zip1(v1, __ T16B, v1, v0);
4434       }
4435       __ sub(tmp2, ch2, tmp1);
4436       __ mov(tmp4, -1); // all bits set
4437       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4438       if (str1_isL != str2_isL) {
4439         __ fmovd(ch1, v1); // move converted 4 symbols
4440       }
4441     __ BIND(L_SMALL_PROCEED);
4442       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4443       __ bic(tmp2, tmp2, ch2);
4444       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4445       __ rbit(tmp2, tmp2);
4446       __ br(__ EQ, NOMATCH);
4447     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4448       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4449       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4450       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4451       if (str2_isL) { // LL
4452         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4453         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4454         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4455         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4456         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4457       } else {
4458         __ mov(ch2, 0xE); // all bits in byte set except last one
4459         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4460         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4461         __ lslv(tmp2, tmp2, tmp4);
4462         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4463         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4464         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4465         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4466       }
4467       __ cmp(ch1, ch2);
4468       __ mov(tmp4, wordSize/str2_chr_size);
4469       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4470     __ BIND(L_SMALL_CMP_LOOP);
4471       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4472                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4473       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4474                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4475       __ add(tmp4, tmp4, 1);
4476       __ cmp(tmp4, cnt1);
4477       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4478       __ cmp(first, ch2);
4479       __ br(__ EQ, L_SMALL_CMP_LOOP);
4480     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4481       __ cbz(tmp2, NOMATCH); // no more matches. exit
4482       __ clz(tmp4, tmp2);
4483       __ add(result, result, 1); // advance index
4484       __ add(str2, str2, str2_chr_size); // advance pointer
4485       __ b(L_SMALL_HAS_ZERO_LOOP);
4486     __ align(OptoLoopAlignment);
4487     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4488       __ cmp(first, ch2);
4489       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4490       __ b(DONE);
4491     __ align(OptoLoopAlignment);
4492     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4493       if (str2_isL) { // LL
4494         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4495         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4496         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4497         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4498         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4499       } else {
4500         __ mov(ch2, 0xE); // all bits in byte set except last one
4501         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4502         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4503         __ lslv(tmp2, tmp2, tmp4);
4504         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4505         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4506         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4507         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4508       }
4509       __ cmp(ch1, ch2);
4510       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4511       __ b(DONE);
4512     __ align(OptoLoopAlignment);
4513     __ BIND(L_HAS_ZERO);
4514       __ rbit(tmp2, tmp2);
4515       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4516       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4517       // It's fine because both counters are 32bit and are not changed in this
4518       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4519       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4520       __ sub(result, result, 1);
4521     __ BIND(L_HAS_ZERO_LOOP);
4522       __ mov(cnt1, wordSize/str2_chr_size);
4523       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4524       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4525       if (str2_isL) {
4526         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4527         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4528         __ lslv(tmp2, tmp2, tmp4);
4529         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4530         __ add(tmp4, tmp4, 1);
4531         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4532         __ lsl(tmp2, tmp2, 1);
4533         __ mov(tmp4, wordSize/str2_chr_size);
4534       } else {
4535         __ mov(ch2, 0xE);
4536         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4537         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4538         __ lslv(tmp2, tmp2, tmp4);
4539         __ add(tmp4, tmp4, 1);
4540         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4541         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4542         __ lsl(tmp2, tmp2, 1);
4543         __ mov(tmp4, wordSize/str2_chr_size);
4544         __ sub(str2, str2, str2_chr_size);
4545       }
4546       __ cmp(ch1, ch2);
4547       __ mov(tmp4, wordSize/str2_chr_size);
4548       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4549     __ BIND(L_CMP_LOOP);
4550       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4551                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4552       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4553                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4554       __ add(tmp4, tmp4, 1);
4555       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4556       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4557       __ cmp(cnt1, ch2);
4558       __ br(__ EQ, L_CMP_LOOP);
4559     __ BIND(L_CMP_LOOP_NOMATCH);
4560       // here we're not matched
4561       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4562       __ clz(tmp4, tmp2);
4563       __ add(str2, str2, str2_chr_size); // advance pointer
4564       __ b(L_HAS_ZERO_LOOP);
4565     __ align(OptoLoopAlignment);
4566     __ BIND(L_CMP_LOOP_LAST_CMP);
4567       __ cmp(cnt1, ch2);
4568       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4569       __ b(DONE);
4570     __ align(OptoLoopAlignment);
4571     __ BIND(L_CMP_LOOP_LAST_CMP2);
4572       if (str2_isL) {
4573         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4574         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4575         __ lslv(tmp2, tmp2, tmp4);
4576         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4577         __ add(tmp4, tmp4, 1);
4578         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4579         __ lsl(tmp2, tmp2, 1);
4580       } else {
4581         __ mov(ch2, 0xE);
4582         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4583         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4584         __ lslv(tmp2, tmp2, tmp4);
4585         __ add(tmp4, tmp4, 1);
4586         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4587         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4588         __ lsl(tmp2, tmp2, 1);
4589         __ sub(str2, str2, str2_chr_size);
4590       }
4591       __ cmp(ch1, ch2);
4592       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4593       __ b(DONE);
4594     __ align(OptoLoopAlignment);
4595     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4596       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4597       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4598       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4599       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4600       // result by analyzed characters value, so, we can just reset lower bits
4601       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4602       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4603       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4604       // index of last analyzed substring inside current octet. So, str2 in at
4605       // respective start address. We need to advance it to next octet
4606       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4607       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4608       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4609       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4610       __ movw(cnt2, cnt2);
4611       __ b(L_LOOP_PROCEED);
4612     __ align(OptoLoopAlignment);
4613     __ BIND(NOMATCH);
4614       __ mov(result, -1);
4615     __ BIND(DONE);
4616       __ pop(spilled_regs, sp);
4617       __ ret(lr);
4618     return entry;
4619   }
4620 
4621   void generate_string_indexof_stubs() {
4622     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4623     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4624     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4625   }
4626 
4627   void inflate_and_store_2_fp_registers(bool generatePrfm,
4628       FloatRegister src1, FloatRegister src2) {
4629     Register dst = r1;
4630     __ zip1(v1, __ T16B, src1, v0);
4631     __ zip2(v2, __ T16B, src1, v0);
4632     if (generatePrfm) {
4633       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4634     }
4635     __ zip1(v3, __ T16B, src2, v0);
4636     __ zip2(v4, __ T16B, src2, v0);
4637     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4638   }
4639 
4640   // R0 = src
4641   // R1 = dst
4642   // R2 = len
4643   // R3 = len >> 3
4644   // V0 = 0
4645   // v1 = loaded 8 bytes
4646   address generate_large_byte_array_inflate() {
4647     __ align(CodeEntryAlignment);
4648     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4649     address entry = __ pc();
4650     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4651     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4652     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4653 
4654     // do one more 8-byte read to have address 16-byte aligned in most cases
4655     // also use single store instruction
4656     __ ldrd(v2, __ post(src, 8));
4657     __ sub(octetCounter, octetCounter, 2);
4658     __ zip1(v1, __ T16B, v1, v0);
4659     __ zip1(v2, __ T16B, v2, v0);
4660     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4661     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4662     __ subs(rscratch1, octetCounter, large_loop_threshold);
4663     __ br(__ LE, LOOP_START);
4664     __ b(LOOP_PRFM_START);
4665     __ bind(LOOP_PRFM);
4666       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4667     __ bind(LOOP_PRFM_START);
4668       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4669       __ sub(octetCounter, octetCounter, 8);
4670       __ subs(rscratch1, octetCounter, large_loop_threshold);
4671       inflate_and_store_2_fp_registers(true, v3, v4);
4672       inflate_and_store_2_fp_registers(true, v5, v6);
4673       __ br(__ GT, LOOP_PRFM);
4674       __ cmp(octetCounter, (u1)8);
4675       __ br(__ LT, DONE);
4676     __ bind(LOOP);
4677       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4678       __ bind(LOOP_START);
4679       __ sub(octetCounter, octetCounter, 8);
4680       __ cmp(octetCounter, (u1)8);
4681       inflate_and_store_2_fp_registers(false, v3, v4);
4682       inflate_and_store_2_fp_registers(false, v5, v6);
4683       __ br(__ GE, LOOP);
4684     __ bind(DONE);
4685       __ ret(lr);
4686     return entry;
4687   }
4688 
4689   /**
4690    *  Arguments:
4691    *
4692    *  Input:
4693    *  c_rarg0   - current state address
4694    *  c_rarg1   - H key address
4695    *  c_rarg2   - data address
4696    *  c_rarg3   - number of blocks
4697    *
4698    *  Output:
4699    *  Updated state at c_rarg0
4700    */
4701   address generate_ghash_processBlocks() {
4702     // Bafflingly, GCM uses little-endian for the byte order, but
4703     // big-endian for the bit order.  For example, the polynomial 1 is
4704     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4705     //
4706     // So, we must either reverse the bytes in each word and do
4707     // everything big-endian or reverse the bits in each byte and do
4708     // it little-endian.  On AArch64 it's more idiomatic to reverse
4709     // the bits in each byte (we have an instruction, RBIT, to do
4710     // that) and keep the data in little-endian bit order throught the
4711     // calculation, bit-reversing the inputs and outputs.
4712 
4713     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4714     __ align(wordSize * 2);
4715     address p = __ pc();
4716     __ emit_int64(0x87);  // The low-order bits of the field
4717                           // polynomial (i.e. p = z^7+z^2+z+1)
4718                           // repeated in the low and high parts of a
4719                           // 128-bit vector
4720     __ emit_int64(0x87);
4721 
4722     __ align(CodeEntryAlignment);
4723     address start = __ pc();
4724 
4725     Register state   = c_rarg0;
4726     Register subkeyH = c_rarg1;
4727     Register data    = c_rarg2;
4728     Register blocks  = c_rarg3;
4729 
4730     FloatRegister vzr = v30;
4731     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4732 
4733     __ ldrq(v0, Address(state));
4734     __ ldrq(v1, Address(subkeyH));
4735 
4736     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4737     __ rbit(v0, __ T16B, v0);
4738     __ rev64(v1, __ T16B, v1);
4739     __ rbit(v1, __ T16B, v1);
4740 
4741     __ ldrq(v26, p);
4742 
4743     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4744     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4745 
4746     {
4747       Label L_ghash_loop;
4748       __ bind(L_ghash_loop);
4749 
4750       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4751                                                  // reversing each byte
4752       __ rbit(v2, __ T16B, v2);
4753       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4754 
4755       // Multiply state in v2 by subkey in v1
4756       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4757                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4758                      /*temps*/v6, v20, v18, v21);
4759       // Reduce v7:v5 by the field polynomial
4760       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4761 
4762       __ sub(blocks, blocks, 1);
4763       __ cbnz(blocks, L_ghash_loop);
4764     }
4765 
4766     // The bit-reversed result is at this point in v0
4767     __ rev64(v1, __ T16B, v0);
4768     __ rbit(v1, __ T16B, v1);
4769 
4770     __ st1(v1, __ T16B, state);
4771     __ ret(lr);
4772 
4773     return start;
4774   }
4775 
4776   // Continuation point for throwing of implicit exceptions that are
4777   // not handled in the current activation. Fabricates an exception
4778   // oop and initiates normal exception dispatching in this
4779   // frame. Since we need to preserve callee-saved values (currently
4780   // only for C2, but done for C1 as well) we need a callee-saved oop
4781   // map and therefore have to make these stubs into RuntimeStubs
4782   // rather than BufferBlobs.  If the compiler needs all registers to
4783   // be preserved between the fault point and the exception handler
4784   // then it must assume responsibility for that in
4785   // AbstractCompiler::continuation_for_implicit_null_exception or
4786   // continuation_for_implicit_division_by_zero_exception. All other
4787   // implicit exceptions (e.g., NullPointerException or
4788   // AbstractMethodError on entry) are either at call sites or
4789   // otherwise assume that stack unwinding will be initiated, so
4790   // caller saved registers were assumed volatile in the compiler.
4791 
4792 #undef __
4793 #define __ masm->
4794 
4795   address generate_throw_exception(const char* name,
4796                                    address runtime_entry,
4797                                    Register arg1 = noreg,
4798                                    Register arg2 = noreg) {
4799     // Information about frame layout at time of blocking runtime call.
4800     // Note that we only have to preserve callee-saved registers since
4801     // the compilers are responsible for supplying a continuation point
4802     // if they expect all registers to be preserved.
4803     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4804     enum layout {
4805       rfp_off = 0,
4806       rfp_off2,
4807       return_off,
4808       return_off2,
4809       framesize // inclusive of return address
4810     };
4811 
4812     int insts_size = 512;
4813     int locs_size  = 64;
4814 
4815     CodeBuffer code(name, insts_size, locs_size);
4816     OopMapSet* oop_maps  = new OopMapSet();
4817     MacroAssembler* masm = new MacroAssembler(&code);
4818 
4819     address start = __ pc();
4820 
4821     // This is an inlined and slightly modified version of call_VM
4822     // which has the ability to fetch the return PC out of
4823     // thread-local storage and also sets up last_Java_sp slightly
4824     // differently than the real call_VM
4825 
4826     __ enter(); // Save FP and LR before call
4827 
4828     assert(is_even(framesize/2), "sp not 16-byte aligned");
4829 
4830     // lr and fp are already in place
4831     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4832 
4833     int frame_complete = __ pc() - start;
4834 
4835     // Set up last_Java_sp and last_Java_fp
4836     address the_pc = __ pc();
4837     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4838 
4839     // Call runtime
4840     if (arg1 != noreg) {
4841       assert(arg2 != c_rarg1, "clobbered");
4842       __ mov(c_rarg1, arg1);
4843     }
4844     if (arg2 != noreg) {
4845       __ mov(c_rarg2, arg2);
4846     }
4847     __ mov(c_rarg0, rthread);
4848     BLOCK_COMMENT("call runtime_entry");
4849     __ mov(rscratch1, runtime_entry);
4850     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4851 
4852     // Generate oop map
4853     OopMap* map = new OopMap(framesize, 0);
4854 
4855     oop_maps->add_gc_map(the_pc - start, map);
4856 
4857     __ reset_last_Java_frame(true);
4858     __ maybe_isb();
4859 
4860     __ leave();
4861 
4862     // check for pending exceptions
4863 #ifdef ASSERT
4864     Label L;
4865     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4866     __ cbnz(rscratch1, L);
4867     __ should_not_reach_here();
4868     __ bind(L);
4869 #endif // ASSERT
4870     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4871 
4872 
4873     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4874     RuntimeStub* stub =
4875       RuntimeStub::new_runtime_stub(name,
4876                                     &code,
4877                                     frame_complete,
4878                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4879                                     oop_maps, false);
4880     return stub->entry_point();
4881   }
4882 
4883   class MontgomeryMultiplyGenerator : public MacroAssembler {
4884 
4885     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4886       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4887 
4888     RegSet _toSave;
4889     bool _squaring;
4890 
4891   public:
4892     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4893       : MacroAssembler(as->code()), _squaring(squaring) {
4894 
4895       // Register allocation
4896 
4897       Register reg = c_rarg0;
4898       Pa_base = reg;       // Argument registers
4899       if (squaring)
4900         Pb_base = Pa_base;
4901       else
4902         Pb_base = ++reg;
4903       Pn_base = ++reg;
4904       Rlen= ++reg;
4905       inv = ++reg;
4906       Pm_base = ++reg;
4907 
4908                           // Working registers:
4909       Ra =  ++reg;        // The current digit of a, b, n, and m.
4910       Rb =  ++reg;
4911       Rm =  ++reg;
4912       Rn =  ++reg;
4913 
4914       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4915       Pb =  ++reg;
4916       Pm =  ++reg;
4917       Pn =  ++reg;
4918 
4919       t0 =  ++reg;        // Three registers which form a
4920       t1 =  ++reg;        // triple-precision accumuator.
4921       t2 =  ++reg;
4922 
4923       Ri =  ++reg;        // Inner and outer loop indexes.
4924       Rj =  ++reg;
4925 
4926       Rhi_ab = ++reg;     // Product registers: low and high parts
4927       Rlo_ab = ++reg;     // of a*b and m*n.
4928       Rhi_mn = ++reg;
4929       Rlo_mn = ++reg;
4930 
4931       // r19 and up are callee-saved.
4932       _toSave = RegSet::range(r19, reg) + Pm_base;
4933     }
4934 
4935   private:
4936     void save_regs() {
4937       push(_toSave, sp);
4938     }
4939 
4940     void restore_regs() {
4941       pop(_toSave, sp);
4942     }
4943 
4944     template <typename T>
4945     void unroll_2(Register count, T block) {
4946       Label loop, end, odd;
4947       tbnz(count, 0, odd);
4948       cbz(count, end);
4949       align(16);
4950       bind(loop);
4951       (this->*block)();
4952       bind(odd);
4953       (this->*block)();
4954       subs(count, count, 2);
4955       br(Assembler::GT, loop);
4956       bind(end);
4957     }
4958 
4959     template <typename T>
4960     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4961       Label loop, end, odd;
4962       tbnz(count, 0, odd);
4963       cbz(count, end);
4964       align(16);
4965       bind(loop);
4966       (this->*block)(d, s, tmp);
4967       bind(odd);
4968       (this->*block)(d, s, tmp);
4969       subs(count, count, 2);
4970       br(Assembler::GT, loop);
4971       bind(end);
4972     }
4973 
4974     void pre1(RegisterOrConstant i) {
4975       block_comment("pre1");
4976       // Pa = Pa_base;
4977       // Pb = Pb_base + i;
4978       // Pm = Pm_base;
4979       // Pn = Pn_base + i;
4980       // Ra = *Pa;
4981       // Rb = *Pb;
4982       // Rm = *Pm;
4983       // Rn = *Pn;
4984       ldr(Ra, Address(Pa_base));
4985       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4986       ldr(Rm, Address(Pm_base));
4987       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4988       lea(Pa, Address(Pa_base));
4989       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4990       lea(Pm, Address(Pm_base));
4991       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4992 
4993       // Zero the m*n result.
4994       mov(Rhi_mn, zr);
4995       mov(Rlo_mn, zr);
4996     }
4997 
4998     // The core multiply-accumulate step of a Montgomery
4999     // multiplication.  The idea is to schedule operations as a
5000     // pipeline so that instructions with long latencies (loads and
5001     // multiplies) have time to complete before their results are
5002     // used.  This most benefits in-order implementations of the
5003     // architecture but out-of-order ones also benefit.
5004     void step() {
5005       block_comment("step");
5006       // MACC(Ra, Rb, t0, t1, t2);
5007       // Ra = *++Pa;
5008       // Rb = *--Pb;
5009       umulh(Rhi_ab, Ra, Rb);
5010       mul(Rlo_ab, Ra, Rb);
5011       ldr(Ra, pre(Pa, wordSize));
5012       ldr(Rb, pre(Pb, -wordSize));
5013       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5014                                        // previous iteration.
5015       // MACC(Rm, Rn, t0, t1, t2);
5016       // Rm = *++Pm;
5017       // Rn = *--Pn;
5018       umulh(Rhi_mn, Rm, Rn);
5019       mul(Rlo_mn, Rm, Rn);
5020       ldr(Rm, pre(Pm, wordSize));
5021       ldr(Rn, pre(Pn, -wordSize));
5022       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5023     }
5024 
5025     void post1() {
5026       block_comment("post1");
5027 
5028       // MACC(Ra, Rb, t0, t1, t2);
5029       // Ra = *++Pa;
5030       // Rb = *--Pb;
5031       umulh(Rhi_ab, Ra, Rb);
5032       mul(Rlo_ab, Ra, Rb);
5033       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5034       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5035 
5036       // *Pm = Rm = t0 * inv;
5037       mul(Rm, t0, inv);
5038       str(Rm, Address(Pm));
5039 
5040       // MACC(Rm, Rn, t0, t1, t2);
5041       // t0 = t1; t1 = t2; t2 = 0;
5042       umulh(Rhi_mn, Rm, Rn);
5043 
5044 #ifndef PRODUCT
5045       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5046       {
5047         mul(Rlo_mn, Rm, Rn);
5048         add(Rlo_mn, t0, Rlo_mn);
5049         Label ok;
5050         cbz(Rlo_mn, ok); {
5051           stop("broken Montgomery multiply");
5052         } bind(ok);
5053       }
5054 #endif
5055       // We have very carefully set things up so that
5056       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5057       // the lower half of Rm * Rn because we know the result already:
5058       // it must be -t0.  t0 + (-t0) must generate a carry iff
5059       // t0 != 0.  So, rather than do a mul and an adds we just set
5060       // the carry flag iff t0 is nonzero.
5061       //
5062       // mul(Rlo_mn, Rm, Rn);
5063       // adds(zr, t0, Rlo_mn);
5064       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5065       adcs(t0, t1, Rhi_mn);
5066       adc(t1, t2, zr);
5067       mov(t2, zr);
5068     }
5069 
5070     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5071       block_comment("pre2");
5072       // Pa = Pa_base + i-len;
5073       // Pb = Pb_base + len;
5074       // Pm = Pm_base + i-len;
5075       // Pn = Pn_base + len;
5076 
5077       if (i.is_register()) {
5078         sub(Rj, i.as_register(), len);
5079       } else {
5080         mov(Rj, i.as_constant());
5081         sub(Rj, Rj, len);
5082       }
5083       // Rj == i-len
5084 
5085       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5086       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5087       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5088       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5089 
5090       // Ra = *++Pa;
5091       // Rb = *--Pb;
5092       // Rm = *++Pm;
5093       // Rn = *--Pn;
5094       ldr(Ra, pre(Pa, wordSize));
5095       ldr(Rb, pre(Pb, -wordSize));
5096       ldr(Rm, pre(Pm, wordSize));
5097       ldr(Rn, pre(Pn, -wordSize));
5098 
5099       mov(Rhi_mn, zr);
5100       mov(Rlo_mn, zr);
5101     }
5102 
5103     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5104       block_comment("post2");
5105       if (i.is_constant()) {
5106         mov(Rj, i.as_constant()-len.as_constant());
5107       } else {
5108         sub(Rj, i.as_register(), len);
5109       }
5110 
5111       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5112 
5113       // As soon as we know the least significant digit of our result,
5114       // store it.
5115       // Pm_base[i-len] = t0;
5116       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5117 
5118       // t0 = t1; t1 = t2; t2 = 0;
5119       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5120       adc(t1, t2, zr);
5121       mov(t2, zr);
5122     }
5123 
5124     // A carry in t0 after Montgomery multiplication means that we
5125     // should subtract multiples of n from our result in m.  We'll
5126     // keep doing that until there is no carry.
5127     void normalize(RegisterOrConstant len) {
5128       block_comment("normalize");
5129       // while (t0)
5130       //   t0 = sub(Pm_base, Pn_base, t0, len);
5131       Label loop, post, again;
5132       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5133       cbz(t0, post); {
5134         bind(again); {
5135           mov(i, zr);
5136           mov(cnt, len);
5137           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5138           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5139           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5140           align(16);
5141           bind(loop); {
5142             sbcs(Rm, Rm, Rn);
5143             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5144             add(i, i, 1);
5145             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5146             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5147             sub(cnt, cnt, 1);
5148           } cbnz(cnt, loop);
5149           sbc(t0, t0, zr);
5150         } cbnz(t0, again);
5151       } bind(post);
5152     }
5153 
5154     // Move memory at s to d, reversing words.
5155     //    Increments d to end of copied memory
5156     //    Destroys tmp1, tmp2
5157     //    Preserves len
5158     //    Leaves s pointing to the address which was in d at start
5159     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5160       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5161 
5162       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5163       mov(tmp1, len);
5164       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5165       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5166     }
5167     // where
5168     void reverse1(Register d, Register s, Register tmp) {
5169       ldr(tmp, pre(s, -wordSize));
5170       ror(tmp, tmp, 32);
5171       str(tmp, post(d, wordSize));
5172     }
5173 
5174     void step_squaring() {
5175       // An extra ACC
5176       step();
5177       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5178     }
5179 
5180     void last_squaring(RegisterOrConstant i) {
5181       Label dont;
5182       // if ((i & 1) == 0) {
5183       tbnz(i.as_register(), 0, dont); {
5184         // MACC(Ra, Rb, t0, t1, t2);
5185         // Ra = *++Pa;
5186         // Rb = *--Pb;
5187         umulh(Rhi_ab, Ra, Rb);
5188         mul(Rlo_ab, Ra, Rb);
5189         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5190       } bind(dont);
5191     }
5192 
5193     void extra_step_squaring() {
5194       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5195 
5196       // MACC(Rm, Rn, t0, t1, t2);
5197       // Rm = *++Pm;
5198       // Rn = *--Pn;
5199       umulh(Rhi_mn, Rm, Rn);
5200       mul(Rlo_mn, Rm, Rn);
5201       ldr(Rm, pre(Pm, wordSize));
5202       ldr(Rn, pre(Pn, -wordSize));
5203     }
5204 
5205     void post1_squaring() {
5206       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5207 
5208       // *Pm = Rm = t0 * inv;
5209       mul(Rm, t0, inv);
5210       str(Rm, Address(Pm));
5211 
5212       // MACC(Rm, Rn, t0, t1, t2);
5213       // t0 = t1; t1 = t2; t2 = 0;
5214       umulh(Rhi_mn, Rm, Rn);
5215 
5216 #ifndef PRODUCT
5217       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5218       {
5219         mul(Rlo_mn, Rm, Rn);
5220         add(Rlo_mn, t0, Rlo_mn);
5221         Label ok;
5222         cbz(Rlo_mn, ok); {
5223           stop("broken Montgomery multiply");
5224         } bind(ok);
5225       }
5226 #endif
5227       // We have very carefully set things up so that
5228       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5229       // the lower half of Rm * Rn because we know the result already:
5230       // it must be -t0.  t0 + (-t0) must generate a carry iff
5231       // t0 != 0.  So, rather than do a mul and an adds we just set
5232       // the carry flag iff t0 is nonzero.
5233       //
5234       // mul(Rlo_mn, Rm, Rn);
5235       // adds(zr, t0, Rlo_mn);
5236       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5237       adcs(t0, t1, Rhi_mn);
5238       adc(t1, t2, zr);
5239       mov(t2, zr);
5240     }
5241 
5242     void acc(Register Rhi, Register Rlo,
5243              Register t0, Register t1, Register t2) {
5244       adds(t0, t0, Rlo);
5245       adcs(t1, t1, Rhi);
5246       adc(t2, t2, zr);
5247     }
5248 
5249   public:
5250     /**
5251      * Fast Montgomery multiplication.  The derivation of the
5252      * algorithm is in A Cryptographic Library for the Motorola
5253      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5254      *
5255      * Arguments:
5256      *
5257      * Inputs for multiplication:
5258      *   c_rarg0   - int array elements a
5259      *   c_rarg1   - int array elements b
5260      *   c_rarg2   - int array elements n (the modulus)
5261      *   c_rarg3   - int length
5262      *   c_rarg4   - int inv
5263      *   c_rarg5   - int array elements m (the result)
5264      *
5265      * Inputs for squaring:
5266      *   c_rarg0   - int array elements a
5267      *   c_rarg1   - int array elements n (the modulus)
5268      *   c_rarg2   - int length
5269      *   c_rarg3   - int inv
5270      *   c_rarg4   - int array elements m (the result)
5271      *
5272      */
5273     address generate_multiply() {
5274       Label argh, nothing;
5275       bind(argh);
5276       stop("MontgomeryMultiply total_allocation must be <= 8192");
5277 
5278       align(CodeEntryAlignment);
5279       address entry = pc();
5280 
5281       cbzw(Rlen, nothing);
5282 
5283       enter();
5284 
5285       // Make room.
5286       cmpw(Rlen, 512);
5287       br(Assembler::HI, argh);
5288       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5289       andr(sp, Ra, -2 * wordSize);
5290 
5291       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5292 
5293       {
5294         // Copy input args, reversing as we go.  We use Ra as a
5295         // temporary variable.
5296         reverse(Ra, Pa_base, Rlen, t0, t1);
5297         if (!_squaring)
5298           reverse(Ra, Pb_base, Rlen, t0, t1);
5299         reverse(Ra, Pn_base, Rlen, t0, t1);
5300       }
5301 
5302       // Push all call-saved registers and also Pm_base which we'll need
5303       // at the end.
5304       save_regs();
5305 
5306 #ifndef PRODUCT
5307       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5308       {
5309         ldr(Rn, Address(Pn_base, 0));
5310         mul(Rlo_mn, Rn, inv);
5311         subs(zr, Rlo_mn, -1);
5312         Label ok;
5313         br(EQ, ok); {
5314           stop("broken inverse in Montgomery multiply");
5315         } bind(ok);
5316       }
5317 #endif
5318 
5319       mov(Pm_base, Ra);
5320 
5321       mov(t0, zr);
5322       mov(t1, zr);
5323       mov(t2, zr);
5324 
5325       block_comment("for (int i = 0; i < len; i++) {");
5326       mov(Ri, zr); {
5327         Label loop, end;
5328         cmpw(Ri, Rlen);
5329         br(Assembler::GE, end);
5330 
5331         bind(loop);
5332         pre1(Ri);
5333 
5334         block_comment("  for (j = i; j; j--) {"); {
5335           movw(Rj, Ri);
5336           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5337         } block_comment("  } // j");
5338 
5339         post1();
5340         addw(Ri, Ri, 1);
5341         cmpw(Ri, Rlen);
5342         br(Assembler::LT, loop);
5343         bind(end);
5344         block_comment("} // i");
5345       }
5346 
5347       block_comment("for (int i = len; i < 2*len; i++) {");
5348       mov(Ri, Rlen); {
5349         Label loop, end;
5350         cmpw(Ri, Rlen, Assembler::LSL, 1);
5351         br(Assembler::GE, end);
5352 
5353         bind(loop);
5354         pre2(Ri, Rlen);
5355 
5356         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5357           lslw(Rj, Rlen, 1);
5358           subw(Rj, Rj, Ri);
5359           subw(Rj, Rj, 1);
5360           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5361         } block_comment("  } // j");
5362 
5363         post2(Ri, Rlen);
5364         addw(Ri, Ri, 1);
5365         cmpw(Ri, Rlen, Assembler::LSL, 1);
5366         br(Assembler::LT, loop);
5367         bind(end);
5368       }
5369       block_comment("} // i");
5370 
5371       normalize(Rlen);
5372 
5373       mov(Ra, Pm_base);  // Save Pm_base in Ra
5374       restore_regs();  // Restore caller's Pm_base
5375 
5376       // Copy our result into caller's Pm_base
5377       reverse(Pm_base, Ra, Rlen, t0, t1);
5378 
5379       leave();
5380       bind(nothing);
5381       ret(lr);
5382 
5383       return entry;
5384     }
5385     // In C, approximately:
5386 
5387     // void
5388     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5389     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5390     //                     unsigned long inv, int len) {
5391     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5392     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5393     //   unsigned long Ra, Rb, Rn, Rm;
5394 
5395     //   int i;
5396 
5397     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5398 
5399     //   for (i = 0; i < len; i++) {
5400     //     int j;
5401 
5402     //     Pa = Pa_base;
5403     //     Pb = Pb_base + i;
5404     //     Pm = Pm_base;
5405     //     Pn = Pn_base + i;
5406 
5407     //     Ra = *Pa;
5408     //     Rb = *Pb;
5409     //     Rm = *Pm;
5410     //     Rn = *Pn;
5411 
5412     //     int iters = i;
5413     //     for (j = 0; iters--; j++) {
5414     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5415     //       MACC(Ra, Rb, t0, t1, t2);
5416     //       Ra = *++Pa;
5417     //       Rb = *--Pb;
5418     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5419     //       MACC(Rm, Rn, t0, t1, t2);
5420     //       Rm = *++Pm;
5421     //       Rn = *--Pn;
5422     //     }
5423 
5424     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5425     //     MACC(Ra, Rb, t0, t1, t2);
5426     //     *Pm = Rm = t0 * inv;
5427     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5428     //     MACC(Rm, Rn, t0, t1, t2);
5429 
5430     //     assert(t0 == 0, "broken Montgomery multiply");
5431 
5432     //     t0 = t1; t1 = t2; t2 = 0;
5433     //   }
5434 
5435     //   for (i = len; i < 2*len; i++) {
5436     //     int j;
5437 
5438     //     Pa = Pa_base + i-len;
5439     //     Pb = Pb_base + len;
5440     //     Pm = Pm_base + i-len;
5441     //     Pn = Pn_base + len;
5442 
5443     //     Ra = *++Pa;
5444     //     Rb = *--Pb;
5445     //     Rm = *++Pm;
5446     //     Rn = *--Pn;
5447 
5448     //     int iters = len*2-i-1;
5449     //     for (j = i-len+1; iters--; j++) {
5450     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5451     //       MACC(Ra, Rb, t0, t1, t2);
5452     //       Ra = *++Pa;
5453     //       Rb = *--Pb;
5454     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5455     //       MACC(Rm, Rn, t0, t1, t2);
5456     //       Rm = *++Pm;
5457     //       Rn = *--Pn;
5458     //     }
5459 
5460     //     Pm_base[i-len] = t0;
5461     //     t0 = t1; t1 = t2; t2 = 0;
5462     //   }
5463 
5464     //   while (t0)
5465     //     t0 = sub(Pm_base, Pn_base, t0, len);
5466     // }
5467 
5468     /**
5469      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5470      * multiplies than Montgomery multiplication so it should be up to
5471      * 25% faster.  However, its loop control is more complex and it
5472      * may actually run slower on some machines.
5473      *
5474      * Arguments:
5475      *
5476      * Inputs:
5477      *   c_rarg0   - int array elements a
5478      *   c_rarg1   - int array elements n (the modulus)
5479      *   c_rarg2   - int length
5480      *   c_rarg3   - int inv
5481      *   c_rarg4   - int array elements m (the result)
5482      *
5483      */
5484     address generate_square() {
5485       Label argh;
5486       bind(argh);
5487       stop("MontgomeryMultiply total_allocation must be <= 8192");
5488 
5489       align(CodeEntryAlignment);
5490       address entry = pc();
5491 
5492       enter();
5493 
5494       // Make room.
5495       cmpw(Rlen, 512);
5496       br(Assembler::HI, argh);
5497       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5498       andr(sp, Ra, -2 * wordSize);
5499 
5500       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5501 
5502       {
5503         // Copy input args, reversing as we go.  We use Ra as a
5504         // temporary variable.
5505         reverse(Ra, Pa_base, Rlen, t0, t1);
5506         reverse(Ra, Pn_base, Rlen, t0, t1);
5507       }
5508 
5509       // Push all call-saved registers and also Pm_base which we'll need
5510       // at the end.
5511       save_regs();
5512 
5513       mov(Pm_base, Ra);
5514 
5515       mov(t0, zr);
5516       mov(t1, zr);
5517       mov(t2, zr);
5518 
5519       block_comment("for (int i = 0; i < len; i++) {");
5520       mov(Ri, zr); {
5521         Label loop, end;
5522         bind(loop);
5523         cmp(Ri, Rlen);
5524         br(Assembler::GE, end);
5525 
5526         pre1(Ri);
5527 
5528         block_comment("for (j = (i+1)/2; j; j--) {"); {
5529           add(Rj, Ri, 1);
5530           lsr(Rj, Rj, 1);
5531           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5532         } block_comment("  } // j");
5533 
5534         last_squaring(Ri);
5535 
5536         block_comment("  for (j = i/2; j; j--) {"); {
5537           lsr(Rj, Ri, 1);
5538           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5539         } block_comment("  } // j");
5540 
5541         post1_squaring();
5542         add(Ri, Ri, 1);
5543         cmp(Ri, Rlen);
5544         br(Assembler::LT, loop);
5545 
5546         bind(end);
5547         block_comment("} // i");
5548       }
5549 
5550       block_comment("for (int i = len; i < 2*len; i++) {");
5551       mov(Ri, Rlen); {
5552         Label loop, end;
5553         bind(loop);
5554         cmp(Ri, Rlen, Assembler::LSL, 1);
5555         br(Assembler::GE, end);
5556 
5557         pre2(Ri, Rlen);
5558 
5559         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5560           lsl(Rj, Rlen, 1);
5561           sub(Rj, Rj, Ri);
5562           sub(Rj, Rj, 1);
5563           lsr(Rj, Rj, 1);
5564           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5565         } block_comment("  } // j");
5566 
5567         last_squaring(Ri);
5568 
5569         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5570           lsl(Rj, Rlen, 1);
5571           sub(Rj, Rj, Ri);
5572           lsr(Rj, Rj, 1);
5573           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5574         } block_comment("  } // j");
5575 
5576         post2(Ri, Rlen);
5577         add(Ri, Ri, 1);
5578         cmp(Ri, Rlen, Assembler::LSL, 1);
5579 
5580         br(Assembler::LT, loop);
5581         bind(end);
5582         block_comment("} // i");
5583       }
5584 
5585       normalize(Rlen);
5586 
5587       mov(Ra, Pm_base);  // Save Pm_base in Ra
5588       restore_regs();  // Restore caller's Pm_base
5589 
5590       // Copy our result into caller's Pm_base
5591       reverse(Pm_base, Ra, Rlen, t0, t1);
5592 
5593       leave();
5594       ret(lr);
5595 
5596       return entry;
5597     }
5598     // In C, approximately:
5599 
5600     // void
5601     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5602     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5603     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5604     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5605     //   unsigned long Ra, Rb, Rn, Rm;
5606 
5607     //   int i;
5608 
5609     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5610 
5611     //   for (i = 0; i < len; i++) {
5612     //     int j;
5613 
5614     //     Pa = Pa_base;
5615     //     Pb = Pa_base + i;
5616     //     Pm = Pm_base;
5617     //     Pn = Pn_base + i;
5618 
5619     //     Ra = *Pa;
5620     //     Rb = *Pb;
5621     //     Rm = *Pm;
5622     //     Rn = *Pn;
5623 
5624     //     int iters = (i+1)/2;
5625     //     for (j = 0; iters--; j++) {
5626     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5627     //       MACC2(Ra, Rb, t0, t1, t2);
5628     //       Ra = *++Pa;
5629     //       Rb = *--Pb;
5630     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5631     //       MACC(Rm, Rn, t0, t1, t2);
5632     //       Rm = *++Pm;
5633     //       Rn = *--Pn;
5634     //     }
5635     //     if ((i & 1) == 0) {
5636     //       assert(Ra == Pa_base[j], "must be");
5637     //       MACC(Ra, Ra, t0, t1, t2);
5638     //     }
5639     //     iters = i/2;
5640     //     assert(iters == i-j, "must be");
5641     //     for (; iters--; j++) {
5642     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5643     //       MACC(Rm, Rn, t0, t1, t2);
5644     //       Rm = *++Pm;
5645     //       Rn = *--Pn;
5646     //     }
5647 
5648     //     *Pm = Rm = t0 * inv;
5649     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5650     //     MACC(Rm, Rn, t0, t1, t2);
5651 
5652     //     assert(t0 == 0, "broken Montgomery multiply");
5653 
5654     //     t0 = t1; t1 = t2; t2 = 0;
5655     //   }
5656 
5657     //   for (i = len; i < 2*len; i++) {
5658     //     int start = i-len+1;
5659     //     int end = start + (len - start)/2;
5660     //     int j;
5661 
5662     //     Pa = Pa_base + i-len;
5663     //     Pb = Pa_base + len;
5664     //     Pm = Pm_base + i-len;
5665     //     Pn = Pn_base + len;
5666 
5667     //     Ra = *++Pa;
5668     //     Rb = *--Pb;
5669     //     Rm = *++Pm;
5670     //     Rn = *--Pn;
5671 
5672     //     int iters = (2*len-i-1)/2;
5673     //     assert(iters == end-start, "must be");
5674     //     for (j = start; iters--; j++) {
5675     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5676     //       MACC2(Ra, Rb, t0, t1, t2);
5677     //       Ra = *++Pa;
5678     //       Rb = *--Pb;
5679     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5680     //       MACC(Rm, Rn, t0, t1, t2);
5681     //       Rm = *++Pm;
5682     //       Rn = *--Pn;
5683     //     }
5684     //     if ((i & 1) == 0) {
5685     //       assert(Ra == Pa_base[j], "must be");
5686     //       MACC(Ra, Ra, t0, t1, t2);
5687     //     }
5688     //     iters =  (2*len-i)/2;
5689     //     assert(iters == len-j, "must be");
5690     //     for (; iters--; j++) {
5691     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5692     //       MACC(Rm, Rn, t0, t1, t2);
5693     //       Rm = *++Pm;
5694     //       Rn = *--Pn;
5695     //     }
5696     //     Pm_base[i-len] = t0;
5697     //     t0 = t1; t1 = t2; t2 = 0;
5698     //   }
5699 
5700     //   while (t0)
5701     //     t0 = sub(Pm_base, Pn_base, t0, len);
5702     // }
5703   };
5704 
5705 
5706   // Initialization
5707   void generate_initial() {
5708     // Generate initial stubs and initializes the entry points
5709 
5710     // entry points that exist in all platforms Note: This is code
5711     // that could be shared among different platforms - however the
5712     // benefit seems to be smaller than the disadvantage of having a
5713     // much more complicated generator structure. See also comment in
5714     // stubRoutines.hpp.
5715 
5716     StubRoutines::_forward_exception_entry = generate_forward_exception();
5717 
5718     StubRoutines::_call_stub_entry =
5719       generate_call_stub(StubRoutines::_call_stub_return_address);
5720 
5721     // is referenced by megamorphic call
5722     StubRoutines::_catch_exception_entry = generate_catch_exception();
5723 
5724     // Build this early so it's available for the interpreter.
5725     StubRoutines::_throw_StackOverflowError_entry =
5726       generate_throw_exception("StackOverflowError throw_exception",
5727                                CAST_FROM_FN_PTR(address,
5728                                                 SharedRuntime::throw_StackOverflowError));
5729     StubRoutines::_throw_delayed_StackOverflowError_entry =
5730       generate_throw_exception("delayed StackOverflowError throw_exception",
5731                                CAST_FROM_FN_PTR(address,
5732                                                 SharedRuntime::throw_delayed_StackOverflowError));
5733     if (UseCRC32Intrinsics) {
5734       // set table address before stub generation which use it
5735       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5736       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5737     }
5738 
5739     if (UseCRC32CIntrinsics) {
5740       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5741     }
5742 
5743     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5744       StubRoutines::_dlog = generate_dlog();
5745     }
5746 
5747     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5748       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5749     }
5750 
5751     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5752       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5753     }
5754   }
5755 
5756   void generate_all() {
5757     // support for verify_oop (must happen after universe_init)
5758     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5759     StubRoutines::_throw_AbstractMethodError_entry =
5760       generate_throw_exception("AbstractMethodError throw_exception",
5761                                CAST_FROM_FN_PTR(address,
5762                                                 SharedRuntime::
5763                                                 throw_AbstractMethodError));
5764 
5765     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5766       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5767                                CAST_FROM_FN_PTR(address,
5768                                                 SharedRuntime::
5769                                                 throw_IncompatibleClassChangeError));
5770 
5771     StubRoutines::_throw_NullPointerException_at_call_entry =
5772       generate_throw_exception("NullPointerException at call throw_exception",
5773                                CAST_FROM_FN_PTR(address,
5774                                                 SharedRuntime::
5775                                                 throw_NullPointerException_at_call));
5776 
5777     // arraycopy stubs used by compilers
5778     generate_arraycopy_stubs();
5779 
5780     // has negatives stub for large arrays.
5781     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5782 
5783     // array equals stub for large arrays.
5784     if (!UseSimpleArrayEquals) {
5785       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5786     }
5787 
5788     generate_compare_long_strings();
5789 
5790     generate_string_indexof_stubs();
5791 
5792     // byte_array_inflate stub for large arrays.
5793     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5794 
5795     if (UseMultiplyToLenIntrinsic) {
5796       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5797     }
5798 
5799     if (UseSquareToLenIntrinsic) {
5800       StubRoutines::_squareToLen = generate_squareToLen();
5801     }
5802 
5803     if (UseMulAddIntrinsic) {
5804       StubRoutines::_mulAdd = generate_mulAdd();
5805     }
5806 
5807     if (UseMontgomeryMultiplyIntrinsic) {
5808       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5809       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5810       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5811     }
5812 
5813     if (UseMontgomerySquareIntrinsic) {
5814       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5815       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5816       // We use generate_multiply() rather than generate_square()
5817       // because it's faster for the sizes of modulus we care about.
5818       StubRoutines::_montgomerySquare = g.generate_multiply();
5819     }
5820 
5821 #ifndef BUILTIN_SIM
5822     // generate GHASH intrinsics code
5823     if (UseGHASHIntrinsics) {
5824       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5825     }
5826 
5827     if (UseAESIntrinsics) {
5828       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5829       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5830       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5831       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5832     }
5833 
5834     if (UseSHA1Intrinsics) {
5835       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5836       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5837     }
5838     if (UseSHA256Intrinsics) {
5839       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5840       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5841     }
5842 
5843     // generate Adler32 intrinsics code
5844     if (UseAdler32Intrinsics) {
5845       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5846     }
5847 
5848     // Safefetch stubs.
5849     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5850                                                        &StubRoutines::_safefetch32_fault_pc,
5851                                                        &StubRoutines::_safefetch32_continuation_pc);
5852     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5853                                                        &StubRoutines::_safefetchN_fault_pc,
5854                                                        &StubRoutines::_safefetchN_continuation_pc);
5855 #endif
5856     StubRoutines::aarch64::set_completed();
5857   }
5858 
5859  public:
5860   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5861     if (all) {
5862       generate_all();
5863     } else {
5864       generate_initial();
5865     }
5866   }
5867 }; // end class declaration
5868 
5869 void StubGenerator_generate(CodeBuffer* code, bool all) {
5870   StubGenerator g(code, all);
5871 }