1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #include "utilities/powerOfTwo.hpp"
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zThreadLocalData.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     address aarch64_entry = __ pc();
 222 
 223     // set up frame and move sp to end of save area
 224     __ enter();
 225     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 226 
 227     // save register parameters and Java scratch/global registers
 228     // n.b. we save thread even though it gets installed in
 229     // rthread because we want to sanity check rthread later
 230     __ str(c_rarg7,  thread);
 231     __ strw(c_rarg6, parameter_size);
 232     __ stp(c_rarg4, c_rarg5,  entry_point);
 233     __ stp(c_rarg2, c_rarg3,  result_type);
 234     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 235 
 236     __ stp(r20, r19,   r20_save);
 237     __ stp(r22, r21,   r22_save);
 238     __ stp(r24, r23,   r24_save);
 239     __ stp(r26, r25,   r26_save);
 240     __ stp(r28, r27,   r28_save);
 241 
 242     __ stpd(v9,  v8,   d9_save);
 243     __ stpd(v11, v10,  d11_save);
 244     __ stpd(v13, v12,  d13_save);
 245     __ stpd(v15, v14,  d15_save);
 246 
 247     // install Java thread in global register now we have saved
 248     // whatever value it held
 249     __ mov(rthread, c_rarg7);
 250     // And method
 251     __ mov(rmethod, c_rarg3);
 252 
 253     // set up the heapbase register
 254     __ reinit_heapbase();
 255 
 256 #ifdef ASSERT
 257     // make sure we have no pending exceptions
 258     {
 259       Label L;
 260       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 261       __ cmp(rscratch1, (u1)NULL_WORD);
 262       __ br(Assembler::EQ, L);
 263       __ stop("StubRoutines::call_stub: entered with pending exception");
 264       __ BIND(L);
 265     }
 266 #endif
 267     // pass parameters if any
 268     __ mov(esp, sp);
 269     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 270     __ andr(sp, rscratch1, -2 * wordSize);
 271 
 272     BLOCK_COMMENT("pass parameters if any");
 273     Label parameters_done;
 274     // parameter count is still in c_rarg6
 275     // and parameter pointer identifying param 1 is in c_rarg5
 276     __ cbzw(c_rarg6, parameters_done);
 277 
 278     address loop = __ pc();
 279     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 280     __ subsw(c_rarg6, c_rarg6, 1);
 281     __ push(rscratch1);
 282     __ br(Assembler::GT, loop);
 283 
 284     __ BIND(parameters_done);
 285 
 286     // call Java entry -- passing methdoOop, and current sp
 287     //      rmethod: Method*
 288     //      r13: sender sp
 289     BLOCK_COMMENT("call Java function");
 290     __ mov(r13, sp);
 291     __ blr(c_rarg4);
 292 
 293     // we do this here because the notify will already have been done
 294     // if we get to the next instruction via an exception
 295     //
 296     // n.b. adding this instruction here affects the calculation of
 297     // whether or not a routine returns to the call stub (used when
 298     // doing stack walks) since the normal test is to check the return
 299     // pc against the address saved below. so we may need to allow for
 300     // this extra instruction in the check.
 301 
 302     // save current address for use by exception handling code
 303 
 304     return_address = __ pc();
 305 
 306     // store result depending on type (everything that is not
 307     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 308     // n.b. this assumes Java returns an integral result in r0
 309     // and a floating result in j_farg0
 310     __ ldr(j_rarg2, result);
 311     Label is_long, is_float, is_double, exit;
 312     __ ldr(j_rarg1, result_type);
 313     __ cmp(j_rarg1, (u1)T_OBJECT);
 314     __ br(Assembler::EQ, is_long);
 315     __ cmp(j_rarg1, (u1)T_LONG);
 316     __ br(Assembler::EQ, is_long);
 317     __ cmp(j_rarg1, (u1)T_FLOAT);
 318     __ br(Assembler::EQ, is_float);
 319     __ cmp(j_rarg1, (u1)T_DOUBLE);
 320     __ br(Assembler::EQ, is_double);
 321 
 322     // handle T_INT case
 323     __ strw(r0, Address(j_rarg2));
 324 
 325     __ BIND(exit);
 326 
 327     // pop parameters
 328     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 329 
 330 #ifdef ASSERT
 331     // verify that threads correspond
 332     {
 333       Label L, S;
 334       __ ldr(rscratch1, thread);
 335       __ cmp(rthread, rscratch1);
 336       __ br(Assembler::NE, S);
 337       __ get_thread(rscratch1);
 338       __ cmp(rthread, rscratch1);
 339       __ br(Assembler::EQ, L);
 340       __ BIND(S);
 341       __ stop("StubRoutines::call_stub: threads must correspond");
 342       __ BIND(L);
 343     }
 344 #endif
 345 
 346     // restore callee-save registers
 347     __ ldpd(v15, v14,  d15_save);
 348     __ ldpd(v13, v12,  d13_save);
 349     __ ldpd(v11, v10,  d11_save);
 350     __ ldpd(v9,  v8,   d9_save);
 351 
 352     __ ldp(r28, r27,   r28_save);
 353     __ ldp(r26, r25,   r26_save);
 354     __ ldp(r24, r23,   r24_save);
 355     __ ldp(r22, r21,   r22_save);
 356     __ ldp(r20, r19,   r20_save);
 357 
 358     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 359     __ ldrw(c_rarg2, result_type);
 360     __ ldr(c_rarg3,  method);
 361     __ ldp(c_rarg4, c_rarg5,  entry_point);
 362     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 363 
 364     // leave frame and return to caller
 365     __ leave();
 366     __ ret(lr);
 367 
 368     // handle return types different from T_INT
 369 
 370     __ BIND(is_long);
 371     __ str(r0, Address(j_rarg2, 0));
 372     __ br(Assembler::AL, exit);
 373 
 374     __ BIND(is_float);
 375     __ strs(j_farg0, Address(j_rarg2, 0));
 376     __ br(Assembler::AL, exit);
 377 
 378     __ BIND(is_double);
 379     __ strd(j_farg0, Address(j_rarg2, 0));
 380     __ br(Assembler::AL, exit);
 381 
 382     return start;
 383   }
 384 
 385   // Return point for a Java call if there's an exception thrown in
 386   // Java code.  The exception is caught and transformed into a
 387   // pending exception stored in JavaThread that can be tested from
 388   // within the VM.
 389   //
 390   // Note: Usually the parameters are removed by the callee. In case
 391   // of an exception crossing an activation frame boundary, that is
 392   // not the case if the callee is compiled code => need to setup the
 393   // rsp.
 394   //
 395   // r0: exception oop
 396 
 397   address generate_catch_exception() {
 398     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 399     address start = __ pc();
 400 
 401     // same as in generate_call_stub():
 402     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 403     const Address thread        (rfp, thread_off         * wordSize);
 404 
 405 #ifdef ASSERT
 406     // verify that threads correspond
 407     {
 408       Label L, S;
 409       __ ldr(rscratch1, thread);
 410       __ cmp(rthread, rscratch1);
 411       __ br(Assembler::NE, S);
 412       __ get_thread(rscratch1);
 413       __ cmp(rthread, rscratch1);
 414       __ br(Assembler::EQ, L);
 415       __ bind(S);
 416       __ stop("StubRoutines::catch_exception: threads must correspond");
 417       __ bind(L);
 418     }
 419 #endif
 420 
 421     // set pending exception
 422     __ verify_oop(r0);
 423 
 424     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 425     __ mov(rscratch1, (address)__FILE__);
 426     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 427     __ movw(rscratch1, (int)__LINE__);
 428     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 429 
 430     // complete return to VM
 431     assert(StubRoutines::_call_stub_return_address != NULL,
 432            "_call_stub_return_address must have been generated before");
 433     __ b(StubRoutines::_call_stub_return_address);
 434 
 435     return start;
 436   }
 437 
 438   // Continuation point for runtime calls returning with a pending
 439   // exception.  The pending exception check happened in the runtime
 440   // or native call stub.  The pending exception in Thread is
 441   // converted into a Java-level exception.
 442   //
 443   // Contract with Java-level exception handlers:
 444   // r0: exception
 445   // r3: throwing pc
 446   //
 447   // NOTE: At entry of this stub, exception-pc must be in LR !!
 448 
 449   // NOTE: this is always used as a jump target within generated code
 450   // so it just needs to be generated code wiht no x86 prolog
 451 
 452   address generate_forward_exception() {
 453     StubCodeMark mark(this, "StubRoutines", "forward exception");
 454     address start = __ pc();
 455 
 456     // Upon entry, LR points to the return address returning into
 457     // Java (interpreted or compiled) code; i.e., the return address
 458     // becomes the throwing pc.
 459     //
 460     // Arguments pushed before the runtime call are still on the stack
 461     // but the exception handler will reset the stack pointer ->
 462     // ignore them.  A potential result in registers can be ignored as
 463     // well.
 464 
 465 #ifdef ASSERT
 466     // make sure this code is only executed if there is a pending exception
 467     {
 468       Label L;
 469       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 470       __ cbnz(rscratch1, L);
 471       __ stop("StubRoutines::forward exception: no pending exception (1)");
 472       __ bind(L);
 473     }
 474 #endif
 475 
 476     // compute exception handler into r19
 477 
 478     // call the VM to find the handler address associated with the
 479     // caller address. pass thread in r0 and caller pc (ret address)
 480     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 481     // the stack.
 482     __ mov(c_rarg1, lr);
 483     // lr will be trashed by the VM call so we move it to R19
 484     // (callee-saved) because we also need to pass it to the handler
 485     // returned by this call.
 486     __ mov(r19, lr);
 487     BLOCK_COMMENT("call exception_handler_for_return_address");
 488     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 489                          SharedRuntime::exception_handler_for_return_address),
 490                     rthread, c_rarg1);
 491     // we should not really care that lr is no longer the callee
 492     // address. we saved the value the handler needs in r19 so we can
 493     // just copy it to r3. however, the C2 handler will push its own
 494     // frame and then calls into the VM and the VM code asserts that
 495     // the PC for the frame above the handler belongs to a compiled
 496     // Java method. So, we restore lr here to satisfy that assert.
 497     __ mov(lr, r19);
 498     // setup r0 & r3 & clear pending exception
 499     __ mov(r3, r19);
 500     __ mov(r19, r0);
 501     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 502     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 503 
 504 #ifdef ASSERT
 505     // make sure exception is set
 506     {
 507       Label L;
 508       __ cbnz(r0, L);
 509       __ stop("StubRoutines::forward exception: no pending exception (2)");
 510       __ bind(L);
 511     }
 512 #endif
 513 
 514     // continue at exception handler
 515     // r0: exception
 516     // r3: throwing pc
 517     // r19: exception handler
 518     __ verify_oop(r0);
 519     __ br(r19);
 520 
 521     return start;
 522   }
 523 
 524   // Non-destructive plausibility checks for oops
 525   //
 526   // Arguments:
 527   //    r0: oop to verify
 528   //    rscratch1: error message
 529   //
 530   // Stack after saving c_rarg3:
 531   //    [tos + 0]: saved c_rarg3
 532   //    [tos + 1]: saved c_rarg2
 533   //    [tos + 2]: saved lr
 534   //    [tos + 3]: saved rscratch2
 535   //    [tos + 4]: saved r0
 536   //    [tos + 5]: saved rscratch1
 537   address generate_verify_oop() {
 538 
 539     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 540     address start = __ pc();
 541 
 542     Label exit, error;
 543 
 544     // save c_rarg2 and c_rarg3
 545     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 546 
 547     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 548     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ ldr(c_rarg3, Address(c_rarg2));
 550     __ add(c_rarg3, c_rarg3, 1);
 551     __ str(c_rarg3, Address(c_rarg2));
 552 
 553     // object is in r0
 554     // make sure object is 'reasonable'
 555     __ cbz(r0, exit); // if obj is NULL it is OK
 556 
 557 #if INCLUDE_ZGC
 558     if (UseZGC) {
 559       // Check if mask is good.
 560       // verifies that ZAddressBadMask & r0 == 0
 561       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 562       __ andr(c_rarg2, r0, c_rarg3);
 563       __ cbnz(c_rarg2, error);
 564     }
 565 #endif
 566 
 567     // Check if the oop is in the right area of memory
 568     // Make sure we cast to `address` or it ends up calling the wrong `mov`
 569     // with MSVC, leading to a crash.
 570     __ mov(c_rarg3, (address) Universe::verify_oop_mask());
 571     __ andr(c_rarg2, r0, c_rarg3);
 572     __ mov(c_rarg3, (address) Universe::verify_oop_bits());
 573 
 574     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 575     // instruction here because the flags register is live.
 576     __ eor(c_rarg2, c_rarg2, c_rarg3);
 577     __ cbnz(c_rarg2, error);
 578 
 579     // make sure klass is 'reasonable', which is not zero.
 580     __ load_klass(r0, r0);  // get klass
 581     __ cbz(r0, error);      // if klass is NULL it is broken
 582 
 583     // return if everything seems ok
 584     __ bind(exit);
 585 
 586     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 587     __ ret(lr);
 588 
 589     // handle errors
 590     __ bind(error);
 591     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 592 
 593     __ push(RegSet::range(r0, r29), sp);
 594     // debug(char* msg, int64_t pc, int64_t regs[])
 595     __ mov(c_rarg0, rscratch1);      // pass address of error message
 596     __ mov(c_rarg1, lr);             // pass return address
 597     __ mov(c_rarg2, sp);             // pass address of regs on stack
 598 #ifndef PRODUCT
 599     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 600 #endif
 601     BLOCK_COMMENT("call MacroAssembler::debug");
 602     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 603     __ blr(rscratch1);
 604     __ hlt(0);
 605 
 606     return start;
 607   }
 608 
 609   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 610 
 611   // The inner part of zero_words().  This is the bulk operation,
 612   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 613   // caller is responsible for zeroing the last few words.
 614   //
 615   // Inputs:
 616   // r10: the HeapWord-aligned base address of an array to zero.
 617   // r11: the count in HeapWords, r11 > 0.
 618   //
 619   // Returns r10 and r11, adjusted for the caller to clear.
 620   // r10: the base address of the tail of words left to clear.
 621   // r11: the number of words in the tail.
 622   //      r11 < MacroAssembler::zero_words_block_size.
 623 
 624   address generate_zero_blocks() {
 625     Label done;
 626     Label base_aligned;
 627 
 628     Register base = r10, cnt = r11;
 629 
 630     __ align(CodeEntryAlignment);
 631     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 632     address start = __ pc();
 633 
 634     if (UseBlockZeroing) {
 635       int zva_length = VM_Version::zva_length();
 636 
 637       // Ensure ZVA length can be divided by 16. This is required by
 638       // the subsequent operations.
 639       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 640 
 641       __ tbz(base, 3, base_aligned);
 642       __ str(zr, Address(__ post(base, 8)));
 643       __ sub(cnt, cnt, 1);
 644       __ bind(base_aligned);
 645 
 646       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 647       // alignment.
 648       Label small;
 649       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 650       __ subs(rscratch1, cnt, low_limit >> 3);
 651       __ br(Assembler::LT, small);
 652       __ zero_dcache_blocks(base, cnt);
 653       __ bind(small);
 654     }
 655 
 656     {
 657       // Number of stp instructions we'll unroll
 658       const int unroll =
 659         MacroAssembler::zero_words_block_size / 2;
 660       // Clear the remaining blocks.
 661       Label loop;
 662       __ subs(cnt, cnt, unroll * 2);
 663       __ br(Assembler::LT, done);
 664       __ bind(loop);
 665       for (int i = 0; i < unroll; i++)
 666         __ stp(zr, zr, __ post(base, 16));
 667       __ subs(cnt, cnt, unroll * 2);
 668       __ br(Assembler::GE, loop);
 669       __ bind(done);
 670       __ add(cnt, cnt, unroll * 2);
 671     }
 672 
 673     __ ret(lr);
 674 
 675     return start;
 676   }
 677 
 678 
 679   typedef enum {
 680     copy_forwards = 1,
 681     copy_backwards = -1
 682   } copy_direction;
 683 
 684   // Bulk copy of blocks of 8 words.
 685   //
 686   // count is a count of words.
 687   //
 688   // Precondition: count >= 8
 689   //
 690   // Postconditions:
 691   //
 692   // The least significant bit of count contains the remaining count
 693   // of words to copy.  The rest of count is trash.
 694   //
 695   // s and d are adjusted to point to the remaining words to copy
 696   //
 697   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 698                            copy_direction direction) {
 699     int unit = wordSize * direction;
 700     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 701 
 702     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 703       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 704     const Register stride = r13;
 705 
 706     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 707     assert_different_registers(s, d, count, rscratch1);
 708 
 709     Label again, drain;
 710     const char *stub_name;
 711     if (direction == copy_forwards)
 712       stub_name = "forward_copy_longs";
 713     else
 714       stub_name = "backward_copy_longs";
 715 
 716     __ align(CodeEntryAlignment);
 717 
 718     StubCodeMark mark(this, "StubRoutines", stub_name);
 719 
 720     __ bind(start);
 721 
 722     Label unaligned_copy_long;
 723     if (AvoidUnalignedAccesses) {
 724       __ tbnz(d, 3, unaligned_copy_long);
 725     }
 726 
 727     if (direction == copy_forwards) {
 728       __ sub(s, s, bias);
 729       __ sub(d, d, bias);
 730     }
 731 
 732 #ifdef ASSERT
 733     // Make sure we are never given < 8 words
 734     {
 735       Label L;
 736       __ cmp(count, (u1)8);
 737       __ br(Assembler::GE, L);
 738       __ stop("genrate_copy_longs called with < 8 words");
 739       __ bind(L);
 740     }
 741 #endif
 742 
 743     // Fill 8 registers
 744     if (UseSIMDForMemoryOps) {
 745       __ ldpq(v0, v1, Address(s, 4 * unit));
 746       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 747     } else {
 748       __ ldp(t0, t1, Address(s, 2 * unit));
 749       __ ldp(t2, t3, Address(s, 4 * unit));
 750       __ ldp(t4, t5, Address(s, 6 * unit));
 751       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 752     }
 753 
 754     __ subs(count, count, 16);
 755     __ br(Assembler::LO, drain);
 756 
 757     int prefetch = PrefetchCopyIntervalInBytes;
 758     bool use_stride = false;
 759     if (direction == copy_backwards) {
 760        use_stride = prefetch > 256;
 761        prefetch = -prefetch;
 762        if (use_stride) __ mov(stride, prefetch);
 763     }
 764 
 765     __ bind(again);
 766 
 767     if (PrefetchCopyIntervalInBytes > 0)
 768       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 769 
 770     if (UseSIMDForMemoryOps) {
 771       __ stpq(v0, v1, Address(d, 4 * unit));
 772       __ ldpq(v0, v1, Address(s, 4 * unit));
 773       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 774       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 775     } else {
 776       __ stp(t0, t1, Address(d, 2 * unit));
 777       __ ldp(t0, t1, Address(s, 2 * unit));
 778       __ stp(t2, t3, Address(d, 4 * unit));
 779       __ ldp(t2, t3, Address(s, 4 * unit));
 780       __ stp(t4, t5, Address(d, 6 * unit));
 781       __ ldp(t4, t5, Address(s, 6 * unit));
 782       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 783       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 784     }
 785 
 786     __ subs(count, count, 8);
 787     __ br(Assembler::HS, again);
 788 
 789     // Drain
 790     __ bind(drain);
 791     if (UseSIMDForMemoryOps) {
 792       __ stpq(v0, v1, Address(d, 4 * unit));
 793       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 794     } else {
 795       __ stp(t0, t1, Address(d, 2 * unit));
 796       __ stp(t2, t3, Address(d, 4 * unit));
 797       __ stp(t4, t5, Address(d, 6 * unit));
 798       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 799     }
 800 
 801     {
 802       Label L1, L2;
 803       __ tbz(count, exact_log2(4), L1);
 804       if (UseSIMDForMemoryOps) {
 805         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 806         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 807       } else {
 808         __ ldp(t0, t1, Address(s, 2 * unit));
 809         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 810         __ stp(t0, t1, Address(d, 2 * unit));
 811         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 812       }
 813       __ bind(L1);
 814 
 815       if (direction == copy_forwards) {
 816         __ add(s, s, bias);
 817         __ add(d, d, bias);
 818       }
 819 
 820       __ tbz(count, 1, L2);
 821       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 822       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 823       __ bind(L2);
 824     }
 825 
 826     __ ret(lr);
 827 
 828     if (AvoidUnalignedAccesses) {
 829       Label drain, again;
 830       // Register order for storing. Order is different for backward copy.
 831 
 832       __ bind(unaligned_copy_long);
 833 
 834       // source address is even aligned, target odd aligned
 835       //
 836       // when forward copying word pairs we read long pairs at offsets
 837       // {0, 2, 4, 6} (in long words). when backwards copying we read
 838       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 839       // address by -2 in the forwards case so we can compute the
 840       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 841       // or -1.
 842       //
 843       // when forward copying we need to store 1 word, 3 pairs and
 844       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 845       // zero offset We adjust the destination by -1 which means we
 846       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 847       //
 848       // When backwards copyng we need to store 1 word, 3 pairs and
 849       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 850       // offsets {1, 3, 5, 7, 8} * unit.
 851 
 852       if (direction == copy_forwards) {
 853         __ sub(s, s, 16);
 854         __ sub(d, d, 8);
 855       }
 856 
 857       // Fill 8 registers
 858       //
 859       // for forwards copy s was offset by -16 from the original input
 860       // value of s so the register contents are at these offsets
 861       // relative to the 64 bit block addressed by that original input
 862       // and so on for each successive 64 byte block when s is updated
 863       //
 864       // t0 at offset 0,  t1 at offset 8
 865       // t2 at offset 16, t3 at offset 24
 866       // t4 at offset 32, t5 at offset 40
 867       // t6 at offset 48, t7 at offset 56
 868 
 869       // for backwards copy s was not offset so the register contents
 870       // are at these offsets into the preceding 64 byte block
 871       // relative to that original input and so on for each successive
 872       // preceding 64 byte block when s is updated. this explains the
 873       // slightly counter-intuitive looking pattern of register usage
 874       // in the stp instructions for backwards copy.
 875       //
 876       // t0 at offset -16, t1 at offset -8
 877       // t2 at offset -32, t3 at offset -24
 878       // t4 at offset -48, t5 at offset -40
 879       // t6 at offset -64, t7 at offset -56
 880 
 881       __ ldp(t0, t1, Address(s, 2 * unit));
 882       __ ldp(t2, t3, Address(s, 4 * unit));
 883       __ ldp(t4, t5, Address(s, 6 * unit));
 884       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 885 
 886       __ subs(count, count, 16);
 887       __ br(Assembler::LO, drain);
 888 
 889       int prefetch = PrefetchCopyIntervalInBytes;
 890       bool use_stride = false;
 891       if (direction == copy_backwards) {
 892          use_stride = prefetch > 256;
 893          prefetch = -prefetch;
 894          if (use_stride) __ mov(stride, prefetch);
 895       }
 896 
 897       __ bind(again);
 898 
 899       if (PrefetchCopyIntervalInBytes > 0)
 900         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 901 
 902       if (direction == copy_forwards) {
 903        // allowing for the offset of -8 the store instructions place
 904        // registers into the target 64 bit block at the following
 905        // offsets
 906        //
 907        // t0 at offset 0
 908        // t1 at offset 8,  t2 at offset 16
 909        // t3 at offset 24, t4 at offset 32
 910        // t5 at offset 40, t6 at offset 48
 911        // t7 at offset 56
 912 
 913         __ str(t0, Address(d, 1 * unit));
 914         __ stp(t1, t2, Address(d, 2 * unit));
 915         __ ldp(t0, t1, Address(s, 2 * unit));
 916         __ stp(t3, t4, Address(d, 4 * unit));
 917         __ ldp(t2, t3, Address(s, 4 * unit));
 918         __ stp(t5, t6, Address(d, 6 * unit));
 919         __ ldp(t4, t5, Address(s, 6 * unit));
 920         __ str(t7, Address(__ pre(d, 8 * unit)));
 921         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 922       } else {
 923        // d was not offset when we started so the registers are
 924        // written into the 64 bit block preceding d with the following
 925        // offsets
 926        //
 927        // t1 at offset -8
 928        // t3 at offset -24, t0 at offset -16
 929        // t5 at offset -48, t2 at offset -32
 930        // t7 at offset -56, t4 at offset -48
 931        //                   t6 at offset -64
 932        //
 933        // note that this matches the offsets previously noted for the
 934        // loads
 935 
 936         __ str(t1, Address(d, 1 * unit));
 937         __ stp(t3, t0, Address(d, 3 * unit));
 938         __ ldp(t0, t1, Address(s, 2 * unit));
 939         __ stp(t5, t2, Address(d, 5 * unit));
 940         __ ldp(t2, t3, Address(s, 4 * unit));
 941         __ stp(t7, t4, Address(d, 7 * unit));
 942         __ ldp(t4, t5, Address(s, 6 * unit));
 943         __ str(t6, Address(__ pre(d, 8 * unit)));
 944         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 945       }
 946 
 947       __ subs(count, count, 8);
 948       __ br(Assembler::HS, again);
 949 
 950       // Drain
 951       //
 952       // this uses the same pattern of offsets and register arguments
 953       // as above
 954       __ bind(drain);
 955       if (direction == copy_forwards) {
 956         __ str(t0, Address(d, 1 * unit));
 957         __ stp(t1, t2, Address(d, 2 * unit));
 958         __ stp(t3, t4, Address(d, 4 * unit));
 959         __ stp(t5, t6, Address(d, 6 * unit));
 960         __ str(t7, Address(__ pre(d, 8 * unit)));
 961       } else {
 962         __ str(t1, Address(d, 1 * unit));
 963         __ stp(t3, t0, Address(d, 3 * unit));
 964         __ stp(t5, t2, Address(d, 5 * unit));
 965         __ stp(t7, t4, Address(d, 7 * unit));
 966         __ str(t6, Address(__ pre(d, 8 * unit)));
 967       }
 968       // now we need to copy any remaining part block which may
 969       // include a 4 word block subblock and/or a 2 word subblock.
 970       // bits 2 and 1 in the count are the tell-tale for whetehr we
 971       // have each such subblock
 972       {
 973         Label L1, L2;
 974         __ tbz(count, exact_log2(4), L1);
 975        // this is the same as above but copying only 4 longs hence
 976        // with ony one intervening stp between the str instructions
 977        // but note that the offsets and registers still follow the
 978        // same pattern
 979         __ ldp(t0, t1, Address(s, 2 * unit));
 980         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 981         if (direction == copy_forwards) {
 982           __ str(t0, Address(d, 1 * unit));
 983           __ stp(t1, t2, Address(d, 2 * unit));
 984           __ str(t3, Address(__ pre(d, 4 * unit)));
 985         } else {
 986           __ str(t1, Address(d, 1 * unit));
 987           __ stp(t3, t0, Address(d, 3 * unit));
 988           __ str(t2, Address(__ pre(d, 4 * unit)));
 989         }
 990         __ bind(L1);
 991 
 992         __ tbz(count, 1, L2);
 993        // this is the same as above but copying only 2 longs hence
 994        // there is no intervening stp between the str instructions
 995        // but note that the offset and register patterns are still
 996        // the same
 997         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
 998         if (direction == copy_forwards) {
 999           __ str(t0, Address(d, 1 * unit));
1000           __ str(t1, Address(__ pre(d, 2 * unit)));
1001         } else {
1002           __ str(t1, Address(d, 1 * unit));
1003           __ str(t0, Address(__ pre(d, 2 * unit)));
1004         }
1005         __ bind(L2);
1006 
1007        // for forwards copy we need to re-adjust the offsets we
1008        // applied so that s and d are follow the last words written
1009 
1010        if (direction == copy_forwards) {
1011          __ add(s, s, 16);
1012          __ add(d, d, 8);
1013        }
1014 
1015       }
1016 
1017       __ ret(lr);
1018       }
1019   }
1020 
1021   // Small copy: less than 16 bytes.
1022   //
1023   // NB: Ignores all of the bits of count which represent more than 15
1024   // bytes, so a caller doesn't have to mask them.
1025 
1026   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1027     bool is_backwards = step < 0;
1028     size_t granularity = uabs(step);
1029     int direction = is_backwards ? -1 : 1;
1030     int unit = wordSize * direction;
1031 
1032     Label Lword, Lint, Lshort, Lbyte;
1033 
1034     assert(granularity
1035            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1036 
1037     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1038 
1039     // ??? I don't know if this bit-test-and-branch is the right thing
1040     // to do.  It does a lot of jumping, resulting in several
1041     // mispredicted branches.  It might make more sense to do this
1042     // with something like Duff's device with a single computed branch.
1043 
1044     __ tbz(count, 3 - exact_log2(granularity), Lword);
1045     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1046     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1047     __ bind(Lword);
1048 
1049     if (granularity <= sizeof (jint)) {
1050       __ tbz(count, 2 - exact_log2(granularity), Lint);
1051       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1052       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1053       __ bind(Lint);
1054     }
1055 
1056     if (granularity <= sizeof (jshort)) {
1057       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1058       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1059       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1060       __ bind(Lshort);
1061     }
1062 
1063     if (granularity <= sizeof (jbyte)) {
1064       __ tbz(count, 0, Lbyte);
1065       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1066       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1067       __ bind(Lbyte);
1068     }
1069   }
1070 
1071   Label copy_f, copy_b;
1072 
1073   // All-singing all-dancing memory copy.
1074   //
1075   // Copy count units of memory from s to d.  The size of a unit is
1076   // step, which can be positive or negative depending on the direction
1077   // of copy.  If is_aligned is false, we align the source address.
1078   //
1079 
1080   void copy_memory(bool is_aligned, Register s, Register d,
1081                    Register count, Register tmp, int step) {
1082     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1083     bool is_backwards = step < 0;
1084     int granularity = uabs(step);
1085     const Register t0 = r3, t1 = r4;
1086 
1087     // <= 96 bytes do inline. Direction doesn't matter because we always
1088     // load all the data before writing anything
1089     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1090     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1091     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1092     const Register send = r17, dend = r16;
1093 
1094     if (PrefetchCopyIntervalInBytes > 0)
1095       __ prfm(Address(s, 0), PLDL1KEEP);
1096     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1097     __ br(Assembler::HI, copy_big);
1098 
1099     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1100     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1101 
1102     __ cmp(count, u1(16/granularity));
1103     __ br(Assembler::LS, copy16);
1104 
1105     __ cmp(count, u1(64/granularity));
1106     __ br(Assembler::HI, copy80);
1107 
1108     __ cmp(count, u1(32/granularity));
1109     __ br(Assembler::LS, copy32);
1110 
1111     // 33..64 bytes
1112     if (UseSIMDForMemoryOps) {
1113       __ ldpq(v0, v1, Address(s, 0));
1114       __ ldpq(v2, v3, Address(send, -32));
1115       __ stpq(v0, v1, Address(d, 0));
1116       __ stpq(v2, v3, Address(dend, -32));
1117     } else {
1118       __ ldp(t0, t1, Address(s, 0));
1119       __ ldp(t2, t3, Address(s, 16));
1120       __ ldp(t4, t5, Address(send, -32));
1121       __ ldp(t6, t7, Address(send, -16));
1122 
1123       __ stp(t0, t1, Address(d, 0));
1124       __ stp(t2, t3, Address(d, 16));
1125       __ stp(t4, t5, Address(dend, -32));
1126       __ stp(t6, t7, Address(dend, -16));
1127     }
1128     __ b(finish);
1129 
1130     // 17..32 bytes
1131     __ bind(copy32);
1132     __ ldp(t0, t1, Address(s, 0));
1133     __ ldp(t2, t3, Address(send, -16));
1134     __ stp(t0, t1, Address(d, 0));
1135     __ stp(t2, t3, Address(dend, -16));
1136     __ b(finish);
1137 
1138     // 65..80/96 bytes
1139     // (96 bytes if SIMD because we do 32 byes per instruction)
1140     __ bind(copy80);
1141     if (UseSIMDForMemoryOps) {
1142       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1143       __ ldpq(v4, v5, Address(send, -32));
1144       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1145       __ stpq(v4, v5, Address(dend, -32));
1146     } else {
1147       __ ldp(t0, t1, Address(s, 0));
1148       __ ldp(t2, t3, Address(s, 16));
1149       __ ldp(t4, t5, Address(s, 32));
1150       __ ldp(t6, t7, Address(s, 48));
1151       __ ldp(t8, t9, Address(send, -16));
1152 
1153       __ stp(t0, t1, Address(d, 0));
1154       __ stp(t2, t3, Address(d, 16));
1155       __ stp(t4, t5, Address(d, 32));
1156       __ stp(t6, t7, Address(d, 48));
1157       __ stp(t8, t9, Address(dend, -16));
1158     }
1159     __ b(finish);
1160 
1161     // 0..16 bytes
1162     __ bind(copy16);
1163     __ cmp(count, u1(8/granularity));
1164     __ br(Assembler::LO, copy8);
1165 
1166     // 8..16 bytes
1167     __ ldr(t0, Address(s, 0));
1168     __ ldr(t1, Address(send, -8));
1169     __ str(t0, Address(d, 0));
1170     __ str(t1, Address(dend, -8));
1171     __ b(finish);
1172 
1173     if (granularity < 8) {
1174       // 4..7 bytes
1175       __ bind(copy8);
1176       __ tbz(count, 2 - exact_log2(granularity), copy4);
1177       __ ldrw(t0, Address(s, 0));
1178       __ ldrw(t1, Address(send, -4));
1179       __ strw(t0, Address(d, 0));
1180       __ strw(t1, Address(dend, -4));
1181       __ b(finish);
1182       if (granularity < 4) {
1183         // 0..3 bytes
1184         __ bind(copy4);
1185         __ cbz(count, finish); // get rid of 0 case
1186         if (granularity == 2) {
1187           __ ldrh(t0, Address(s, 0));
1188           __ strh(t0, Address(d, 0));
1189         } else { // granularity == 1
1190           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1191           // the first and last byte.
1192           // Handle the 3 byte case by loading and storing base + count/2
1193           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1194           // This does means in the 1 byte case we load/store the same
1195           // byte 3 times.
1196           __ lsr(count, count, 1);
1197           __ ldrb(t0, Address(s, 0));
1198           __ ldrb(t1, Address(send, -1));
1199           __ ldrb(t2, Address(s, count));
1200           __ strb(t0, Address(d, 0));
1201           __ strb(t1, Address(dend, -1));
1202           __ strb(t2, Address(d, count));
1203         }
1204         __ b(finish);
1205       }
1206     }
1207 
1208     __ bind(copy_big);
1209     if (is_backwards) {
1210       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1211       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1212     }
1213 
1214     // Now we've got the small case out of the way we can align the
1215     // source address on a 2-word boundary.
1216 
1217     Label aligned;
1218 
1219     if (is_aligned) {
1220       // We may have to adjust by 1 word to get s 2-word-aligned.
1221       __ tbz(s, exact_log2(wordSize), aligned);
1222       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1223       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1224       __ sub(count, count, wordSize/granularity);
1225     } else {
1226       if (is_backwards) {
1227         __ andr(rscratch2, s, 2 * wordSize - 1);
1228       } else {
1229         __ neg(rscratch2, s);
1230         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1231       }
1232       // rscratch2 is the byte adjustment needed to align s.
1233       __ cbz(rscratch2, aligned);
1234       int shift = exact_log2(granularity);
1235       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1236       __ sub(count, count, rscratch2);
1237 
1238 #if 0
1239       // ?? This code is only correct for a disjoint copy.  It may or
1240       // may not make sense to use it in that case.
1241 
1242       // Copy the first pair; s and d may not be aligned.
1243       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1244       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1245 
1246       // Align s and d, adjust count
1247       if (is_backwards) {
1248         __ sub(s, s, rscratch2);
1249         __ sub(d, d, rscratch2);
1250       } else {
1251         __ add(s, s, rscratch2);
1252         __ add(d, d, rscratch2);
1253       }
1254 #else
1255       copy_memory_small(s, d, rscratch2, rscratch1, step);
1256 #endif
1257     }
1258 
1259     __ bind(aligned);
1260 
1261     // s is now 2-word-aligned.
1262 
1263     // We have a count of units and some trailing bytes.  Adjust the
1264     // count and do a bulk copy of words.
1265     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1266     if (direction == copy_forwards)
1267       __ bl(copy_f);
1268     else
1269       __ bl(copy_b);
1270 
1271     // And the tail.
1272     copy_memory_small(s, d, count, tmp, step);
1273 
1274     if (granularity >= 8) __ bind(copy8);
1275     if (granularity >= 4) __ bind(copy4);
1276     __ bind(finish);
1277   }
1278 
1279 
1280   void clobber_registers() {
1281 #ifdef ASSERT
1282     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1283     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1284     for (Register r = r3; r <= NOT_WIN64(r18) WIN64_ONLY(r17); r++)
1285       if (r != rscratch1) __ mov(r, rscratch1);
1286 #endif
1287 
1288   }
1289 
1290   // Scan over array at a for count oops, verifying each one.
1291   // Preserves a and count, clobbers rscratch1 and rscratch2.
1292   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1293     Label loop, end;
1294     __ mov(rscratch1, a);
1295     __ mov(rscratch2, zr);
1296     __ bind(loop);
1297     __ cmp(rscratch2, count);
1298     __ br(Assembler::HS, end);
1299     if (size == (size_t)wordSize) {
1300       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1301       __ verify_oop(temp);
1302     } else {
1303       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1304       __ decode_heap_oop(temp); // calls verify_oop
1305     }
1306     __ add(rscratch2, rscratch2, size);
1307     __ b(loop);
1308     __ bind(end);
1309   }
1310 
1311   // Arguments:
1312   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1313   //             ignored
1314   //   is_oop  - true => oop array, so generate store check code
1315   //   name    - stub name string
1316   //
1317   // Inputs:
1318   //   c_rarg0   - source array address
1319   //   c_rarg1   - destination array address
1320   //   c_rarg2   - element count, treated as ssize_t, can be zero
1321   //
1322   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1323   // the hardware handle it.  The two dwords within qwords that span
1324   // cache line boundaries will still be loaded and stored atomicly.
1325   //
1326   // Side Effects:
1327   //   disjoint_int_copy_entry is set to the no-overlap entry point
1328   //   used by generate_conjoint_int_oop_copy().
1329   //
1330   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1331                                   const char *name, bool dest_uninitialized = false) {
1332     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1333     RegSet saved_reg = RegSet::of(s, d, count);
1334     __ align(CodeEntryAlignment);
1335     StubCodeMark mark(this, "StubRoutines", name);
1336     address start = __ pc();
1337     __ enter();
1338 
1339     if (entry != NULL) {
1340       *entry = __ pc();
1341       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1342       BLOCK_COMMENT("Entry:");
1343     }
1344 
1345     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1346     if (dest_uninitialized) {
1347       decorators |= IS_DEST_UNINITIALIZED;
1348     }
1349     if (aligned) {
1350       decorators |= ARRAYCOPY_ALIGNED;
1351     }
1352 
1353     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1354     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1355 
1356     if (is_oop) {
1357       // save regs before copy_memory
1358       __ push(RegSet::of(d, count), sp);
1359     }
1360     {
1361       // UnsafeCopyMemory page error: continue after ucm
1362       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1363       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1364       copy_memory(aligned, s, d, count, rscratch1, size);
1365     }
1366 
1367     if (is_oop) {
1368       __ pop(RegSet::of(d, count), sp);
1369       if (VerifyOops)
1370         verify_oop_array(size, d, count, r16);
1371     }
1372 
1373     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1374 
1375     __ leave();
1376     __ mov(r0, zr); // return 0
1377     __ ret(lr);
1378     return start;
1379   }
1380 
1381   // Arguments:
1382   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1383   //             ignored
1384   //   is_oop  - true => oop array, so generate store check code
1385   //   name    - stub name string
1386   //
1387   // Inputs:
1388   //   c_rarg0   - source array address
1389   //   c_rarg1   - destination array address
1390   //   c_rarg2   - element count, treated as ssize_t, can be zero
1391   //
1392   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1393   // the hardware handle it.  The two dwords within qwords that span
1394   // cache line boundaries will still be loaded and stored atomicly.
1395   //
1396   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1397                                  address *entry, const char *name,
1398                                  bool dest_uninitialized = false) {
1399     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1400     RegSet saved_regs = RegSet::of(s, d, count);
1401     StubCodeMark mark(this, "StubRoutines", name);
1402     address start = __ pc();
1403     __ enter();
1404 
1405     if (entry != NULL) {
1406       *entry = __ pc();
1407       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1408       BLOCK_COMMENT("Entry:");
1409     }
1410 
1411     // use fwd copy when (d-s) above_equal (count*size)
1412     __ sub(rscratch1, d, s);
1413     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1414     __ br(Assembler::HS, nooverlap_target);
1415 
1416     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1417     if (dest_uninitialized) {
1418       decorators |= IS_DEST_UNINITIALIZED;
1419     }
1420     if (aligned) {
1421       decorators |= ARRAYCOPY_ALIGNED;
1422     }
1423 
1424     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1425     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1426 
1427     if (is_oop) {
1428       // save regs before copy_memory
1429       __ push(RegSet::of(d, count), sp);
1430     }
1431     {
1432       // UnsafeCopyMemory page error: continue after ucm
1433       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1434       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1435       copy_memory(aligned, s, d, count, rscratch1, -size);
1436     }
1437     if (is_oop) {
1438       __ pop(RegSet::of(d, count), sp);
1439       if (VerifyOops)
1440         verify_oop_array(size, d, count, r16);
1441     }
1442     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1443     __ leave();
1444     __ mov(r0, zr); // return 0
1445     __ ret(lr);
1446     return start;
1447 }
1448 
1449   // Arguments:
1450   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1451   //             ignored
1452   //   name    - stub name string
1453   //
1454   // Inputs:
1455   //   c_rarg0   - source array address
1456   //   c_rarg1   - destination array address
1457   //   c_rarg2   - element count, treated as ssize_t, can be zero
1458   //
1459   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1460   // we let the hardware handle it.  The one to eight bytes within words,
1461   // dwords or qwords that span cache line boundaries will still be loaded
1462   // and stored atomically.
1463   //
1464   // Side Effects:
1465   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1466   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1467   // we let the hardware handle it.  The one to eight bytes within words,
1468   // dwords or qwords that span cache line boundaries will still be loaded
1469   // and stored atomically.
1470   //
1471   // Side Effects:
1472   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1473   //   used by generate_conjoint_byte_copy().
1474   //
1475   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1476     const bool not_oop = false;
1477     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1478   }
1479 
1480   // Arguments:
1481   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1482   //             ignored
1483   //   name    - stub name string
1484   //
1485   // Inputs:
1486   //   c_rarg0   - source array address
1487   //   c_rarg1   - destination array address
1488   //   c_rarg2   - element count, treated as ssize_t, can be zero
1489   //
1490   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1491   // we let the hardware handle it.  The one to eight bytes within words,
1492   // dwords or qwords that span cache line boundaries will still be loaded
1493   // and stored atomically.
1494   //
1495   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1496                                       address* entry, const char *name) {
1497     const bool not_oop = false;
1498     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1499   }
1500 
1501   // Arguments:
1502   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1503   //             ignored
1504   //   name    - stub name string
1505   //
1506   // Inputs:
1507   //   c_rarg0   - source array address
1508   //   c_rarg1   - destination array address
1509   //   c_rarg2   - element count, treated as ssize_t, can be zero
1510   //
1511   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1512   // let the hardware handle it.  The two or four words within dwords
1513   // or qwords that span cache line boundaries will still be loaded
1514   // and stored atomically.
1515   //
1516   // Side Effects:
1517   //   disjoint_short_copy_entry is set to the no-overlap entry point
1518   //   used by generate_conjoint_short_copy().
1519   //
1520   address generate_disjoint_short_copy(bool aligned,
1521                                        address* entry, const char *name) {
1522     const bool not_oop = false;
1523     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1524   }
1525 
1526   // Arguments:
1527   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1528   //             ignored
1529   //   name    - stub name string
1530   //
1531   // Inputs:
1532   //   c_rarg0   - source array address
1533   //   c_rarg1   - destination array address
1534   //   c_rarg2   - element count, treated as ssize_t, can be zero
1535   //
1536   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1537   // let the hardware handle it.  The two or four words within dwords
1538   // or qwords that span cache line boundaries will still be loaded
1539   // and stored atomically.
1540   //
1541   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1542                                        address *entry, const char *name) {
1543     const bool not_oop = false;
1544     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1545 
1546   }
1547   // Arguments:
1548   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1549   //             ignored
1550   //   name    - stub name string
1551   //
1552   // Inputs:
1553   //   c_rarg0   - source array address
1554   //   c_rarg1   - destination array address
1555   //   c_rarg2   - element count, treated as ssize_t, can be zero
1556   //
1557   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1558   // the hardware handle it.  The two dwords within qwords that span
1559   // cache line boundaries will still be loaded and stored atomicly.
1560   //
1561   // Side Effects:
1562   //   disjoint_int_copy_entry is set to the no-overlap entry point
1563   //   used by generate_conjoint_int_oop_copy().
1564   //
1565   address generate_disjoint_int_copy(bool aligned, address *entry,
1566                                          const char *name, bool dest_uninitialized = false) {
1567     const bool not_oop = false;
1568     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1569   }
1570 
1571   // Arguments:
1572   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1573   //             ignored
1574   //   name    - stub name string
1575   //
1576   // Inputs:
1577   //   c_rarg0   - source array address
1578   //   c_rarg1   - destination array address
1579   //   c_rarg2   - element count, treated as ssize_t, can be zero
1580   //
1581   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1582   // the hardware handle it.  The two dwords within qwords that span
1583   // cache line boundaries will still be loaded and stored atomicly.
1584   //
1585   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1586                                      address *entry, const char *name,
1587                                      bool dest_uninitialized = false) {
1588     const bool not_oop = false;
1589     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1590   }
1591 
1592 
1593   // Arguments:
1594   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1595   //             ignored
1596   //   name    - stub name string
1597   //
1598   // Inputs:
1599   //   c_rarg0   - source array address
1600   //   c_rarg1   - destination array address
1601   //   c_rarg2   - element count, treated as size_t, can be zero
1602   //
1603   // Side Effects:
1604   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1605   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1606   //
1607   address generate_disjoint_long_copy(bool aligned, address *entry,
1608                                           const char *name, bool dest_uninitialized = false) {
1609     const bool not_oop = false;
1610     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1611   }
1612 
1613   // Arguments:
1614   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1615   //             ignored
1616   //   name    - stub name string
1617   //
1618   // Inputs:
1619   //   c_rarg0   - source array address
1620   //   c_rarg1   - destination array address
1621   //   c_rarg2   - element count, treated as size_t, can be zero
1622   //
1623   address generate_conjoint_long_copy(bool aligned,
1624                                       address nooverlap_target, address *entry,
1625                                       const char *name, bool dest_uninitialized = false) {
1626     const bool not_oop = false;
1627     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1628   }
1629 
1630   // Arguments:
1631   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1632   //             ignored
1633   //   name    - stub name string
1634   //
1635   // Inputs:
1636   //   c_rarg0   - source array address
1637   //   c_rarg1   - destination array address
1638   //   c_rarg2   - element count, treated as size_t, can be zero
1639   //
1640   // Side Effects:
1641   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1642   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1643   //
1644   address generate_disjoint_oop_copy(bool aligned, address *entry,
1645                                      const char *name, bool dest_uninitialized) {
1646     const bool is_oop = true;
1647     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1648     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1649   }
1650 
1651   // Arguments:
1652   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1653   //             ignored
1654   //   name    - stub name string
1655   //
1656   // Inputs:
1657   //   c_rarg0   - source array address
1658   //   c_rarg1   - destination array address
1659   //   c_rarg2   - element count, treated as size_t, can be zero
1660   //
1661   address generate_conjoint_oop_copy(bool aligned,
1662                                      address nooverlap_target, address *entry,
1663                                      const char *name, bool dest_uninitialized) {
1664     const bool is_oop = true;
1665     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1666     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1667                                   name, dest_uninitialized);
1668   }
1669 
1670 
1671   // Helper for generating a dynamic type check.
1672   // Smashes rscratch1, rscratch2.
1673   void generate_type_check(Register sub_klass,
1674                            Register super_check_offset,
1675                            Register super_klass,
1676                            Label& L_success) {
1677     assert_different_registers(sub_klass, super_check_offset, super_klass);
1678 
1679     BLOCK_COMMENT("type_check:");
1680 
1681     Label L_miss;
1682 
1683     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1684                                      super_check_offset);
1685     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1686 
1687     // Fall through on failure!
1688     __ BIND(L_miss);
1689   }
1690 
1691   //
1692   //  Generate checkcasting array copy stub
1693   //
1694   //  Input:
1695   //    c_rarg0   - source array address
1696   //    c_rarg1   - destination array address
1697   //    c_rarg2   - element count, treated as ssize_t, can be zero
1698   //    c_rarg3   - size_t ckoff (super_check_offset)
1699   //    c_rarg4   - oop ckval (super_klass)
1700   //
1701   //  Output:
1702   //    r0 ==  0  -  success
1703   //    r0 == -1^K - failure, where K is partial transfer count
1704   //
1705   address generate_checkcast_copy(const char *name, address *entry,
1706                                   bool dest_uninitialized = false) {
1707 
1708     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1709 
1710     // Input registers (after setup_arg_regs)
1711     const Register from        = c_rarg0;   // source array address
1712     const Register to          = c_rarg1;   // destination array address
1713     const Register count       = c_rarg2;   // elementscount
1714     const Register ckoff       = c_rarg3;   // super_check_offset
1715     const Register ckval       = c_rarg4;   // super_klass
1716 
1717     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1718     RegSet wb_post_saved_regs = RegSet::of(count);
1719 
1720     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1721     const Register copied_oop  = r22;       // actual oop copied
1722     const Register count_save  = r21;       // orig elementscount
1723     const Register start_to    = r20;       // destination array start address
1724     const Register r19_klass   = r19;       // oop._klass
1725 
1726     //---------------------------------------------------------------
1727     // Assembler stub will be used for this call to arraycopy
1728     // if the two arrays are subtypes of Object[] but the
1729     // destination array type is not equal to or a supertype
1730     // of the source type.  Each element must be separately
1731     // checked.
1732 
1733     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1734                                copied_oop, r19_klass, count_save);
1735 
1736     __ align(CodeEntryAlignment);
1737     StubCodeMark mark(this, "StubRoutines", name);
1738     address start = __ pc();
1739 
1740     __ enter(); // required for proper stackwalking of RuntimeStub frame
1741 
1742 #ifdef ASSERT
1743     // caller guarantees that the arrays really are different
1744     // otherwise, we would have to make conjoint checks
1745     { Label L;
1746       array_overlap_test(L, TIMES_OOP);
1747       __ stop("checkcast_copy within a single array");
1748       __ bind(L);
1749     }
1750 #endif //ASSERT
1751 
1752     // Caller of this entry point must set up the argument registers.
1753     if (entry != NULL) {
1754       *entry = __ pc();
1755       BLOCK_COMMENT("Entry:");
1756     }
1757 
1758      // Empty array:  Nothing to do.
1759     __ cbz(count, L_done);
1760     __ push(RegSet::of(r19, r20, r21, r22), sp);
1761 
1762 #ifdef ASSERT
1763     BLOCK_COMMENT("assert consistent ckoff/ckval");
1764     // The ckoff and ckval must be mutually consistent,
1765     // even though caller generates both.
1766     { Label L;
1767       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1768       __ ldrw(start_to, Address(ckval, sco_offset));
1769       __ cmpw(ckoff, start_to);
1770       __ br(Assembler::EQ, L);
1771       __ stop("super_check_offset inconsistent");
1772       __ bind(L);
1773     }
1774 #endif //ASSERT
1775 
1776     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1777     bool is_oop = true;
1778     if (dest_uninitialized) {
1779       decorators |= IS_DEST_UNINITIALIZED;
1780     }
1781 
1782     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1783     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1784 
1785     // save the original count
1786     __ mov(count_save, count);
1787 
1788     // Copy from low to high addresses
1789     __ mov(start_to, to);              // Save destination array start address
1790     __ b(L_load_element);
1791 
1792     // ======== begin loop ========
1793     // (Loop is rotated; its entry is L_load_element.)
1794     // Loop control:
1795     //   for (; count != 0; count--) {
1796     //     copied_oop = load_heap_oop(from++);
1797     //     ... generate_type_check ...;
1798     //     store_heap_oop(to++, copied_oop);
1799     //   }
1800     __ align(OptoLoopAlignment);
1801 
1802     __ BIND(L_store_element);
1803     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1804     __ sub(count, count, 1);
1805     __ cbz(count, L_do_card_marks);
1806 
1807     // ======== loop entry is here ========
1808     __ BIND(L_load_element);
1809     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1810     __ cbz(copied_oop, L_store_element);
1811 
1812     __ load_klass(r19_klass, copied_oop);// query the object klass
1813     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1814     // ======== end loop ========
1815 
1816     // It was a real error; we must depend on the caller to finish the job.
1817     // Register count = remaining oops, count_orig = total oops.
1818     // Emit GC store barriers for the oops we have copied and report
1819     // their number to the caller.
1820 
1821     __ subs(count, count_save, count);     // K = partially copied oop count
1822     __ eon(count, count, zr);                   // report (-1^K) to caller
1823     __ br(Assembler::EQ, L_done_pop);
1824 
1825     __ BIND(L_do_card_marks);
1826     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1827 
1828     __ bind(L_done_pop);
1829     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1830     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1831 
1832     __ bind(L_done);
1833     __ mov(r0, count);
1834     __ leave();
1835     __ ret(lr);
1836 
1837     return start;
1838   }
1839 
1840   // Perform range checks on the proposed arraycopy.
1841   // Kills temp, but nothing else.
1842   // Also, clean the sign bits of src_pos and dst_pos.
1843   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1844                               Register src_pos, // source position (c_rarg1)
1845                               Register dst,     // destination array oo (c_rarg2)
1846                               Register dst_pos, // destination position (c_rarg3)
1847                               Register length,
1848                               Register temp,
1849                               Label& L_failed) {
1850     BLOCK_COMMENT("arraycopy_range_checks:");
1851 
1852     assert_different_registers(rscratch1, temp);
1853 
1854     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1855     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1856     __ addw(temp, length, src_pos);
1857     __ cmpw(temp, rscratch1);
1858     __ br(Assembler::HI, L_failed);
1859 
1860     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1861     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1862     __ addw(temp, length, dst_pos);
1863     __ cmpw(temp, rscratch1);
1864     __ br(Assembler::HI, L_failed);
1865 
1866     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1867     __ movw(src_pos, src_pos);
1868     __ movw(dst_pos, dst_pos);
1869 
1870     BLOCK_COMMENT("arraycopy_range_checks done");
1871   }
1872 
1873   // These stubs get called from some dumb test routine.
1874   // I'll write them properly when they're called from
1875   // something that's actually doing something.
1876   static void fake_arraycopy_stub(address src, address dst, int count) {
1877     assert(count == 0, "huh?");
1878   }
1879 
1880 
1881   //
1882   //  Generate 'unsafe' array copy stub
1883   //  Though just as safe as the other stubs, it takes an unscaled
1884   //  size_t argument instead of an element count.
1885   //
1886   //  Input:
1887   //    c_rarg0   - source array address
1888   //    c_rarg1   - destination array address
1889   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1890   //
1891   // Examines the alignment of the operands and dispatches
1892   // to a long, int, short, or byte copy loop.
1893   //
1894   address generate_unsafe_copy(const char *name,
1895                                address byte_copy_entry,
1896                                address short_copy_entry,
1897                                address int_copy_entry,
1898                                address long_copy_entry) {
1899     Label L_long_aligned, L_int_aligned, L_short_aligned;
1900     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1901 
1902     __ align(CodeEntryAlignment);
1903     StubCodeMark mark(this, "StubRoutines", name);
1904     address start = __ pc();
1905     __ enter(); // required for proper stackwalking of RuntimeStub frame
1906 
1907     // bump this on entry, not on exit:
1908     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1909 
1910     __ orr(rscratch1, s, d);
1911     __ orr(rscratch1, rscratch1, count);
1912 
1913     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1914     __ cbz(rscratch1, L_long_aligned);
1915     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1916     __ cbz(rscratch1, L_int_aligned);
1917     __ tbz(rscratch1, 0, L_short_aligned);
1918     __ b(RuntimeAddress(byte_copy_entry));
1919 
1920     __ BIND(L_short_aligned);
1921     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1922     __ b(RuntimeAddress(short_copy_entry));
1923     __ BIND(L_int_aligned);
1924     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1925     __ b(RuntimeAddress(int_copy_entry));
1926     __ BIND(L_long_aligned);
1927     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1928     __ b(RuntimeAddress(long_copy_entry));
1929 
1930     return start;
1931   }
1932 
1933   //
1934   //  Generate generic array copy stubs
1935   //
1936   //  Input:
1937   //    c_rarg0    -  src oop
1938   //    c_rarg1    -  src_pos (32-bits)
1939   //    c_rarg2    -  dst oop
1940   //    c_rarg3    -  dst_pos (32-bits)
1941   //    c_rarg4    -  element count (32-bits)
1942   //
1943   //  Output:
1944   //    r0 ==  0  -  success
1945   //    r0 == -1^K - failure, where K is partial transfer count
1946   //
1947   address generate_generic_copy(const char *name,
1948                                 address byte_copy_entry, address short_copy_entry,
1949                                 address int_copy_entry, address oop_copy_entry,
1950                                 address long_copy_entry, address checkcast_copy_entry) {
1951 
1952     Label L_failed, L_objArray;
1953     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1954 
1955     // Input registers
1956     const Register src        = c_rarg0;  // source array oop
1957     const Register src_pos    = c_rarg1;  // source position
1958     const Register dst        = c_rarg2;  // destination array oop
1959     const Register dst_pos    = c_rarg3;  // destination position
1960     const Register length     = c_rarg4;
1961 
1962 
1963     // Registers used as temps
1964     const Register dst_klass  = c_rarg5;
1965 
1966     __ align(CodeEntryAlignment);
1967 
1968     StubCodeMark mark(this, "StubRoutines", name);
1969 
1970     address start = __ pc();
1971 
1972     __ enter(); // required for proper stackwalking of RuntimeStub frame
1973 
1974     // bump this on entry, not on exit:
1975     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1976 
1977     //-----------------------------------------------------------------------
1978     // Assembler stub will be used for this call to arraycopy
1979     // if the following conditions are met:
1980     //
1981     // (1) src and dst must not be null.
1982     // (2) src_pos must not be negative.
1983     // (3) dst_pos must not be negative.
1984     // (4) length  must not be negative.
1985     // (5) src klass and dst klass should be the same and not NULL.
1986     // (6) src and dst should be arrays.
1987     // (7) src_pos + length must not exceed length of src.
1988     // (8) dst_pos + length must not exceed length of dst.
1989     //
1990 
1991     //  if (src == NULL) return -1;
1992     __ cbz(src, L_failed);
1993 
1994     //  if (src_pos < 0) return -1;
1995     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1996 
1997     //  if (dst == NULL) return -1;
1998     __ cbz(dst, L_failed);
1999 
2000     //  if (dst_pos < 0) return -1;
2001     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2002 
2003     // registers used as temp
2004     const Register scratch_length    = r16; // elements count to copy
2005     const Register scratch_src_klass = r17; // array klass
2006     const Register lh                = r15; // layout helper
2007 
2008     //  if (length < 0) return -1;
2009     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2010     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2011 
2012     __ load_klass(scratch_src_klass, src);
2013 #ifdef ASSERT
2014     //  assert(src->klass() != NULL);
2015     {
2016       BLOCK_COMMENT("assert klasses not null {");
2017       Label L1, L2;
2018       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2019       __ bind(L1);
2020       __ stop("broken null klass");
2021       __ bind(L2);
2022       __ load_klass(rscratch1, dst);
2023       __ cbz(rscratch1, L1);     // this would be broken also
2024       BLOCK_COMMENT("} assert klasses not null done");
2025     }
2026 #endif
2027 
2028     // Load layout helper (32-bits)
2029     //
2030     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2031     // 32        30    24            16              8     2                 0
2032     //
2033     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2034     //
2035 
2036     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2037 
2038     // Handle objArrays completely differently...
2039     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2040     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2041     __ movw(rscratch1, objArray_lh);
2042     __ eorw(rscratch2, lh, rscratch1);
2043     __ cbzw(rscratch2, L_objArray);
2044 
2045     //  if (src->klass() != dst->klass()) return -1;
2046     __ load_klass(rscratch2, dst);
2047     __ eor(rscratch2, rscratch2, scratch_src_klass);
2048     __ cbnz(rscratch2, L_failed);
2049 
2050     //  if (!src->is_Array()) return -1;
2051     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2052 
2053     // At this point, it is known to be a typeArray (array_tag 0x3).
2054 #ifdef ASSERT
2055     {
2056       BLOCK_COMMENT("assert primitive array {");
2057       Label L;
2058       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2059       __ cmpw(lh, rscratch2);
2060       __ br(Assembler::GE, L);
2061       __ stop("must be a primitive array");
2062       __ bind(L);
2063       BLOCK_COMMENT("} assert primitive array done");
2064     }
2065 #endif
2066 
2067     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2068                            rscratch2, L_failed);
2069 
2070     // TypeArrayKlass
2071     //
2072     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2073     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2074     //
2075 
2076     const Register rscratch1_offset = rscratch1;    // array offset
2077     const Register r15_elsize = lh; // element size
2078 
2079     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2080            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2081     __ add(src, src, rscratch1_offset);           // src array offset
2082     __ add(dst, dst, rscratch1_offset);           // dst array offset
2083     BLOCK_COMMENT("choose copy loop based on element size");
2084 
2085     // next registers should be set before the jump to corresponding stub
2086     const Register from     = c_rarg0;  // source array address
2087     const Register to       = c_rarg1;  // destination array address
2088     const Register count    = c_rarg2;  // elements count
2089 
2090     // 'from', 'to', 'count' registers should be set in such order
2091     // since they are the same as 'src', 'src_pos', 'dst'.
2092 
2093     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2094 
2095     // The possible values of elsize are 0-3, i.e. exact_log2(element
2096     // size in bytes).  We do a simple bitwise binary search.
2097   __ BIND(L_copy_bytes);
2098     __ tbnz(r15_elsize, 1, L_copy_ints);
2099     __ tbnz(r15_elsize, 0, L_copy_shorts);
2100     __ lea(from, Address(src, src_pos));// src_addr
2101     __ lea(to,   Address(dst, dst_pos));// dst_addr
2102     __ movw(count, scratch_length); // length
2103     __ b(RuntimeAddress(byte_copy_entry));
2104 
2105   __ BIND(L_copy_shorts);
2106     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2107     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2108     __ movw(count, scratch_length); // length
2109     __ b(RuntimeAddress(short_copy_entry));
2110 
2111   __ BIND(L_copy_ints);
2112     __ tbnz(r15_elsize, 0, L_copy_longs);
2113     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2114     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2115     __ movw(count, scratch_length); // length
2116     __ b(RuntimeAddress(int_copy_entry));
2117 
2118   __ BIND(L_copy_longs);
2119 #ifdef ASSERT
2120     {
2121       BLOCK_COMMENT("assert long copy {");
2122       Label L;
2123       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2124       __ cmpw(r15_elsize, LogBytesPerLong);
2125       __ br(Assembler::EQ, L);
2126       __ stop("must be long copy, but elsize is wrong");
2127       __ bind(L);
2128       BLOCK_COMMENT("} assert long copy done");
2129     }
2130 #endif
2131     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2132     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2133     __ movw(count, scratch_length); // length
2134     __ b(RuntimeAddress(long_copy_entry));
2135 
2136     // ObjArrayKlass
2137   __ BIND(L_objArray);
2138     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2139 
2140     Label L_plain_copy, L_checkcast_copy;
2141     //  test array classes for subtyping
2142     __ load_klass(r15, dst);
2143     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2144     __ br(Assembler::NE, L_checkcast_copy);
2145 
2146     // Identically typed arrays can be copied without element-wise checks.
2147     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2148                            rscratch2, L_failed);
2149 
2150     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2151     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2152     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2153     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2154     __ movw(count, scratch_length); // length
2155   __ BIND(L_plain_copy);
2156     __ b(RuntimeAddress(oop_copy_entry));
2157 
2158   __ BIND(L_checkcast_copy);
2159     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2160     {
2161       // Before looking at dst.length, make sure dst is also an objArray.
2162       __ ldrw(rscratch1, Address(r15, lh_offset));
2163       __ movw(rscratch2, objArray_lh);
2164       __ eorw(rscratch1, rscratch1, rscratch2);
2165       __ cbnzw(rscratch1, L_failed);
2166 
2167       // It is safe to examine both src.length and dst.length.
2168       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2169                              r15, L_failed);
2170 
2171       __ load_klass(dst_klass, dst); // reload
2172 
2173       // Marshal the base address arguments now, freeing registers.
2174       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2175       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2176       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2177       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2178       __ movw(count, length);           // length (reloaded)
2179       Register sco_temp = c_rarg3;      // this register is free now
2180       assert_different_registers(from, to, count, sco_temp,
2181                                  dst_klass, scratch_src_klass);
2182       // assert_clean_int(count, sco_temp);
2183 
2184       // Generate the type check.
2185       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2186       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2187 
2188       // Smashes rscratch1, rscratch2
2189       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2190 
2191       // Fetch destination element klass from the ObjArrayKlass header.
2192       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2193       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2194       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2195 
2196       // the checkcast_copy loop needs two extra arguments:
2197       assert(c_rarg3 == sco_temp, "#3 already in place");
2198       // Set up arguments for checkcast_copy_entry.
2199       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2200       __ b(RuntimeAddress(checkcast_copy_entry));
2201     }
2202 
2203   __ BIND(L_failed);
2204     __ mov(r0, -1);
2205     __ leave();   // required for proper stackwalking of RuntimeStub frame
2206     __ ret(lr);
2207 
2208     return start;
2209   }
2210 
2211   //
2212   // Generate stub for array fill. If "aligned" is true, the
2213   // "to" address is assumed to be heapword aligned.
2214   //
2215   // Arguments for generated stub:
2216   //   to:    c_rarg0
2217   //   value: c_rarg1
2218   //   count: c_rarg2 treated as signed
2219   //
2220   address generate_fill(BasicType t, bool aligned, const char *name) {
2221     __ align(CodeEntryAlignment);
2222     StubCodeMark mark(this, "StubRoutines", name);
2223     address start = __ pc();
2224 
2225     BLOCK_COMMENT("Entry:");
2226 
2227     const Register to        = c_rarg0;  // source array address
2228     const Register value     = c_rarg1;  // value
2229     const Register count     = c_rarg2;  // elements count
2230 
2231     const Register bz_base = r10;        // base for block_zero routine
2232     const Register cnt_words = r11;      // temp register
2233 
2234     __ enter();
2235 
2236     Label L_fill_elements, L_exit1;
2237 
2238     int shift = -1;
2239     switch (t) {
2240       case T_BYTE:
2241         shift = 0;
2242         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2243         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2244         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2245         __ br(Assembler::LO, L_fill_elements);
2246         break;
2247       case T_SHORT:
2248         shift = 1;
2249         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2250         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2251         __ br(Assembler::LO, L_fill_elements);
2252         break;
2253       case T_INT:
2254         shift = 2;
2255         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2256         __ br(Assembler::LO, L_fill_elements);
2257         break;
2258       default: ShouldNotReachHere();
2259     }
2260 
2261     // Align source address at 8 bytes address boundary.
2262     Label L_skip_align1, L_skip_align2, L_skip_align4;
2263     if (!aligned) {
2264       switch (t) {
2265         case T_BYTE:
2266           // One byte misalignment happens only for byte arrays.
2267           __ tbz(to, 0, L_skip_align1);
2268           __ strb(value, Address(__ post(to, 1)));
2269           __ subw(count, count, 1);
2270           __ bind(L_skip_align1);
2271           // Fallthrough
2272         case T_SHORT:
2273           // Two bytes misalignment happens only for byte and short (char) arrays.
2274           __ tbz(to, 1, L_skip_align2);
2275           __ strh(value, Address(__ post(to, 2)));
2276           __ subw(count, count, 2 >> shift);
2277           __ bind(L_skip_align2);
2278           // Fallthrough
2279         case T_INT:
2280           // Align to 8 bytes, we know we are 4 byte aligned to start.
2281           __ tbz(to, 2, L_skip_align4);
2282           __ strw(value, Address(__ post(to, 4)));
2283           __ subw(count, count, 4 >> shift);
2284           __ bind(L_skip_align4);
2285           break;
2286         default: ShouldNotReachHere();
2287       }
2288     }
2289 
2290     //
2291     //  Fill large chunks
2292     //
2293     __ lsrw(cnt_words, count, 3 - shift); // number of words
2294     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2295     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2296     if (UseBlockZeroing) {
2297       Label non_block_zeroing, rest;
2298       // If the fill value is zero we can use the fast zero_words().
2299       __ cbnz(value, non_block_zeroing);
2300       __ mov(bz_base, to);
2301       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2302       __ zero_words(bz_base, cnt_words);
2303       __ b(rest);
2304       __ bind(non_block_zeroing);
2305       __ fill_words(to, cnt_words, value);
2306       __ bind(rest);
2307     } else {
2308       __ fill_words(to, cnt_words, value);
2309     }
2310 
2311     // Remaining count is less than 8 bytes. Fill it by a single store.
2312     // Note that the total length is no less than 8 bytes.
2313     if (t == T_BYTE || t == T_SHORT) {
2314       Label L_exit1;
2315       __ cbzw(count, L_exit1);
2316       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2317       __ str(value, Address(to, -8));    // overwrite some elements
2318       __ bind(L_exit1);
2319       __ leave();
2320       __ ret(lr);
2321     }
2322 
2323     // Handle copies less than 8 bytes.
2324     Label L_fill_2, L_fill_4, L_exit2;
2325     __ bind(L_fill_elements);
2326     switch (t) {
2327       case T_BYTE:
2328         __ tbz(count, 0, L_fill_2);
2329         __ strb(value, Address(__ post(to, 1)));
2330         __ bind(L_fill_2);
2331         __ tbz(count, 1, L_fill_4);
2332         __ strh(value, Address(__ post(to, 2)));
2333         __ bind(L_fill_4);
2334         __ tbz(count, 2, L_exit2);
2335         __ strw(value, Address(to));
2336         break;
2337       case T_SHORT:
2338         __ tbz(count, 0, L_fill_4);
2339         __ strh(value, Address(__ post(to, 2)));
2340         __ bind(L_fill_4);
2341         __ tbz(count, 1, L_exit2);
2342         __ strw(value, Address(to));
2343         break;
2344       case T_INT:
2345         __ cbzw(count, L_exit2);
2346         __ strw(value, Address(to));
2347         break;
2348       default: ShouldNotReachHere();
2349     }
2350     __ bind(L_exit2);
2351     __ leave();
2352     __ ret(lr);
2353     return start;
2354   }
2355 
2356   address generate_data_cache_writeback() {
2357     const Register line        = c_rarg0;  // address of line to write back
2358 
2359     __ align(CodeEntryAlignment);
2360 
2361     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2362 
2363     address start = __ pc();
2364     __ enter();
2365     __ cache_wb(Address(line, 0));
2366     __ leave();
2367     __ ret(lr);
2368 
2369     return start;
2370   }
2371 
2372   address generate_data_cache_writeback_sync() {
2373     const Register is_pre     = c_rarg0;  // pre or post sync
2374 
2375     __ align(CodeEntryAlignment);
2376 
2377     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2378 
2379     // pre wbsync is a no-op
2380     // post wbsync translates to an sfence
2381 
2382     Label skip;
2383     address start = __ pc();
2384     __ enter();
2385     __ cbnz(is_pre, skip);
2386     __ cache_wbsync(false);
2387     __ bind(skip);
2388     __ leave();
2389     __ ret(lr);
2390 
2391     return start;
2392   }
2393 
2394   void generate_arraycopy_stubs() {
2395     address entry;
2396     address entry_jbyte_arraycopy;
2397     address entry_jshort_arraycopy;
2398     address entry_jint_arraycopy;
2399     address entry_oop_arraycopy;
2400     address entry_jlong_arraycopy;
2401     address entry_checkcast_arraycopy;
2402 
2403     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2404     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2405 
2406     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2407 
2408     //*** jbyte
2409     // Always need aligned and unaligned versions
2410     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2411                                                                                   "jbyte_disjoint_arraycopy");
2412     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2413                                                                                   &entry_jbyte_arraycopy,
2414                                                                                   "jbyte_arraycopy");
2415     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2416                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2417     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2418                                                                                   "arrayof_jbyte_arraycopy");
2419 
2420     //*** jshort
2421     // Always need aligned and unaligned versions
2422     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2423                                                                                     "jshort_disjoint_arraycopy");
2424     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2425                                                                                     &entry_jshort_arraycopy,
2426                                                                                     "jshort_arraycopy");
2427     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2428                                                                                     "arrayof_jshort_disjoint_arraycopy");
2429     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2430                                                                                     "arrayof_jshort_arraycopy");
2431 
2432     //*** jint
2433     // Aligned versions
2434     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2435                                                                                 "arrayof_jint_disjoint_arraycopy");
2436     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2437                                                                                 "arrayof_jint_arraycopy");
2438     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2439     // entry_jint_arraycopy always points to the unaligned version
2440     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2441                                                                                 "jint_disjoint_arraycopy");
2442     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2443                                                                                 &entry_jint_arraycopy,
2444                                                                                 "jint_arraycopy");
2445 
2446     //*** jlong
2447     // It is always aligned
2448     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2449                                                                                   "arrayof_jlong_disjoint_arraycopy");
2450     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2451                                                                                   "arrayof_jlong_arraycopy");
2452     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2453     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2454 
2455     //*** oops
2456     {
2457       // With compressed oops we need unaligned versions; notice that
2458       // we overwrite entry_oop_arraycopy.
2459       bool aligned = !UseCompressedOops;
2460 
2461       StubRoutines::_arrayof_oop_disjoint_arraycopy
2462         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2463                                      /*dest_uninitialized*/false);
2464       StubRoutines::_arrayof_oop_arraycopy
2465         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2466                                      /*dest_uninitialized*/false);
2467       // Aligned versions without pre-barriers
2468       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2469         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2470                                      /*dest_uninitialized*/true);
2471       StubRoutines::_arrayof_oop_arraycopy_uninit
2472         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2473                                      /*dest_uninitialized*/true);
2474     }
2475 
2476     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2477     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2478     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2479     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2480 
2481     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2482     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2483                                                                         /*dest_uninitialized*/true);
2484 
2485     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2486                                                               entry_jbyte_arraycopy,
2487                                                               entry_jshort_arraycopy,
2488                                                               entry_jint_arraycopy,
2489                                                               entry_jlong_arraycopy);
2490 
2491     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2492                                                                entry_jbyte_arraycopy,
2493                                                                entry_jshort_arraycopy,
2494                                                                entry_jint_arraycopy,
2495                                                                entry_oop_arraycopy,
2496                                                                entry_jlong_arraycopy,
2497                                                                entry_checkcast_arraycopy);
2498 
2499     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2500     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2501     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2502     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2503     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2504     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2505   }
2506 
2507   void generate_math_stubs() { Unimplemented(); }
2508 
2509   // Arguments:
2510   //
2511   // Inputs:
2512   //   c_rarg0   - source byte array address
2513   //   c_rarg1   - destination byte array address
2514   //   c_rarg2   - K (key) in little endian int array
2515   //
2516   address generate_aescrypt_encryptBlock() {
2517     __ align(CodeEntryAlignment);
2518     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2519 
2520     Label L_doLast;
2521 
2522     const Register from        = c_rarg0;  // source array address
2523     const Register to          = c_rarg1;  // destination array address
2524     const Register key         = c_rarg2;  // key array address
2525     const Register keylen      = rscratch1;
2526 
2527     address start = __ pc();
2528     __ enter();
2529 
2530     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2531 
2532     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2533 
2534     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2535     __ rev32(v1, __ T16B, v1);
2536     __ rev32(v2, __ T16B, v2);
2537     __ rev32(v3, __ T16B, v3);
2538     __ rev32(v4, __ T16B, v4);
2539     __ aese(v0, v1);
2540     __ aesmc(v0, v0);
2541     __ aese(v0, v2);
2542     __ aesmc(v0, v0);
2543     __ aese(v0, v3);
2544     __ aesmc(v0, v0);
2545     __ aese(v0, v4);
2546     __ aesmc(v0, v0);
2547 
2548     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2549     __ rev32(v1, __ T16B, v1);
2550     __ rev32(v2, __ T16B, v2);
2551     __ rev32(v3, __ T16B, v3);
2552     __ rev32(v4, __ T16B, v4);
2553     __ aese(v0, v1);
2554     __ aesmc(v0, v0);
2555     __ aese(v0, v2);
2556     __ aesmc(v0, v0);
2557     __ aese(v0, v3);
2558     __ aesmc(v0, v0);
2559     __ aese(v0, v4);
2560     __ aesmc(v0, v0);
2561 
2562     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2563     __ rev32(v1, __ T16B, v1);
2564     __ rev32(v2, __ T16B, v2);
2565 
2566     __ cmpw(keylen, 44);
2567     __ br(Assembler::EQ, L_doLast);
2568 
2569     __ aese(v0, v1);
2570     __ aesmc(v0, v0);
2571     __ aese(v0, v2);
2572     __ aesmc(v0, v0);
2573 
2574     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2575     __ rev32(v1, __ T16B, v1);
2576     __ rev32(v2, __ T16B, v2);
2577 
2578     __ cmpw(keylen, 52);
2579     __ br(Assembler::EQ, L_doLast);
2580 
2581     __ aese(v0, v1);
2582     __ aesmc(v0, v0);
2583     __ aese(v0, v2);
2584     __ aesmc(v0, v0);
2585 
2586     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2587     __ rev32(v1, __ T16B, v1);
2588     __ rev32(v2, __ T16B, v2);
2589 
2590     __ BIND(L_doLast);
2591 
2592     __ aese(v0, v1);
2593     __ aesmc(v0, v0);
2594     __ aese(v0, v2);
2595 
2596     __ ld1(v1, __ T16B, key);
2597     __ rev32(v1, __ T16B, v1);
2598     __ eor(v0, __ T16B, v0, v1);
2599 
2600     __ st1(v0, __ T16B, to);
2601 
2602     __ mov(r0, 0);
2603 
2604     __ leave();
2605     __ ret(lr);
2606 
2607     return start;
2608   }
2609 
2610   // Arguments:
2611   //
2612   // Inputs:
2613   //   c_rarg0   - source byte array address
2614   //   c_rarg1   - destination byte array address
2615   //   c_rarg2   - K (key) in little endian int array
2616   //
2617   address generate_aescrypt_decryptBlock() {
2618     assert(UseAES, "need AES instructions and misaligned SSE support");
2619     __ align(CodeEntryAlignment);
2620     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2621     Label L_doLast;
2622 
2623     const Register from        = c_rarg0;  // source array address
2624     const Register to          = c_rarg1;  // destination array address
2625     const Register key         = c_rarg2;  // key array address
2626     const Register keylen      = rscratch1;
2627 
2628     address start = __ pc();
2629     __ enter(); // required for proper stackwalking of RuntimeStub frame
2630 
2631     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2632 
2633     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2634 
2635     __ ld1(v5, __ T16B, __ post(key, 16));
2636     __ rev32(v5, __ T16B, v5);
2637 
2638     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2639     __ rev32(v1, __ T16B, v1);
2640     __ rev32(v2, __ T16B, v2);
2641     __ rev32(v3, __ T16B, v3);
2642     __ rev32(v4, __ T16B, v4);
2643     __ aesd(v0, v1);
2644     __ aesimc(v0, v0);
2645     __ aesd(v0, v2);
2646     __ aesimc(v0, v0);
2647     __ aesd(v0, v3);
2648     __ aesimc(v0, v0);
2649     __ aesd(v0, v4);
2650     __ aesimc(v0, v0);
2651 
2652     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2653     __ rev32(v1, __ T16B, v1);
2654     __ rev32(v2, __ T16B, v2);
2655     __ rev32(v3, __ T16B, v3);
2656     __ rev32(v4, __ T16B, v4);
2657     __ aesd(v0, v1);
2658     __ aesimc(v0, v0);
2659     __ aesd(v0, v2);
2660     __ aesimc(v0, v0);
2661     __ aesd(v0, v3);
2662     __ aesimc(v0, v0);
2663     __ aesd(v0, v4);
2664     __ aesimc(v0, v0);
2665 
2666     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2667     __ rev32(v1, __ T16B, v1);
2668     __ rev32(v2, __ T16B, v2);
2669 
2670     __ cmpw(keylen, 44);
2671     __ br(Assembler::EQ, L_doLast);
2672 
2673     __ aesd(v0, v1);
2674     __ aesimc(v0, v0);
2675     __ aesd(v0, v2);
2676     __ aesimc(v0, v0);
2677 
2678     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2679     __ rev32(v1, __ T16B, v1);
2680     __ rev32(v2, __ T16B, v2);
2681 
2682     __ cmpw(keylen, 52);
2683     __ br(Assembler::EQ, L_doLast);
2684 
2685     __ aesd(v0, v1);
2686     __ aesimc(v0, v0);
2687     __ aesd(v0, v2);
2688     __ aesimc(v0, v0);
2689 
2690     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2691     __ rev32(v1, __ T16B, v1);
2692     __ rev32(v2, __ T16B, v2);
2693 
2694     __ BIND(L_doLast);
2695 
2696     __ aesd(v0, v1);
2697     __ aesimc(v0, v0);
2698     __ aesd(v0, v2);
2699 
2700     __ eor(v0, __ T16B, v0, v5);
2701 
2702     __ st1(v0, __ T16B, to);
2703 
2704     __ mov(r0, 0);
2705 
2706     __ leave();
2707     __ ret(lr);
2708 
2709     return start;
2710   }
2711 
2712   // Arguments:
2713   //
2714   // Inputs:
2715   //   c_rarg0   - source byte array address
2716   //   c_rarg1   - destination byte array address
2717   //   c_rarg2   - K (key) in little endian int array
2718   //   c_rarg3   - r vector byte array address
2719   //   c_rarg4   - input length
2720   //
2721   // Output:
2722   //   x0        - input length
2723   //
2724   address generate_cipherBlockChaining_encryptAESCrypt() {
2725     assert(UseAES, "need AES instructions and misaligned SSE support");
2726     __ align(CodeEntryAlignment);
2727     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2728 
2729     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2730 
2731     const Register from        = c_rarg0;  // source array address
2732     const Register to          = c_rarg1;  // destination array address
2733     const Register key         = c_rarg2;  // key array address
2734     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2735                                            // and left with the results of the last encryption block
2736     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2737     const Register keylen      = rscratch1;
2738 
2739     address start = __ pc();
2740 
2741       __ enter();
2742 
2743       __ movw(rscratch2, len_reg);
2744 
2745       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2746 
2747       __ ld1(v0, __ T16B, rvec);
2748 
2749       __ cmpw(keylen, 52);
2750       __ br(Assembler::CC, L_loadkeys_44);
2751       __ br(Assembler::EQ, L_loadkeys_52);
2752 
2753       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2754       __ rev32(v17, __ T16B, v17);
2755       __ rev32(v18, __ T16B, v18);
2756     __ BIND(L_loadkeys_52);
2757       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2758       __ rev32(v19, __ T16B, v19);
2759       __ rev32(v20, __ T16B, v20);
2760     __ BIND(L_loadkeys_44);
2761       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2762       __ rev32(v21, __ T16B, v21);
2763       __ rev32(v22, __ T16B, v22);
2764       __ rev32(v23, __ T16B, v23);
2765       __ rev32(v24, __ T16B, v24);
2766       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2767       __ rev32(v25, __ T16B, v25);
2768       __ rev32(v26, __ T16B, v26);
2769       __ rev32(v27, __ T16B, v27);
2770       __ rev32(v28, __ T16B, v28);
2771       __ ld1(v29, v30, v31, __ T16B, key);
2772       __ rev32(v29, __ T16B, v29);
2773       __ rev32(v30, __ T16B, v30);
2774       __ rev32(v31, __ T16B, v31);
2775 
2776     __ BIND(L_aes_loop);
2777       __ ld1(v1, __ T16B, __ post(from, 16));
2778       __ eor(v0, __ T16B, v0, v1);
2779 
2780       __ br(Assembler::CC, L_rounds_44);
2781       __ br(Assembler::EQ, L_rounds_52);
2782 
2783       __ aese(v0, v17); __ aesmc(v0, v0);
2784       __ aese(v0, v18); __ aesmc(v0, v0);
2785     __ BIND(L_rounds_52);
2786       __ aese(v0, v19); __ aesmc(v0, v0);
2787       __ aese(v0, v20); __ aesmc(v0, v0);
2788     __ BIND(L_rounds_44);
2789       __ aese(v0, v21); __ aesmc(v0, v0);
2790       __ aese(v0, v22); __ aesmc(v0, v0);
2791       __ aese(v0, v23); __ aesmc(v0, v0);
2792       __ aese(v0, v24); __ aesmc(v0, v0);
2793       __ aese(v0, v25); __ aesmc(v0, v0);
2794       __ aese(v0, v26); __ aesmc(v0, v0);
2795       __ aese(v0, v27); __ aesmc(v0, v0);
2796       __ aese(v0, v28); __ aesmc(v0, v0);
2797       __ aese(v0, v29); __ aesmc(v0, v0);
2798       __ aese(v0, v30);
2799       __ eor(v0, __ T16B, v0, v31);
2800 
2801       __ st1(v0, __ T16B, __ post(to, 16));
2802 
2803       __ subw(len_reg, len_reg, 16);
2804       __ cbnzw(len_reg, L_aes_loop);
2805 
2806       __ st1(v0, __ T16B, rvec);
2807 
2808       __ mov(r0, rscratch2);
2809 
2810       __ leave();
2811       __ ret(lr);
2812 
2813       return start;
2814   }
2815 
2816   // Arguments:
2817   //
2818   // Inputs:
2819   //   c_rarg0   - source byte array address
2820   //   c_rarg1   - destination byte array address
2821   //   c_rarg2   - K (key) in little endian int array
2822   //   c_rarg3   - r vector byte array address
2823   //   c_rarg4   - input length
2824   //
2825   // Output:
2826   //   r0        - input length
2827   //
2828   address generate_cipherBlockChaining_decryptAESCrypt() {
2829     assert(UseAES, "need AES instructions and misaligned SSE support");
2830     __ align(CodeEntryAlignment);
2831     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2832 
2833     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2834 
2835     const Register from        = c_rarg0;  // source array address
2836     const Register to          = c_rarg1;  // destination array address
2837     const Register key         = c_rarg2;  // key array address
2838     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2839                                            // and left with the results of the last encryption block
2840     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2841     const Register keylen      = rscratch1;
2842 
2843     address start = __ pc();
2844 
2845       __ enter();
2846 
2847       __ movw(rscratch2, len_reg);
2848 
2849       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2850 
2851       __ ld1(v2, __ T16B, rvec);
2852 
2853       __ ld1(v31, __ T16B, __ post(key, 16));
2854       __ rev32(v31, __ T16B, v31);
2855 
2856       __ cmpw(keylen, 52);
2857       __ br(Assembler::CC, L_loadkeys_44);
2858       __ br(Assembler::EQ, L_loadkeys_52);
2859 
2860       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2861       __ rev32(v17, __ T16B, v17);
2862       __ rev32(v18, __ T16B, v18);
2863     __ BIND(L_loadkeys_52);
2864       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2865       __ rev32(v19, __ T16B, v19);
2866       __ rev32(v20, __ T16B, v20);
2867     __ BIND(L_loadkeys_44);
2868       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2869       __ rev32(v21, __ T16B, v21);
2870       __ rev32(v22, __ T16B, v22);
2871       __ rev32(v23, __ T16B, v23);
2872       __ rev32(v24, __ T16B, v24);
2873       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2874       __ rev32(v25, __ T16B, v25);
2875       __ rev32(v26, __ T16B, v26);
2876       __ rev32(v27, __ T16B, v27);
2877       __ rev32(v28, __ T16B, v28);
2878       __ ld1(v29, v30, __ T16B, key);
2879       __ rev32(v29, __ T16B, v29);
2880       __ rev32(v30, __ T16B, v30);
2881 
2882     __ BIND(L_aes_loop);
2883       __ ld1(v0, __ T16B, __ post(from, 16));
2884       __ orr(v1, __ T16B, v0, v0);
2885 
2886       __ br(Assembler::CC, L_rounds_44);
2887       __ br(Assembler::EQ, L_rounds_52);
2888 
2889       __ aesd(v0, v17); __ aesimc(v0, v0);
2890       __ aesd(v0, v18); __ aesimc(v0, v0);
2891     __ BIND(L_rounds_52);
2892       __ aesd(v0, v19); __ aesimc(v0, v0);
2893       __ aesd(v0, v20); __ aesimc(v0, v0);
2894     __ BIND(L_rounds_44);
2895       __ aesd(v0, v21); __ aesimc(v0, v0);
2896       __ aesd(v0, v22); __ aesimc(v0, v0);
2897       __ aesd(v0, v23); __ aesimc(v0, v0);
2898       __ aesd(v0, v24); __ aesimc(v0, v0);
2899       __ aesd(v0, v25); __ aesimc(v0, v0);
2900       __ aesd(v0, v26); __ aesimc(v0, v0);
2901       __ aesd(v0, v27); __ aesimc(v0, v0);
2902       __ aesd(v0, v28); __ aesimc(v0, v0);
2903       __ aesd(v0, v29); __ aesimc(v0, v0);
2904       __ aesd(v0, v30);
2905       __ eor(v0, __ T16B, v0, v31);
2906       __ eor(v0, __ T16B, v0, v2);
2907 
2908       __ st1(v0, __ T16B, __ post(to, 16));
2909       __ orr(v2, __ T16B, v1, v1);
2910 
2911       __ subw(len_reg, len_reg, 16);
2912       __ cbnzw(len_reg, L_aes_loop);
2913 
2914       __ st1(v2, __ T16B, rvec);
2915 
2916       __ mov(r0, rscratch2);
2917 
2918       __ leave();
2919       __ ret(lr);
2920 
2921     return start;
2922   }
2923 
2924   // Arguments:
2925   //
2926   // Inputs:
2927   //   c_rarg0   - byte[]  source+offset
2928   //   c_rarg1   - int[]   SHA.state
2929   //   c_rarg2   - int     offset
2930   //   c_rarg3   - int     limit
2931   //
2932   address generate_sha1_implCompress(bool multi_block, const char *name) {
2933     __ align(CodeEntryAlignment);
2934     StubCodeMark mark(this, "StubRoutines", name);
2935     address start = __ pc();
2936 
2937     Register buf   = c_rarg0;
2938     Register state = c_rarg1;
2939     Register ofs   = c_rarg2;
2940     Register limit = c_rarg3;
2941 
2942     Label keys;
2943     Label sha1_loop;
2944 
2945     // load the keys into v0..v3
2946     __ adr(rscratch1, keys);
2947     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2948     // load 5 words state into v6, v7
2949     __ ldrq(v6, Address(state, 0));
2950     __ ldrs(v7, Address(state, 16));
2951 
2952 
2953     __ BIND(sha1_loop);
2954     // load 64 bytes of data into v16..v19
2955     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2956     __ rev32(v16, __ T16B, v16);
2957     __ rev32(v17, __ T16B, v17);
2958     __ rev32(v18, __ T16B, v18);
2959     __ rev32(v19, __ T16B, v19);
2960 
2961     // do the sha1
2962     __ addv(v4, __ T4S, v16, v0);
2963     __ orr(v20, __ T16B, v6, v6);
2964 
2965     FloatRegister d0 = v16;
2966     FloatRegister d1 = v17;
2967     FloatRegister d2 = v18;
2968     FloatRegister d3 = v19;
2969 
2970     for (int round = 0; round < 20; round++) {
2971       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2972       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2973       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2974       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2975       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2976 
2977       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2978       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2979       __ sha1h(tmp2, __ T4S, v20);
2980       if (round < 5)
2981         __ sha1c(v20, __ T4S, tmp3, tmp4);
2982       else if (round < 10 || round >= 15)
2983         __ sha1p(v20, __ T4S, tmp3, tmp4);
2984       else
2985         __ sha1m(v20, __ T4S, tmp3, tmp4);
2986       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2987 
2988       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2989     }
2990 
2991     __ addv(v7, __ T2S, v7, v21);
2992     __ addv(v6, __ T4S, v6, v20);
2993 
2994     if (multi_block) {
2995       __ add(ofs, ofs, 64);
2996       __ cmp(ofs, limit);
2997       __ br(Assembler::LE, sha1_loop);
2998       __ mov(c_rarg0, ofs); // return ofs
2999     }
3000 
3001     __ strq(v6, Address(state, 0));
3002     __ strs(v7, Address(state, 16));
3003 
3004     __ ret(lr);
3005 
3006     __ bind(keys);
3007     __ emit_int32(0x5a827999);
3008     __ emit_int32(0x6ed9eba1);
3009     __ emit_int32(0x8f1bbcdc);
3010     __ emit_int32(0xca62c1d6);
3011 
3012     return start;
3013   }
3014 
3015 
3016   // Arguments:
3017   //
3018   // Inputs:
3019   //   c_rarg0   - byte[]  source+offset
3020   //   c_rarg1   - int[]   SHA.state
3021   //   c_rarg2   - int     offset
3022   //   c_rarg3   - int     limit
3023   //
3024   address generate_sha256_implCompress(bool multi_block, const char *name) {
3025     static const uint32_t round_consts[64] = {
3026       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3027       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3028       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3029       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3030       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3031       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3032       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3033       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3034       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3035       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3036       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3037       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3038       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3039       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3040       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3041       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3042     };
3043     __ align(CodeEntryAlignment);
3044     StubCodeMark mark(this, "StubRoutines", name);
3045     address start = __ pc();
3046 
3047     Register buf   = c_rarg0;
3048     Register state = c_rarg1;
3049     Register ofs   = c_rarg2;
3050     Register limit = c_rarg3;
3051 
3052     Label sha1_loop;
3053 
3054     __ stpd(v8, v9, __ pre(sp, -32));
3055     __ stpd(v10, v11, Address(sp, 16));
3056 
3057 // dga == v0
3058 // dgb == v1
3059 // dg0 == v2
3060 // dg1 == v3
3061 // dg2 == v4
3062 // t0 == v6
3063 // t1 == v7
3064 
3065     // load 16 keys to v16..v31
3066     __ lea(rscratch1, ExternalAddress((address)round_consts));
3067     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3068     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3069     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3070     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3071 
3072     // load 8 words (256 bits) state
3073     __ ldpq(v0, v1, state);
3074 
3075     __ BIND(sha1_loop);
3076     // load 64 bytes of data into v8..v11
3077     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3078     __ rev32(v8, __ T16B, v8);
3079     __ rev32(v9, __ T16B, v9);
3080     __ rev32(v10, __ T16B, v10);
3081     __ rev32(v11, __ T16B, v11);
3082 
3083     __ addv(v6, __ T4S, v8, v16);
3084     __ orr(v2, __ T16B, v0, v0);
3085     __ orr(v3, __ T16B, v1, v1);
3086 
3087     FloatRegister d0 = v8;
3088     FloatRegister d1 = v9;
3089     FloatRegister d2 = v10;
3090     FloatRegister d3 = v11;
3091 
3092 
3093     for (int round = 0; round < 16; round++) {
3094       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3095       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3096       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3097       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3098 
3099       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3100        __ orr(v4, __ T16B, v2, v2);
3101       if (round < 15)
3102         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3103       __ sha256h(v2, __ T4S, v3, tmp2);
3104       __ sha256h2(v3, __ T4S, v4, tmp2);
3105       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3106 
3107       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3108     }
3109 
3110     __ addv(v0, __ T4S, v0, v2);
3111     __ addv(v1, __ T4S, v1, v3);
3112 
3113     if (multi_block) {
3114       __ add(ofs, ofs, 64);
3115       __ cmp(ofs, limit);
3116       __ br(Assembler::LE, sha1_loop);
3117       __ mov(c_rarg0, ofs); // return ofs
3118     }
3119 
3120     __ ldpd(v10, v11, Address(sp, 16));
3121     __ ldpd(v8, v9, __ post(sp, 32));
3122 
3123     __ stpq(v0, v1, state);
3124 
3125     __ ret(lr);
3126 
3127     return start;
3128   }
3129 
3130   // Safefetch stubs.
3131   void generate_safefetch(const char* name, int size, address* entry,
3132                           address* fault_pc, address* continuation_pc) {
3133     // safefetch signatures:
3134     //   int      SafeFetch32(int*      adr, int      errValue);
3135     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3136     //
3137     // arguments:
3138     //   c_rarg0 = adr
3139     //   c_rarg1 = errValue
3140     //
3141     // result:
3142     //   PPC_RET  = *adr or errValue
3143 
3144     StubCodeMark mark(this, "StubRoutines", name);
3145 
3146     // Entry point, pc or function descriptor.
3147     *entry = __ pc();
3148 
3149     // Load *adr into c_rarg1, may fault.
3150     *fault_pc = __ pc();
3151     switch (size) {
3152       case 4:
3153         // int32_t
3154         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3155         break;
3156       case 8:
3157         // int64_t
3158         __ ldr(c_rarg1, Address(c_rarg0, 0));
3159         break;
3160       default:
3161         ShouldNotReachHere();
3162     }
3163 
3164     // return errValue or *adr
3165     *continuation_pc = __ pc();
3166     __ mov(r0, c_rarg1);
3167     __ ret(lr);
3168   }
3169 
3170   /**
3171    *  Arguments:
3172    *
3173    * Inputs:
3174    *   c_rarg0   - int crc
3175    *   c_rarg1   - byte* buf
3176    *   c_rarg2   - int length
3177    *
3178    * Ouput:
3179    *       rax   - int crc result
3180    */
3181   address generate_updateBytesCRC32() {
3182     assert(UseCRC32Intrinsics, "what are we doing here?");
3183 
3184     __ align(CodeEntryAlignment);
3185     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3186 
3187     address start = __ pc();
3188 
3189     const Register crc   = c_rarg0;  // crc
3190     const Register buf   = c_rarg1;  // source java byte array address
3191     const Register len   = c_rarg2;  // length
3192     const Register table0 = c_rarg3; // crc_table address
3193     const Register table1 = c_rarg4;
3194     const Register table2 = c_rarg5;
3195     const Register table3 = c_rarg6;
3196     const Register tmp3 = c_rarg7;
3197 
3198     BLOCK_COMMENT("Entry:");
3199     __ enter(); // required for proper stackwalking of RuntimeStub frame
3200 
3201     __ kernel_crc32(crc, buf, len,
3202               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3203 
3204     __ leave(); // required for proper stackwalking of RuntimeStub frame
3205     __ ret(lr);
3206 
3207     return start;
3208   }
3209 
3210   /**
3211    *  Arguments:
3212    *
3213    * Inputs:
3214    *   c_rarg0   - int crc
3215    *   c_rarg1   - byte* buf
3216    *   c_rarg2   - int length
3217    *   c_rarg3   - int* table
3218    *
3219    * Ouput:
3220    *       r0   - int crc result
3221    */
3222   address generate_updateBytesCRC32C() {
3223     assert(UseCRC32CIntrinsics, "what are we doing here?");
3224 
3225     __ align(CodeEntryAlignment);
3226     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3227 
3228     address start = __ pc();
3229 
3230     const Register crc   = c_rarg0;  // crc
3231     const Register buf   = c_rarg1;  // source java byte array address
3232     const Register len   = c_rarg2;  // length
3233     const Register table0 = c_rarg3; // crc_table address
3234     const Register table1 = c_rarg4;
3235     const Register table2 = c_rarg5;
3236     const Register table3 = c_rarg6;
3237     const Register tmp3 = c_rarg7;
3238 
3239     BLOCK_COMMENT("Entry:");
3240     __ enter(); // required for proper stackwalking of RuntimeStub frame
3241 
3242     __ kernel_crc32c(crc, buf, len,
3243               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3244 
3245     __ leave(); // required for proper stackwalking of RuntimeStub frame
3246     __ ret(lr);
3247 
3248     return start;
3249   }
3250 
3251   /***
3252    *  Arguments:
3253    *
3254    *  Inputs:
3255    *   c_rarg0   - int   adler
3256    *   c_rarg1   - byte* buff
3257    *   c_rarg2   - int   len
3258    *
3259    * Output:
3260    *   c_rarg0   - int adler result
3261    */
3262   address generate_updateBytesAdler32() {
3263     __ align(CodeEntryAlignment);
3264     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3265     address start = __ pc();
3266 
3267     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3268 
3269     // Aliases
3270     Register adler  = c_rarg0;
3271     Register s1     = c_rarg0;
3272     Register s2     = c_rarg3;
3273     Register buff   = c_rarg1;
3274     Register len    = c_rarg2;
3275     Register nmax  = r4;
3276     Register base  = r5;
3277     Register count = r6;
3278     Register temp0 = rscratch1;
3279     Register temp1 = rscratch2;
3280     FloatRegister vbytes = v0;
3281     FloatRegister vs1acc = v1;
3282     FloatRegister vs2acc = v2;
3283     FloatRegister vtable = v3;
3284 
3285     // Max number of bytes we can process before having to take the mod
3286     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3287     uint64_t BASE = 0xfff1;
3288     uint64_t NMAX = 0x15B0;
3289 
3290     __ mov(base, BASE);
3291     __ mov(nmax, NMAX);
3292 
3293     // Load accumulation coefficients for the upper 16 bits
3294     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3295     __ ld1(vtable, __ T16B, Address(temp0));
3296 
3297     // s1 is initialized to the lower 16 bits of adler
3298     // s2 is initialized to the upper 16 bits of adler
3299     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3300     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3301 
3302     // The pipelined loop needs at least 16 elements for 1 iteration
3303     // It does check this, but it is more effective to skip to the cleanup loop
3304     __ cmp(len, (u1)16);
3305     __ br(Assembler::HS, L_nmax);
3306     __ cbz(len, L_combine);
3307 
3308     __ bind(L_simple_by1_loop);
3309     __ ldrb(temp0, Address(__ post(buff, 1)));
3310     __ add(s1, s1, temp0);
3311     __ add(s2, s2, s1);
3312     __ subs(len, len, 1);
3313     __ br(Assembler::HI, L_simple_by1_loop);
3314 
3315     // s1 = s1 % BASE
3316     __ subs(temp0, s1, base);
3317     __ csel(s1, temp0, s1, Assembler::HS);
3318 
3319     // s2 = s2 % BASE
3320     __ lsr(temp0, s2, 16);
3321     __ lsl(temp1, temp0, 4);
3322     __ sub(temp1, temp1, temp0);
3323     __ add(s2, temp1, s2, ext::uxth);
3324 
3325     __ subs(temp0, s2, base);
3326     __ csel(s2, temp0, s2, Assembler::HS);
3327 
3328     __ b(L_combine);
3329 
3330     __ bind(L_nmax);
3331     __ subs(len, len, nmax);
3332     __ sub(count, nmax, 16);
3333     __ br(Assembler::LO, L_by16);
3334 
3335     __ bind(L_nmax_loop);
3336 
3337     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3338                                       vbytes, vs1acc, vs2acc, vtable);
3339 
3340     __ subs(count, count, 16);
3341     __ br(Assembler::HS, L_nmax_loop);
3342 
3343     // s1 = s1 % BASE
3344     __ lsr(temp0, s1, 16);
3345     __ lsl(temp1, temp0, 4);
3346     __ sub(temp1, temp1, temp0);
3347     __ add(temp1, temp1, s1, ext::uxth);
3348 
3349     __ lsr(temp0, temp1, 16);
3350     __ lsl(s1, temp0, 4);
3351     __ sub(s1, s1, temp0);
3352     __ add(s1, s1, temp1, ext:: uxth);
3353 
3354     __ subs(temp0, s1, base);
3355     __ csel(s1, temp0, s1, Assembler::HS);
3356 
3357     // s2 = s2 % BASE
3358     __ lsr(temp0, s2, 16);
3359     __ lsl(temp1, temp0, 4);
3360     __ sub(temp1, temp1, temp0);
3361     __ add(temp1, temp1, s2, ext::uxth);
3362 
3363     __ lsr(temp0, temp1, 16);
3364     __ lsl(s2, temp0, 4);
3365     __ sub(s2, s2, temp0);
3366     __ add(s2, s2, temp1, ext:: uxth);
3367 
3368     __ subs(temp0, s2, base);
3369     __ csel(s2, temp0, s2, Assembler::HS);
3370 
3371     __ subs(len, len, nmax);
3372     __ sub(count, nmax, 16);
3373     __ br(Assembler::HS, L_nmax_loop);
3374 
3375     __ bind(L_by16);
3376     __ adds(len, len, count);
3377     __ br(Assembler::LO, L_by1);
3378 
3379     __ bind(L_by16_loop);
3380 
3381     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3382                                       vbytes, vs1acc, vs2acc, vtable);
3383 
3384     __ subs(len, len, 16);
3385     __ br(Assembler::HS, L_by16_loop);
3386 
3387     __ bind(L_by1);
3388     __ adds(len, len, 15);
3389     __ br(Assembler::LO, L_do_mod);
3390 
3391     __ bind(L_by1_loop);
3392     __ ldrb(temp0, Address(__ post(buff, 1)));
3393     __ add(s1, temp0, s1);
3394     __ add(s2, s2, s1);
3395     __ subs(len, len, 1);
3396     __ br(Assembler::HS, L_by1_loop);
3397 
3398     __ bind(L_do_mod);
3399     // s1 = s1 % BASE
3400     __ lsr(temp0, s1, 16);
3401     __ lsl(temp1, temp0, 4);
3402     __ sub(temp1, temp1, temp0);
3403     __ add(temp1, temp1, s1, ext::uxth);
3404 
3405     __ lsr(temp0, temp1, 16);
3406     __ lsl(s1, temp0, 4);
3407     __ sub(s1, s1, temp0);
3408     __ add(s1, s1, temp1, ext:: uxth);
3409 
3410     __ subs(temp0, s1, base);
3411     __ csel(s1, temp0, s1, Assembler::HS);
3412 
3413     // s2 = s2 % BASE
3414     __ lsr(temp0, s2, 16);
3415     __ lsl(temp1, temp0, 4);
3416     __ sub(temp1, temp1, temp0);
3417     __ add(temp1, temp1, s2, ext::uxth);
3418 
3419     __ lsr(temp0, temp1, 16);
3420     __ lsl(s2, temp0, 4);
3421     __ sub(s2, s2, temp0);
3422     __ add(s2, s2, temp1, ext:: uxth);
3423 
3424     __ subs(temp0, s2, base);
3425     __ csel(s2, temp0, s2, Assembler::HS);
3426 
3427     // Combine lower bits and higher bits
3428     __ bind(L_combine);
3429     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3430 
3431     __ ret(lr);
3432 
3433     return start;
3434   }
3435 
3436   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3437           Register temp0, Register temp1, FloatRegister vbytes,
3438           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3439     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3440     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3441     // In non-vectorized code, we update s1 and s2 as:
3442     //   s1 <- s1 + b1
3443     //   s2 <- s2 + s1
3444     //   s1 <- s1 + b2
3445     //   s2 <- s2 + b1
3446     //   ...
3447     //   s1 <- s1 + b16
3448     //   s2 <- s2 + s1
3449     // Putting above assignments together, we have:
3450     //   s1_new = s1 + b1 + b2 + ... + b16
3451     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3452     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3453     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3454     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3455 
3456     // s2 = s2 + s1 * 16
3457     __ add(s2, s2, s1, Assembler::LSL, 4);
3458 
3459     // vs1acc = b1 + b2 + b3 + ... + b16
3460     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3461     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3462     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3463     __ uaddlv(vs1acc, __ T16B, vbytes);
3464     __ uaddlv(vs2acc, __ T8H, vs2acc);
3465 
3466     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3467     __ fmovd(temp0, vs1acc);
3468     __ fmovd(temp1, vs2acc);
3469     __ add(s1, s1, temp0);
3470     __ add(s2, s2, temp1);
3471   }
3472 
3473   /**
3474    *  Arguments:
3475    *
3476    *  Input:
3477    *    c_rarg0   - x address
3478    *    c_rarg1   - x length
3479    *    c_rarg2   - y address
3480    *    c_rarg3   - y lenth
3481    *    c_rarg4   - z address
3482    *    c_rarg5   - z length
3483    */
3484   address generate_multiplyToLen() {
3485     __ align(CodeEntryAlignment);
3486     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3487 
3488     address start = __ pc();
3489     const Register x     = r0;
3490     const Register xlen  = r1;
3491     const Register y     = r2;
3492     const Register ylen  = r3;
3493     const Register z     = r4;
3494     const Register zlen  = r5;
3495 
3496     const Register tmp1  = r10;
3497     const Register tmp2  = r11;
3498     const Register tmp3  = r12;
3499     const Register tmp4  = r13;
3500     const Register tmp5  = r14;
3501     const Register tmp6  = r15;
3502     const Register tmp7  = r16;
3503 
3504     BLOCK_COMMENT("Entry:");
3505     __ enter(); // required for proper stackwalking of RuntimeStub frame
3506     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3507     __ leave(); // required for proper stackwalking of RuntimeStub frame
3508     __ ret(lr);
3509 
3510     return start;
3511   }
3512 
3513   address generate_squareToLen() {
3514     // squareToLen algorithm for sizes 1..127 described in java code works
3515     // faster than multiply_to_len on some CPUs and slower on others, but
3516     // multiply_to_len shows a bit better overall results
3517     __ align(CodeEntryAlignment);
3518     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3519     address start = __ pc();
3520 
3521     const Register x     = r0;
3522     const Register xlen  = r1;
3523     const Register z     = r2;
3524     const Register zlen  = r3;
3525     const Register y     = r4; // == x
3526     const Register ylen  = r5; // == xlen
3527 
3528     const Register tmp1  = r10;
3529     const Register tmp2  = r11;
3530     const Register tmp3  = r12;
3531     const Register tmp4  = r13;
3532     const Register tmp5  = r14;
3533     const Register tmp6  = r15;
3534     const Register tmp7  = r16;
3535 
3536     RegSet spilled_regs = RegSet::of(y, ylen);
3537     BLOCK_COMMENT("Entry:");
3538     __ enter();
3539     __ push(spilled_regs, sp);
3540     __ mov(y, x);
3541     __ mov(ylen, xlen);
3542     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3543     __ pop(spilled_regs, sp);
3544     __ leave();
3545     __ ret(lr);
3546     return start;
3547   }
3548 
3549   address generate_mulAdd() {
3550     __ align(CodeEntryAlignment);
3551     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3552 
3553     address start = __ pc();
3554 
3555     const Register out     = r0;
3556     const Register in      = r1;
3557     const Register offset  = r2;
3558     const Register len     = r3;
3559     const Register k       = r4;
3560 
3561     BLOCK_COMMENT("Entry:");
3562     __ enter();
3563     __ mul_add(out, in, offset, len, k);
3564     __ leave();
3565     __ ret(lr);
3566 
3567     return start;
3568   }
3569 
3570   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3571                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3572                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3573     // Karatsuba multiplication performs a 128*128 -> 256-bit
3574     // multiplication in three 128-bit multiplications and a few
3575     // additions.
3576     //
3577     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3578     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3579     //
3580     // Inputs:
3581     //
3582     // A0 in a.d[0]     (subkey)
3583     // A1 in a.d[1]
3584     // (A1+A0) in a1_xor_a0.d[0]
3585     //
3586     // B0 in b.d[0]     (state)
3587     // B1 in b.d[1]
3588 
3589     __ ext(tmp1, __ T16B, b, b, 0x08);
3590     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3591     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3592     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3593     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3594 
3595     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3596     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3597     __ eor(tmp2, __ T16B, tmp2, tmp4);
3598     __ eor(tmp2, __ T16B, tmp2, tmp3);
3599 
3600     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3601     __ ins(result_hi, __ D, tmp2, 0, 1);
3602     __ ins(result_lo, __ D, tmp2, 1, 0);
3603   }
3604 
3605   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3606                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3607     const FloatRegister t0 = result;
3608 
3609     // The GCM field polynomial f is z^128 + p(z), where p =
3610     // z^7+z^2+z+1.
3611     //
3612     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3613     //
3614     // so, given that the product we're reducing is
3615     //    a == lo + hi * z^128
3616     // substituting,
3617     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3618     //
3619     // we reduce by multiplying hi by p(z) and subtracting the result
3620     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3621     // bits we can do this with two 64-bit multiplications, lo*p and
3622     // hi*p.
3623 
3624     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3625     __ ext(t1, __ T16B, t0, z, 8);
3626     __ eor(hi, __ T16B, hi, t1);
3627     __ ext(t1, __ T16B, z, t0, 8);
3628     __ eor(lo, __ T16B, lo, t1);
3629     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3630     __ eor(result, __ T16B, lo, t0);
3631   }
3632 
3633   address generate_has_negatives(address &has_negatives_long) {
3634     const u1 large_loop_size = 64;
3635     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3636     int dcache_line = VM_Version::dcache_line_size();
3637 
3638     Register ary1 = r1, len = r2, result = r0;
3639 
3640     __ align(CodeEntryAlignment);
3641 
3642     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3643 
3644     address entry = __ pc();
3645 
3646     __ enter();
3647 
3648   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3649         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3650 
3651   __ cmp(len, (u1)15);
3652   __ br(Assembler::GT, LEN_OVER_15);
3653   // The only case when execution falls into this code is when pointer is near
3654   // the end of memory page and we have to avoid reading next page
3655   __ add(ary1, ary1, len);
3656   __ subs(len, len, 8);
3657   __ br(Assembler::GT, LEN_OVER_8);
3658   __ ldr(rscratch2, Address(ary1, -8));
3659   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3660   __ lsrv(rscratch2, rscratch2, rscratch1);
3661   __ tst(rscratch2, UPPER_BIT_MASK);
3662   __ cset(result, Assembler::NE);
3663   __ leave();
3664   __ ret(lr);
3665   __ bind(LEN_OVER_8);
3666   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3667   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3668   __ tst(rscratch2, UPPER_BIT_MASK);
3669   __ br(Assembler::NE, RET_TRUE_NO_POP);
3670   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3671   __ lsrv(rscratch1, rscratch1, rscratch2);
3672   __ tst(rscratch1, UPPER_BIT_MASK);
3673   __ cset(result, Assembler::NE);
3674   __ leave();
3675   __ ret(lr);
3676 
3677   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3678   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3679 
3680   has_negatives_long = __ pc(); // 2nd entry point
3681 
3682   __ enter();
3683 
3684   __ bind(LEN_OVER_15);
3685     __ push(spilled_regs, sp);
3686     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3687     __ cbz(rscratch2, ALIGNED);
3688     __ ldp(tmp6, tmp1, Address(ary1));
3689     __ mov(tmp5, 16);
3690     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3691     __ add(ary1, ary1, rscratch1);
3692     __ sub(len, len, rscratch1);
3693     __ orr(tmp6, tmp6, tmp1);
3694     __ tst(tmp6, UPPER_BIT_MASK);
3695     __ br(Assembler::NE, RET_TRUE);
3696 
3697   __ bind(ALIGNED);
3698     __ cmp(len, large_loop_size);
3699     __ br(Assembler::LT, CHECK_16);
3700     // Perform 16-byte load as early return in pre-loop to handle situation
3701     // when initially aligned large array has negative values at starting bytes,
3702     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3703     // slower. Cases with negative bytes further ahead won't be affected that
3704     // much. In fact, it'll be faster due to early loads, less instructions and
3705     // less branches in LARGE_LOOP.
3706     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3707     __ sub(len, len, 16);
3708     __ orr(tmp6, tmp6, tmp1);
3709     __ tst(tmp6, UPPER_BIT_MASK);
3710     __ br(Assembler::NE, RET_TRUE);
3711     __ cmp(len, large_loop_size);
3712     __ br(Assembler::LT, CHECK_16);
3713 
3714     if (SoftwarePrefetchHintDistance >= 0
3715         && SoftwarePrefetchHintDistance >= dcache_line) {
3716       // initial prefetch
3717       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3718     }
3719   __ bind(LARGE_LOOP);
3720     if (SoftwarePrefetchHintDistance >= 0) {
3721       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3722     }
3723     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3724     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3725     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3726     // instructions per cycle and have less branches, but this approach disables
3727     // early return, thus, all 64 bytes are loaded and checked every time.
3728     __ ldp(tmp2, tmp3, Address(ary1));
3729     __ ldp(tmp4, tmp5, Address(ary1, 16));
3730     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3731     __ ldp(tmp6, tmp1, Address(ary1, 48));
3732     __ add(ary1, ary1, large_loop_size);
3733     __ sub(len, len, large_loop_size);
3734     __ orr(tmp2, tmp2, tmp3);
3735     __ orr(tmp4, tmp4, tmp5);
3736     __ orr(rscratch1, rscratch1, rscratch2);
3737     __ orr(tmp6, tmp6, tmp1);
3738     __ orr(tmp2, tmp2, tmp4);
3739     __ orr(rscratch1, rscratch1, tmp6);
3740     __ orr(tmp2, tmp2, rscratch1);
3741     __ tst(tmp2, UPPER_BIT_MASK);
3742     __ br(Assembler::NE, RET_TRUE);
3743     __ cmp(len, large_loop_size);
3744     __ br(Assembler::GE, LARGE_LOOP);
3745 
3746   __ bind(CHECK_16); // small 16-byte load pre-loop
3747     __ cmp(len, (u1)16);
3748     __ br(Assembler::LT, POST_LOOP16);
3749 
3750   __ bind(LOOP16); // small 16-byte load loop
3751     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3752     __ sub(len, len, 16);
3753     __ orr(tmp2, tmp2, tmp3);
3754     __ tst(tmp2, UPPER_BIT_MASK);
3755     __ br(Assembler::NE, RET_TRUE);
3756     __ cmp(len, (u1)16);
3757     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3758 
3759   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3760     __ cmp(len, (u1)8);
3761     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3762     __ ldr(tmp3, Address(__ post(ary1, 8)));
3763     __ sub(len, len, 8);
3764     __ tst(tmp3, UPPER_BIT_MASK);
3765     __ br(Assembler::NE, RET_TRUE);
3766 
3767   __ bind(POST_LOOP16_LOAD_TAIL);
3768     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3769     __ ldr(tmp1, Address(ary1));
3770     __ mov(tmp2, 64);
3771     __ sub(tmp4, tmp2, len, __ LSL, 3);
3772     __ lslv(tmp1, tmp1, tmp4);
3773     __ tst(tmp1, UPPER_BIT_MASK);
3774     __ br(Assembler::NE, RET_TRUE);
3775     // Fallthrough
3776 
3777   __ bind(RET_FALSE);
3778     __ pop(spilled_regs, sp);
3779     __ leave();
3780     __ mov(result, zr);
3781     __ ret(lr);
3782 
3783   __ bind(RET_TRUE);
3784     __ pop(spilled_regs, sp);
3785   __ bind(RET_TRUE_NO_POP);
3786     __ leave();
3787     __ mov(result, 1);
3788     __ ret(lr);
3789 
3790   __ bind(DONE);
3791     __ pop(spilled_regs, sp);
3792     __ leave();
3793     __ ret(lr);
3794     return entry;
3795   }
3796 
3797   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3798         bool usePrefetch, Label &NOT_EQUAL) {
3799     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3800         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3801         tmp7 = r12, tmp8 = r13;
3802     Label LOOP;
3803 
3804     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3805     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3806     __ bind(LOOP);
3807     if (usePrefetch) {
3808       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3809       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3810     }
3811     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3812     __ eor(tmp1, tmp1, tmp2);
3813     __ eor(tmp3, tmp3, tmp4);
3814     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3815     __ orr(tmp1, tmp1, tmp3);
3816     __ cbnz(tmp1, NOT_EQUAL);
3817     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3818     __ eor(tmp5, tmp5, tmp6);
3819     __ eor(tmp7, tmp7, tmp8);
3820     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3821     __ orr(tmp5, tmp5, tmp7);
3822     __ cbnz(tmp5, NOT_EQUAL);
3823     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3824     __ eor(tmp1, tmp1, tmp2);
3825     __ eor(tmp3, tmp3, tmp4);
3826     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3827     __ orr(tmp1, tmp1, tmp3);
3828     __ cbnz(tmp1, NOT_EQUAL);
3829     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3830     __ eor(tmp5, tmp5, tmp6);
3831     __ sub(cnt1, cnt1, 8 * wordSize);
3832     __ eor(tmp7, tmp7, tmp8);
3833     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3834     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3835     // cmp) because subs allows an unlimited range of immediate operand.
3836     __ subs(tmp6, cnt1, loopThreshold);
3837     __ orr(tmp5, tmp5, tmp7);
3838     __ cbnz(tmp5, NOT_EQUAL);
3839     __ br(__ GE, LOOP);
3840     // post-loop
3841     __ eor(tmp1, tmp1, tmp2);
3842     __ eor(tmp3, tmp3, tmp4);
3843     __ orr(tmp1, tmp1, tmp3);
3844     __ sub(cnt1, cnt1, 2 * wordSize);
3845     __ cbnz(tmp1, NOT_EQUAL);
3846   }
3847 
3848   void generate_large_array_equals_loop_simd(int loopThreshold,
3849         bool usePrefetch, Label &NOT_EQUAL) {
3850     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3851         tmp2 = rscratch2;
3852     Label LOOP;
3853 
3854     __ bind(LOOP);
3855     if (usePrefetch) {
3856       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3857       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3858     }
3859     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3860     __ sub(cnt1, cnt1, 8 * wordSize);
3861     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3862     __ subs(tmp1, cnt1, loopThreshold);
3863     __ eor(v0, __ T16B, v0, v4);
3864     __ eor(v1, __ T16B, v1, v5);
3865     __ eor(v2, __ T16B, v2, v6);
3866     __ eor(v3, __ T16B, v3, v7);
3867     __ orr(v0, __ T16B, v0, v1);
3868     __ orr(v1, __ T16B, v2, v3);
3869     __ orr(v0, __ T16B, v0, v1);
3870     __ umov(tmp1, v0, __ D, 0);
3871     __ umov(tmp2, v0, __ D, 1);
3872     __ orr(tmp1, tmp1, tmp2);
3873     __ cbnz(tmp1, NOT_EQUAL);
3874     __ br(__ GE, LOOP);
3875   }
3876 
3877   // a1 = r1 - array1 address
3878   // a2 = r2 - array2 address
3879   // result = r0 - return value. Already contains "false"
3880   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3881   // r3-r5 are reserved temporary registers
3882   address generate_large_array_equals() {
3883     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3884         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3885         tmp7 = r12, tmp8 = r13;
3886     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3887         SMALL_LOOP, POST_LOOP;
3888     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3889     // calculate if at least 32 prefetched bytes are used
3890     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3891     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3892     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3893     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3894         tmp5, tmp6, tmp7, tmp8);
3895 
3896     __ align(CodeEntryAlignment);
3897 
3898     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3899 
3900     address entry = __ pc();
3901     __ enter();
3902     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3903     // also advance pointers to use post-increment instead of pre-increment
3904     __ add(a1, a1, wordSize);
3905     __ add(a2, a2, wordSize);
3906     if (AvoidUnalignedAccesses) {
3907       // both implementations (SIMD/nonSIMD) are using relatively large load
3908       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3909       // on some CPUs in case of address is not at least 16-byte aligned.
3910       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3911       // load if needed at least for 1st address and make if 16-byte aligned.
3912       Label ALIGNED16;
3913       __ tbz(a1, 3, ALIGNED16);
3914       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3915       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3916       __ sub(cnt1, cnt1, wordSize);
3917       __ eor(tmp1, tmp1, tmp2);
3918       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3919       __ bind(ALIGNED16);
3920     }
3921     if (UseSIMDForArrayEquals) {
3922       if (SoftwarePrefetchHintDistance >= 0) {
3923         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3924         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3925         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3926             /* prfm = */ true, NOT_EQUAL);
3927         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3928         __ br(__ LT, TAIL);
3929       }
3930       __ bind(NO_PREFETCH_LARGE_LOOP);
3931       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3932           /* prfm = */ false, NOT_EQUAL);
3933     } else {
3934       __ push(spilled_regs, sp);
3935       if (SoftwarePrefetchHintDistance >= 0) {
3936         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3937         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3938         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3939             /* prfm = */ true, NOT_EQUAL);
3940         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3941         __ br(__ LT, TAIL);
3942       }
3943       __ bind(NO_PREFETCH_LARGE_LOOP);
3944       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3945           /* prfm = */ false, NOT_EQUAL);
3946     }
3947     __ bind(TAIL);
3948       __ cbz(cnt1, EQUAL);
3949       __ subs(cnt1, cnt1, wordSize);
3950       __ br(__ LE, POST_LOOP);
3951     __ bind(SMALL_LOOP);
3952       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3953       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3954       __ subs(cnt1, cnt1, wordSize);
3955       __ eor(tmp1, tmp1, tmp2);
3956       __ cbnz(tmp1, NOT_EQUAL);
3957       __ br(__ GT, SMALL_LOOP);
3958     __ bind(POST_LOOP);
3959       __ ldr(tmp1, Address(a1, cnt1));
3960       __ ldr(tmp2, Address(a2, cnt1));
3961       __ eor(tmp1, tmp1, tmp2);
3962       __ cbnz(tmp1, NOT_EQUAL);
3963     __ bind(EQUAL);
3964       __ mov(result, true);
3965     __ bind(NOT_EQUAL);
3966       if (!UseSIMDForArrayEquals) {
3967         __ pop(spilled_regs, sp);
3968       }
3969     __ bind(NOT_EQUAL_NO_POP);
3970     __ leave();
3971     __ ret(lr);
3972     return entry;
3973   }
3974 
3975   address generate_dsin_dcos(bool isCos) {
3976     __ align(CodeEntryAlignment);
3977     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3978     address start = __ pc();
3979     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3980         (address)StubRoutines::aarch64::_two_over_pi,
3981         (address)StubRoutines::aarch64::_pio2,
3982         (address)StubRoutines::aarch64::_dsin_coef,
3983         (address)StubRoutines::aarch64::_dcos_coef);
3984     return start;
3985   }
3986 
3987   address generate_dlog() {
3988     __ align(CodeEntryAlignment);
3989     StubCodeMark mark(this, "StubRoutines", "dlog");
3990     address entry = __ pc();
3991     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3992         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3993     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3994     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3995         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3996     return entry;
3997   }
3998 
3999   // code for comparing 16 bytes of strings with same encoding
4000   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4001     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4002     __ ldr(rscratch1, Address(__ post(str1, 8)));
4003     __ eor(rscratch2, tmp1, tmp2);
4004     __ ldr(cnt1, Address(__ post(str2, 8)));
4005     __ cbnz(rscratch2, DIFF1);
4006     __ ldr(tmp1, Address(__ post(str1, 8)));
4007     __ eor(rscratch2, rscratch1, cnt1);
4008     __ ldr(tmp2, Address(__ post(str2, 8)));
4009     __ cbnz(rscratch2, DIFF2);
4010   }
4011 
4012   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4013   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4014       Label &DIFF2) {
4015     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4016     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4017 
4018     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4019     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4020     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4021     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4022 
4023     __ fmovd(tmpL, vtmp3);
4024     __ eor(rscratch2, tmp3, tmpL);
4025     __ cbnz(rscratch2, DIFF2);
4026 
4027     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4028     __ umov(tmpL, vtmp3, __ D, 1);
4029     __ eor(rscratch2, tmpU, tmpL);
4030     __ cbnz(rscratch2, DIFF1);
4031 
4032     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4033     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4034     __ fmovd(tmpL, vtmp);
4035     __ eor(rscratch2, tmp3, tmpL);
4036     __ cbnz(rscratch2, DIFF2);
4037 
4038     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4039     __ umov(tmpL, vtmp, __ D, 1);
4040     __ eor(rscratch2, tmpU, tmpL);
4041     __ cbnz(rscratch2, DIFF1);
4042   }
4043 
4044   // r0  = result
4045   // r1  = str1
4046   // r2  = cnt1
4047   // r3  = str2
4048   // r4  = cnt2
4049   // r10 = tmp1
4050   // r11 = tmp2
4051   address generate_compare_long_string_different_encoding(bool isLU) {
4052     __ align(CodeEntryAlignment);
4053     StubCodeMark mark(this, "StubRoutines", isLU
4054         ? "compare_long_string_different_encoding LU"
4055         : "compare_long_string_different_encoding UL");
4056     address entry = __ pc();
4057     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4058         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4059         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4060     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4061         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4062     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4063     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4064 
4065     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4066 
4067     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4068     // cnt2 == amount of characters left to compare
4069     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4070     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4071     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4072     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4073     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4074     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4075     __ eor(rscratch2, tmp1, tmp2);
4076     __ mov(rscratch1, tmp2);
4077     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4078     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4079              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4080     __ push(spilled_regs, sp);
4081     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4082     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4083 
4084     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4085 
4086     if (SoftwarePrefetchHintDistance >= 0) {
4087       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4088       __ br(__ LT, NO_PREFETCH);
4089       __ bind(LARGE_LOOP_PREFETCH);
4090         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4091         __ mov(tmp4, 2);
4092         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4093         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4094           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4095           __ subs(tmp4, tmp4, 1);
4096           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4097           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4098           __ mov(tmp4, 2);
4099         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4100           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4101           __ subs(tmp4, tmp4, 1);
4102           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4103           __ sub(cnt2, cnt2, 64);
4104           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4105           __ br(__ GE, LARGE_LOOP_PREFETCH);
4106     }
4107     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4108     __ bind(NO_PREFETCH);
4109     __ subs(cnt2, cnt2, 16);
4110     __ br(__ LT, TAIL);
4111     __ align(OptoLoopAlignment);
4112     __ bind(SMALL_LOOP); // smaller loop
4113       __ subs(cnt2, cnt2, 16);
4114       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4115       __ br(__ GE, SMALL_LOOP);
4116       __ cmn(cnt2, (u1)16);
4117       __ br(__ EQ, LOAD_LAST);
4118     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4119       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4120       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4121       __ ldr(tmp3, Address(cnt1, -8));
4122       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4123       __ b(LOAD_LAST);
4124     __ bind(DIFF2);
4125       __ mov(tmpU, tmp3);
4126     __ bind(DIFF1);
4127       __ pop(spilled_regs, sp);
4128       __ b(CALCULATE_DIFFERENCE);
4129     __ bind(LOAD_LAST);
4130       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4131       // No need to load it again
4132       __ mov(tmpU, tmp3);
4133       __ pop(spilled_regs, sp);
4134 
4135       // tmp2 points to the address of the last 4 Latin1 characters right now
4136       __ ldrs(vtmp, Address(tmp2));
4137       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4138       __ fmovd(tmpL, vtmp);
4139 
4140       __ eor(rscratch2, tmpU, tmpL);
4141       __ cbz(rscratch2, DONE);
4142 
4143     // Find the first different characters in the longwords and
4144     // compute their difference.
4145     __ bind(CALCULATE_DIFFERENCE);
4146       __ rev(rscratch2, rscratch2);
4147       __ clz(rscratch2, rscratch2);
4148       __ andr(rscratch2, rscratch2, -16);
4149       __ lsrv(tmp1, tmp1, rscratch2);
4150       __ uxthw(tmp1, tmp1);
4151       __ lsrv(rscratch1, rscratch1, rscratch2);
4152       __ uxthw(rscratch1, rscratch1);
4153       __ subw(result, tmp1, rscratch1);
4154     __ bind(DONE);
4155       __ ret(lr);
4156     return entry;
4157   }
4158 
4159     address generate_method_entry_barrier() {
4160     __ align(CodeEntryAlignment);
4161     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
4162 
4163     Label deoptimize_label;
4164 
4165     address start = __ pc();
4166 
4167     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
4168 
4169     __ enter();
4170     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
4171 
4172     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
4173 
4174     __ push_call_clobbered_registers();
4175 
4176     __ mov(c_rarg0, rscratch2);
4177     __ call_VM_leaf
4178          (CAST_FROM_FN_PTR
4179           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
4180 
4181     __ reset_last_Java_frame(true);
4182 
4183     __ mov(rscratch1, r0);
4184 
4185     __ pop_call_clobbered_registers();
4186 
4187     __ cbnz(rscratch1, deoptimize_label);
4188 
4189     __ leave();
4190     __ ret(lr);
4191 
4192     __ BIND(deoptimize_label);
4193 
4194     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
4195     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
4196 
4197     __ mov(sp, rscratch1);
4198     __ br(rscratch2);
4199 
4200     return start;
4201   }
4202 
4203   // r0  = result
4204   // r1  = str1
4205   // r2  = cnt1
4206   // r3  = str2
4207   // r4  = cnt2
4208   // r10 = tmp1
4209   // r11 = tmp2
4210   address generate_compare_long_string_same_encoding(bool isLL) {
4211     __ align(CodeEntryAlignment);
4212     StubCodeMark mark(this, "StubRoutines", isLL
4213         ? "compare_long_string_same_encoding LL"
4214         : "compare_long_string_same_encoding UU");
4215     address entry = __ pc();
4216     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4217         tmp1 = r10, tmp2 = r11;
4218     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4219         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4220         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4221     // exit from large loop when less than 64 bytes left to read or we're about
4222     // to prefetch memory behind array border
4223     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4224     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4225     // update cnt2 counter with already loaded 8 bytes
4226     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4227     // update pointers, because of previous read
4228     __ add(str1, str1, wordSize);
4229     __ add(str2, str2, wordSize);
4230     if (SoftwarePrefetchHintDistance >= 0) {
4231       __ bind(LARGE_LOOP_PREFETCH);
4232         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4233         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4234         compare_string_16_bytes_same(DIFF, DIFF2);
4235         compare_string_16_bytes_same(DIFF, DIFF2);
4236         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4237         compare_string_16_bytes_same(DIFF, DIFF2);
4238         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4239         compare_string_16_bytes_same(DIFF, DIFF2);
4240         __ br(__ GT, LARGE_LOOP_PREFETCH);
4241         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4242     }
4243     // less than 16 bytes left?
4244     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4245     __ br(__ LT, TAIL);
4246     __ align(OptoLoopAlignment);
4247     __ bind(SMALL_LOOP);
4248       compare_string_16_bytes_same(DIFF, DIFF2);
4249       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4250       __ br(__ GE, SMALL_LOOP);
4251     __ bind(TAIL);
4252       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4253       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4254       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4255       __ br(__ LE, CHECK_LAST);
4256       __ eor(rscratch2, tmp1, tmp2);
4257       __ cbnz(rscratch2, DIFF);
4258       __ ldr(tmp1, Address(__ post(str1, 8)));
4259       __ ldr(tmp2, Address(__ post(str2, 8)));
4260       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4261     __ bind(CHECK_LAST);
4262       if (!isLL) {
4263         __ add(cnt2, cnt2, cnt2); // now in bytes
4264       }
4265       __ eor(rscratch2, tmp1, tmp2);
4266       __ cbnz(rscratch2, DIFF);
4267       __ ldr(rscratch1, Address(str1, cnt2));
4268       __ ldr(cnt1, Address(str2, cnt2));
4269       __ eor(rscratch2, rscratch1, cnt1);
4270       __ cbz(rscratch2, LENGTH_DIFF);
4271       // Find the first different characters in the longwords and
4272       // compute their difference.
4273     __ bind(DIFF2);
4274       __ rev(rscratch2, rscratch2);
4275       __ clz(rscratch2, rscratch2);
4276       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4277       __ lsrv(rscratch1, rscratch1, rscratch2);
4278       if (isLL) {
4279         __ lsrv(cnt1, cnt1, rscratch2);
4280         __ uxtbw(rscratch1, rscratch1);
4281         __ uxtbw(cnt1, cnt1);
4282       } else {
4283         __ lsrv(cnt1, cnt1, rscratch2);
4284         __ uxthw(rscratch1, rscratch1);
4285         __ uxthw(cnt1, cnt1);
4286       }
4287       __ subw(result, rscratch1, cnt1);
4288       __ b(LENGTH_DIFF);
4289     __ bind(DIFF);
4290       __ rev(rscratch2, rscratch2);
4291       __ clz(rscratch2, rscratch2);
4292       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4293       __ lsrv(tmp1, tmp1, rscratch2);
4294       if (isLL) {
4295         __ lsrv(tmp2, tmp2, rscratch2);
4296         __ uxtbw(tmp1, tmp1);
4297         __ uxtbw(tmp2, tmp2);
4298       } else {
4299         __ lsrv(tmp2, tmp2, rscratch2);
4300         __ uxthw(tmp1, tmp1);
4301         __ uxthw(tmp2, tmp2);
4302       }
4303       __ subw(result, tmp1, tmp2);
4304       __ b(LENGTH_DIFF);
4305     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4306       __ eor(rscratch2, tmp1, tmp2);
4307       __ cbnz(rscratch2, DIFF);
4308     __ bind(LENGTH_DIFF);
4309       __ ret(lr);
4310     return entry;
4311   }
4312 
4313   void generate_compare_long_strings() {
4314       StubRoutines::aarch64::_compare_long_string_LL
4315           = generate_compare_long_string_same_encoding(true);
4316       StubRoutines::aarch64::_compare_long_string_UU
4317           = generate_compare_long_string_same_encoding(false);
4318       StubRoutines::aarch64::_compare_long_string_LU
4319           = generate_compare_long_string_different_encoding(true);
4320       StubRoutines::aarch64::_compare_long_string_UL
4321           = generate_compare_long_string_different_encoding(false);
4322   }
4323 
4324   // R0 = result
4325   // R1 = str2
4326   // R2 = cnt1
4327   // R3 = str1
4328   // R4 = cnt2
4329   // This generic linear code use few additional ideas, which makes it faster:
4330   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4331   // in order to skip initial loading(help in systems with 1 ld pipeline)
4332   // 2) we can use "fast" algorithm of finding single character to search for
4333   // first symbol with less branches(1 branch per each loaded register instead
4334   // of branch for each symbol), so, this is where constants like
4335   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4336   // 3) after loading and analyzing 1st register of source string, it can be
4337   // used to search for every 1st character entry, saving few loads in
4338   // comparison with "simplier-but-slower" implementation
4339   // 4) in order to avoid lots of push/pop operations, code below is heavily
4340   // re-using/re-initializing/compressing register values, which makes code
4341   // larger and a bit less readable, however, most of extra operations are
4342   // issued during loads or branches, so, penalty is minimal
4343   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4344     const char* stubName = str1_isL
4345         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4346         : "indexof_linear_uu";
4347     __ align(CodeEntryAlignment);
4348     StubCodeMark mark(this, "StubRoutines", stubName);
4349     address entry = __ pc();
4350 
4351     int str1_chr_size = str1_isL ? 1 : 2;
4352     int str2_chr_size = str2_isL ? 1 : 2;
4353     int str1_chr_shift = str1_isL ? 0 : 1;
4354     int str2_chr_shift = str2_isL ? 0 : 1;
4355     bool isL = str1_isL && str2_isL;
4356    // parameters
4357     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4358     // temporary registers
4359     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4360     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4361     // redefinitions
4362     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4363 
4364     __ push(spilled_regs, sp);
4365     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4366         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4367         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4368         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4369         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4370         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4371     // Read whole register from str1. It is safe, because length >=8 here
4372     __ ldr(ch1, Address(str1));
4373     // Read whole register from str2. It is safe, because length >=8 here
4374     __ ldr(ch2, Address(str2));
4375     __ sub(cnt2, cnt2, cnt1);
4376     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4377     if (str1_isL != str2_isL) {
4378       __ eor(v0, __ T16B, v0, v0);
4379     }
4380     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4381     __ mul(first, first, tmp1);
4382     // check if we have less than 1 register to check
4383     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4384     if (str1_isL != str2_isL) {
4385       __ fmovd(v1, ch1);
4386     }
4387     __ br(__ LE, L_SMALL);
4388     __ eor(ch2, first, ch2);
4389     if (str1_isL != str2_isL) {
4390       __ zip1(v1, __ T16B, v1, v0);
4391     }
4392     __ sub(tmp2, ch2, tmp1);
4393     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4394     __ bics(tmp2, tmp2, ch2);
4395     if (str1_isL != str2_isL) {
4396       __ fmovd(ch1, v1);
4397     }
4398     __ br(__ NE, L_HAS_ZERO);
4399     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4400     __ add(result, result, wordSize/str2_chr_size);
4401     __ add(str2, str2, wordSize);
4402     __ br(__ LT, L_POST_LOOP);
4403     __ BIND(L_LOOP);
4404       __ ldr(ch2, Address(str2));
4405       __ eor(ch2, first, ch2);
4406       __ sub(tmp2, ch2, tmp1);
4407       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4408       __ bics(tmp2, tmp2, ch2);
4409       __ br(__ NE, L_HAS_ZERO);
4410     __ BIND(L_LOOP_PROCEED);
4411       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4412       __ add(str2, str2, wordSize);
4413       __ add(result, result, wordSize/str2_chr_size);
4414       __ br(__ GE, L_LOOP);
4415     __ BIND(L_POST_LOOP);
4416       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4417       __ br(__ LE, NOMATCH);
4418       __ ldr(ch2, Address(str2));
4419       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4420       __ eor(ch2, first, ch2);
4421       __ sub(tmp2, ch2, tmp1);
4422       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4423       __ mov(tmp4, -1); // all bits set
4424       __ b(L_SMALL_PROCEED);
4425     __ align(OptoLoopAlignment);
4426     __ BIND(L_SMALL);
4427       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4428       __ eor(ch2, first, ch2);
4429       if (str1_isL != str2_isL) {
4430         __ zip1(v1, __ T16B, v1, v0);
4431       }
4432       __ sub(tmp2, ch2, tmp1);
4433       __ mov(tmp4, -1); // all bits set
4434       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4435       if (str1_isL != str2_isL) {
4436         __ fmovd(ch1, v1); // move converted 4 symbols
4437       }
4438     __ BIND(L_SMALL_PROCEED);
4439       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4440       __ bic(tmp2, tmp2, ch2);
4441       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4442       __ rbit(tmp2, tmp2);
4443       __ br(__ EQ, NOMATCH);
4444     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4445       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4446       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4447       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4448       if (str2_isL) { // LL
4449         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4450         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4451         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4452         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4453         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4454       } else {
4455         __ mov(ch2, 0xE); // all bits in byte set except last one
4456         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4457         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4458         __ lslv(tmp2, tmp2, tmp4);
4459         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4460         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4461         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4462         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4463       }
4464       __ cmp(ch1, ch2);
4465       __ mov(tmp4, wordSize/str2_chr_size);
4466       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4467     __ BIND(L_SMALL_CMP_LOOP);
4468       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4469                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4470       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4471                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4472       __ add(tmp4, tmp4, 1);
4473       __ cmp(tmp4, cnt1);
4474       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4475       __ cmp(first, ch2);
4476       __ br(__ EQ, L_SMALL_CMP_LOOP);
4477     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4478       __ cbz(tmp2, NOMATCH); // no more matches. exit
4479       __ clz(tmp4, tmp2);
4480       __ add(result, result, 1); // advance index
4481       __ add(str2, str2, str2_chr_size); // advance pointer
4482       __ b(L_SMALL_HAS_ZERO_LOOP);
4483     __ align(OptoLoopAlignment);
4484     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4485       __ cmp(first, ch2);
4486       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4487       __ b(DONE);
4488     __ align(OptoLoopAlignment);
4489     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4490       if (str2_isL) { // LL
4491         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4492         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4493         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4494         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4495         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4496       } else {
4497         __ mov(ch2, 0xE); // all bits in byte set except last one
4498         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4499         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4500         __ lslv(tmp2, tmp2, tmp4);
4501         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4502         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4503         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4504         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4505       }
4506       __ cmp(ch1, ch2);
4507       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4508       __ b(DONE);
4509     __ align(OptoLoopAlignment);
4510     __ BIND(L_HAS_ZERO);
4511       __ rbit(tmp2, tmp2);
4512       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4513       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4514       // It's fine because both counters are 32bit and are not changed in this
4515       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4516       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4517       __ sub(result, result, 1);
4518     __ BIND(L_HAS_ZERO_LOOP);
4519       __ mov(cnt1, wordSize/str2_chr_size);
4520       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4521       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4522       if (str2_isL) {
4523         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4524         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4525         __ lslv(tmp2, tmp2, tmp4);
4526         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4527         __ add(tmp4, tmp4, 1);
4528         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4529         __ lsl(tmp2, tmp2, 1);
4530         __ mov(tmp4, wordSize/str2_chr_size);
4531       } else {
4532         __ mov(ch2, 0xE);
4533         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4534         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4535         __ lslv(tmp2, tmp2, tmp4);
4536         __ add(tmp4, tmp4, 1);
4537         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4538         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4539         __ lsl(tmp2, tmp2, 1);
4540         __ mov(tmp4, wordSize/str2_chr_size);
4541         __ sub(str2, str2, str2_chr_size);
4542       }
4543       __ cmp(ch1, ch2);
4544       __ mov(tmp4, wordSize/str2_chr_size);
4545       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4546     __ BIND(L_CMP_LOOP);
4547       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4548                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4549       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4550                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4551       __ add(tmp4, tmp4, 1);
4552       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4553       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4554       __ cmp(cnt1, ch2);
4555       __ br(__ EQ, L_CMP_LOOP);
4556     __ BIND(L_CMP_LOOP_NOMATCH);
4557       // here we're not matched
4558       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4559       __ clz(tmp4, tmp2);
4560       __ add(str2, str2, str2_chr_size); // advance pointer
4561       __ b(L_HAS_ZERO_LOOP);
4562     __ align(OptoLoopAlignment);
4563     __ BIND(L_CMP_LOOP_LAST_CMP);
4564       __ cmp(cnt1, ch2);
4565       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4566       __ b(DONE);
4567     __ align(OptoLoopAlignment);
4568     __ BIND(L_CMP_LOOP_LAST_CMP2);
4569       if (str2_isL) {
4570         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4571         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4572         __ lslv(tmp2, tmp2, tmp4);
4573         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4574         __ add(tmp4, tmp4, 1);
4575         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4576         __ lsl(tmp2, tmp2, 1);
4577       } else {
4578         __ mov(ch2, 0xE);
4579         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4580         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4581         __ lslv(tmp2, tmp2, tmp4);
4582         __ add(tmp4, tmp4, 1);
4583         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4584         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4585         __ lsl(tmp2, tmp2, 1);
4586         __ sub(str2, str2, str2_chr_size);
4587       }
4588       __ cmp(ch1, ch2);
4589       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4590       __ b(DONE);
4591     __ align(OptoLoopAlignment);
4592     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4593       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4594       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4595       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4596       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4597       // result by analyzed characters value, so, we can just reset lower bits
4598       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4599       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4600       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4601       // index of last analyzed substring inside current octet. So, str2 in at
4602       // respective start address. We need to advance it to next octet
4603       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4604       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4605       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4606       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4607       __ movw(cnt2, cnt2);
4608       __ b(L_LOOP_PROCEED);
4609     __ align(OptoLoopAlignment);
4610     __ BIND(NOMATCH);
4611       __ mov(result, -1);
4612     __ BIND(DONE);
4613       __ pop(spilled_regs, sp);
4614       __ ret(lr);
4615     return entry;
4616   }
4617 
4618   void generate_string_indexof_stubs() {
4619     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4620     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4621     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4622   }
4623 
4624   void inflate_and_store_2_fp_registers(bool generatePrfm,
4625       FloatRegister src1, FloatRegister src2) {
4626     Register dst = r1;
4627     __ zip1(v1, __ T16B, src1, v0);
4628     __ zip2(v2, __ T16B, src1, v0);
4629     if (generatePrfm) {
4630       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4631     }
4632     __ zip1(v3, __ T16B, src2, v0);
4633     __ zip2(v4, __ T16B, src2, v0);
4634     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4635   }
4636 
4637   // R0 = src
4638   // R1 = dst
4639   // R2 = len
4640   // R3 = len >> 3
4641   // V0 = 0
4642   // v1 = loaded 8 bytes
4643   address generate_large_byte_array_inflate() {
4644     __ align(CodeEntryAlignment);
4645     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4646     address entry = __ pc();
4647     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4648     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4649     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
4650 
4651     // do one more 8-byte read to have address 16-byte aligned in most cases
4652     // also use single store instruction
4653     __ ldrd(v2, __ post(src, 8));
4654     __ sub(octetCounter, octetCounter, 2);
4655     __ zip1(v1, __ T16B, v1, v0);
4656     __ zip1(v2, __ T16B, v2, v0);
4657     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4658     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4659     __ subs(rscratch1, octetCounter, large_loop_threshold);
4660     __ br(__ LE, LOOP_START);
4661     __ b(LOOP_PRFM_START);
4662     __ bind(LOOP_PRFM);
4663       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4664     __ bind(LOOP_PRFM_START);
4665       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4666       __ sub(octetCounter, octetCounter, 8);
4667       __ subs(rscratch1, octetCounter, large_loop_threshold);
4668       inflate_and_store_2_fp_registers(true, v3, v4);
4669       inflate_and_store_2_fp_registers(true, v5, v6);
4670       __ br(__ GT, LOOP_PRFM);
4671       __ cmp(octetCounter, (u1)8);
4672       __ br(__ LT, DONE);
4673     __ bind(LOOP);
4674       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4675       __ bind(LOOP_START);
4676       __ sub(octetCounter, octetCounter, 8);
4677       __ cmp(octetCounter, (u1)8);
4678       inflate_and_store_2_fp_registers(false, v3, v4);
4679       inflate_and_store_2_fp_registers(false, v5, v6);
4680       __ br(__ GE, LOOP);
4681     __ bind(DONE);
4682       __ ret(lr);
4683     return entry;
4684   }
4685 
4686   /**
4687    *  Arguments:
4688    *
4689    *  Input:
4690    *  c_rarg0   - current state address
4691    *  c_rarg1   - H key address
4692    *  c_rarg2   - data address
4693    *  c_rarg3   - number of blocks
4694    *
4695    *  Output:
4696    *  Updated state at c_rarg0
4697    */
4698   address generate_ghash_processBlocks() {
4699     // Bafflingly, GCM uses little-endian for the byte order, but
4700     // big-endian for the bit order.  For example, the polynomial 1 is
4701     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4702     //
4703     // So, we must either reverse the bytes in each word and do
4704     // everything big-endian or reverse the bits in each byte and do
4705     // it little-endian.  On AArch64 it's more idiomatic to reverse
4706     // the bits in each byte (we have an instruction, RBIT, to do
4707     // that) and keep the data in little-endian bit order throught the
4708     // calculation, bit-reversing the inputs and outputs.
4709 
4710     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4711     __ align(wordSize * 2);
4712     address p = __ pc();
4713     __ emit_int64(0x87);  // The low-order bits of the field
4714                           // polynomial (i.e. p = z^7+z^2+z+1)
4715                           // repeated in the low and high parts of a
4716                           // 128-bit vector
4717     __ emit_int64(0x87);
4718 
4719     __ align(CodeEntryAlignment);
4720     address start = __ pc();
4721 
4722     Register state   = c_rarg0;
4723     Register subkeyH = c_rarg1;
4724     Register data    = c_rarg2;
4725     Register blocks  = c_rarg3;
4726 
4727     FloatRegister vzr = v30;
4728     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4729 
4730     __ ldrq(v0, Address(state));
4731     __ ldrq(v1, Address(subkeyH));
4732 
4733     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4734     __ rbit(v0, __ T16B, v0);
4735     __ rev64(v1, __ T16B, v1);
4736     __ rbit(v1, __ T16B, v1);
4737 
4738     __ ldrq(v26, p);
4739 
4740     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4741     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4742 
4743     {
4744       Label L_ghash_loop;
4745       __ bind(L_ghash_loop);
4746 
4747       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4748                                                  // reversing each byte
4749       __ rbit(v2, __ T16B, v2);
4750       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4751 
4752       // Multiply state in v2 by subkey in v1
4753       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4754                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4755                      /*temps*/v6, v20, v18, v21);
4756       // Reduce v7:v5 by the field polynomial
4757       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4758 
4759       __ sub(blocks, blocks, 1);
4760       __ cbnz(blocks, L_ghash_loop);
4761     }
4762 
4763     // The bit-reversed result is at this point in v0
4764     __ rev64(v1, __ T16B, v0);
4765     __ rbit(v1, __ T16B, v1);
4766 
4767     __ st1(v1, __ T16B, state);
4768     __ ret(lr);
4769 
4770     return start;
4771   }
4772 
4773   // Continuation point for throwing of implicit exceptions that are
4774   // not handled in the current activation. Fabricates an exception
4775   // oop and initiates normal exception dispatching in this
4776   // frame. Since we need to preserve callee-saved values (currently
4777   // only for C2, but done for C1 as well) we need a callee-saved oop
4778   // map and therefore have to make these stubs into RuntimeStubs
4779   // rather than BufferBlobs.  If the compiler needs all registers to
4780   // be preserved between the fault point and the exception handler
4781   // then it must assume responsibility for that in
4782   // AbstractCompiler::continuation_for_implicit_null_exception or
4783   // continuation_for_implicit_division_by_zero_exception. All other
4784   // implicit exceptions (e.g., NullPointerException or
4785   // AbstractMethodError on entry) are either at call sites or
4786   // otherwise assume that stack unwinding will be initiated, so
4787   // caller saved registers were assumed volatile in the compiler.
4788 
4789 #undef __
4790 #define __ masm->
4791 
4792   address generate_throw_exception(const char* name,
4793                                    address runtime_entry,
4794                                    Register arg1 = noreg,
4795                                    Register arg2 = noreg) {
4796     // Information about frame layout at time of blocking runtime call.
4797     // Note that we only have to preserve callee-saved registers since
4798     // the compilers are responsible for supplying a continuation point
4799     // if they expect all registers to be preserved.
4800     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4801     enum layout {
4802       rfp_off = 0,
4803       rfp_off2,
4804       return_off,
4805       return_off2,
4806       framesize // inclusive of return address
4807     };
4808 
4809     int insts_size = 512;
4810     int locs_size  = 64;
4811 
4812     CodeBuffer code(name, insts_size, locs_size);
4813     OopMapSet* oop_maps  = new OopMapSet();
4814     MacroAssembler* masm = new MacroAssembler(&code);
4815 
4816     address start = __ pc();
4817 
4818     // This is an inlined and slightly modified version of call_VM
4819     // which has the ability to fetch the return PC out of
4820     // thread-local storage and also sets up last_Java_sp slightly
4821     // differently than the real call_VM
4822 
4823     __ enter(); // Save FP and LR before call
4824 
4825     assert(is_even(framesize/2), "sp not 16-byte aligned");
4826 
4827     // lr and fp are already in place
4828     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4829 
4830     int frame_complete = __ pc() - start;
4831 
4832     // Set up last_Java_sp and last_Java_fp
4833     address the_pc = __ pc();
4834     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4835 
4836     // Call runtime
4837     if (arg1 != noreg) {
4838       assert(arg2 != c_rarg1, "clobbered");
4839       __ mov(c_rarg1, arg1);
4840     }
4841     if (arg2 != noreg) {
4842       __ mov(c_rarg2, arg2);
4843     }
4844     __ mov(c_rarg0, rthread);
4845     BLOCK_COMMENT("call runtime_entry");
4846     __ mov(rscratch1, runtime_entry);
4847     __ blr(rscratch1);
4848 
4849     // Generate oop map
4850     OopMap* map = new OopMap(framesize, 0);
4851 
4852     oop_maps->add_gc_map(the_pc - start, map);
4853 
4854     __ reset_last_Java_frame(true);
4855     __ maybe_isb();
4856 
4857     __ leave();
4858 
4859     // check for pending exceptions
4860 #ifdef ASSERT
4861     Label L;
4862     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4863     __ cbnz(rscratch1, L);
4864     __ should_not_reach_here();
4865     __ bind(L);
4866 #endif // ASSERT
4867     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4868 
4869 
4870     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4871     RuntimeStub* stub =
4872       RuntimeStub::new_runtime_stub(name,
4873                                     &code,
4874                                     frame_complete,
4875                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4876                                     oop_maps, false);
4877     return stub->entry_point();
4878   }
4879 
4880   class MontgomeryMultiplyGenerator : public MacroAssembler {
4881 
4882     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4883       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4884 
4885     RegSet _toSave;
4886     bool _squaring;
4887 
4888   public:
4889     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4890       : MacroAssembler(as->code()), _squaring(squaring) {
4891 
4892       // Register allocation
4893 
4894       Register reg = c_rarg0;
4895       Pa_base = reg;       // Argument registers
4896       if (squaring)
4897         Pb_base = Pa_base;
4898       else
4899         Pb_base = next_reg(reg);
4900       Pn_base = next_reg(reg);
4901       Rlen= next_reg(reg);
4902       inv = next_reg(reg);
4903       Pm_base = next_reg(reg);
4904 
4905                           // Working registers:
4906       Ra =  next_reg(reg); // The current digit of a, b, n, and m.
4907       Rb =  next_reg(reg);
4908       Rm =  next_reg(reg);
4909       Rn =  next_reg(reg);
4910 
4911       Pa =  next_reg(reg); // Pointers to the current/next digit of a, b, n, and m.
4912       Pb =  next_reg(reg);
4913       Pm =  next_reg(reg);
4914       Pn =  next_reg(reg);
4915 
4916       t0 =  next_reg(reg); // Three registers which form a
4917       t1 =  next_reg(reg); // triple-precision accumuator.
4918       t2 =  next_reg(reg);
4919 
4920       Ri =  next_reg(reg); // Inner and outer loop indexes.
4921       Rj =  next_reg(reg);
4922 
4923       Rhi_ab = next_reg(reg); // Product registers: low and high parts
4924       Rlo_ab = next_reg(reg); // of a*b and m*n.
4925       Rhi_mn = next_reg(reg);
4926       Rlo_mn = next_reg(reg);
4927 
4928       // r19 and up are callee-saved.
4929       _toSave = RegSet::range(r19, reg) + Pm_base;
4930     }
4931 
4932   private:
4933     Register next_reg(Register &reg) {
4934 #ifdef _WIN64
4935       // skip r18 on Windows, it's used by native TLS
4936       return ++reg == r18 ? ++reg : reg;
4937 #else
4938       return ++reg;
4939 #endif
4940     }
4941 
4942     void save_regs() {
4943       push(_toSave, sp);
4944     }
4945 
4946     void restore_regs() {
4947       pop(_toSave, sp);
4948     }
4949 
4950     template <typename T>
4951     void unroll_2(Register count, T block) {
4952       Label loop, end, odd;
4953       tbnz(count, 0, odd);
4954       cbz(count, end);
4955       align(16);
4956       bind(loop);
4957       (this->*block)();
4958       bind(odd);
4959       (this->*block)();
4960       subs(count, count, 2);
4961       br(Assembler::GT, loop);
4962       bind(end);
4963     }
4964 
4965     template <typename T>
4966     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4967       Label loop, end, odd;
4968       tbnz(count, 0, odd);
4969       cbz(count, end);
4970       align(16);
4971       bind(loop);
4972       (this->*block)(d, s, tmp);
4973       bind(odd);
4974       (this->*block)(d, s, tmp);
4975       subs(count, count, 2);
4976       br(Assembler::GT, loop);
4977       bind(end);
4978     }
4979 
4980     void pre1(RegisterOrConstant i) {
4981       block_comment("pre1");
4982       // Pa = Pa_base;
4983       // Pb = Pb_base + i;
4984       // Pm = Pm_base;
4985       // Pn = Pn_base + i;
4986       // Ra = *Pa;
4987       // Rb = *Pb;
4988       // Rm = *Pm;
4989       // Rn = *Pn;
4990       ldr(Ra, Address(Pa_base));
4991       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4992       ldr(Rm, Address(Pm_base));
4993       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4994       lea(Pa, Address(Pa_base));
4995       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4996       lea(Pm, Address(Pm_base));
4997       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4998 
4999       // Zero the m*n result.
5000       mov(Rhi_mn, zr);
5001       mov(Rlo_mn, zr);
5002     }
5003 
5004     // The core multiply-accumulate step of a Montgomery
5005     // multiplication.  The idea is to schedule operations as a
5006     // pipeline so that instructions with long latencies (loads and
5007     // multiplies) have time to complete before their results are
5008     // used.  This most benefits in-order implementations of the
5009     // architecture but out-of-order ones also benefit.
5010     void step() {
5011       block_comment("step");
5012       // MACC(Ra, Rb, t0, t1, t2);
5013       // Ra = *++Pa;
5014       // Rb = *--Pb;
5015       umulh(Rhi_ab, Ra, Rb);
5016       mul(Rlo_ab, Ra, Rb);
5017       ldr(Ra, pre(Pa, wordSize));
5018       ldr(Rb, pre(Pb, -wordSize));
5019       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5020                                        // previous iteration.
5021       // MACC(Rm, Rn, t0, t1, t2);
5022       // Rm = *++Pm;
5023       // Rn = *--Pn;
5024       umulh(Rhi_mn, Rm, Rn);
5025       mul(Rlo_mn, Rm, Rn);
5026       ldr(Rm, pre(Pm, wordSize));
5027       ldr(Rn, pre(Pn, -wordSize));
5028       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5029     }
5030 
5031     void post1() {
5032       block_comment("post1");
5033 
5034       // MACC(Ra, Rb, t0, t1, t2);
5035       // Ra = *++Pa;
5036       // Rb = *--Pb;
5037       umulh(Rhi_ab, Ra, Rb);
5038       mul(Rlo_ab, Ra, Rb);
5039       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5040       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5041 
5042       // *Pm = Rm = t0 * inv;
5043       mul(Rm, t0, inv);
5044       str(Rm, Address(Pm));
5045 
5046       // MACC(Rm, Rn, t0, t1, t2);
5047       // t0 = t1; t1 = t2; t2 = 0;
5048       umulh(Rhi_mn, Rm, Rn);
5049 
5050 #ifndef PRODUCT
5051       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5052       {
5053         mul(Rlo_mn, Rm, Rn);
5054         add(Rlo_mn, t0, Rlo_mn);
5055         Label ok;
5056         cbz(Rlo_mn, ok); {
5057           stop("broken Montgomery multiply");
5058         } bind(ok);
5059       }
5060 #endif
5061       // We have very carefully set things up so that
5062       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5063       // the lower half of Rm * Rn because we know the result already:
5064       // it must be -t0.  t0 + (-t0) must generate a carry iff
5065       // t0 != 0.  So, rather than do a mul and an adds we just set
5066       // the carry flag iff t0 is nonzero.
5067       //
5068       // mul(Rlo_mn, Rm, Rn);
5069       // adds(zr, t0, Rlo_mn);
5070       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5071       adcs(t0, t1, Rhi_mn);
5072       adc(t1, t2, zr);
5073       mov(t2, zr);
5074     }
5075 
5076     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5077       block_comment("pre2");
5078       // Pa = Pa_base + i-len;
5079       // Pb = Pb_base + len;
5080       // Pm = Pm_base + i-len;
5081       // Pn = Pn_base + len;
5082 
5083       if (i.is_register()) {
5084         sub(Rj, i.as_register(), len);
5085       } else {
5086         mov(Rj, i.as_constant());
5087         sub(Rj, Rj, len);
5088       }
5089       // Rj == i-len
5090 
5091       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5092       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5093       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5094       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5095 
5096       // Ra = *++Pa;
5097       // Rb = *--Pb;
5098       // Rm = *++Pm;
5099       // Rn = *--Pn;
5100       ldr(Ra, pre(Pa, wordSize));
5101       ldr(Rb, pre(Pb, -wordSize));
5102       ldr(Rm, pre(Pm, wordSize));
5103       ldr(Rn, pre(Pn, -wordSize));
5104 
5105       mov(Rhi_mn, zr);
5106       mov(Rlo_mn, zr);
5107     }
5108 
5109     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5110       block_comment("post2");
5111       if (i.is_constant()) {
5112         mov(Rj, i.as_constant()-len.as_constant());
5113       } else {
5114         sub(Rj, i.as_register(), len);
5115       }
5116 
5117       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5118 
5119       // As soon as we know the least significant digit of our result,
5120       // store it.
5121       // Pm_base[i-len] = t0;
5122       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5123 
5124       // t0 = t1; t1 = t2; t2 = 0;
5125       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5126       adc(t1, t2, zr);
5127       mov(t2, zr);
5128     }
5129 
5130     // A carry in t0 after Montgomery multiplication means that we
5131     // should subtract multiples of n from our result in m.  We'll
5132     // keep doing that until there is no carry.
5133     void normalize(RegisterOrConstant len) {
5134       block_comment("normalize");
5135       // while (t0)
5136       //   t0 = sub(Pm_base, Pn_base, t0, len);
5137       Label loop, post, again;
5138       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5139       cbz(t0, post); {
5140         bind(again); {
5141           mov(i, zr);
5142           mov(cnt, len);
5143           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5144           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5145           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5146           align(16);
5147           bind(loop); {
5148             sbcs(Rm, Rm, Rn);
5149             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5150             add(i, i, 1);
5151             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5152             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5153             sub(cnt, cnt, 1);
5154           } cbnz(cnt, loop);
5155           sbc(t0, t0, zr);
5156         } cbnz(t0, again);
5157       } bind(post);
5158     }
5159 
5160     // Move memory at s to d, reversing words.
5161     //    Increments d to end of copied memory
5162     //    Destroys tmp1, tmp2
5163     //    Preserves len
5164     //    Leaves s pointing to the address which was in d at start
5165     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5166       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5167 
5168       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5169       mov(tmp1, len);
5170       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5171       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5172     }
5173     // where
5174     void reverse1(Register d, Register s, Register tmp) {
5175       ldr(tmp, pre(s, -wordSize));
5176       ror(tmp, tmp, 32);
5177       str(tmp, post(d, wordSize));
5178     }
5179 
5180     void step_squaring() {
5181       // An extra ACC
5182       step();
5183       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5184     }
5185 
5186     void last_squaring(RegisterOrConstant i) {
5187       Label dont;
5188       // if ((i & 1) == 0) {
5189       tbnz(i.as_register(), 0, dont); {
5190         // MACC(Ra, Rb, t0, t1, t2);
5191         // Ra = *++Pa;
5192         // Rb = *--Pb;
5193         umulh(Rhi_ab, Ra, Rb);
5194         mul(Rlo_ab, Ra, Rb);
5195         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5196       } bind(dont);
5197     }
5198 
5199     void extra_step_squaring() {
5200       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5201 
5202       // MACC(Rm, Rn, t0, t1, t2);
5203       // Rm = *++Pm;
5204       // Rn = *--Pn;
5205       umulh(Rhi_mn, Rm, Rn);
5206       mul(Rlo_mn, Rm, Rn);
5207       ldr(Rm, pre(Pm, wordSize));
5208       ldr(Rn, pre(Pn, -wordSize));
5209     }
5210 
5211     void post1_squaring() {
5212       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5213 
5214       // *Pm = Rm = t0 * inv;
5215       mul(Rm, t0, inv);
5216       str(Rm, Address(Pm));
5217 
5218       // MACC(Rm, Rn, t0, t1, t2);
5219       // t0 = t1; t1 = t2; t2 = 0;
5220       umulh(Rhi_mn, Rm, Rn);
5221 
5222 #ifndef PRODUCT
5223       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5224       {
5225         mul(Rlo_mn, Rm, Rn);
5226         add(Rlo_mn, t0, Rlo_mn);
5227         Label ok;
5228         cbz(Rlo_mn, ok); {
5229           stop("broken Montgomery multiply");
5230         } bind(ok);
5231       }
5232 #endif
5233       // We have very carefully set things up so that
5234       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5235       // the lower half of Rm * Rn because we know the result already:
5236       // it must be -t0.  t0 + (-t0) must generate a carry iff
5237       // t0 != 0.  So, rather than do a mul and an adds we just set
5238       // the carry flag iff t0 is nonzero.
5239       //
5240       // mul(Rlo_mn, Rm, Rn);
5241       // adds(zr, t0, Rlo_mn);
5242       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5243       adcs(t0, t1, Rhi_mn);
5244       adc(t1, t2, zr);
5245       mov(t2, zr);
5246     }
5247 
5248     void acc(Register Rhi, Register Rlo,
5249              Register t0, Register t1, Register t2) {
5250       adds(t0, t0, Rlo);
5251       adcs(t1, t1, Rhi);
5252       adc(t2, t2, zr);
5253     }
5254 
5255   public:
5256     /**
5257      * Fast Montgomery multiplication.  The derivation of the
5258      * algorithm is in A Cryptographic Library for the Motorola
5259      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5260      *
5261      * Arguments:
5262      *
5263      * Inputs for multiplication:
5264      *   c_rarg0   - int array elements a
5265      *   c_rarg1   - int array elements b
5266      *   c_rarg2   - int array elements n (the modulus)
5267      *   c_rarg3   - int length
5268      *   c_rarg4   - int inv
5269      *   c_rarg5   - int array elements m (the result)
5270      *
5271      * Inputs for squaring:
5272      *   c_rarg0   - int array elements a
5273      *   c_rarg1   - int array elements n (the modulus)
5274      *   c_rarg2   - int length
5275      *   c_rarg3   - int inv
5276      *   c_rarg4   - int array elements m (the result)
5277      *
5278      */
5279     address generate_multiply() {
5280       Label argh, nothing;
5281       bind(argh);
5282       stop("MontgomeryMultiply total_allocation must be <= 8192");
5283 
5284       align(CodeEntryAlignment);
5285       address entry = pc();
5286 
5287       cbzw(Rlen, nothing);
5288 
5289       enter();
5290 
5291       // Make room.
5292       cmpw(Rlen, 512);
5293       br(Assembler::HI, argh);
5294       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5295       andr(sp, Ra, -2 * wordSize);
5296 
5297       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5298 
5299       {
5300         // Copy input args, reversing as we go.  We use Ra as a
5301         // temporary variable.
5302         reverse(Ra, Pa_base, Rlen, t0, t1);
5303         if (!_squaring)
5304           reverse(Ra, Pb_base, Rlen, t0, t1);
5305         reverse(Ra, Pn_base, Rlen, t0, t1);
5306       }
5307 
5308       // Push all call-saved registers and also Pm_base which we'll need
5309       // at the end.
5310       save_regs();
5311 
5312 #ifndef PRODUCT
5313       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5314       {
5315         ldr(Rn, Address(Pn_base, 0));
5316         mul(Rlo_mn, Rn, inv);
5317         subs(zr, Rlo_mn, -1);
5318         Label ok;
5319         br(EQ, ok); {
5320           stop("broken inverse in Montgomery multiply");
5321         } bind(ok);
5322       }
5323 #endif
5324 
5325       mov(Pm_base, Ra);
5326 
5327       mov(t0, zr);
5328       mov(t1, zr);
5329       mov(t2, zr);
5330 
5331       block_comment("for (int i = 0; i < len; i++) {");
5332       mov(Ri, zr); {
5333         Label loop, end;
5334         cmpw(Ri, Rlen);
5335         br(Assembler::GE, end);
5336 
5337         bind(loop);
5338         pre1(Ri);
5339 
5340         block_comment("  for (j = i; j; j--) {"); {
5341           movw(Rj, Ri);
5342           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5343         } block_comment("  } // j");
5344 
5345         post1();
5346         addw(Ri, Ri, 1);
5347         cmpw(Ri, Rlen);
5348         br(Assembler::LT, loop);
5349         bind(end);
5350         block_comment("} // i");
5351       }
5352 
5353       block_comment("for (int i = len; i < 2*len; i++) {");
5354       mov(Ri, Rlen); {
5355         Label loop, end;
5356         cmpw(Ri, Rlen, Assembler::LSL, 1);
5357         br(Assembler::GE, end);
5358 
5359         bind(loop);
5360         pre2(Ri, Rlen);
5361 
5362         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5363           lslw(Rj, Rlen, 1);
5364           subw(Rj, Rj, Ri);
5365           subw(Rj, Rj, 1);
5366           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5367         } block_comment("  } // j");
5368 
5369         post2(Ri, Rlen);
5370         addw(Ri, Ri, 1);
5371         cmpw(Ri, Rlen, Assembler::LSL, 1);
5372         br(Assembler::LT, loop);
5373         bind(end);
5374       }
5375       block_comment("} // i");
5376 
5377       normalize(Rlen);
5378 
5379       mov(Ra, Pm_base);  // Save Pm_base in Ra
5380       restore_regs();  // Restore caller's Pm_base
5381 
5382       // Copy our result into caller's Pm_base
5383       reverse(Pm_base, Ra, Rlen, t0, t1);
5384 
5385       leave();
5386       bind(nothing);
5387       ret(lr);
5388 
5389       return entry;
5390     }
5391     // In C, approximately:
5392 
5393     // void
5394     // montgomery_multiply(uint64_t Pa_base[], uint64_t Pb_base[],
5395     //                     uint64_t Pn_base[], uint64_t Pm_base[],
5396     //                     uint64_t inv, int len) {
5397     //   uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5398     //   uint64_t *Pa, *Pb, *Pn, *Pm;
5399     //   uint64_t Ra, Rb, Rn, Rm;
5400 
5401     //   int i;
5402 
5403     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5404 
5405     //   for (i = 0; i < len; i++) {
5406     //     int j;
5407 
5408     //     Pa = Pa_base;
5409     //     Pb = Pb_base + i;
5410     //     Pm = Pm_base;
5411     //     Pn = Pn_base + i;
5412 
5413     //     Ra = *Pa;
5414     //     Rb = *Pb;
5415     //     Rm = *Pm;
5416     //     Rn = *Pn;
5417 
5418     //     int iters = i;
5419     //     for (j = 0; iters--; j++) {
5420     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5421     //       MACC(Ra, Rb, t0, t1, t2);
5422     //       Ra = *++Pa;
5423     //       Rb = *--Pb;
5424     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5425     //       MACC(Rm, Rn, t0, t1, t2);
5426     //       Rm = *++Pm;
5427     //       Rn = *--Pn;
5428     //     }
5429 
5430     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5431     //     MACC(Ra, Rb, t0, t1, t2);
5432     //     *Pm = Rm = t0 * inv;
5433     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5434     //     MACC(Rm, Rn, t0, t1, t2);
5435 
5436     //     assert(t0 == 0, "broken Montgomery multiply");
5437 
5438     //     t0 = t1; t1 = t2; t2 = 0;
5439     //   }
5440 
5441     //   for (i = len; i < 2*len; i++) {
5442     //     int j;
5443 
5444     //     Pa = Pa_base + i-len;
5445     //     Pb = Pb_base + len;
5446     //     Pm = Pm_base + i-len;
5447     //     Pn = Pn_base + len;
5448 
5449     //     Ra = *++Pa;
5450     //     Rb = *--Pb;
5451     //     Rm = *++Pm;
5452     //     Rn = *--Pn;
5453 
5454     //     int iters = len*2-i-1;
5455     //     for (j = i-len+1; iters--; j++) {
5456     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5457     //       MACC(Ra, Rb, t0, t1, t2);
5458     //       Ra = *++Pa;
5459     //       Rb = *--Pb;
5460     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5461     //       MACC(Rm, Rn, t0, t1, t2);
5462     //       Rm = *++Pm;
5463     //       Rn = *--Pn;
5464     //     }
5465 
5466     //     Pm_base[i-len] = t0;
5467     //     t0 = t1; t1 = t2; t2 = 0;
5468     //   }
5469 
5470     //   while (t0)
5471     //     t0 = sub(Pm_base, Pn_base, t0, len);
5472     // }
5473 
5474     /**
5475      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5476      * multiplies than Montgomery multiplication so it should be up to
5477      * 25% faster.  However, its loop control is more complex and it
5478      * may actually run slower on some machines.
5479      *
5480      * Arguments:
5481      *
5482      * Inputs:
5483      *   c_rarg0   - int array elements a
5484      *   c_rarg1   - int array elements n (the modulus)
5485      *   c_rarg2   - int length
5486      *   c_rarg3   - int inv
5487      *   c_rarg4   - int array elements m (the result)
5488      *
5489      */
5490     address generate_square() {
5491       Label argh;
5492       bind(argh);
5493       stop("MontgomeryMultiply total_allocation must be <= 8192");
5494 
5495       align(CodeEntryAlignment);
5496       address entry = pc();
5497 
5498       enter();
5499 
5500       // Make room.
5501       cmpw(Rlen, 512);
5502       br(Assembler::HI, argh);
5503       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5504       andr(sp, Ra, -2 * wordSize);
5505 
5506       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5507 
5508       {
5509         // Copy input args, reversing as we go.  We use Ra as a
5510         // temporary variable.
5511         reverse(Ra, Pa_base, Rlen, t0, t1);
5512         reverse(Ra, Pn_base, Rlen, t0, t1);
5513       }
5514 
5515       // Push all call-saved registers and also Pm_base which we'll need
5516       // at the end.
5517       save_regs();
5518 
5519       mov(Pm_base, Ra);
5520 
5521       mov(t0, zr);
5522       mov(t1, zr);
5523       mov(t2, zr);
5524 
5525       block_comment("for (int i = 0; i < len; i++) {");
5526       mov(Ri, zr); {
5527         Label loop, end;
5528         bind(loop);
5529         cmp(Ri, Rlen);
5530         br(Assembler::GE, end);
5531 
5532         pre1(Ri);
5533 
5534         block_comment("for (j = (i+1)/2; j; j--) {"); {
5535           add(Rj, Ri, 1);
5536           lsr(Rj, Rj, 1);
5537           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5538         } block_comment("  } // j");
5539 
5540         last_squaring(Ri);
5541 
5542         block_comment("  for (j = i/2; j; j--) {"); {
5543           lsr(Rj, Ri, 1);
5544           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5545         } block_comment("  } // j");
5546 
5547         post1_squaring();
5548         add(Ri, Ri, 1);
5549         cmp(Ri, Rlen);
5550         br(Assembler::LT, loop);
5551 
5552         bind(end);
5553         block_comment("} // i");
5554       }
5555 
5556       block_comment("for (int i = len; i < 2*len; i++) {");
5557       mov(Ri, Rlen); {
5558         Label loop, end;
5559         bind(loop);
5560         cmp(Ri, Rlen, Assembler::LSL, 1);
5561         br(Assembler::GE, end);
5562 
5563         pre2(Ri, Rlen);
5564 
5565         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5566           lsl(Rj, Rlen, 1);
5567           sub(Rj, Rj, Ri);
5568           sub(Rj, Rj, 1);
5569           lsr(Rj, Rj, 1);
5570           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5571         } block_comment("  } // j");
5572 
5573         last_squaring(Ri);
5574 
5575         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5576           lsl(Rj, Rlen, 1);
5577           sub(Rj, Rj, Ri);
5578           lsr(Rj, Rj, 1);
5579           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5580         } block_comment("  } // j");
5581 
5582         post2(Ri, Rlen);
5583         add(Ri, Ri, 1);
5584         cmp(Ri, Rlen, Assembler::LSL, 1);
5585 
5586         br(Assembler::LT, loop);
5587         bind(end);
5588         block_comment("} // i");
5589       }
5590 
5591       normalize(Rlen);
5592 
5593       mov(Ra, Pm_base);  // Save Pm_base in Ra
5594       restore_regs();  // Restore caller's Pm_base
5595 
5596       // Copy our result into caller's Pm_base
5597       reverse(Pm_base, Ra, Rlen, t0, t1);
5598 
5599       leave();
5600       ret(lr);
5601 
5602       return entry;
5603     }
5604     // In C, approximately:
5605 
5606     // void
5607     // montgomery_square(uint64_t Pa_base[], uint64_t Pn_base[],
5608     //                   uint64_t Pm_base[], uint64_t inv, int len) {
5609     //   uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5610     //   uint64_t *Pa, *Pb, *Pn, *Pm;
5611     //   uint64_t Ra, Rb, Rn, Rm;
5612 
5613     //   int i;
5614 
5615     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5616 
5617     //   for (i = 0; i < len; i++) {
5618     //     int j;
5619 
5620     //     Pa = Pa_base;
5621     //     Pb = Pa_base + i;
5622     //     Pm = Pm_base;
5623     //     Pn = Pn_base + i;
5624 
5625     //     Ra = *Pa;
5626     //     Rb = *Pb;
5627     //     Rm = *Pm;
5628     //     Rn = *Pn;
5629 
5630     //     int iters = (i+1)/2;
5631     //     for (j = 0; iters--; j++) {
5632     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5633     //       MACC2(Ra, Rb, t0, t1, t2);
5634     //       Ra = *++Pa;
5635     //       Rb = *--Pb;
5636     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5637     //       MACC(Rm, Rn, t0, t1, t2);
5638     //       Rm = *++Pm;
5639     //       Rn = *--Pn;
5640     //     }
5641     //     if ((i & 1) == 0) {
5642     //       assert(Ra == Pa_base[j], "must be");
5643     //       MACC(Ra, Ra, t0, t1, t2);
5644     //     }
5645     //     iters = i/2;
5646     //     assert(iters == i-j, "must be");
5647     //     for (; iters--; j++) {
5648     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5649     //       MACC(Rm, Rn, t0, t1, t2);
5650     //       Rm = *++Pm;
5651     //       Rn = *--Pn;
5652     //     }
5653 
5654     //     *Pm = Rm = t0 * inv;
5655     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5656     //     MACC(Rm, Rn, t0, t1, t2);
5657 
5658     //     assert(t0 == 0, "broken Montgomery multiply");
5659 
5660     //     t0 = t1; t1 = t2; t2 = 0;
5661     //   }
5662 
5663     //   for (i = len; i < 2*len; i++) {
5664     //     int start = i-len+1;
5665     //     int end = start + (len - start)/2;
5666     //     int j;
5667 
5668     //     Pa = Pa_base + i-len;
5669     //     Pb = Pa_base + len;
5670     //     Pm = Pm_base + i-len;
5671     //     Pn = Pn_base + len;
5672 
5673     //     Ra = *++Pa;
5674     //     Rb = *--Pb;
5675     //     Rm = *++Pm;
5676     //     Rn = *--Pn;
5677 
5678     //     int iters = (2*len-i-1)/2;
5679     //     assert(iters == end-start, "must be");
5680     //     for (j = start; iters--; j++) {
5681     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5682     //       MACC2(Ra, Rb, t0, t1, t2);
5683     //       Ra = *++Pa;
5684     //       Rb = *--Pb;
5685     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5686     //       MACC(Rm, Rn, t0, t1, t2);
5687     //       Rm = *++Pm;
5688     //       Rn = *--Pn;
5689     //     }
5690     //     if ((i & 1) == 0) {
5691     //       assert(Ra == Pa_base[j], "must be");
5692     //       MACC(Ra, Ra, t0, t1, t2);
5693     //     }
5694     //     iters =  (2*len-i)/2;
5695     //     assert(iters == len-j, "must be");
5696     //     for (; iters--; j++) {
5697     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5698     //       MACC(Rm, Rn, t0, t1, t2);
5699     //       Rm = *++Pm;
5700     //       Rn = *--Pn;
5701     //     }
5702     //     Pm_base[i-len] = t0;
5703     //     t0 = t1; t1 = t2; t2 = 0;
5704     //   }
5705 
5706     //   while (t0)
5707     //     t0 = sub(Pm_base, Pn_base, t0, len);
5708     // }
5709   };
5710 
5711 
5712   // Initialization
5713   void generate_initial() {
5714     // Generate initial stubs and initializes the entry points
5715 
5716     // entry points that exist in all platforms Note: This is code
5717     // that could be shared among different platforms - however the
5718     // benefit seems to be smaller than the disadvantage of having a
5719     // much more complicated generator structure. See also comment in
5720     // stubRoutines.hpp.
5721 
5722     StubRoutines::_forward_exception_entry = generate_forward_exception();
5723 
5724     StubRoutines::_call_stub_entry =
5725       generate_call_stub(StubRoutines::_call_stub_return_address);
5726 
5727     // is referenced by megamorphic call
5728     StubRoutines::_catch_exception_entry = generate_catch_exception();
5729 
5730     // Build this early so it's available for the interpreter.
5731     StubRoutines::_throw_StackOverflowError_entry =
5732       generate_throw_exception("StackOverflowError throw_exception",
5733                                CAST_FROM_FN_PTR(address,
5734                                                 SharedRuntime::throw_StackOverflowError));
5735     StubRoutines::_throw_delayed_StackOverflowError_entry =
5736       generate_throw_exception("delayed StackOverflowError throw_exception",
5737                                CAST_FROM_FN_PTR(address,
5738                                                 SharedRuntime::throw_delayed_StackOverflowError));
5739     if (UseCRC32Intrinsics) {
5740       // set table address before stub generation which use it
5741       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5742       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5743     }
5744 
5745     if (UseCRC32CIntrinsics) {
5746       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5747     }
5748 
5749     // Disabled until JDK-8210858 is fixed
5750     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5751     //   StubRoutines::_dlog = generate_dlog();
5752     // }
5753 
5754     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5755       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5756     }
5757 
5758     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5759       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5760     }
5761 
5762     // Safefetch stubs.
5763     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5764                                                        &StubRoutines::_safefetch32_fault_pc,
5765                                                        &StubRoutines::_safefetch32_continuation_pc);
5766     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5767                                                        &StubRoutines::_safefetchN_fault_pc,
5768                                                        &StubRoutines::_safefetchN_continuation_pc);
5769   }
5770 
5771   void generate_all() {
5772     // support for verify_oop (must happen after universe_init)
5773     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5774     StubRoutines::_throw_AbstractMethodError_entry =
5775       generate_throw_exception("AbstractMethodError throw_exception",
5776                                CAST_FROM_FN_PTR(address,
5777                                                 SharedRuntime::
5778                                                 throw_AbstractMethodError));
5779 
5780     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5781       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5782                                CAST_FROM_FN_PTR(address,
5783                                                 SharedRuntime::
5784                                                 throw_IncompatibleClassChangeError));
5785 
5786     StubRoutines::_throw_NullPointerException_at_call_entry =
5787       generate_throw_exception("NullPointerException at call throw_exception",
5788                                CAST_FROM_FN_PTR(address,
5789                                                 SharedRuntime::
5790                                                 throw_NullPointerException_at_call));
5791 
5792     // arraycopy stubs used by compilers
5793     generate_arraycopy_stubs();
5794 
5795     // has negatives stub for large arrays.
5796     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5797 
5798     // array equals stub for large arrays.
5799     if (!UseSimpleArrayEquals) {
5800       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5801     }
5802 
5803     generate_compare_long_strings();
5804 
5805     generate_string_indexof_stubs();
5806 
5807     // byte_array_inflate stub for large arrays.
5808     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5809 
5810     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5811     if (bs_nm != NULL) {
5812       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
5813     }
5814 #ifdef COMPILER2
5815     if (UseMultiplyToLenIntrinsic) {
5816       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5817     }
5818 
5819     if (UseSquareToLenIntrinsic) {
5820       StubRoutines::_squareToLen = generate_squareToLen();
5821     }
5822 
5823     if (UseMulAddIntrinsic) {
5824       StubRoutines::_mulAdd = generate_mulAdd();
5825     }
5826 
5827     if (UseMontgomeryMultiplyIntrinsic) {
5828       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5829       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5830       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5831     }
5832 
5833     if (UseMontgomerySquareIntrinsic) {
5834       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5835       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5836       // We use generate_multiply() rather than generate_square()
5837       // because it's faster for the sizes of modulus we care about.
5838       StubRoutines::_montgomerySquare = g.generate_multiply();
5839     }
5840 #endif // COMPILER2
5841 
5842     // generate GHASH intrinsics code
5843     if (UseGHASHIntrinsics) {
5844       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5845     }
5846 
5847     // data cache line writeback
5848     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5849     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5850 
5851     if (UseAESIntrinsics) {
5852       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5853       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5854       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5855       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5856     }
5857 
5858     if (UseSHA1Intrinsics) {
5859       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5860       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5861     }
5862     if (UseSHA256Intrinsics) {
5863       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5864       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5865     }
5866 
5867     // generate Adler32 intrinsics code
5868     if (UseAdler32Intrinsics) {
5869       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5870     }
5871 
5872     StubRoutines::aarch64::set_completed();
5873   }
5874 
5875  public:
5876   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5877     if (all) {
5878       generate_all();
5879     } else {
5880       generate_initial();
5881     }
5882   }
5883 }; // end class declaration
5884 
5885 #define UCM_TABLE_MAX_ENTRIES 8
5886 void StubGenerator_generate(CodeBuffer* code, bool all) {
5887   if (UnsafeCopyMemory::_table == NULL) {
5888     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
5889   }
5890   StubGenerator g(code, all);
5891 }