Old src/hotspot/cpu/aarch64/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #include "utilities/powerOfTwo.hpp"
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zThreadLocalData.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     address aarch64_entry = __ pc();
 222 
 223     // set up frame and move sp to end of save area
 224     __ enter();
 225     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 226 
 227     // save register parameters and Java scratch/global registers
 228     // n.b. we save thread even though it gets installed in
 229     // rthread because we want to sanity check rthread later
 230     __ str(c_rarg7,  thread);
 231     __ strw(c_rarg6, parameter_size);
 232     __ stp(c_rarg4, c_rarg5,  entry_point);
 233     __ stp(c_rarg2, c_rarg3,  result_type);
 234     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 235 
 236     __ stp(r20, r19,   r20_save);
 237     __ stp(r22, r21,   r22_save);
 238     __ stp(r24, r23,   r24_save);
 239     __ stp(r26, r25,   r26_save);
 240     __ stp(r28, r27,   r28_save);
 241 
 242     __ stpd(v9,  v8,   d9_save);
 243     __ stpd(v11, v10,  d11_save);
 244     __ stpd(v13, v12,  d13_save);
 245     __ stpd(v15, v14,  d15_save);
 246 
 247     // install Java thread in global register now we have saved
 248     // whatever value it held
 249     __ mov(rthread, c_rarg7);
 250     // And method
 251     __ mov(rmethod, c_rarg3);
 252 
 253     // set up the heapbase register
 254     __ reinit_heapbase();
 255 
 256 #ifdef ASSERT
 257     // make sure we have no pending exceptions
 258     {
 259       Label L;
 260       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 261       __ cmp(rscratch1, (u1)NULL_WORD);
 262       __ br(Assembler::EQ, L);
 263       __ stop("StubRoutines::call_stub: entered with pending exception");
 264       __ BIND(L);
 265     }
 266 #endif
 267     // pass parameters if any
 268     __ mov(esp, sp);
 269     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 270     __ andr(sp, rscratch1, -2 * wordSize);
 271 
 272     BLOCK_COMMENT("pass parameters if any");
 273     Label parameters_done;
 274     // parameter count is still in c_rarg6
 275     // and parameter pointer identifying param 1 is in c_rarg5
 276     __ cbzw(c_rarg6, parameters_done);
 277 
 278     address loop = __ pc();
 279     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 280     __ subsw(c_rarg6, c_rarg6, 1);
 281     __ push(rscratch1);
 282     __ br(Assembler::GT, loop);
 283 
 284     __ BIND(parameters_done);
 285 
 286     // call Java entry -- passing methdoOop, and current sp
 287     //      rmethod: Method*
 288     //      r13: sender sp
 289     BLOCK_COMMENT("call Java function");
 290     __ mov(r13, sp);
 291     __ blr(c_rarg4);
 292 
 293     // we do this here because the notify will already have been done
 294     // if we get to the next instruction via an exception
 295     //
 296     // n.b. adding this instruction here affects the calculation of
 297     // whether or not a routine returns to the call stub (used when
 298     // doing stack walks) since the normal test is to check the return
 299     // pc against the address saved below. so we may need to allow for
 300     // this extra instruction in the check.
 301 
 302     // save current address for use by exception handling code
 303 
 304     return_address = __ pc();
 305 
 306     // store result depending on type (everything that is not
 307     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 308     // n.b. this assumes Java returns an integral result in r0
 309     // and a floating result in j_farg0
 310     __ ldr(j_rarg2, result);
 311     Label is_long, is_float, is_double, exit;
 312     __ ldr(j_rarg1, result_type);
 313     __ cmp(j_rarg1, (u1)T_OBJECT);
 314     __ br(Assembler::EQ, is_long);
 315     __ cmp(j_rarg1, (u1)T_LONG);
 316     __ br(Assembler::EQ, is_long);
 317     __ cmp(j_rarg1, (u1)T_FLOAT);
 318     __ br(Assembler::EQ, is_float);
 319     __ cmp(j_rarg1, (u1)T_DOUBLE);
 320     __ br(Assembler::EQ, is_double);
 321 
 322     // handle T_INT case
 323     __ strw(r0, Address(j_rarg2));
 324 
 325     __ BIND(exit);
 326 
 327     // pop parameters
 328     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 329 
 330 #ifdef ASSERT
 331     // verify that threads correspond
 332     {
 333       Label L, S;
 334       __ ldr(rscratch1, thread);
 335       __ cmp(rthread, rscratch1);
 336       __ br(Assembler::NE, S);
 337       __ get_thread(rscratch1);
 338       __ cmp(rthread, rscratch1);
 339       __ br(Assembler::EQ, L);
 340       __ BIND(S);
 341       __ stop("StubRoutines::call_stub: threads must correspond");
 342       __ BIND(L);
 343     }
 344 #endif
 345 
 346     // restore callee-save registers
 347     __ ldpd(v15, v14,  d15_save);
 348     __ ldpd(v13, v12,  d13_save);
 349     __ ldpd(v11, v10,  d11_save);
 350     __ ldpd(v9,  v8,   d9_save);
 351 
 352     __ ldp(r28, r27,   r28_save);
 353     __ ldp(r26, r25,   r26_save);
 354     __ ldp(r24, r23,   r24_save);
 355     __ ldp(r22, r21,   r22_save);
 356     __ ldp(r20, r19,   r20_save);
 357 
 358     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 359     __ ldrw(c_rarg2, result_type);
 360     __ ldr(c_rarg3,  method);
 361     __ ldp(c_rarg4, c_rarg5,  entry_point);
 362     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 363 
 364     // leave frame and return to caller
 365     __ leave();
 366     __ ret(lr);
 367 
 368     // handle return types different from T_INT
 369 
 370     __ BIND(is_long);
 371     __ str(r0, Address(j_rarg2, 0));
 372     __ br(Assembler::AL, exit);
 373 
 374     __ BIND(is_float);
 375     __ strs(j_farg0, Address(j_rarg2, 0));
 376     __ br(Assembler::AL, exit);
 377 
 378     __ BIND(is_double);
 379     __ strd(j_farg0, Address(j_rarg2, 0));
 380     __ br(Assembler::AL, exit);
 381 
 382     return start;
 383   }
 384 
 385   // Return point for a Java call if there's an exception thrown in
 386   // Java code.  The exception is caught and transformed into a
 387   // pending exception stored in JavaThread that can be tested from
 388   // within the VM.
 389   //
 390   // Note: Usually the parameters are removed by the callee. In case
 391   // of an exception crossing an activation frame boundary, that is
 392   // not the case if the callee is compiled code => need to setup the
 393   // rsp.
 394   //
 395   // r0: exception oop
 396 
 397   address generate_catch_exception() {
 398     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 399     address start = __ pc();
 400 
 401     // same as in generate_call_stub():
 402     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 403     const Address thread        (rfp, thread_off         * wordSize);
 404 
 405 #ifdef ASSERT
 406     // verify that threads correspond
 407     {
 408       Label L, S;
 409       __ ldr(rscratch1, thread);
 410       __ cmp(rthread, rscratch1);
 411       __ br(Assembler::NE, S);
 412       __ get_thread(rscratch1);
 413       __ cmp(rthread, rscratch1);
 414       __ br(Assembler::EQ, L);
 415       __ bind(S);
 416       __ stop("StubRoutines::catch_exception: threads must correspond");
 417       __ bind(L);
 418     }
 419 #endif
 420 
 421     // set pending exception
 422     __ verify_oop(r0);
 423 
 424     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 425     __ mov(rscratch1, (address)__FILE__);
 426     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 427     __ movw(rscratch1, (int)__LINE__);
 428     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 429 
 430     // complete return to VM
 431     assert(StubRoutines::_call_stub_return_address != NULL,
 432            "_call_stub_return_address must have been generated before");
 433     __ b(StubRoutines::_call_stub_return_address);
 434 
 435     return start;
 436   }
 437 
 438   // Continuation point for runtime calls returning with a pending
 439   // exception.  The pending exception check happened in the runtime
 440   // or native call stub.  The pending exception in Thread is
 441   // converted into a Java-level exception.
 442   //
 443   // Contract with Java-level exception handlers:
 444   // r0: exception
 445   // r3: throwing pc
 446   //
 447   // NOTE: At entry of this stub, exception-pc must be in LR !!
 448 
 449   // NOTE: this is always used as a jump target within generated code
 450   // so it just needs to be generated code wiht no x86 prolog
 451 
 452   address generate_forward_exception() {
 453     StubCodeMark mark(this, "StubRoutines", "forward exception");
 454     address start = __ pc();
 455 
 456     // Upon entry, LR points to the return address returning into
 457     // Java (interpreted or compiled) code; i.e., the return address
 458     // becomes the throwing pc.
 459     //
 460     // Arguments pushed before the runtime call are still on the stack
 461     // but the exception handler will reset the stack pointer ->
 462     // ignore them.  A potential result in registers can be ignored as
 463     // well.
 464 
 465 #ifdef ASSERT
 466     // make sure this code is only executed if there is a pending exception
 467     {
 468       Label L;
 469       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 470       __ cbnz(rscratch1, L);
 471       __ stop("StubRoutines::forward exception: no pending exception (1)");
 472       __ bind(L);
 473     }
 474 #endif
 475 
 476     // compute exception handler into r19
 477 
 478     // call the VM to find the handler address associated with the
 479     // caller address. pass thread in r0 and caller pc (ret address)
 480     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 481     // the stack.
 482     __ mov(c_rarg1, lr);
 483     // lr will be trashed by the VM call so we move it to R19
 484     // (callee-saved) because we also need to pass it to the handler
 485     // returned by this call.
 486     __ mov(r19, lr);
 487     BLOCK_COMMENT("call exception_handler_for_return_address");
 488     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 489                          SharedRuntime::exception_handler_for_return_address),
 490                     rthread, c_rarg1);
 491     // we should not really care that lr is no longer the callee
 492     // address. we saved the value the handler needs in r19 so we can
 493     // just copy it to r3. however, the C2 handler will push its own
 494     // frame and then calls into the VM and the VM code asserts that
 495     // the PC for the frame above the handler belongs to a compiled
 496     // Java method. So, we restore lr here to satisfy that assert.
 497     __ mov(lr, r19);
 498     // setup r0 & r3 & clear pending exception
 499     __ mov(r3, r19);
 500     __ mov(r19, r0);
 501     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 502     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 503 
 504 #ifdef ASSERT
 505     // make sure exception is set
 506     {
 507       Label L;
 508       __ cbnz(r0, L);
 509       __ stop("StubRoutines::forward exception: no pending exception (2)");
 510       __ bind(L);
 511     }
 512 #endif
 513 
 514     // continue at exception handler
 515     // r0: exception
 516     // r3: throwing pc
 517     // r19: exception handler
 518     __ verify_oop(r0);
 519     __ br(r19);
 520 
 521     return start;
 522   }
 523 
 524   // Non-destructive plausibility checks for oops
 525   //
 526   // Arguments:
 527   //    r0: oop to verify
 528   //    rscratch1: error message
 529   //
 530   // Stack after saving c_rarg3:
 531   //    [tos + 0]: saved c_rarg3
 532   //    [tos + 1]: saved c_rarg2
 533   //    [tos + 2]: saved lr
 534   //    [tos + 3]: saved rscratch2
 535   //    [tos + 4]: saved r0
 536   //    [tos + 5]: saved rscratch1
 537   address generate_verify_oop() {
 538 
 539     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 540     address start = __ pc();
 541 
 542     Label exit, error;
 543 
 544     // save c_rarg2 and c_rarg3
 545     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 546 
 547     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 548     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ ldr(c_rarg3, Address(c_rarg2));
 550     __ add(c_rarg3, c_rarg3, 1);
 551     __ str(c_rarg3, Address(c_rarg2));
 552 
 553     // object is in r0
 554     // make sure object is 'reasonable'
 555     __ cbz(r0, exit); // if obj is NULL it is OK
 556 
 557 #if INCLUDE_ZGC
 558     if (UseZGC) {
 559       // Check if mask is good.
 560       // verifies that ZAddressBadMask & r0 == 0
 561       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 562       __ andr(c_rarg2, r0, c_rarg3);
 563       __ cbnz(c_rarg2, error);
 564     }
 565 #endif
 566 
 567     // Check if the oop is in the right area of memory
 568     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 569     __ andr(c_rarg2, r0, c_rarg3);
 570     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 571 
 572     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 573     // instruction here because the flags register is live.
 574     __ eor(c_rarg2, c_rarg2, c_rarg3);
 575     __ cbnz(c_rarg2, error);
 576 
 577     // make sure klass is 'reasonable', which is not zero.
 578     __ load_klass(r0, r0);  // get klass
 579     __ cbz(r0, error);      // if klass is NULL it is broken
 580 
 581     // return if everything seems ok
 582     __ bind(exit);
 583 
 584     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 585     __ ret(lr);
 586 
 587     // handle errors
 588     __ bind(error);
 589     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 590 
 591     __ push(RegSet::range(r0, r29), sp);
 592     // debug(char* msg, int64_t pc, int64_t regs[])
 593     __ mov(c_rarg0, rscratch1);      // pass address of error message
 594     __ mov(c_rarg1, lr);             // pass return address
 595     __ mov(c_rarg2, sp);             // pass address of regs on stack
 596 #ifndef PRODUCT
 597     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 598 #endif
 599     BLOCK_COMMENT("call MacroAssembler::debug");
 600     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 601     __ blr(rscratch1);
 602     __ hlt(0);
 603 
 604     return start;
 605   }
 606 
 607   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 608 
 609   // The inner part of zero_words().  This is the bulk operation,
 610   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 611   // caller is responsible for zeroing the last few words.
 612   //
 613   // Inputs:
 614   // r10: the HeapWord-aligned base address of an array to zero.
 615   // r11: the count in HeapWords, r11 > 0.
 616   //
 617   // Returns r10 and r11, adjusted for the caller to clear.
 618   // r10: the base address of the tail of words left to clear.
 619   // r11: the number of words in the tail.
 620   //      r11 < MacroAssembler::zero_words_block_size.
 621 
 622   address generate_zero_blocks() {
 623     Label done;
 624     Label base_aligned;
 625 
 626     Register base = r10, cnt = r11;
 627 
 628     __ align(CodeEntryAlignment);
 629     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 630     address start = __ pc();
 631 
 632     if (UseBlockZeroing) {
 633       int zva_length = VM_Version::zva_length();
 634 
 635       // Ensure ZVA length can be divided by 16. This is required by
 636       // the subsequent operations.
 637       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 638 
 639       __ tbz(base, 3, base_aligned);
 640       __ str(zr, Address(__ post(base, 8)));
 641       __ sub(cnt, cnt, 1);
 642       __ bind(base_aligned);
 643 
 644       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 645       // alignment.
 646       Label small;
 647       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 648       __ subs(rscratch1, cnt, low_limit >> 3);
 649       __ br(Assembler::LT, small);
 650       __ zero_dcache_blocks(base, cnt);
 651       __ bind(small);
 652     }
 653 
 654     {
 655       // Number of stp instructions we'll unroll
 656       const int unroll =
 657         MacroAssembler::zero_words_block_size / 2;
 658       // Clear the remaining blocks.
 659       Label loop;
 660       __ subs(cnt, cnt, unroll * 2);
 661       __ br(Assembler::LT, done);
 662       __ bind(loop);
 663       for (int i = 0; i < unroll; i++)
 664         __ stp(zr, zr, __ post(base, 16));
 665       __ subs(cnt, cnt, unroll * 2);
 666       __ br(Assembler::GE, loop);
 667       __ bind(done);
 668       __ add(cnt, cnt, unroll * 2);
 669     }
 670 
 671     __ ret(lr);
 672 
 673     return start;
 674   }
 675 
 676 
 677   typedef enum {
 678     copy_forwards = 1,
 679     copy_backwards = -1
 680   } copy_direction;
 681 
 682   // Bulk copy of blocks of 8 words.
 683   //
 684   // count is a count of words.
 685   //
 686   // Precondition: count >= 8
 687   //
 688   // Postconditions:
 689   //
 690   // The least significant bit of count contains the remaining count
 691   // of words to copy.  The rest of count is trash.
 692   //
 693   // s and d are adjusted to point to the remaining words to copy
 694   //
 695   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 696                            copy_direction direction) {
 697     int unit = wordSize * direction;
 698     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 699 
 700     int offset;
 701     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 702       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 703     const Register stride = r13;
 704 
 705     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 706     assert_different_registers(s, d, count, rscratch1);
 707 
 708     Label again, drain;
 709     const char *stub_name;
 710     if (direction == copy_forwards)
 711       stub_name = "forward_copy_longs";
 712     else
 713       stub_name = "backward_copy_longs";
 714 
 715     __ align(CodeEntryAlignment);
 716 
 717     StubCodeMark mark(this, "StubRoutines", stub_name);
 718 
 719     __ bind(start);
 720 
 721     Label unaligned_copy_long;
 722     if (AvoidUnalignedAccesses) {
 723       __ tbnz(d, 3, unaligned_copy_long);
 724     }
 725 
 726     if (direction == copy_forwards) {
 727       __ sub(s, s, bias);
 728       __ sub(d, d, bias);
 729     }
 730 
 731 #ifdef ASSERT
 732     // Make sure we are never given < 8 words
 733     {
 734       Label L;
 735       __ cmp(count, (u1)8);
 736       __ br(Assembler::GE, L);
 737       __ stop("genrate_copy_longs called with < 8 words");
 738       __ bind(L);
 739     }
 740 #endif
 741 
 742     // Fill 8 registers
 743     if (UseSIMDForMemoryOps) {
 744       __ ldpq(v0, v1, Address(s, 4 * unit));
 745       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 746     } else {
 747       __ ldp(t0, t1, Address(s, 2 * unit));
 748       __ ldp(t2, t3, Address(s, 4 * unit));
 749       __ ldp(t4, t5, Address(s, 6 * unit));
 750       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 751     }
 752 
 753     __ subs(count, count, 16);
 754     __ br(Assembler::LO, drain);
 755 
 756     int prefetch = PrefetchCopyIntervalInBytes;
 757     bool use_stride = false;
 758     if (direction == copy_backwards) {
 759        use_stride = prefetch > 256;
 760        prefetch = -prefetch;
 761        if (use_stride) __ mov(stride, prefetch);
 762     }
 763 
 764     __ bind(again);
 765 
 766     if (PrefetchCopyIntervalInBytes > 0)
 767       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 768 
 769     if (UseSIMDForMemoryOps) {
 770       __ stpq(v0, v1, Address(d, 4 * unit));
 771       __ ldpq(v0, v1, Address(s, 4 * unit));
 772       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 773       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 774     } else {
 775       __ stp(t0, t1, Address(d, 2 * unit));
 776       __ ldp(t0, t1, Address(s, 2 * unit));
 777       __ stp(t2, t3, Address(d, 4 * unit));
 778       __ ldp(t2, t3, Address(s, 4 * unit));
 779       __ stp(t4, t5, Address(d, 6 * unit));
 780       __ ldp(t4, t5, Address(s, 6 * unit));
 781       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 782       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 783     }
 784 
 785     __ subs(count, count, 8);
 786     __ br(Assembler::HS, again);
 787 
 788     // Drain
 789     __ bind(drain);
 790     if (UseSIMDForMemoryOps) {
 791       __ stpq(v0, v1, Address(d, 4 * unit));
 792       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 793     } else {
 794       __ stp(t0, t1, Address(d, 2 * unit));
 795       __ stp(t2, t3, Address(d, 4 * unit));
 796       __ stp(t4, t5, Address(d, 6 * unit));
 797       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 798     }
 799 
 800     {
 801       Label L1, L2;
 802       __ tbz(count, exact_log2(4), L1);
 803       if (UseSIMDForMemoryOps) {
 804         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 805         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 806       } else {
 807         __ ldp(t0, t1, Address(s, 2 * unit));
 808         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 809         __ stp(t0, t1, Address(d, 2 * unit));
 810         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 811       }
 812       __ bind(L1);
 813 
 814       if (direction == copy_forwards) {
 815         __ add(s, s, bias);
 816         __ add(d, d, bias);
 817       }
 818 
 819       __ tbz(count, 1, L2);
 820       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 821       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 822       __ bind(L2);
 823     }
 824 
 825     __ ret(lr);
 826 
 827     if (AvoidUnalignedAccesses) {
 828       Label drain, again;
 829       // Register order for storing. Order is different for backward copy.
 830 
 831       __ bind(unaligned_copy_long);
 832 
 833       // source address is even aligned, target odd aligned
 834       //
 835       // when forward copying word pairs we read long pairs at offsets
 836       // {0, 2, 4, 6} (in long words). when backwards copying we read
 837       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 838       // address by -2 in the forwards case so we can compute the
 839       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 840       // or -1.
 841       //
 842       // when forward copying we need to store 1 word, 3 pairs and
 843       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 844       // zero offset We adjust the destination by -1 which means we
 845       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 846       //
 847       // When backwards copyng we need to store 1 word, 3 pairs and
 848       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 849       // offsets {1, 3, 5, 7, 8} * unit.
 850 
 851       if (direction == copy_forwards) {
 852         __ sub(s, s, 16);
 853         __ sub(d, d, 8);
 854       }
 855 
 856       // Fill 8 registers
 857       //
 858       // for forwards copy s was offset by -16 from the original input
 859       // value of s so the register contents are at these offsets
 860       // relative to the 64 bit block addressed by that original input
 861       // and so on for each successive 64 byte block when s is updated
 862       //
 863       // t0 at offset 0,  t1 at offset 8
 864       // t2 at offset 16, t3 at offset 24
 865       // t4 at offset 32, t5 at offset 40
 866       // t6 at offset 48, t7 at offset 56
 867 
 868       // for backwards copy s was not offset so the register contents
 869       // are at these offsets into the preceding 64 byte block
 870       // relative to that original input and so on for each successive
 871       // preceding 64 byte block when s is updated. this explains the
 872       // slightly counter-intuitive looking pattern of register usage
 873       // in the stp instructions for backwards copy.
 874       //
 875       // t0 at offset -16, t1 at offset -8
 876       // t2 at offset -32, t3 at offset -24
 877       // t4 at offset -48, t5 at offset -40
 878       // t6 at offset -64, t7 at offset -56
 879 
 880       __ ldp(t0, t1, Address(s, 2 * unit));
 881       __ ldp(t2, t3, Address(s, 4 * unit));
 882       __ ldp(t4, t5, Address(s, 6 * unit));
 883       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 884 
 885       __ subs(count, count, 16);
 886       __ br(Assembler::LO, drain);
 887 
 888       int prefetch = PrefetchCopyIntervalInBytes;
 889       bool use_stride = false;
 890       if (direction == copy_backwards) {
 891          use_stride = prefetch > 256;
 892          prefetch = -prefetch;
 893          if (use_stride) __ mov(stride, prefetch);
 894       }
 895 
 896       __ bind(again);
 897 
 898       if (PrefetchCopyIntervalInBytes > 0)
 899         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 900 
 901       if (direction == copy_forwards) {
 902        // allowing for the offset of -8 the store instructions place
 903        // registers into the target 64 bit block at the following
 904        // offsets
 905        //
 906        // t0 at offset 0
 907        // t1 at offset 8,  t2 at offset 16
 908        // t3 at offset 24, t4 at offset 32
 909        // t5 at offset 40, t6 at offset 48
 910        // t7 at offset 56
 911 
 912         __ str(t0, Address(d, 1 * unit));
 913         __ stp(t1, t2, Address(d, 2 * unit));
 914         __ ldp(t0, t1, Address(s, 2 * unit));
 915         __ stp(t3, t4, Address(d, 4 * unit));
 916         __ ldp(t2, t3, Address(s, 4 * unit));
 917         __ stp(t5, t6, Address(d, 6 * unit));
 918         __ ldp(t4, t5, Address(s, 6 * unit));
 919         __ str(t7, Address(__ pre(d, 8 * unit)));
 920         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 921       } else {
 922        // d was not offset when we started so the registers are
 923        // written into the 64 bit block preceding d with the following
 924        // offsets
 925        //
 926        // t1 at offset -8
 927        // t3 at offset -24, t0 at offset -16
 928        // t5 at offset -48, t2 at offset -32
 929        // t7 at offset -56, t4 at offset -48
 930        //                   t6 at offset -64
 931        //
 932        // note that this matches the offsets previously noted for the
 933        // loads
 934 
 935         __ str(t1, Address(d, 1 * unit));
 936         __ stp(t3, t0, Address(d, 3 * unit));
 937         __ ldp(t0, t1, Address(s, 2 * unit));
 938         __ stp(t5, t2, Address(d, 5 * unit));
 939         __ ldp(t2, t3, Address(s, 4 * unit));
 940         __ stp(t7, t4, Address(d, 7 * unit));
 941         __ ldp(t4, t5, Address(s, 6 * unit));
 942         __ str(t6, Address(__ pre(d, 8 * unit)));
 943         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 944       }
 945 
 946       __ subs(count, count, 8);
 947       __ br(Assembler::HS, again);
 948 
 949       // Drain
 950       //
 951       // this uses the same pattern of offsets and register arguments
 952       // as above
 953       __ bind(drain);
 954       if (direction == copy_forwards) {
 955         __ str(t0, Address(d, 1 * unit));
 956         __ stp(t1, t2, Address(d, 2 * unit));
 957         __ stp(t3, t4, Address(d, 4 * unit));
 958         __ stp(t5, t6, Address(d, 6 * unit));
 959         __ str(t7, Address(__ pre(d, 8 * unit)));
 960       } else {
 961         __ str(t1, Address(d, 1 * unit));
 962         __ stp(t3, t0, Address(d, 3 * unit));
 963         __ stp(t5, t2, Address(d, 5 * unit));
 964         __ stp(t7, t4, Address(d, 7 * unit));
 965         __ str(t6, Address(__ pre(d, 8 * unit)));
 966       }
 967       // now we need to copy any remaining part block which may
 968       // include a 4 word block subblock and/or a 2 word subblock.
 969       // bits 2 and 1 in the count are the tell-tale for whetehr we
 970       // have each such subblock
 971       {
 972         Label L1, L2;
 973         __ tbz(count, exact_log2(4), L1);
 974        // this is the same as above but copying only 4 longs hence
 975        // with ony one intervening stp between the str instructions
 976        // but note that the offsets and registers still follow the
 977        // same pattern
 978         __ ldp(t0, t1, Address(s, 2 * unit));
 979         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 980         if (direction == copy_forwards) {
 981           __ str(t0, Address(d, 1 * unit));
 982           __ stp(t1, t2, Address(d, 2 * unit));
 983           __ str(t3, Address(__ pre(d, 4 * unit)));
 984         } else {
 985           __ str(t1, Address(d, 1 * unit));
 986           __ stp(t3, t0, Address(d, 3 * unit));
 987           __ str(t2, Address(__ pre(d, 4 * unit)));
 988         }
 989         __ bind(L1);
 990 
 991         __ tbz(count, 1, L2);
 992        // this is the same as above but copying only 2 longs hence
 993        // there is no intervening stp between the str instructions
 994        // but note that the offset and register patterns are still
 995        // the same
 996         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
 997         if (direction == copy_forwards) {
 998           __ str(t0, Address(d, 1 * unit));
 999           __ str(t1, Address(__ pre(d, 2 * unit)));
1000         } else {
1001           __ str(t1, Address(d, 1 * unit));
1002           __ str(t0, Address(__ pre(d, 2 * unit)));
1003         }
1004         __ bind(L2);
1005 
1006        // for forwards copy we need to re-adjust the offsets we
1007        // applied so that s and d are follow the last words written
1008 
1009        if (direction == copy_forwards) {
1010          __ add(s, s, 16);
1011          __ add(d, d, 8);
1012        }
1013 
1014       }
1015 
1016       __ ret(lr);
1017       }
1018   }
1019 
1020   // Small copy: less than 16 bytes.
1021   //
1022   // NB: Ignores all of the bits of count which represent more than 15
1023   // bytes, so a caller doesn't have to mask them.
1024 
1025   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1026     bool is_backwards = step < 0;
1027     size_t granularity = uabs(step);
1028     int direction = is_backwards ? -1 : 1;
1029     int unit = wordSize * direction;
1030 
1031     Label Lword, Lint, Lshort, Lbyte;
1032 
1033     assert(granularity
1034            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1035 
1036     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1037 
1038     // ??? I don't know if this bit-test-and-branch is the right thing
1039     // to do.  It does a lot of jumping, resulting in several
1040     // mispredicted branches.  It might make more sense to do this
1041     // with something like Duff's device with a single computed branch.
1042 
1043     __ tbz(count, 3 - exact_log2(granularity), Lword);
1044     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1045     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1046     __ bind(Lword);
1047 
1048     if (granularity <= sizeof (jint)) {
1049       __ tbz(count, 2 - exact_log2(granularity), Lint);
1050       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1051       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1052       __ bind(Lint);
1053     }
1054 
1055     if (granularity <= sizeof (jshort)) {
1056       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1057       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1058       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1059       __ bind(Lshort);
1060     }
1061 
1062     if (granularity <= sizeof (jbyte)) {
1063       __ tbz(count, 0, Lbyte);
1064       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1065       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1066       __ bind(Lbyte);
1067     }
1068   }
1069 
1070   Label copy_f, copy_b;
1071 
1072   // All-singing all-dancing memory copy.
1073   //
1074   // Copy count units of memory from s to d.  The size of a unit is
1075   // step, which can be positive or negative depending on the direction
1076   // of copy.  If is_aligned is false, we align the source address.
1077   //
1078 
1079   void copy_memory(bool is_aligned, Register s, Register d,
1080                    Register count, Register tmp, int step) {
1081     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1082     bool is_backwards = step < 0;
1083     int granularity = uabs(step);
1084     const Register t0 = r3, t1 = r4;
1085 
1086     // <= 96 bytes do inline. Direction doesn't matter because we always
1087     // load all the data before writing anything
1088     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1089     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1090     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1091     const Register send = r17, dend = r18;
1092 
1093     if (PrefetchCopyIntervalInBytes > 0)
1094       __ prfm(Address(s, 0), PLDL1KEEP);
1095     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1096     __ br(Assembler::HI, copy_big);
1097 
1098     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1099     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1100 
1101     __ cmp(count, u1(16/granularity));
1102     __ br(Assembler::LS, copy16);
1103 
1104     __ cmp(count, u1(64/granularity));
1105     __ br(Assembler::HI, copy80);
1106 
1107     __ cmp(count, u1(32/granularity));
1108     __ br(Assembler::LS, copy32);
1109 
1110     // 33..64 bytes
1111     if (UseSIMDForMemoryOps) {
1112       __ ldpq(v0, v1, Address(s, 0));
1113       __ ldpq(v2, v3, Address(send, -32));
1114       __ stpq(v0, v1, Address(d, 0));
1115       __ stpq(v2, v3, Address(dend, -32));
1116     } else {
1117       __ ldp(t0, t1, Address(s, 0));
1118       __ ldp(t2, t3, Address(s, 16));
1119       __ ldp(t4, t5, Address(send, -32));
1120       __ ldp(t6, t7, Address(send, -16));
1121 
1122       __ stp(t0, t1, Address(d, 0));
1123       __ stp(t2, t3, Address(d, 16));
1124       __ stp(t4, t5, Address(dend, -32));
1125       __ stp(t6, t7, Address(dend, -16));
1126     }
1127     __ b(finish);
1128 
1129     // 17..32 bytes
1130     __ bind(copy32);
1131     __ ldp(t0, t1, Address(s, 0));
1132     __ ldp(t2, t3, Address(send, -16));
1133     __ stp(t0, t1, Address(d, 0));
1134     __ stp(t2, t3, Address(dend, -16));
1135     __ b(finish);
1136 
1137     // 65..80/96 bytes
1138     // (96 bytes if SIMD because we do 32 byes per instruction)
1139     __ bind(copy80);
1140     if (UseSIMDForMemoryOps) {
1141       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1142       __ ldpq(v4, v5, Address(send, -32));
1143       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1144       __ stpq(v4, v5, Address(dend, -32));
1145     } else {
1146       __ ldp(t0, t1, Address(s, 0));
1147       __ ldp(t2, t3, Address(s, 16));
1148       __ ldp(t4, t5, Address(s, 32));
1149       __ ldp(t6, t7, Address(s, 48));
1150       __ ldp(t8, t9, Address(send, -16));
1151 
1152       __ stp(t0, t1, Address(d, 0));
1153       __ stp(t2, t3, Address(d, 16));
1154       __ stp(t4, t5, Address(d, 32));
1155       __ stp(t6, t7, Address(d, 48));
1156       __ stp(t8, t9, Address(dend, -16));
1157     }
1158     __ b(finish);
1159 
1160     // 0..16 bytes
1161     __ bind(copy16);
1162     __ cmp(count, u1(8/granularity));
1163     __ br(Assembler::LO, copy8);
1164 
1165     // 8..16 bytes
1166     __ ldr(t0, Address(s, 0));
1167     __ ldr(t1, Address(send, -8));
1168     __ str(t0, Address(d, 0));
1169     __ str(t1, Address(dend, -8));
1170     __ b(finish);
1171 
1172     if (granularity < 8) {
1173       // 4..7 bytes
1174       __ bind(copy8);
1175       __ tbz(count, 2 - exact_log2(granularity), copy4);
1176       __ ldrw(t0, Address(s, 0));
1177       __ ldrw(t1, Address(send, -4));
1178       __ strw(t0, Address(d, 0));
1179       __ strw(t1, Address(dend, -4));
1180       __ b(finish);
1181       if (granularity < 4) {
1182         // 0..3 bytes
1183         __ bind(copy4);
1184         __ cbz(count, finish); // get rid of 0 case
1185         if (granularity == 2) {
1186           __ ldrh(t0, Address(s, 0));
1187           __ strh(t0, Address(d, 0));
1188         } else { // granularity == 1
1189           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1190           // the first and last byte.
1191           // Handle the 3 byte case by loading and storing base + count/2
1192           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1193           // This does means in the 1 byte case we load/store the same
1194           // byte 3 times.
1195           __ lsr(count, count, 1);
1196           __ ldrb(t0, Address(s, 0));
1197           __ ldrb(t1, Address(send, -1));
1198           __ ldrb(t2, Address(s, count));
1199           __ strb(t0, Address(d, 0));
1200           __ strb(t1, Address(dend, -1));
1201           __ strb(t2, Address(d, count));
1202         }
1203         __ b(finish);
1204       }
1205     }
1206 
1207     __ bind(copy_big);
1208     if (is_backwards) {
1209       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1210       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1211     }
1212 
1213     // Now we've got the small case out of the way we can align the
1214     // source address on a 2-word boundary.
1215 
1216     Label aligned;
1217 
1218     if (is_aligned) {
1219       // We may have to adjust by 1 word to get s 2-word-aligned.
1220       __ tbz(s, exact_log2(wordSize), aligned);
1221       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1222       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1223       __ sub(count, count, wordSize/granularity);
1224     } else {
1225       if (is_backwards) {
1226         __ andr(rscratch2, s, 2 * wordSize - 1);
1227       } else {
1228         __ neg(rscratch2, s);
1229         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1230       }
1231       // rscratch2 is the byte adjustment needed to align s.
1232       __ cbz(rscratch2, aligned);
1233       int shift = exact_log2(granularity);
1234       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1235       __ sub(count, count, rscratch2);
1236 
1237 #if 0
1238       // ?? This code is only correct for a disjoint copy.  It may or
1239       // may not make sense to use it in that case.
1240 
1241       // Copy the first pair; s and d may not be aligned.
1242       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1243       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1244 
1245       // Align s and d, adjust count
1246       if (is_backwards) {
1247         __ sub(s, s, rscratch2);
1248         __ sub(d, d, rscratch2);
1249       } else {
1250         __ add(s, s, rscratch2);
1251         __ add(d, d, rscratch2);
1252       }
1253 #else
1254       copy_memory_small(s, d, rscratch2, rscratch1, step);
1255 #endif
1256     }
1257 
1258     __ bind(aligned);
1259 
1260     // s is now 2-word-aligned.
1261 
1262     // We have a count of units and some trailing bytes.  Adjust the
1263     // count and do a bulk copy of words.
1264     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1265     if (direction == copy_forwards)
1266       __ bl(copy_f);
1267     else
1268       __ bl(copy_b);
1269 
1270     // And the tail.
1271     copy_memory_small(s, d, count, tmp, step);
1272 
1273     if (granularity >= 8) __ bind(copy8);
1274     if (granularity >= 4) __ bind(copy4);
1275     __ bind(finish);
1276   }
1277 
1278 
1279   void clobber_registers() {
1280 #ifdef ASSERT
1281     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1282     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1283     for (Register r = r3; r <= r18; r++)
1284       if (r != rscratch1) __ mov(r, rscratch1);
1285 #endif
1286   }
1287 
1288   // Scan over array at a for count oops, verifying each one.
1289   // Preserves a and count, clobbers rscratch1 and rscratch2.
1290   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1291     Label loop, end;
1292     __ mov(rscratch1, a);
1293     __ mov(rscratch2, zr);
1294     __ bind(loop);
1295     __ cmp(rscratch2, count);
1296     __ br(Assembler::HS, end);
1297     if (size == (size_t)wordSize) {
1298       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1299       __ verify_oop(temp);
1300     } else {
1301       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1302       __ decode_heap_oop(temp); // calls verify_oop
1303     }
1304     __ add(rscratch2, rscratch2, size);
1305     __ b(loop);
1306     __ bind(end);
1307   }
1308 
1309   // Arguments:
1310   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1311   //             ignored
1312   //   is_oop  - true => oop array, so generate store check code
1313   //   name    - stub name string
1314   //
1315   // Inputs:
1316   //   c_rarg0   - source array address
1317   //   c_rarg1   - destination array address
1318   //   c_rarg2   - element count, treated as ssize_t, can be zero
1319   //
1320   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1321   // the hardware handle it.  The two dwords within qwords that span
1322   // cache line boundaries will still be loaded and stored atomicly.
1323   //
1324   // Side Effects:
1325   //   disjoint_int_copy_entry is set to the no-overlap entry point
1326   //   used by generate_conjoint_int_oop_copy().
1327   //
1328   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1329                                   const char *name, bool dest_uninitialized = false) {
1330     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1331     RegSet saved_reg = RegSet::of(s, d, count);
1332     __ align(CodeEntryAlignment);
1333     StubCodeMark mark(this, "StubRoutines", name);
1334     address start = __ pc();
1335     __ enter();
1336 
1337     if (entry != NULL) {
1338       *entry = __ pc();
1339       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1340       BLOCK_COMMENT("Entry:");
1341     }
1342 
1343     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1344     if (dest_uninitialized) {
1345       decorators |= IS_DEST_UNINITIALIZED;
1346     }
1347     if (aligned) {
1348       decorators |= ARRAYCOPY_ALIGNED;
1349     }
1350 
1351     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1352     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1353 
1354     if (is_oop) {
1355       // save regs before copy_memory
1356       __ push(RegSet::of(d, count), sp);
1357     }
1358     {
1359       // UnsafeCopyMemory page error: continue after ucm
1360       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1361       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1362       copy_memory(aligned, s, d, count, rscratch1, size);
1363     }
1364 
1365     if (is_oop) {
1366       __ pop(RegSet::of(d, count), sp);
1367       if (VerifyOops)
1368         verify_oop_array(size, d, count, r16);
1369     }
1370 
1371     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1372 
1373     __ leave();
1374     __ mov(r0, zr); // return 0
1375     __ ret(lr);
1376     return start;
1377   }
1378 
1379   // Arguments:
1380   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1381   //             ignored
1382   //   is_oop  - true => oop array, so generate store check code
1383   //   name    - stub name string
1384   //
1385   // Inputs:
1386   //   c_rarg0   - source array address
1387   //   c_rarg1   - destination array address
1388   //   c_rarg2   - element count, treated as ssize_t, can be zero
1389   //
1390   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1391   // the hardware handle it.  The two dwords within qwords that span
1392   // cache line boundaries will still be loaded and stored atomicly.
1393   //
1394   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1395                                  address *entry, const char *name,
1396                                  bool dest_uninitialized = false) {
1397     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1398     RegSet saved_regs = RegSet::of(s, d, count);
1399     StubCodeMark mark(this, "StubRoutines", name);
1400     address start = __ pc();
1401     __ enter();
1402 
1403     if (entry != NULL) {
1404       *entry = __ pc();
1405       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1406       BLOCK_COMMENT("Entry:");
1407     }
1408 
1409     // use fwd copy when (d-s) above_equal (count*size)
1410     __ sub(rscratch1, d, s);
1411     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1412     __ br(Assembler::HS, nooverlap_target);
1413 
1414     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1415     if (dest_uninitialized) {
1416       decorators |= IS_DEST_UNINITIALIZED;
1417     }
1418     if (aligned) {
1419       decorators |= ARRAYCOPY_ALIGNED;
1420     }
1421 
1422     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1423     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1424 
1425     if (is_oop) {
1426       // save regs before copy_memory
1427       __ push(RegSet::of(d, count), sp);
1428     }
1429     {
1430       // UnsafeCopyMemory page error: continue after ucm
1431       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1432       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1433       copy_memory(aligned, s, d, count, rscratch1, -size);
1434     }
1435     if (is_oop) {
1436       __ pop(RegSet::of(d, count), sp);
1437       if (VerifyOops)
1438         verify_oop_array(size, d, count, r16);
1439     }
1440     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1441     __ leave();
1442     __ mov(r0, zr); // return 0
1443     __ ret(lr);
1444     return start;
1445 }
1446 
1447   // Arguments:
1448   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1449   //             ignored
1450   //   name    - stub name string
1451   //
1452   // Inputs:
1453   //   c_rarg0   - source array address
1454   //   c_rarg1   - destination array address
1455   //   c_rarg2   - element count, treated as ssize_t, can be zero
1456   //
1457   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1458   // we let the hardware handle it.  The one to eight bytes within words,
1459   // dwords or qwords that span cache line boundaries will still be loaded
1460   // and stored atomically.
1461   //
1462   // Side Effects:
1463   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1464   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1465   // we let the hardware handle it.  The one to eight bytes within words,
1466   // dwords or qwords that span cache line boundaries will still be loaded
1467   // and stored atomically.
1468   //
1469   // Side Effects:
1470   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1471   //   used by generate_conjoint_byte_copy().
1472   //
1473   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1474     const bool not_oop = false;
1475     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1476   }
1477 
1478   // Arguments:
1479   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1480   //             ignored
1481   //   name    - stub name string
1482   //
1483   // Inputs:
1484   //   c_rarg0   - source array address
1485   //   c_rarg1   - destination array address
1486   //   c_rarg2   - element count, treated as ssize_t, can be zero
1487   //
1488   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1489   // we let the hardware handle it.  The one to eight bytes within words,
1490   // dwords or qwords that span cache line boundaries will still be loaded
1491   // and stored atomically.
1492   //
1493   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1494                                       address* entry, const char *name) {
1495     const bool not_oop = false;
1496     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1497   }
1498 
1499   // Arguments:
1500   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1501   //             ignored
1502   //   name    - stub name string
1503   //
1504   // Inputs:
1505   //   c_rarg0   - source array address
1506   //   c_rarg1   - destination array address
1507   //   c_rarg2   - element count, treated as ssize_t, can be zero
1508   //
1509   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1510   // let the hardware handle it.  The two or four words within dwords
1511   // or qwords that span cache line boundaries will still be loaded
1512   // and stored atomically.
1513   //
1514   // Side Effects:
1515   //   disjoint_short_copy_entry is set to the no-overlap entry point
1516   //   used by generate_conjoint_short_copy().
1517   //
1518   address generate_disjoint_short_copy(bool aligned,
1519                                        address* entry, const char *name) {
1520     const bool not_oop = false;
1521     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1522   }
1523 
1524   // Arguments:
1525   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1526   //             ignored
1527   //   name    - stub name string
1528   //
1529   // Inputs:
1530   //   c_rarg0   - source array address
1531   //   c_rarg1   - destination array address
1532   //   c_rarg2   - element count, treated as ssize_t, can be zero
1533   //
1534   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1535   // let the hardware handle it.  The two or four words within dwords
1536   // or qwords that span cache line boundaries will still be loaded
1537   // and stored atomically.
1538   //
1539   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1540                                        address *entry, const char *name) {
1541     const bool not_oop = false;
1542     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1543 
1544   }
1545   // Arguments:
1546   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1547   //             ignored
1548   //   name    - stub name string
1549   //
1550   // Inputs:
1551   //   c_rarg0   - source array address
1552   //   c_rarg1   - destination array address
1553   //   c_rarg2   - element count, treated as ssize_t, can be zero
1554   //
1555   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1556   // the hardware handle it.  The two dwords within qwords that span
1557   // cache line boundaries will still be loaded and stored atomicly.
1558   //
1559   // Side Effects:
1560   //   disjoint_int_copy_entry is set to the no-overlap entry point
1561   //   used by generate_conjoint_int_oop_copy().
1562   //
1563   address generate_disjoint_int_copy(bool aligned, address *entry,
1564                                          const char *name, bool dest_uninitialized = false) {
1565     const bool not_oop = false;
1566     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1567   }
1568 
1569   // Arguments:
1570   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1571   //             ignored
1572   //   name    - stub name string
1573   //
1574   // Inputs:
1575   //   c_rarg0   - source array address
1576   //   c_rarg1   - destination array address
1577   //   c_rarg2   - element count, treated as ssize_t, can be zero
1578   //
1579   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1580   // the hardware handle it.  The two dwords within qwords that span
1581   // cache line boundaries will still be loaded and stored atomicly.
1582   //
1583   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1584                                      address *entry, const char *name,
1585                                      bool dest_uninitialized = false) {
1586     const bool not_oop = false;
1587     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1588   }
1589 
1590 
1591   // Arguments:
1592   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1593   //             ignored
1594   //   name    - stub name string
1595   //
1596   // Inputs:
1597   //   c_rarg0   - source array address
1598   //   c_rarg1   - destination array address
1599   //   c_rarg2   - element count, treated as size_t, can be zero
1600   //
1601   // Side Effects:
1602   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1603   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1604   //
1605   address generate_disjoint_long_copy(bool aligned, address *entry,
1606                                           const char *name, bool dest_uninitialized = false) {
1607     const bool not_oop = false;
1608     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1609   }
1610 
1611   // Arguments:
1612   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1613   //             ignored
1614   //   name    - stub name string
1615   //
1616   // Inputs:
1617   //   c_rarg0   - source array address
1618   //   c_rarg1   - destination array address
1619   //   c_rarg2   - element count, treated as size_t, can be zero
1620   //
1621   address generate_conjoint_long_copy(bool aligned,
1622                                       address nooverlap_target, address *entry,
1623                                       const char *name, bool dest_uninitialized = false) {
1624     const bool not_oop = false;
1625     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1626   }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as size_t, can be zero
1637   //
1638   // Side Effects:
1639   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1640   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1641   //
1642   address generate_disjoint_oop_copy(bool aligned, address *entry,
1643                                      const char *name, bool dest_uninitialized) {
1644     const bool is_oop = true;
1645     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1646     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1647   }
1648 
1649   // Arguments:
1650   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1651   //             ignored
1652   //   name    - stub name string
1653   //
1654   // Inputs:
1655   //   c_rarg0   - source array address
1656   //   c_rarg1   - destination array address
1657   //   c_rarg2   - element count, treated as size_t, can be zero
1658   //
1659   address generate_conjoint_oop_copy(bool aligned,
1660                                      address nooverlap_target, address *entry,
1661                                      const char *name, bool dest_uninitialized) {
1662     const bool is_oop = true;
1663     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1664     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1665                                   name, dest_uninitialized);
1666   }
1667 
1668 
1669   // Helper for generating a dynamic type check.
1670   // Smashes rscratch1, rscratch2.
1671   void generate_type_check(Register sub_klass,
1672                            Register super_check_offset,
1673                            Register super_klass,
1674                            Label& L_success) {
1675     assert_different_registers(sub_klass, super_check_offset, super_klass);
1676 
1677     BLOCK_COMMENT("type_check:");
1678 
1679     Label L_miss;
1680 
1681     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1682                                      super_check_offset);
1683     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1684 
1685     // Fall through on failure!
1686     __ BIND(L_miss);
1687   }
1688 
1689   //
1690   //  Generate checkcasting array copy stub
1691   //
1692   //  Input:
1693   //    c_rarg0   - source array address
1694   //    c_rarg1   - destination array address
1695   //    c_rarg2   - element count, treated as ssize_t, can be zero
1696   //    c_rarg3   - size_t ckoff (super_check_offset)
1697   //    c_rarg4   - oop ckval (super_klass)
1698   //
1699   //  Output:
1700   //    r0 ==  0  -  success
1701   //    r0 == -1^K - failure, where K is partial transfer count
1702   //
1703   address generate_checkcast_copy(const char *name, address *entry,
1704                                   bool dest_uninitialized = false) {
1705 
1706     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1707 
1708     // Input registers (after setup_arg_regs)
1709     const Register from        = c_rarg0;   // source array address
1710     const Register to          = c_rarg1;   // destination array address
1711     const Register count       = c_rarg2;   // elementscount
1712     const Register ckoff       = c_rarg3;   // super_check_offset
1713     const Register ckval       = c_rarg4;   // super_klass
1714 
1715     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1716     RegSet wb_post_saved_regs = RegSet::of(count);
1717 
1718     // Registers used as temps (r18, r19, r20 are save-on-entry)
1719     const Register count_save  = r21;       // orig elementscount
1720     const Register start_to    = r20;       // destination array start address
1721     const Register copied_oop  = r18;       // actual oop copied
1722     const Register r19_klass   = r19;       // oop._klass
1723 
1724     //---------------------------------------------------------------
1725     // Assembler stub will be used for this call to arraycopy
1726     // if the two arrays are subtypes of Object[] but the
1727     // destination array type is not equal to or a supertype
1728     // of the source type.  Each element must be separately
1729     // checked.
1730 
1731     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1732                                copied_oop, r19_klass, count_save);
1733 
1734     __ align(CodeEntryAlignment);
1735     StubCodeMark mark(this, "StubRoutines", name);
1736     address start = __ pc();
1737 
1738     __ enter(); // required for proper stackwalking of RuntimeStub frame
1739 
1740 #ifdef ASSERT
1741     // caller guarantees that the arrays really are different
1742     // otherwise, we would have to make conjoint checks
1743     { Label L;
1744       array_overlap_test(L, TIMES_OOP);
1745       __ stop("checkcast_copy within a single array");
1746       __ bind(L);
1747     }
1748 #endif //ASSERT
1749 
1750     // Caller of this entry point must set up the argument registers.
1751     if (entry != NULL) {
1752       *entry = __ pc();
1753       BLOCK_COMMENT("Entry:");
1754     }
1755 
1756      // Empty array:  Nothing to do.
1757     __ cbz(count, L_done);
1758 
1759     __ push(RegSet::of(r18, r19, r20, r21), sp);
1760 
1761 #ifdef ASSERT
1762     BLOCK_COMMENT("assert consistent ckoff/ckval");
1763     // The ckoff and ckval must be mutually consistent,
1764     // even though caller generates both.
1765     { Label L;
1766       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1767       __ ldrw(start_to, Address(ckval, sco_offset));
1768       __ cmpw(ckoff, start_to);
1769       __ br(Assembler::EQ, L);
1770       __ stop("super_check_offset inconsistent");
1771       __ bind(L);
1772     }
1773 #endif //ASSERT
1774 
1775     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1776     bool is_oop = true;
1777     if (dest_uninitialized) {
1778       decorators |= IS_DEST_UNINITIALIZED;
1779     }
1780 
1781     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1782     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1783 
1784     // save the original count
1785     __ mov(count_save, count);
1786 
1787     // Copy from low to high addresses
1788     __ mov(start_to, to);              // Save destination array start address
1789     __ b(L_load_element);
1790 
1791     // ======== begin loop ========
1792     // (Loop is rotated; its entry is L_load_element.)
1793     // Loop control:
1794     //   for (; count != 0; count--) {
1795     //     copied_oop = load_heap_oop(from++);
1796     //     ... generate_type_check ...;
1797     //     store_heap_oop(to++, copied_oop);
1798     //   }
1799     __ align(OptoLoopAlignment);
1800 
1801     __ BIND(L_store_element);
1802     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1803     __ sub(count, count, 1);
1804     __ cbz(count, L_do_card_marks);
1805 
1806     // ======== loop entry is here ========
1807     __ BIND(L_load_element);
1808     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1809     __ cbz(copied_oop, L_store_element);
1810 
1811     __ load_klass(r19_klass, copied_oop);// query the object klass
1812     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1813     // ======== end loop ========
1814 
1815     // It was a real error; we must depend on the caller to finish the job.
1816     // Register count = remaining oops, count_orig = total oops.
1817     // Emit GC store barriers for the oops we have copied and report
1818     // their number to the caller.
1819 
1820     __ subs(count, count_save, count);     // K = partially copied oop count
1821     __ eon(count, count, zr);                   // report (-1^K) to caller
1822     __ br(Assembler::EQ, L_done_pop);
1823 
1824     __ BIND(L_do_card_marks);
1825     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1826 
1827     __ bind(L_done_pop);
1828     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1829     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1830 
1831     __ bind(L_done);
1832     __ mov(r0, count);
1833     __ leave();
1834     __ ret(lr);
1835 
1836     return start;
1837   }
1838 
1839   // Perform range checks on the proposed arraycopy.
1840   // Kills temp, but nothing else.
1841   // Also, clean the sign bits of src_pos and dst_pos.
1842   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1843                               Register src_pos, // source position (c_rarg1)
1844                               Register dst,     // destination array oo (c_rarg2)
1845                               Register dst_pos, // destination position (c_rarg3)
1846                               Register length,
1847                               Register temp,
1848                               Label& L_failed) {
1849     BLOCK_COMMENT("arraycopy_range_checks:");
1850 
1851     assert_different_registers(rscratch1, temp);
1852 
1853     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1854     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1855     __ addw(temp, length, src_pos);
1856     __ cmpw(temp, rscratch1);
1857     __ br(Assembler::HI, L_failed);
1858 
1859     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1860     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1861     __ addw(temp, length, dst_pos);
1862     __ cmpw(temp, rscratch1);
1863     __ br(Assembler::HI, L_failed);
1864 
1865     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1866     __ movw(src_pos, src_pos);
1867     __ movw(dst_pos, dst_pos);
1868 
1869     BLOCK_COMMENT("arraycopy_range_checks done");
1870   }
1871 
1872   // These stubs get called from some dumb test routine.
1873   // I'll write them properly when they're called from
1874   // something that's actually doing something.
1875   static void fake_arraycopy_stub(address src, address dst, int count) {
1876     assert(count == 0, "huh?");
1877   }
1878 
1879 
1880   //
1881   //  Generate 'unsafe' array copy stub
1882   //  Though just as safe as the other stubs, it takes an unscaled
1883   //  size_t argument instead of an element count.
1884   //
1885   //  Input:
1886   //    c_rarg0   - source array address
1887   //    c_rarg1   - destination array address
1888   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1889   //
1890   // Examines the alignment of the operands and dispatches
1891   // to a long, int, short, or byte copy loop.
1892   //
1893   address generate_unsafe_copy(const char *name,
1894                                address byte_copy_entry,
1895                                address short_copy_entry,
1896                                address int_copy_entry,
1897                                address long_copy_entry) {
1898     Label L_long_aligned, L_int_aligned, L_short_aligned;
1899     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1900 
1901     __ align(CodeEntryAlignment);
1902     StubCodeMark mark(this, "StubRoutines", name);
1903     address start = __ pc();
1904     __ enter(); // required for proper stackwalking of RuntimeStub frame
1905 
1906     // bump this on entry, not on exit:
1907     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1908 
1909     __ orr(rscratch1, s, d);
1910     __ orr(rscratch1, rscratch1, count);
1911 
1912     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1913     __ cbz(rscratch1, L_long_aligned);
1914     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1915     __ cbz(rscratch1, L_int_aligned);
1916     __ tbz(rscratch1, 0, L_short_aligned);
1917     __ b(RuntimeAddress(byte_copy_entry));
1918 
1919     __ BIND(L_short_aligned);
1920     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1921     __ b(RuntimeAddress(short_copy_entry));
1922     __ BIND(L_int_aligned);
1923     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1924     __ b(RuntimeAddress(int_copy_entry));
1925     __ BIND(L_long_aligned);
1926     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1927     __ b(RuntimeAddress(long_copy_entry));
1928 
1929     return start;
1930   }
1931 
1932   //
1933   //  Generate generic array copy stubs
1934   //
1935   //  Input:
1936   //    c_rarg0    -  src oop
1937   //    c_rarg1    -  src_pos (32-bits)
1938   //    c_rarg2    -  dst oop
1939   //    c_rarg3    -  dst_pos (32-bits)
1940   //    c_rarg4    -  element count (32-bits)
1941   //
1942   //  Output:
1943   //    r0 ==  0  -  success
1944   //    r0 == -1^K - failure, where K is partial transfer count
1945   //
1946   address generate_generic_copy(const char *name,
1947                                 address byte_copy_entry, address short_copy_entry,
1948                                 address int_copy_entry, address oop_copy_entry,
1949                                 address long_copy_entry, address checkcast_copy_entry) {
1950 
1951     Label L_failed, L_objArray;
1952     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1953 
1954     // Input registers
1955     const Register src        = c_rarg0;  // source array oop
1956     const Register src_pos    = c_rarg1;  // source position
1957     const Register dst        = c_rarg2;  // destination array oop
1958     const Register dst_pos    = c_rarg3;  // destination position
1959     const Register length     = c_rarg4;
1960 
1961 
1962     // Registers used as temps
1963     const Register dst_klass  = c_rarg5;
1964 
1965     __ align(CodeEntryAlignment);
1966 
1967     StubCodeMark mark(this, "StubRoutines", name);
1968 
1969     address start = __ pc();
1970 
1971     __ enter(); // required for proper stackwalking of RuntimeStub frame
1972 
1973     // bump this on entry, not on exit:
1974     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1975 
1976     //-----------------------------------------------------------------------
1977     // Assembler stub will be used for this call to arraycopy
1978     // if the following conditions are met:
1979     //
1980     // (1) src and dst must not be null.
1981     // (2) src_pos must not be negative.
1982     // (3) dst_pos must not be negative.
1983     // (4) length  must not be negative.
1984     // (5) src klass and dst klass should be the same and not NULL.
1985     // (6) src and dst should be arrays.
1986     // (7) src_pos + length must not exceed length of src.
1987     // (8) dst_pos + length must not exceed length of dst.
1988     //
1989 
1990     //  if (src == NULL) return -1;
1991     __ cbz(src, L_failed);
1992 
1993     //  if (src_pos < 0) return -1;
1994     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1995 
1996     //  if (dst == NULL) return -1;
1997     __ cbz(dst, L_failed);
1998 
1999     //  if (dst_pos < 0) return -1;
2000     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2001 
2002     // registers used as temp
2003     const Register scratch_length    = r16; // elements count to copy
2004     const Register scratch_src_klass = r17; // array klass
2005     const Register lh                = r18; // layout helper
2006 
2007     //  if (length < 0) return -1;
2008     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2009     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2010 
2011     __ load_klass(scratch_src_klass, src);
2012 #ifdef ASSERT
2013     //  assert(src->klass() != NULL);
2014     {
2015       BLOCK_COMMENT("assert klasses not null {");
2016       Label L1, L2;
2017       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2018       __ bind(L1);
2019       __ stop("broken null klass");
2020       __ bind(L2);
2021       __ load_klass(rscratch1, dst);
2022       __ cbz(rscratch1, L1);     // this would be broken also
2023       BLOCK_COMMENT("} assert klasses not null done");
2024     }
2025 #endif
2026 
2027     // Load layout helper (32-bits)
2028     //
2029     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2030     // 32        30    24            16              8     2                 0
2031     //
2032     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2033     //
2034 
2035     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2036 
2037     // Handle objArrays completely differently...
2038     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2039     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2040     __ movw(rscratch1, objArray_lh);
2041     __ eorw(rscratch2, lh, rscratch1);
2042     __ cbzw(rscratch2, L_objArray);
2043 
2044     //  if (src->klass() != dst->klass()) return -1;
2045     __ load_klass(rscratch2, dst);
2046     __ eor(rscratch2, rscratch2, scratch_src_klass);
2047     __ cbnz(rscratch2, L_failed);
2048 
2049     //  if (!src->is_Array()) return -1;
2050     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2051 
2052     // At this point, it is known to be a typeArray (array_tag 0x3).
2053 #ifdef ASSERT
2054     {
2055       BLOCK_COMMENT("assert primitive array {");
2056       Label L;
2057       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2058       __ cmpw(lh, rscratch2);
2059       __ br(Assembler::GE, L);
2060       __ stop("must be a primitive array");
2061       __ bind(L);
2062       BLOCK_COMMENT("} assert primitive array done");
2063     }
2064 #endif
2065 
2066     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2067                            rscratch2, L_failed);
2068 
2069     // TypeArrayKlass
2070     //
2071     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2072     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2073     //
2074 
2075     const Register rscratch1_offset = rscratch1;    // array offset
2076     const Register r18_elsize = lh; // element size
2077 
2078     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2079            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2080     __ add(src, src, rscratch1_offset);           // src array offset
2081     __ add(dst, dst, rscratch1_offset);           // dst array offset
2082     BLOCK_COMMENT("choose copy loop based on element size");
2083 
2084     // next registers should be set before the jump to corresponding stub
2085     const Register from     = c_rarg0;  // source array address
2086     const Register to       = c_rarg1;  // destination array address
2087     const Register count    = c_rarg2;  // elements count
2088 
2089     // 'from', 'to', 'count' registers should be set in such order
2090     // since they are the same as 'src', 'src_pos', 'dst'.
2091 
2092     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2093 
2094     // The possible values of elsize are 0-3, i.e. exact_log2(element
2095     // size in bytes).  We do a simple bitwise binary search.
2096   __ BIND(L_copy_bytes);
2097     __ tbnz(r18_elsize, 1, L_copy_ints);
2098     __ tbnz(r18_elsize, 0, L_copy_shorts);
2099     __ lea(from, Address(src, src_pos));// src_addr
2100     __ lea(to,   Address(dst, dst_pos));// dst_addr
2101     __ movw(count, scratch_length); // length
2102     __ b(RuntimeAddress(byte_copy_entry));
2103 
2104   __ BIND(L_copy_shorts);
2105     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2106     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2107     __ movw(count, scratch_length); // length
2108     __ b(RuntimeAddress(short_copy_entry));
2109 
2110   __ BIND(L_copy_ints);
2111     __ tbnz(r18_elsize, 0, L_copy_longs);
2112     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2113     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2114     __ movw(count, scratch_length); // length
2115     __ b(RuntimeAddress(int_copy_entry));
2116 
2117   __ BIND(L_copy_longs);
2118 #ifdef ASSERT
2119     {
2120       BLOCK_COMMENT("assert long copy {");
2121       Label L;
2122       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2123       __ cmpw(r18_elsize, LogBytesPerLong);
2124       __ br(Assembler::EQ, L);
2125       __ stop("must be long copy, but elsize is wrong");
2126       __ bind(L);
2127       BLOCK_COMMENT("} assert long copy done");
2128     }
2129 #endif
2130     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2131     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2132     __ movw(count, scratch_length); // length
2133     __ b(RuntimeAddress(long_copy_entry));
2134 
2135     // ObjArrayKlass
2136   __ BIND(L_objArray);
2137     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2138 
2139     Label L_plain_copy, L_checkcast_copy;
2140     //  test array classes for subtyping
2141     __ load_klass(r18, dst);
2142     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2143     __ br(Assembler::NE, L_checkcast_copy);
2144 
2145     // Identically typed arrays can be copied without element-wise checks.
2146     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2147                            rscratch2, L_failed);
2148 
2149     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2150     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2151     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2152     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2153     __ movw(count, scratch_length); // length
2154   __ BIND(L_plain_copy);
2155     __ b(RuntimeAddress(oop_copy_entry));
2156 
2157   __ BIND(L_checkcast_copy);
2158     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2159     {
2160       // Before looking at dst.length, make sure dst is also an objArray.
2161       __ ldrw(rscratch1, Address(r18, lh_offset));
2162       __ movw(rscratch2, objArray_lh);
2163       __ eorw(rscratch1, rscratch1, rscratch2);
2164       __ cbnzw(rscratch1, L_failed);
2165 
2166       // It is safe to examine both src.length and dst.length.
2167       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2168                              r18, L_failed);
2169 
2170       __ load_klass(dst_klass, dst); // reload
2171 
2172       // Marshal the base address arguments now, freeing registers.
2173       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2174       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2175       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2176       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2177       __ movw(count, length);           // length (reloaded)
2178       Register sco_temp = c_rarg3;      // this register is free now
2179       assert_different_registers(from, to, count, sco_temp,
2180                                  dst_klass, scratch_src_klass);
2181       // assert_clean_int(count, sco_temp);
2182 
2183       // Generate the type check.
2184       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2185       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2186 
2187       // Smashes rscratch1, rscratch2
2188       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2189 
2190       // Fetch destination element klass from the ObjArrayKlass header.
2191       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2192       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2193       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2194 
2195       // the checkcast_copy loop needs two extra arguments:
2196       assert(c_rarg3 == sco_temp, "#3 already in place");
2197       // Set up arguments for checkcast_copy_entry.
2198       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2199       __ b(RuntimeAddress(checkcast_copy_entry));
2200     }
2201 
2202   __ BIND(L_failed);
2203     __ mov(r0, -1);
2204     __ leave();   // required for proper stackwalking of RuntimeStub frame
2205     __ ret(lr);
2206 
2207     return start;
2208   }
2209 
2210   //
2211   // Generate stub for array fill. If "aligned" is true, the
2212   // "to" address is assumed to be heapword aligned.
2213   //
2214   // Arguments for generated stub:
2215   //   to:    c_rarg0
2216   //   value: c_rarg1
2217   //   count: c_rarg2 treated as signed
2218   //
2219   address generate_fill(BasicType t, bool aligned, const char *name) {
2220     __ align(CodeEntryAlignment);
2221     StubCodeMark mark(this, "StubRoutines", name);
2222     address start = __ pc();
2223 
2224     BLOCK_COMMENT("Entry:");
2225 
2226     const Register to        = c_rarg0;  // source array address
2227     const Register value     = c_rarg1;  // value
2228     const Register count     = c_rarg2;  // elements count
2229 
2230     const Register bz_base = r10;        // base for block_zero routine
2231     const Register cnt_words = r11;      // temp register
2232 
2233     __ enter();
2234 
2235     Label L_fill_elements, L_exit1;
2236 
2237     int shift = -1;
2238     switch (t) {
2239       case T_BYTE:
2240         shift = 0;
2241         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2242         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2243         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2244         __ br(Assembler::LO, L_fill_elements);
2245         break;
2246       case T_SHORT:
2247         shift = 1;
2248         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2249         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2250         __ br(Assembler::LO, L_fill_elements);
2251         break;
2252       case T_INT:
2253         shift = 2;
2254         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2255         __ br(Assembler::LO, L_fill_elements);
2256         break;
2257       default: ShouldNotReachHere();
2258     }
2259 
2260     // Align source address at 8 bytes address boundary.
2261     Label L_skip_align1, L_skip_align2, L_skip_align4;
2262     if (!aligned) {
2263       switch (t) {
2264         case T_BYTE:
2265           // One byte misalignment happens only for byte arrays.
2266           __ tbz(to, 0, L_skip_align1);
2267           __ strb(value, Address(__ post(to, 1)));
2268           __ subw(count, count, 1);
2269           __ bind(L_skip_align1);
2270           // Fallthrough
2271         case T_SHORT:
2272           // Two bytes misalignment happens only for byte and short (char) arrays.
2273           __ tbz(to, 1, L_skip_align2);
2274           __ strh(value, Address(__ post(to, 2)));
2275           __ subw(count, count, 2 >> shift);
2276           __ bind(L_skip_align2);
2277           // Fallthrough
2278         case T_INT:
2279           // Align to 8 bytes, we know we are 4 byte aligned to start.
2280           __ tbz(to, 2, L_skip_align4);
2281           __ strw(value, Address(__ post(to, 4)));
2282           __ subw(count, count, 4 >> shift);
2283           __ bind(L_skip_align4);
2284           break;
2285         default: ShouldNotReachHere();
2286       }
2287     }
2288 
2289     //
2290     //  Fill large chunks
2291     //
2292     __ lsrw(cnt_words, count, 3 - shift); // number of words
2293     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2294     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2295     if (UseBlockZeroing) {
2296       Label non_block_zeroing, rest;
2297       // If the fill value is zero we can use the fast zero_words().
2298       __ cbnz(value, non_block_zeroing);
2299       __ mov(bz_base, to);
2300       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2301       __ zero_words(bz_base, cnt_words);
2302       __ b(rest);
2303       __ bind(non_block_zeroing);
2304       __ fill_words(to, cnt_words, value);
2305       __ bind(rest);
2306     } else {
2307       __ fill_words(to, cnt_words, value);
2308     }
2309 
2310     // Remaining count is less than 8 bytes. Fill it by a single store.
2311     // Note that the total length is no less than 8 bytes.
2312     if (t == T_BYTE || t == T_SHORT) {
2313       Label L_exit1;
2314       __ cbzw(count, L_exit1);
2315       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2316       __ str(value, Address(to, -8));    // overwrite some elements
2317       __ bind(L_exit1);
2318       __ leave();
2319       __ ret(lr);
2320     }
2321 
2322     // Handle copies less than 8 bytes.
2323     Label L_fill_2, L_fill_4, L_exit2;
2324     __ bind(L_fill_elements);
2325     switch (t) {
2326       case T_BYTE:
2327         __ tbz(count, 0, L_fill_2);
2328         __ strb(value, Address(__ post(to, 1)));
2329         __ bind(L_fill_2);
2330         __ tbz(count, 1, L_fill_4);
2331         __ strh(value, Address(__ post(to, 2)));
2332         __ bind(L_fill_4);
2333         __ tbz(count, 2, L_exit2);
2334         __ strw(value, Address(to));
2335         break;
2336       case T_SHORT:
2337         __ tbz(count, 0, L_fill_4);
2338         __ strh(value, Address(__ post(to, 2)));
2339         __ bind(L_fill_4);
2340         __ tbz(count, 1, L_exit2);
2341         __ strw(value, Address(to));
2342         break;
2343       case T_INT:
2344         __ cbzw(count, L_exit2);
2345         __ strw(value, Address(to));
2346         break;
2347       default: ShouldNotReachHere();
2348     }
2349     __ bind(L_exit2);
2350     __ leave();
2351     __ ret(lr);
2352     return start;
2353   }
2354 
2355   address generate_data_cache_writeback() {
2356     const Register line        = c_rarg0;  // address of line to write back
2357 
2358     __ align(CodeEntryAlignment);
2359 
2360     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2361 
2362     address start = __ pc();
2363     __ enter();
2364     __ cache_wb(Address(line, 0));
2365     __ leave();
2366     __ ret(lr);
2367 
2368     return start;
2369   }
2370 
2371   address generate_data_cache_writeback_sync() {
2372     const Register is_pre     = c_rarg0;  // pre or post sync
2373 
2374     __ align(CodeEntryAlignment);
2375 
2376     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2377 
2378     // pre wbsync is a no-op
2379     // post wbsync translates to an sfence
2380 
2381     Label skip;
2382     address start = __ pc();
2383     __ enter();
2384     __ cbnz(is_pre, skip);
2385     __ cache_wbsync(false);
2386     __ bind(skip);
2387     __ leave();
2388     __ ret(lr);
2389 
2390     return start;
2391   }
2392 
2393   void generate_arraycopy_stubs() {
2394     address entry;
2395     address entry_jbyte_arraycopy;
2396     address entry_jshort_arraycopy;
2397     address entry_jint_arraycopy;
2398     address entry_oop_arraycopy;
2399     address entry_jlong_arraycopy;
2400     address entry_checkcast_arraycopy;
2401 
2402     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2403     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2404 
2405     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2406 
2407     //*** jbyte
2408     // Always need aligned and unaligned versions
2409     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2410                                                                                   "jbyte_disjoint_arraycopy");
2411     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2412                                                                                   &entry_jbyte_arraycopy,
2413                                                                                   "jbyte_arraycopy");
2414     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2415                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2416     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2417                                                                                   "arrayof_jbyte_arraycopy");
2418 
2419     //*** jshort
2420     // Always need aligned and unaligned versions
2421     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2422                                                                                     "jshort_disjoint_arraycopy");
2423     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2424                                                                                     &entry_jshort_arraycopy,
2425                                                                                     "jshort_arraycopy");
2426     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2427                                                                                     "arrayof_jshort_disjoint_arraycopy");
2428     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2429                                                                                     "arrayof_jshort_arraycopy");
2430 
2431     //*** jint
2432     // Aligned versions
2433     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2434                                                                                 "arrayof_jint_disjoint_arraycopy");
2435     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2436                                                                                 "arrayof_jint_arraycopy");
2437     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2438     // entry_jint_arraycopy always points to the unaligned version
2439     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2440                                                                                 "jint_disjoint_arraycopy");
2441     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2442                                                                                 &entry_jint_arraycopy,
2443                                                                                 "jint_arraycopy");
2444 
2445     //*** jlong
2446     // It is always aligned
2447     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2448                                                                                   "arrayof_jlong_disjoint_arraycopy");
2449     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2450                                                                                   "arrayof_jlong_arraycopy");
2451     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2452     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2453 
2454     //*** oops
2455     {
2456       // With compressed oops we need unaligned versions; notice that
2457       // we overwrite entry_oop_arraycopy.
2458       bool aligned = !UseCompressedOops;
2459 
2460       StubRoutines::_arrayof_oop_disjoint_arraycopy
2461         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2462                                      /*dest_uninitialized*/false);
2463       StubRoutines::_arrayof_oop_arraycopy
2464         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2465                                      /*dest_uninitialized*/false);
2466       // Aligned versions without pre-barriers
2467       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2468         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2469                                      /*dest_uninitialized*/true);
2470       StubRoutines::_arrayof_oop_arraycopy_uninit
2471         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2472                                      /*dest_uninitialized*/true);
2473     }
2474 
2475     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2476     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2477     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2478     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2479 
2480     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2481     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2482                                                                         /*dest_uninitialized*/true);
2483 
2484     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2485                                                               entry_jbyte_arraycopy,
2486                                                               entry_jshort_arraycopy,
2487                                                               entry_jint_arraycopy,
2488                                                               entry_jlong_arraycopy);
2489 
2490     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2491                                                                entry_jbyte_arraycopy,
2492                                                                entry_jshort_arraycopy,
2493                                                                entry_jint_arraycopy,
2494                                                                entry_oop_arraycopy,
2495                                                                entry_jlong_arraycopy,
2496                                                                entry_checkcast_arraycopy);
2497 
2498     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2499     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2500     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2501     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2502     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2503     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2504   }
2505 
2506   void generate_math_stubs() { Unimplemented(); }
2507 
2508   // Arguments:
2509   //
2510   // Inputs:
2511   //   c_rarg0   - source byte array address
2512   //   c_rarg1   - destination byte array address
2513   //   c_rarg2   - K (key) in little endian int array
2514   //
2515   address generate_aescrypt_encryptBlock() {
2516     __ align(CodeEntryAlignment);
2517     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2518 
2519     Label L_doLast;
2520 
2521     const Register from        = c_rarg0;  // source array address
2522     const Register to          = c_rarg1;  // destination array address
2523     const Register key         = c_rarg2;  // key array address
2524     const Register keylen      = rscratch1;
2525 
2526     address start = __ pc();
2527     __ enter();
2528 
2529     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2530 
2531     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2532 
2533     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2534     __ rev32(v1, __ T16B, v1);
2535     __ rev32(v2, __ T16B, v2);
2536     __ rev32(v3, __ T16B, v3);
2537     __ rev32(v4, __ T16B, v4);
2538     __ aese(v0, v1);
2539     __ aesmc(v0, v0);
2540     __ aese(v0, v2);
2541     __ aesmc(v0, v0);
2542     __ aese(v0, v3);
2543     __ aesmc(v0, v0);
2544     __ aese(v0, v4);
2545     __ aesmc(v0, v0);
2546 
2547     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2548     __ rev32(v1, __ T16B, v1);
2549     __ rev32(v2, __ T16B, v2);
2550     __ rev32(v3, __ T16B, v3);
2551     __ rev32(v4, __ T16B, v4);
2552     __ aese(v0, v1);
2553     __ aesmc(v0, v0);
2554     __ aese(v0, v2);
2555     __ aesmc(v0, v0);
2556     __ aese(v0, v3);
2557     __ aesmc(v0, v0);
2558     __ aese(v0, v4);
2559     __ aesmc(v0, v0);
2560 
2561     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2562     __ rev32(v1, __ T16B, v1);
2563     __ rev32(v2, __ T16B, v2);
2564 
2565     __ cmpw(keylen, 44);
2566     __ br(Assembler::EQ, L_doLast);
2567 
2568     __ aese(v0, v1);
2569     __ aesmc(v0, v0);
2570     __ aese(v0, v2);
2571     __ aesmc(v0, v0);
2572 
2573     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2574     __ rev32(v1, __ T16B, v1);
2575     __ rev32(v2, __ T16B, v2);
2576 
2577     __ cmpw(keylen, 52);
2578     __ br(Assembler::EQ, L_doLast);
2579 
2580     __ aese(v0, v1);
2581     __ aesmc(v0, v0);
2582     __ aese(v0, v2);
2583     __ aesmc(v0, v0);
2584 
2585     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2586     __ rev32(v1, __ T16B, v1);
2587     __ rev32(v2, __ T16B, v2);
2588 
2589     __ BIND(L_doLast);
2590 
2591     __ aese(v0, v1);
2592     __ aesmc(v0, v0);
2593     __ aese(v0, v2);
2594 
2595     __ ld1(v1, __ T16B, key);
2596     __ rev32(v1, __ T16B, v1);
2597     __ eor(v0, __ T16B, v0, v1);
2598 
2599     __ st1(v0, __ T16B, to);
2600 
2601     __ mov(r0, 0);
2602 
2603     __ leave();
2604     __ ret(lr);
2605 
2606     return start;
2607   }
2608 
2609   // Arguments:
2610   //
2611   // Inputs:
2612   //   c_rarg0   - source byte array address
2613   //   c_rarg1   - destination byte array address
2614   //   c_rarg2   - K (key) in little endian int array
2615   //
2616   address generate_aescrypt_decryptBlock() {
2617     assert(UseAES, "need AES instructions and misaligned SSE support");
2618     __ align(CodeEntryAlignment);
2619     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2620     Label L_doLast;
2621 
2622     const Register from        = c_rarg0;  // source array address
2623     const Register to          = c_rarg1;  // destination array address
2624     const Register key         = c_rarg2;  // key array address
2625     const Register keylen      = rscratch1;
2626 
2627     address start = __ pc();
2628     __ enter(); // required for proper stackwalking of RuntimeStub frame
2629 
2630     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2631 
2632     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2633 
2634     __ ld1(v5, __ T16B, __ post(key, 16));
2635     __ rev32(v5, __ T16B, v5);
2636 
2637     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2638     __ rev32(v1, __ T16B, v1);
2639     __ rev32(v2, __ T16B, v2);
2640     __ rev32(v3, __ T16B, v3);
2641     __ rev32(v4, __ T16B, v4);
2642     __ aesd(v0, v1);
2643     __ aesimc(v0, v0);
2644     __ aesd(v0, v2);
2645     __ aesimc(v0, v0);
2646     __ aesd(v0, v3);
2647     __ aesimc(v0, v0);
2648     __ aesd(v0, v4);
2649     __ aesimc(v0, v0);
2650 
2651     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2652     __ rev32(v1, __ T16B, v1);
2653     __ rev32(v2, __ T16B, v2);
2654     __ rev32(v3, __ T16B, v3);
2655     __ rev32(v4, __ T16B, v4);
2656     __ aesd(v0, v1);
2657     __ aesimc(v0, v0);
2658     __ aesd(v0, v2);
2659     __ aesimc(v0, v0);
2660     __ aesd(v0, v3);
2661     __ aesimc(v0, v0);
2662     __ aesd(v0, v4);
2663     __ aesimc(v0, v0);
2664 
2665     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2666     __ rev32(v1, __ T16B, v1);
2667     __ rev32(v2, __ T16B, v2);
2668 
2669     __ cmpw(keylen, 44);
2670     __ br(Assembler::EQ, L_doLast);
2671 
2672     __ aesd(v0, v1);
2673     __ aesimc(v0, v0);
2674     __ aesd(v0, v2);
2675     __ aesimc(v0, v0);
2676 
2677     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2678     __ rev32(v1, __ T16B, v1);
2679     __ rev32(v2, __ T16B, v2);
2680 
2681     __ cmpw(keylen, 52);
2682     __ br(Assembler::EQ, L_doLast);
2683 
2684     __ aesd(v0, v1);
2685     __ aesimc(v0, v0);
2686     __ aesd(v0, v2);
2687     __ aesimc(v0, v0);
2688 
2689     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2690     __ rev32(v1, __ T16B, v1);
2691     __ rev32(v2, __ T16B, v2);
2692 
2693     __ BIND(L_doLast);
2694 
2695     __ aesd(v0, v1);
2696     __ aesimc(v0, v0);
2697     __ aesd(v0, v2);
2698 
2699     __ eor(v0, __ T16B, v0, v5);
2700 
2701     __ st1(v0, __ T16B, to);
2702 
2703     __ mov(r0, 0);
2704 
2705     __ leave();
2706     __ ret(lr);
2707 
2708     return start;
2709   }
2710 
2711   // Arguments:
2712   //
2713   // Inputs:
2714   //   c_rarg0   - source byte array address
2715   //   c_rarg1   - destination byte array address
2716   //   c_rarg2   - K (key) in little endian int array
2717   //   c_rarg3   - r vector byte array address
2718   //   c_rarg4   - input length
2719   //
2720   // Output:
2721   //   x0        - input length
2722   //
2723   address generate_cipherBlockChaining_encryptAESCrypt() {
2724     assert(UseAES, "need AES instructions and misaligned SSE support");
2725     __ align(CodeEntryAlignment);
2726     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2727 
2728     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2729 
2730     const Register from        = c_rarg0;  // source array address
2731     const Register to          = c_rarg1;  // destination array address
2732     const Register key         = c_rarg2;  // key array address
2733     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2734                                            // and left with the results of the last encryption block
2735     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2736     const Register keylen      = rscratch1;
2737 
2738     address start = __ pc();
2739 
2740       __ enter();
2741 
2742       __ movw(rscratch2, len_reg);
2743 
2744       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2745 
2746       __ ld1(v0, __ T16B, rvec);
2747 
2748       __ cmpw(keylen, 52);
2749       __ br(Assembler::CC, L_loadkeys_44);
2750       __ br(Assembler::EQ, L_loadkeys_52);
2751 
2752       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2753       __ rev32(v17, __ T16B, v17);
2754       __ rev32(v18, __ T16B, v18);
2755     __ BIND(L_loadkeys_52);
2756       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2757       __ rev32(v19, __ T16B, v19);
2758       __ rev32(v20, __ T16B, v20);
2759     __ BIND(L_loadkeys_44);
2760       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2761       __ rev32(v21, __ T16B, v21);
2762       __ rev32(v22, __ T16B, v22);
2763       __ rev32(v23, __ T16B, v23);
2764       __ rev32(v24, __ T16B, v24);
2765       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2766       __ rev32(v25, __ T16B, v25);
2767       __ rev32(v26, __ T16B, v26);
2768       __ rev32(v27, __ T16B, v27);
2769       __ rev32(v28, __ T16B, v28);
2770       __ ld1(v29, v30, v31, __ T16B, key);
2771       __ rev32(v29, __ T16B, v29);
2772       __ rev32(v30, __ T16B, v30);
2773       __ rev32(v31, __ T16B, v31);
2774 
2775     __ BIND(L_aes_loop);
2776       __ ld1(v1, __ T16B, __ post(from, 16));
2777       __ eor(v0, __ T16B, v0, v1);
2778 
2779       __ br(Assembler::CC, L_rounds_44);
2780       __ br(Assembler::EQ, L_rounds_52);
2781 
2782       __ aese(v0, v17); __ aesmc(v0, v0);
2783       __ aese(v0, v18); __ aesmc(v0, v0);
2784     __ BIND(L_rounds_52);
2785       __ aese(v0, v19); __ aesmc(v0, v0);
2786       __ aese(v0, v20); __ aesmc(v0, v0);
2787     __ BIND(L_rounds_44);
2788       __ aese(v0, v21); __ aesmc(v0, v0);
2789       __ aese(v0, v22); __ aesmc(v0, v0);
2790       __ aese(v0, v23); __ aesmc(v0, v0);
2791       __ aese(v0, v24); __ aesmc(v0, v0);
2792       __ aese(v0, v25); __ aesmc(v0, v0);
2793       __ aese(v0, v26); __ aesmc(v0, v0);
2794       __ aese(v0, v27); __ aesmc(v0, v0);
2795       __ aese(v0, v28); __ aesmc(v0, v0);
2796       __ aese(v0, v29); __ aesmc(v0, v0);
2797       __ aese(v0, v30);
2798       __ eor(v0, __ T16B, v0, v31);
2799 
2800       __ st1(v0, __ T16B, __ post(to, 16));
2801 
2802       __ subw(len_reg, len_reg, 16);
2803       __ cbnzw(len_reg, L_aes_loop);
2804 
2805       __ st1(v0, __ T16B, rvec);
2806 
2807       __ mov(r0, rscratch2);
2808 
2809       __ leave();
2810       __ ret(lr);
2811 
2812       return start;
2813   }
2814 
2815   // Arguments:
2816   //
2817   // Inputs:
2818   //   c_rarg0   - source byte array address
2819   //   c_rarg1   - destination byte array address
2820   //   c_rarg2   - K (key) in little endian int array
2821   //   c_rarg3   - r vector byte array address
2822   //   c_rarg4   - input length
2823   //
2824   // Output:
2825   //   r0        - input length
2826   //
2827   address generate_cipherBlockChaining_decryptAESCrypt() {
2828     assert(UseAES, "need AES instructions and misaligned SSE support");
2829     __ align(CodeEntryAlignment);
2830     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2831 
2832     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2833 
2834     const Register from        = c_rarg0;  // source array address
2835     const Register to          = c_rarg1;  // destination array address
2836     const Register key         = c_rarg2;  // key array address
2837     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2838                                            // and left with the results of the last encryption block
2839     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2840     const Register keylen      = rscratch1;
2841 
2842     address start = __ pc();
2843 
2844       __ enter();
2845 
2846       __ movw(rscratch2, len_reg);
2847 
2848       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2849 
2850       __ ld1(v2, __ T16B, rvec);
2851 
2852       __ ld1(v31, __ T16B, __ post(key, 16));
2853       __ rev32(v31, __ T16B, v31);
2854 
2855       __ cmpw(keylen, 52);
2856       __ br(Assembler::CC, L_loadkeys_44);
2857       __ br(Assembler::EQ, L_loadkeys_52);
2858 
2859       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2860       __ rev32(v17, __ T16B, v17);
2861       __ rev32(v18, __ T16B, v18);
2862     __ BIND(L_loadkeys_52);
2863       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2864       __ rev32(v19, __ T16B, v19);
2865       __ rev32(v20, __ T16B, v20);
2866     __ BIND(L_loadkeys_44);
2867       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2868       __ rev32(v21, __ T16B, v21);
2869       __ rev32(v22, __ T16B, v22);
2870       __ rev32(v23, __ T16B, v23);
2871       __ rev32(v24, __ T16B, v24);
2872       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2873       __ rev32(v25, __ T16B, v25);
2874       __ rev32(v26, __ T16B, v26);
2875       __ rev32(v27, __ T16B, v27);
2876       __ rev32(v28, __ T16B, v28);
2877       __ ld1(v29, v30, __ T16B, key);
2878       __ rev32(v29, __ T16B, v29);
2879       __ rev32(v30, __ T16B, v30);
2880 
2881     __ BIND(L_aes_loop);
2882       __ ld1(v0, __ T16B, __ post(from, 16));
2883       __ orr(v1, __ T16B, v0, v0);
2884 
2885       __ br(Assembler::CC, L_rounds_44);
2886       __ br(Assembler::EQ, L_rounds_52);
2887 
2888       __ aesd(v0, v17); __ aesimc(v0, v0);
2889       __ aesd(v0, v18); __ aesimc(v0, v0);
2890     __ BIND(L_rounds_52);
2891       __ aesd(v0, v19); __ aesimc(v0, v0);
2892       __ aesd(v0, v20); __ aesimc(v0, v0);
2893     __ BIND(L_rounds_44);
2894       __ aesd(v0, v21); __ aesimc(v0, v0);
2895       __ aesd(v0, v22); __ aesimc(v0, v0);
2896       __ aesd(v0, v23); __ aesimc(v0, v0);
2897       __ aesd(v0, v24); __ aesimc(v0, v0);
2898       __ aesd(v0, v25); __ aesimc(v0, v0);
2899       __ aesd(v0, v26); __ aesimc(v0, v0);
2900       __ aesd(v0, v27); __ aesimc(v0, v0);
2901       __ aesd(v0, v28); __ aesimc(v0, v0);
2902       __ aesd(v0, v29); __ aesimc(v0, v0);
2903       __ aesd(v0, v30);
2904       __ eor(v0, __ T16B, v0, v31);
2905       __ eor(v0, __ T16B, v0, v2);
2906 
2907       __ st1(v0, __ T16B, __ post(to, 16));
2908       __ orr(v2, __ T16B, v1, v1);
2909 
2910       __ subw(len_reg, len_reg, 16);
2911       __ cbnzw(len_reg, L_aes_loop);
2912 
2913       __ st1(v2, __ T16B, rvec);
2914 
2915       __ mov(r0, rscratch2);
2916 
2917       __ leave();
2918       __ ret(lr);
2919 
2920     return start;
2921   }
2922 
2923   // Arguments:
2924   //
2925   // Inputs:
2926   //   c_rarg0   - byte[]  source+offset
2927   //   c_rarg1   - int[]   SHA.state
2928   //   c_rarg2   - int     offset
2929   //   c_rarg3   - int     limit
2930   //
2931   address generate_sha1_implCompress(bool multi_block, const char *name) {
2932     __ align(CodeEntryAlignment);
2933     StubCodeMark mark(this, "StubRoutines", name);
2934     address start = __ pc();
2935 
2936     Register buf   = c_rarg0;
2937     Register state = c_rarg1;
2938     Register ofs   = c_rarg2;
2939     Register limit = c_rarg3;
2940 
2941     Label keys;
2942     Label sha1_loop;
2943 
2944     // load the keys into v0..v3
2945     __ adr(rscratch1, keys);
2946     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2947     // load 5 words state into v6, v7
2948     __ ldrq(v6, Address(state, 0));
2949     __ ldrs(v7, Address(state, 16));
2950 
2951 
2952     __ BIND(sha1_loop);
2953     // load 64 bytes of data into v16..v19
2954     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2955     __ rev32(v16, __ T16B, v16);
2956     __ rev32(v17, __ T16B, v17);
2957     __ rev32(v18, __ T16B, v18);
2958     __ rev32(v19, __ T16B, v19);
2959 
2960     // do the sha1
2961     __ addv(v4, __ T4S, v16, v0);
2962     __ orr(v20, __ T16B, v6, v6);
2963 
2964     FloatRegister d0 = v16;
2965     FloatRegister d1 = v17;
2966     FloatRegister d2 = v18;
2967     FloatRegister d3 = v19;
2968 
2969     for (int round = 0; round < 20; round++) {
2970       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2971       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2972       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2973       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2974       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2975 
2976       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2977       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2978       __ sha1h(tmp2, __ T4S, v20);
2979       if (round < 5)
2980         __ sha1c(v20, __ T4S, tmp3, tmp4);
2981       else if (round < 10 || round >= 15)
2982         __ sha1p(v20, __ T4S, tmp3, tmp4);
2983       else
2984         __ sha1m(v20, __ T4S, tmp3, tmp4);
2985       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2986 
2987       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2988     }
2989 
2990     __ addv(v7, __ T2S, v7, v21);
2991     __ addv(v6, __ T4S, v6, v20);
2992 
2993     if (multi_block) {
2994       __ add(ofs, ofs, 64);
2995       __ cmp(ofs, limit);
2996       __ br(Assembler::LE, sha1_loop);
2997       __ mov(c_rarg0, ofs); // return ofs
2998     }
2999 
3000     __ strq(v6, Address(state, 0));
3001     __ strs(v7, Address(state, 16));
3002 
3003     __ ret(lr);
3004 
3005     __ bind(keys);
3006     __ emit_int32(0x5a827999);
3007     __ emit_int32(0x6ed9eba1);
3008     __ emit_int32(0x8f1bbcdc);
3009     __ emit_int32(0xca62c1d6);
3010 
3011     return start;
3012   }
3013 
3014 
3015   // Arguments:
3016   //
3017   // Inputs:
3018   //   c_rarg0   - byte[]  source+offset
3019   //   c_rarg1   - int[]   SHA.state
3020   //   c_rarg2   - int     offset
3021   //   c_rarg3   - int     limit
3022   //
3023   address generate_sha256_implCompress(bool multi_block, const char *name) {
3024     static const uint32_t round_consts[64] = {
3025       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3026       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3027       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3028       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3029       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3030       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3031       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3032       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3033       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3034       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3035       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3036       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3037       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3038       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3039       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3040       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3041     };
3042     __ align(CodeEntryAlignment);
3043     StubCodeMark mark(this, "StubRoutines", name);
3044     address start = __ pc();
3045 
3046     Register buf   = c_rarg0;
3047     Register state = c_rarg1;
3048     Register ofs   = c_rarg2;
3049     Register limit = c_rarg3;
3050 
3051     Label sha1_loop;
3052 
3053     __ stpd(v8, v9, __ pre(sp, -32));
3054     __ stpd(v10, v11, Address(sp, 16));
3055 
3056 // dga == v0
3057 // dgb == v1
3058 // dg0 == v2
3059 // dg1 == v3
3060 // dg2 == v4
3061 // t0 == v6
3062 // t1 == v7
3063 
3064     // load 16 keys to v16..v31
3065     __ lea(rscratch1, ExternalAddress((address)round_consts));
3066     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3067     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3068     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3069     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3070 
3071     // load 8 words (256 bits) state
3072     __ ldpq(v0, v1, state);
3073 
3074     __ BIND(sha1_loop);
3075     // load 64 bytes of data into v8..v11
3076     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3077     __ rev32(v8, __ T16B, v8);
3078     __ rev32(v9, __ T16B, v9);
3079     __ rev32(v10, __ T16B, v10);
3080     __ rev32(v11, __ T16B, v11);
3081 
3082     __ addv(v6, __ T4S, v8, v16);
3083     __ orr(v2, __ T16B, v0, v0);
3084     __ orr(v3, __ T16B, v1, v1);
3085 
3086     FloatRegister d0 = v8;
3087     FloatRegister d1 = v9;
3088     FloatRegister d2 = v10;
3089     FloatRegister d3 = v11;
3090 
3091 
3092     for (int round = 0; round < 16; round++) {
3093       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3094       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3095       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3096       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3097 
3098       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3099        __ orr(v4, __ T16B, v2, v2);
3100       if (round < 15)
3101         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3102       __ sha256h(v2, __ T4S, v3, tmp2);
3103       __ sha256h2(v3, __ T4S, v4, tmp2);
3104       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3105 
3106       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3107     }
3108 
3109     __ addv(v0, __ T4S, v0, v2);
3110     __ addv(v1, __ T4S, v1, v3);
3111 
3112     if (multi_block) {
3113       __ add(ofs, ofs, 64);
3114       __ cmp(ofs, limit);
3115       __ br(Assembler::LE, sha1_loop);
3116       __ mov(c_rarg0, ofs); // return ofs
3117     }
3118 
3119     __ ldpd(v10, v11, Address(sp, 16));
3120     __ ldpd(v8, v9, __ post(sp, 32));
3121 
3122     __ stpq(v0, v1, state);
3123 
3124     __ ret(lr);
3125 
3126     return start;
3127   }
3128 
3129   // Safefetch stubs.
3130   void generate_safefetch(const char* name, int size, address* entry,
3131                           address* fault_pc, address* continuation_pc) {
3132     // safefetch signatures:
3133     //   int      SafeFetch32(int*      adr, int      errValue);
3134     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3135     //
3136     // arguments:
3137     //   c_rarg0 = adr
3138     //   c_rarg1 = errValue
3139     //
3140     // result:
3141     //   PPC_RET  = *adr or errValue
3142 
3143     StubCodeMark mark(this, "StubRoutines", name);
3144 
3145     // Entry point, pc or function descriptor.
3146     *entry = __ pc();
3147 
3148     // Load *adr into c_rarg1, may fault.
3149     *fault_pc = __ pc();
3150     switch (size) {
3151       case 4:
3152         // int32_t
3153         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3154         break;
3155       case 8:
3156         // int64_t
3157         __ ldr(c_rarg1, Address(c_rarg0, 0));
3158         break;
3159       default:
3160         ShouldNotReachHere();
3161     }
3162 
3163     // return errValue or *adr
3164     *continuation_pc = __ pc();
3165     __ mov(r0, c_rarg1);
3166     __ ret(lr);
3167   }
3168 
3169   /**
3170    *  Arguments:
3171    *
3172    * Inputs:
3173    *   c_rarg0   - int crc
3174    *   c_rarg1   - byte* buf
3175    *   c_rarg2   - int length
3176    *
3177    * Ouput:
3178    *       rax   - int crc result
3179    */
3180   address generate_updateBytesCRC32() {
3181     assert(UseCRC32Intrinsics, "what are we doing here?");
3182 
3183     __ align(CodeEntryAlignment);
3184     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3185 
3186     address start = __ pc();
3187 
3188     const Register crc   = c_rarg0;  // crc
3189     const Register buf   = c_rarg1;  // source java byte array address
3190     const Register len   = c_rarg2;  // length
3191     const Register table0 = c_rarg3; // crc_table address
3192     const Register table1 = c_rarg4;
3193     const Register table2 = c_rarg5;
3194     const Register table3 = c_rarg6;
3195     const Register tmp3 = c_rarg7;
3196 
3197     BLOCK_COMMENT("Entry:");
3198     __ enter(); // required for proper stackwalking of RuntimeStub frame
3199 
3200     __ kernel_crc32(crc, buf, len,
3201               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3202 
3203     __ leave(); // required for proper stackwalking of RuntimeStub frame
3204     __ ret(lr);
3205 
3206     return start;
3207   }
3208 
3209   /**
3210    *  Arguments:
3211    *
3212    * Inputs:
3213    *   c_rarg0   - int crc
3214    *   c_rarg1   - byte* buf
3215    *   c_rarg2   - int length
3216    *   c_rarg3   - int* table
3217    *
3218    * Ouput:
3219    *       r0   - int crc result
3220    */
3221   address generate_updateBytesCRC32C() {
3222     assert(UseCRC32CIntrinsics, "what are we doing here?");
3223 
3224     __ align(CodeEntryAlignment);
3225     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3226 
3227     address start = __ pc();
3228 
3229     const Register crc   = c_rarg0;  // crc
3230     const Register buf   = c_rarg1;  // source java byte array address
3231     const Register len   = c_rarg2;  // length
3232     const Register table0 = c_rarg3; // crc_table address
3233     const Register table1 = c_rarg4;
3234     const Register table2 = c_rarg5;
3235     const Register table3 = c_rarg6;
3236     const Register tmp3 = c_rarg7;
3237 
3238     BLOCK_COMMENT("Entry:");
3239     __ enter(); // required for proper stackwalking of RuntimeStub frame
3240 
3241     __ kernel_crc32c(crc, buf, len,
3242               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3243 
3244     __ leave(); // required for proper stackwalking of RuntimeStub frame
3245     __ ret(lr);
3246 
3247     return start;
3248   }
3249 
3250   /***
3251    *  Arguments:
3252    *
3253    *  Inputs:
3254    *   c_rarg0   - int   adler
3255    *   c_rarg1   - byte* buff
3256    *   c_rarg2   - int   len
3257    *
3258    * Output:
3259    *   c_rarg0   - int adler result
3260    */
3261   address generate_updateBytesAdler32() {
3262     __ align(CodeEntryAlignment);
3263     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3264     address start = __ pc();
3265 
3266     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3267 
3268     // Aliases
3269     Register adler  = c_rarg0;
3270     Register s1     = c_rarg0;
3271     Register s2     = c_rarg3;
3272     Register buff   = c_rarg1;
3273     Register len    = c_rarg2;
3274     Register nmax  = r4;
3275     Register base  = r5;
3276     Register count = r6;
3277     Register temp0 = rscratch1;
3278     Register temp1 = rscratch2;
3279     FloatRegister vbytes = v0;
3280     FloatRegister vs1acc = v1;
3281     FloatRegister vs2acc = v2;
3282     FloatRegister vtable = v3;
3283 
3284     // Max number of bytes we can process before having to take the mod
3285     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3286     unsigned long BASE = 0xfff1;
3287     unsigned long NMAX = 0x15B0;
3288 
3289     __ mov(base, BASE);
3290     __ mov(nmax, NMAX);
3291 
3292     // Load accumulation coefficients for the upper 16 bits
3293     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3294     __ ld1(vtable, __ T16B, Address(temp0));
3295 
3296     // s1 is initialized to the lower 16 bits of adler
3297     // s2 is initialized to the upper 16 bits of adler
3298     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3299     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3300 
3301     // The pipelined loop needs at least 16 elements for 1 iteration
3302     // It does check this, but it is more effective to skip to the cleanup loop
3303     __ cmp(len, (u1)16);
3304     __ br(Assembler::HS, L_nmax);
3305     __ cbz(len, L_combine);
3306 
3307     __ bind(L_simple_by1_loop);
3308     __ ldrb(temp0, Address(__ post(buff, 1)));
3309     __ add(s1, s1, temp0);
3310     __ add(s2, s2, s1);
3311     __ subs(len, len, 1);
3312     __ br(Assembler::HI, L_simple_by1_loop);
3313 
3314     // s1 = s1 % BASE
3315     __ subs(temp0, s1, base);
3316     __ csel(s1, temp0, s1, Assembler::HS);
3317 
3318     // s2 = s2 % BASE
3319     __ lsr(temp0, s2, 16);
3320     __ lsl(temp1, temp0, 4);
3321     __ sub(temp1, temp1, temp0);
3322     __ add(s2, temp1, s2, ext::uxth);
3323 
3324     __ subs(temp0, s2, base);
3325     __ csel(s2, temp0, s2, Assembler::HS);
3326 
3327     __ b(L_combine);
3328 
3329     __ bind(L_nmax);
3330     __ subs(len, len, nmax);
3331     __ sub(count, nmax, 16);
3332     __ br(Assembler::LO, L_by16);
3333 
3334     __ bind(L_nmax_loop);
3335 
3336     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3337                                       vbytes, vs1acc, vs2acc, vtable);
3338 
3339     __ subs(count, count, 16);
3340     __ br(Assembler::HS, L_nmax_loop);
3341 
3342     // s1 = s1 % BASE
3343     __ lsr(temp0, s1, 16);
3344     __ lsl(temp1, temp0, 4);
3345     __ sub(temp1, temp1, temp0);
3346     __ add(temp1, temp1, s1, ext::uxth);
3347 
3348     __ lsr(temp0, temp1, 16);
3349     __ lsl(s1, temp0, 4);
3350     __ sub(s1, s1, temp0);
3351     __ add(s1, s1, temp1, ext:: uxth);
3352 
3353     __ subs(temp0, s1, base);
3354     __ csel(s1, temp0, s1, Assembler::HS);
3355 
3356     // s2 = s2 % BASE
3357     __ lsr(temp0, s2, 16);
3358     __ lsl(temp1, temp0, 4);
3359     __ sub(temp1, temp1, temp0);
3360     __ add(temp1, temp1, s2, ext::uxth);
3361 
3362     __ lsr(temp0, temp1, 16);
3363     __ lsl(s2, temp0, 4);
3364     __ sub(s2, s2, temp0);
3365     __ add(s2, s2, temp1, ext:: uxth);
3366 
3367     __ subs(temp0, s2, base);
3368     __ csel(s2, temp0, s2, Assembler::HS);
3369 
3370     __ subs(len, len, nmax);
3371     __ sub(count, nmax, 16);
3372     __ br(Assembler::HS, L_nmax_loop);
3373 
3374     __ bind(L_by16);
3375     __ adds(len, len, count);
3376     __ br(Assembler::LO, L_by1);
3377 
3378     __ bind(L_by16_loop);
3379 
3380     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3381                                       vbytes, vs1acc, vs2acc, vtable);
3382 
3383     __ subs(len, len, 16);
3384     __ br(Assembler::HS, L_by16_loop);
3385 
3386     __ bind(L_by1);
3387     __ adds(len, len, 15);
3388     __ br(Assembler::LO, L_do_mod);
3389 
3390     __ bind(L_by1_loop);
3391     __ ldrb(temp0, Address(__ post(buff, 1)));
3392     __ add(s1, temp0, s1);
3393     __ add(s2, s2, s1);
3394     __ subs(len, len, 1);
3395     __ br(Assembler::HS, L_by1_loop);
3396 
3397     __ bind(L_do_mod);
3398     // s1 = s1 % BASE
3399     __ lsr(temp0, s1, 16);
3400     __ lsl(temp1, temp0, 4);
3401     __ sub(temp1, temp1, temp0);
3402     __ add(temp1, temp1, s1, ext::uxth);
3403 
3404     __ lsr(temp0, temp1, 16);
3405     __ lsl(s1, temp0, 4);
3406     __ sub(s1, s1, temp0);
3407     __ add(s1, s1, temp1, ext:: uxth);
3408 
3409     __ subs(temp0, s1, base);
3410     __ csel(s1, temp0, s1, Assembler::HS);
3411 
3412     // s2 = s2 % BASE
3413     __ lsr(temp0, s2, 16);
3414     __ lsl(temp1, temp0, 4);
3415     __ sub(temp1, temp1, temp0);
3416     __ add(temp1, temp1, s2, ext::uxth);
3417 
3418     __ lsr(temp0, temp1, 16);
3419     __ lsl(s2, temp0, 4);
3420     __ sub(s2, s2, temp0);
3421     __ add(s2, s2, temp1, ext:: uxth);
3422 
3423     __ subs(temp0, s2, base);
3424     __ csel(s2, temp0, s2, Assembler::HS);
3425 
3426     // Combine lower bits and higher bits
3427     __ bind(L_combine);
3428     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3429 
3430     __ ret(lr);
3431 
3432     return start;
3433   }
3434 
3435   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3436           Register temp0, Register temp1, FloatRegister vbytes,
3437           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3438     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3439     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3440     // In non-vectorized code, we update s1 and s2 as:
3441     //   s1 <- s1 + b1
3442     //   s2 <- s2 + s1
3443     //   s1 <- s1 + b2
3444     //   s2 <- s2 + b1
3445     //   ...
3446     //   s1 <- s1 + b16
3447     //   s2 <- s2 + s1
3448     // Putting above assignments together, we have:
3449     //   s1_new = s1 + b1 + b2 + ... + b16
3450     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3451     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3452     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3453     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3454 
3455     // s2 = s2 + s1 * 16
3456     __ add(s2, s2, s1, Assembler::LSL, 4);
3457 
3458     // vs1acc = b1 + b2 + b3 + ... + b16
3459     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3460     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3461     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3462     __ uaddlv(vs1acc, __ T16B, vbytes);
3463     __ uaddlv(vs2acc, __ T8H, vs2acc);
3464 
3465     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3466     __ fmovd(temp0, vs1acc);
3467     __ fmovd(temp1, vs2acc);
3468     __ add(s1, s1, temp0);
3469     __ add(s2, s2, temp1);
3470   }
3471 
3472   /**
3473    *  Arguments:
3474    *
3475    *  Input:
3476    *    c_rarg0   - x address
3477    *    c_rarg1   - x length
3478    *    c_rarg2   - y address
3479    *    c_rarg3   - y lenth
3480    *    c_rarg4   - z address
3481    *    c_rarg5   - z length
3482    */
3483   address generate_multiplyToLen() {
3484     __ align(CodeEntryAlignment);
3485     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3486 
3487     address start = __ pc();
3488     const Register x     = r0;
3489     const Register xlen  = r1;
3490     const Register y     = r2;
3491     const Register ylen  = r3;
3492     const Register z     = r4;
3493     const Register zlen  = r5;
3494 
3495     const Register tmp1  = r10;
3496     const Register tmp2  = r11;
3497     const Register tmp3  = r12;
3498     const Register tmp4  = r13;
3499     const Register tmp5  = r14;
3500     const Register tmp6  = r15;
3501     const Register tmp7  = r16;
3502 
3503     BLOCK_COMMENT("Entry:");
3504     __ enter(); // required for proper stackwalking of RuntimeStub frame
3505     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3506     __ leave(); // required for proper stackwalking of RuntimeStub frame
3507     __ ret(lr);
3508 
3509     return start;
3510   }
3511 
3512   address generate_squareToLen() {
3513     // squareToLen algorithm for sizes 1..127 described in java code works
3514     // faster than multiply_to_len on some CPUs and slower on others, but
3515     // multiply_to_len shows a bit better overall results
3516     __ align(CodeEntryAlignment);
3517     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3518     address start = __ pc();
3519 
3520     const Register x     = r0;
3521     const Register xlen  = r1;
3522     const Register z     = r2;
3523     const Register zlen  = r3;
3524     const Register y     = r4; // == x
3525     const Register ylen  = r5; // == xlen
3526 
3527     const Register tmp1  = r10;
3528     const Register tmp2  = r11;
3529     const Register tmp3  = r12;
3530     const Register tmp4  = r13;
3531     const Register tmp5  = r14;
3532     const Register tmp6  = r15;
3533     const Register tmp7  = r16;
3534 
3535     RegSet spilled_regs = RegSet::of(y, ylen);
3536     BLOCK_COMMENT("Entry:");
3537     __ enter();
3538     __ push(spilled_regs, sp);
3539     __ mov(y, x);
3540     __ mov(ylen, xlen);
3541     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3542     __ pop(spilled_regs, sp);
3543     __ leave();
3544     __ ret(lr);
3545     return start;
3546   }
3547 
3548   address generate_mulAdd() {
3549     __ align(CodeEntryAlignment);
3550     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3551 
3552     address start = __ pc();
3553 
3554     const Register out     = r0;
3555     const Register in      = r1;
3556     const Register offset  = r2;
3557     const Register len     = r3;
3558     const Register k       = r4;
3559 
3560     BLOCK_COMMENT("Entry:");
3561     __ enter();
3562     __ mul_add(out, in, offset, len, k);
3563     __ leave();
3564     __ ret(lr);
3565 
3566     return start;
3567   }
3568 
3569   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3570                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3571                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3572     // Karatsuba multiplication performs a 128*128 -> 256-bit
3573     // multiplication in three 128-bit multiplications and a few
3574     // additions.
3575     //
3576     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3577     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3578     //
3579     // Inputs:
3580     //
3581     // A0 in a.d[0]     (subkey)
3582     // A1 in a.d[1]
3583     // (A1+A0) in a1_xor_a0.d[0]
3584     //
3585     // B0 in b.d[0]     (state)
3586     // B1 in b.d[1]
3587 
3588     __ ext(tmp1, __ T16B, b, b, 0x08);
3589     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3590     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3591     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3592     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3593 
3594     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3595     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3596     __ eor(tmp2, __ T16B, tmp2, tmp4);
3597     __ eor(tmp2, __ T16B, tmp2, tmp3);
3598 
3599     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3600     __ ins(result_hi, __ D, tmp2, 0, 1);
3601     __ ins(result_lo, __ D, tmp2, 1, 0);
3602   }
3603 
3604   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3605                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3606     const FloatRegister t0 = result;
3607 
3608     // The GCM field polynomial f is z^128 + p(z), where p =
3609     // z^7+z^2+z+1.
3610     //
3611     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3612     //
3613     // so, given that the product we're reducing is
3614     //    a == lo + hi * z^128
3615     // substituting,
3616     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3617     //
3618     // we reduce by multiplying hi by p(z) and subtracting the result
3619     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3620     // bits we can do this with two 64-bit multiplications, lo*p and
3621     // hi*p.
3622 
3623     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3624     __ ext(t1, __ T16B, t0, z, 8);
3625     __ eor(hi, __ T16B, hi, t1);
3626     __ ext(t1, __ T16B, z, t0, 8);
3627     __ eor(lo, __ T16B, lo, t1);
3628     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3629     __ eor(result, __ T16B, lo, t0);
3630   }
3631 
3632   address generate_has_negatives(address &has_negatives_long) {
3633     const u1 large_loop_size = 64;
3634     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3635     int dcache_line = VM_Version::dcache_line_size();
3636 
3637     Register ary1 = r1, len = r2, result = r0;
3638 
3639     __ align(CodeEntryAlignment);
3640 
3641     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3642 
3643     address entry = __ pc();
3644 
3645     __ enter();
3646 
3647   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3648         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3649 
3650   __ cmp(len, (u1)15);
3651   __ br(Assembler::GT, LEN_OVER_15);
3652   // The only case when execution falls into this code is when pointer is near
3653   // the end of memory page and we have to avoid reading next page
3654   __ add(ary1, ary1, len);
3655   __ subs(len, len, 8);
3656   __ br(Assembler::GT, LEN_OVER_8);
3657   __ ldr(rscratch2, Address(ary1, -8));
3658   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3659   __ lsrv(rscratch2, rscratch2, rscratch1);
3660   __ tst(rscratch2, UPPER_BIT_MASK);
3661   __ cset(result, Assembler::NE);
3662   __ leave();
3663   __ ret(lr);
3664   __ bind(LEN_OVER_8);
3665   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3666   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3667   __ tst(rscratch2, UPPER_BIT_MASK);
3668   __ br(Assembler::NE, RET_TRUE_NO_POP);
3669   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3670   __ lsrv(rscratch1, rscratch1, rscratch2);
3671   __ tst(rscratch1, UPPER_BIT_MASK);
3672   __ cset(result, Assembler::NE);
3673   __ leave();
3674   __ ret(lr);
3675 
3676   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3677   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3678 
3679   has_negatives_long = __ pc(); // 2nd entry point
3680 
3681   __ enter();
3682 
3683   __ bind(LEN_OVER_15);
3684     __ push(spilled_regs, sp);
3685     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3686     __ cbz(rscratch2, ALIGNED);
3687     __ ldp(tmp6, tmp1, Address(ary1));
3688     __ mov(tmp5, 16);
3689     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3690     __ add(ary1, ary1, rscratch1);
3691     __ sub(len, len, rscratch1);
3692     __ orr(tmp6, tmp6, tmp1);
3693     __ tst(tmp6, UPPER_BIT_MASK);
3694     __ br(Assembler::NE, RET_TRUE);
3695 
3696   __ bind(ALIGNED);
3697     __ cmp(len, large_loop_size);
3698     __ br(Assembler::LT, CHECK_16);
3699     // Perform 16-byte load as early return in pre-loop to handle situation
3700     // when initially aligned large array has negative values at starting bytes,
3701     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3702     // slower. Cases with negative bytes further ahead won't be affected that
3703     // much. In fact, it'll be faster due to early loads, less instructions and
3704     // less branches in LARGE_LOOP.
3705     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3706     __ sub(len, len, 16);
3707     __ orr(tmp6, tmp6, tmp1);
3708     __ tst(tmp6, UPPER_BIT_MASK);
3709     __ br(Assembler::NE, RET_TRUE);
3710     __ cmp(len, large_loop_size);
3711     __ br(Assembler::LT, CHECK_16);
3712 
3713     if (SoftwarePrefetchHintDistance >= 0
3714         && SoftwarePrefetchHintDistance >= dcache_line) {
3715       // initial prefetch
3716       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3717     }
3718   __ bind(LARGE_LOOP);
3719     if (SoftwarePrefetchHintDistance >= 0) {
3720       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3721     }
3722     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3723     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3724     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3725     // instructions per cycle and have less branches, but this approach disables
3726     // early return, thus, all 64 bytes are loaded and checked every time.
3727     __ ldp(tmp2, tmp3, Address(ary1));
3728     __ ldp(tmp4, tmp5, Address(ary1, 16));
3729     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3730     __ ldp(tmp6, tmp1, Address(ary1, 48));
3731     __ add(ary1, ary1, large_loop_size);
3732     __ sub(len, len, large_loop_size);
3733     __ orr(tmp2, tmp2, tmp3);
3734     __ orr(tmp4, tmp4, tmp5);
3735     __ orr(rscratch1, rscratch1, rscratch2);
3736     __ orr(tmp6, tmp6, tmp1);
3737     __ orr(tmp2, tmp2, tmp4);
3738     __ orr(rscratch1, rscratch1, tmp6);
3739     __ orr(tmp2, tmp2, rscratch1);
3740     __ tst(tmp2, UPPER_BIT_MASK);
3741     __ br(Assembler::NE, RET_TRUE);
3742     __ cmp(len, large_loop_size);
3743     __ br(Assembler::GE, LARGE_LOOP);
3744 
3745   __ bind(CHECK_16); // small 16-byte load pre-loop
3746     __ cmp(len, (u1)16);
3747     __ br(Assembler::LT, POST_LOOP16);
3748 
3749   __ bind(LOOP16); // small 16-byte load loop
3750     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3751     __ sub(len, len, 16);
3752     __ orr(tmp2, tmp2, tmp3);
3753     __ tst(tmp2, UPPER_BIT_MASK);
3754     __ br(Assembler::NE, RET_TRUE);
3755     __ cmp(len, (u1)16);
3756     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3757 
3758   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3759     __ cmp(len, (u1)8);
3760     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3761     __ ldr(tmp3, Address(__ post(ary1, 8)));
3762     __ sub(len, len, 8);
3763     __ tst(tmp3, UPPER_BIT_MASK);
3764     __ br(Assembler::NE, RET_TRUE);
3765 
3766   __ bind(POST_LOOP16_LOAD_TAIL);
3767     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3768     __ ldr(tmp1, Address(ary1));
3769     __ mov(tmp2, 64);
3770     __ sub(tmp4, tmp2, len, __ LSL, 3);
3771     __ lslv(tmp1, tmp1, tmp4);
3772     __ tst(tmp1, UPPER_BIT_MASK);
3773     __ br(Assembler::NE, RET_TRUE);
3774     // Fallthrough
3775 
3776   __ bind(RET_FALSE);
3777     __ pop(spilled_regs, sp);
3778     __ leave();
3779     __ mov(result, zr);
3780     __ ret(lr);
3781 
3782   __ bind(RET_TRUE);
3783     __ pop(spilled_regs, sp);
3784   __ bind(RET_TRUE_NO_POP);
3785     __ leave();
3786     __ mov(result, 1);
3787     __ ret(lr);
3788 
3789   __ bind(DONE);
3790     __ pop(spilled_regs, sp);
3791     __ leave();
3792     __ ret(lr);
3793     return entry;
3794   }
3795 
3796   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3797         bool usePrefetch, Label &NOT_EQUAL) {
3798     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3799         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3800         tmp7 = r12, tmp8 = r13;
3801     Label LOOP;
3802 
3803     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3804     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3805     __ bind(LOOP);
3806     if (usePrefetch) {
3807       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3808       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3809     }
3810     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3811     __ eor(tmp1, tmp1, tmp2);
3812     __ eor(tmp3, tmp3, tmp4);
3813     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3814     __ orr(tmp1, tmp1, tmp3);
3815     __ cbnz(tmp1, NOT_EQUAL);
3816     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3817     __ eor(tmp5, tmp5, tmp6);
3818     __ eor(tmp7, tmp7, tmp8);
3819     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3820     __ orr(tmp5, tmp5, tmp7);
3821     __ cbnz(tmp5, NOT_EQUAL);
3822     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3823     __ eor(tmp1, tmp1, tmp2);
3824     __ eor(tmp3, tmp3, tmp4);
3825     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3826     __ orr(tmp1, tmp1, tmp3);
3827     __ cbnz(tmp1, NOT_EQUAL);
3828     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3829     __ eor(tmp5, tmp5, tmp6);
3830     __ sub(cnt1, cnt1, 8 * wordSize);
3831     __ eor(tmp7, tmp7, tmp8);
3832     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3833     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3834     // cmp) because subs allows an unlimited range of immediate operand.
3835     __ subs(tmp6, cnt1, loopThreshold);
3836     __ orr(tmp5, tmp5, tmp7);
3837     __ cbnz(tmp5, NOT_EQUAL);
3838     __ br(__ GE, LOOP);
3839     // post-loop
3840     __ eor(tmp1, tmp1, tmp2);
3841     __ eor(tmp3, tmp3, tmp4);
3842     __ orr(tmp1, tmp1, tmp3);
3843     __ sub(cnt1, cnt1, 2 * wordSize);
3844     __ cbnz(tmp1, NOT_EQUAL);
3845   }
3846 
3847   void generate_large_array_equals_loop_simd(int loopThreshold,
3848         bool usePrefetch, Label &NOT_EQUAL) {
3849     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3850         tmp2 = rscratch2;
3851     Label LOOP;
3852 
3853     __ bind(LOOP);
3854     if (usePrefetch) {
3855       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3856       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3857     }
3858     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3859     __ sub(cnt1, cnt1, 8 * wordSize);
3860     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3861     __ subs(tmp1, cnt1, loopThreshold);
3862     __ eor(v0, __ T16B, v0, v4);
3863     __ eor(v1, __ T16B, v1, v5);
3864     __ eor(v2, __ T16B, v2, v6);
3865     __ eor(v3, __ T16B, v3, v7);
3866     __ orr(v0, __ T16B, v0, v1);
3867     __ orr(v1, __ T16B, v2, v3);
3868     __ orr(v0, __ T16B, v0, v1);
3869     __ umov(tmp1, v0, __ D, 0);
3870     __ umov(tmp2, v0, __ D, 1);
3871     __ orr(tmp1, tmp1, tmp2);
3872     __ cbnz(tmp1, NOT_EQUAL);
3873     __ br(__ GE, LOOP);
3874   }
3875 
3876   // a1 = r1 - array1 address
3877   // a2 = r2 - array2 address
3878   // result = r0 - return value. Already contains "false"
3879   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3880   // r3-r5 are reserved temporary registers
3881   address generate_large_array_equals() {
3882     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3883         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3884         tmp7 = r12, tmp8 = r13;
3885     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3886         SMALL_LOOP, POST_LOOP;
3887     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3888     // calculate if at least 32 prefetched bytes are used
3889     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3890     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3891     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3892     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3893         tmp5, tmp6, tmp7, tmp8);
3894 
3895     __ align(CodeEntryAlignment);
3896 
3897     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3898 
3899     address entry = __ pc();
3900     __ enter();
3901     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3902     // also advance pointers to use post-increment instead of pre-increment
3903     __ add(a1, a1, wordSize);
3904     __ add(a2, a2, wordSize);
3905     if (AvoidUnalignedAccesses) {
3906       // both implementations (SIMD/nonSIMD) are using relatively large load
3907       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3908       // on some CPUs in case of address is not at least 16-byte aligned.
3909       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3910       // load if needed at least for 1st address and make if 16-byte aligned.
3911       Label ALIGNED16;
3912       __ tbz(a1, 3, ALIGNED16);
3913       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3914       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3915       __ sub(cnt1, cnt1, wordSize);
3916       __ eor(tmp1, tmp1, tmp2);
3917       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3918       __ bind(ALIGNED16);
3919     }
3920     if (UseSIMDForArrayEquals) {
3921       if (SoftwarePrefetchHintDistance >= 0) {
3922         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3923         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3924         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3925             /* prfm = */ true, NOT_EQUAL);
3926         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3927         __ br(__ LT, TAIL);
3928       }
3929       __ bind(NO_PREFETCH_LARGE_LOOP);
3930       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3931           /* prfm = */ false, NOT_EQUAL);
3932     } else {
3933       __ push(spilled_regs, sp);
3934       if (SoftwarePrefetchHintDistance >= 0) {
3935         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3936         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3937         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3938             /* prfm = */ true, NOT_EQUAL);
3939         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3940         __ br(__ LT, TAIL);
3941       }
3942       __ bind(NO_PREFETCH_LARGE_LOOP);
3943       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3944           /* prfm = */ false, NOT_EQUAL);
3945     }
3946     __ bind(TAIL);
3947       __ cbz(cnt1, EQUAL);
3948       __ subs(cnt1, cnt1, wordSize);
3949       __ br(__ LE, POST_LOOP);
3950     __ bind(SMALL_LOOP);
3951       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3952       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3953       __ subs(cnt1, cnt1, wordSize);
3954       __ eor(tmp1, tmp1, tmp2);
3955       __ cbnz(tmp1, NOT_EQUAL);
3956       __ br(__ GT, SMALL_LOOP);
3957     __ bind(POST_LOOP);
3958       __ ldr(tmp1, Address(a1, cnt1));
3959       __ ldr(tmp2, Address(a2, cnt1));
3960       __ eor(tmp1, tmp1, tmp2);
3961       __ cbnz(tmp1, NOT_EQUAL);
3962     __ bind(EQUAL);
3963       __ mov(result, true);
3964     __ bind(NOT_EQUAL);
3965       if (!UseSIMDForArrayEquals) {
3966         __ pop(spilled_regs, sp);
3967       }
3968     __ bind(NOT_EQUAL_NO_POP);
3969     __ leave();
3970     __ ret(lr);
3971     return entry;
3972   }
3973 
3974   address generate_dsin_dcos(bool isCos) {
3975     __ align(CodeEntryAlignment);
3976     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3977     address start = __ pc();
3978     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3979         (address)StubRoutines::aarch64::_two_over_pi,
3980         (address)StubRoutines::aarch64::_pio2,
3981         (address)StubRoutines::aarch64::_dsin_coef,
3982         (address)StubRoutines::aarch64::_dcos_coef);
3983     return start;
3984   }
3985 
3986   address generate_dlog() {
3987     __ align(CodeEntryAlignment);
3988     StubCodeMark mark(this, "StubRoutines", "dlog");
3989     address entry = __ pc();
3990     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3991         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3992     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3993     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3994         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3995     return entry;
3996   }
3997 
3998   // code for comparing 16 bytes of strings with same encoding
3999   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4000     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4001     __ ldr(rscratch1, Address(__ post(str1, 8)));
4002     __ eor(rscratch2, tmp1, tmp2);
4003     __ ldr(cnt1, Address(__ post(str2, 8)));
4004     __ cbnz(rscratch2, DIFF1);
4005     __ ldr(tmp1, Address(__ post(str1, 8)));
4006     __ eor(rscratch2, rscratch1, cnt1);
4007     __ ldr(tmp2, Address(__ post(str2, 8)));
4008     __ cbnz(rscratch2, DIFF2);
4009   }
4010 
4011   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4012   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4013       Label &DIFF2) {
4014     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4015     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4016 
4017     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4018     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4019     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4020     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4021 
4022     __ fmovd(tmpL, vtmp3);
4023     __ eor(rscratch2, tmp3, tmpL);
4024     __ cbnz(rscratch2, DIFF2);
4025 
4026     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4027     __ umov(tmpL, vtmp3, __ D, 1);
4028     __ eor(rscratch2, tmpU, tmpL);
4029     __ cbnz(rscratch2, DIFF1);
4030 
4031     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4032     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4033     __ fmovd(tmpL, vtmp);
4034     __ eor(rscratch2, tmp3, tmpL);
4035     __ cbnz(rscratch2, DIFF2);
4036 
4037     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4038     __ umov(tmpL, vtmp, __ D, 1);
4039     __ eor(rscratch2, tmpU, tmpL);
4040     __ cbnz(rscratch2, DIFF1);
4041   }
4042 
4043   // r0  = result
4044   // r1  = str1
4045   // r2  = cnt1
4046   // r3  = str2
4047   // r4  = cnt2
4048   // r10 = tmp1
4049   // r11 = tmp2
4050   address generate_compare_long_string_different_encoding(bool isLU) {
4051     __ align(CodeEntryAlignment);
4052     StubCodeMark mark(this, "StubRoutines", isLU
4053         ? "compare_long_string_different_encoding LU"
4054         : "compare_long_string_different_encoding UL");
4055     address entry = __ pc();
4056     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4057         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4058         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4059     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4060         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4061     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4062     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4063 
4064     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4065 
4066     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4067     // cnt2 == amount of characters left to compare
4068     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4069     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4070     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4071     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4072     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4073     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4074     __ eor(rscratch2, tmp1, tmp2);
4075     __ mov(rscratch1, tmp2);
4076     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4077     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4078              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4079     __ push(spilled_regs, sp);
4080     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4081     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4082 
4083     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4084 
4085     if (SoftwarePrefetchHintDistance >= 0) {
4086       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4087       __ br(__ LT, NO_PREFETCH);
4088       __ bind(LARGE_LOOP_PREFETCH);
4089         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4090         __ mov(tmp4, 2);
4091         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4092         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4093           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4094           __ subs(tmp4, tmp4, 1);
4095           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4096           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4097           __ mov(tmp4, 2);
4098         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4099           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4100           __ subs(tmp4, tmp4, 1);
4101           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4102           __ sub(cnt2, cnt2, 64);
4103           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4104           __ br(__ GE, LARGE_LOOP_PREFETCH);
4105     }
4106     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4107     __ bind(NO_PREFETCH);
4108     __ subs(cnt2, cnt2, 16);
4109     __ br(__ LT, TAIL);
4110     __ align(OptoLoopAlignment);
4111     __ bind(SMALL_LOOP); // smaller loop
4112       __ subs(cnt2, cnt2, 16);
4113       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4114       __ br(__ GE, SMALL_LOOP);
4115       __ cmn(cnt2, (u1)16);
4116       __ br(__ EQ, LOAD_LAST);
4117     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4118       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4119       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4120       __ ldr(tmp3, Address(cnt1, -8));
4121       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4122       __ b(LOAD_LAST);
4123     __ bind(DIFF2);
4124       __ mov(tmpU, tmp3);
4125     __ bind(DIFF1);
4126       __ pop(spilled_regs, sp);
4127       __ b(CALCULATE_DIFFERENCE);
4128     __ bind(LOAD_LAST);
4129       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4130       // No need to load it again
4131       __ mov(tmpU, tmp3);
4132       __ pop(spilled_regs, sp);
4133 
4134       // tmp2 points to the address of the last 4 Latin1 characters right now
4135       __ ldrs(vtmp, Address(tmp2));
4136       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4137       __ fmovd(tmpL, vtmp);
4138 
4139       __ eor(rscratch2, tmpU, tmpL);
4140       __ cbz(rscratch2, DONE);
4141 
4142     // Find the first different characters in the longwords and
4143     // compute their difference.
4144     __ bind(CALCULATE_DIFFERENCE);
4145       __ rev(rscratch2, rscratch2);
4146       __ clz(rscratch2, rscratch2);
4147       __ andr(rscratch2, rscratch2, -16);
4148       __ lsrv(tmp1, tmp1, rscratch2);
4149       __ uxthw(tmp1, tmp1);
4150       __ lsrv(rscratch1, rscratch1, rscratch2);
4151       __ uxthw(rscratch1, rscratch1);
4152       __ subw(result, tmp1, rscratch1);
4153     __ bind(DONE);
4154       __ ret(lr);
4155     return entry;
4156   }
4157 
4158   // r0  = result
4159   // r1  = str1
4160   // r2  = cnt1
4161   // r3  = str2
4162   // r4  = cnt2
4163   // r10 = tmp1
4164   // r11 = tmp2
4165   address generate_compare_long_string_same_encoding(bool isLL) {
4166     __ align(CodeEntryAlignment);
4167     StubCodeMark mark(this, "StubRoutines", isLL
4168         ? "compare_long_string_same_encoding LL"
4169         : "compare_long_string_same_encoding UU");
4170     address entry = __ pc();
4171     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4172         tmp1 = r10, tmp2 = r11;
4173     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4174         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4175         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4176     // exit from large loop when less than 64 bytes left to read or we're about
4177     // to prefetch memory behind array border
4178     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4179     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4180     // update cnt2 counter with already loaded 8 bytes
4181     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4182     // update pointers, because of previous read
4183     __ add(str1, str1, wordSize);
4184     __ add(str2, str2, wordSize);
4185     if (SoftwarePrefetchHintDistance >= 0) {
4186       __ bind(LARGE_LOOP_PREFETCH);
4187         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4188         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4189         compare_string_16_bytes_same(DIFF, DIFF2);
4190         compare_string_16_bytes_same(DIFF, DIFF2);
4191         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4192         compare_string_16_bytes_same(DIFF, DIFF2);
4193         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4194         compare_string_16_bytes_same(DIFF, DIFF2);
4195         __ br(__ GT, LARGE_LOOP_PREFETCH);
4196         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4197     }
4198     // less than 16 bytes left?
4199     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4200     __ br(__ LT, TAIL);
4201     __ align(OptoLoopAlignment);
4202     __ bind(SMALL_LOOP);
4203       compare_string_16_bytes_same(DIFF, DIFF2);
4204       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4205       __ br(__ GE, SMALL_LOOP);
4206     __ bind(TAIL);
4207       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4208       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4209       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4210       __ br(__ LE, CHECK_LAST);
4211       __ eor(rscratch2, tmp1, tmp2);
4212       __ cbnz(rscratch2, DIFF);
4213       __ ldr(tmp1, Address(__ post(str1, 8)));
4214       __ ldr(tmp2, Address(__ post(str2, 8)));
4215       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4216     __ bind(CHECK_LAST);
4217       if (!isLL) {
4218         __ add(cnt2, cnt2, cnt2); // now in bytes
4219       }
4220       __ eor(rscratch2, tmp1, tmp2);
4221       __ cbnz(rscratch2, DIFF);
4222       __ ldr(rscratch1, Address(str1, cnt2));
4223       __ ldr(cnt1, Address(str2, cnt2));
4224       __ eor(rscratch2, rscratch1, cnt1);
4225       __ cbz(rscratch2, LENGTH_DIFF);
4226       // Find the first different characters in the longwords and
4227       // compute their difference.
4228     __ bind(DIFF2);
4229       __ rev(rscratch2, rscratch2);
4230       __ clz(rscratch2, rscratch2);
4231       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4232       __ lsrv(rscratch1, rscratch1, rscratch2);
4233       if (isLL) {
4234         __ lsrv(cnt1, cnt1, rscratch2);
4235         __ uxtbw(rscratch1, rscratch1);
4236         __ uxtbw(cnt1, cnt1);
4237       } else {
4238         __ lsrv(cnt1, cnt1, rscratch2);
4239         __ uxthw(rscratch1, rscratch1);
4240         __ uxthw(cnt1, cnt1);
4241       }
4242       __ subw(result, rscratch1, cnt1);
4243       __ b(LENGTH_DIFF);
4244     __ bind(DIFF);
4245       __ rev(rscratch2, rscratch2);
4246       __ clz(rscratch2, rscratch2);
4247       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4248       __ lsrv(tmp1, tmp1, rscratch2);
4249       if (isLL) {
4250         __ lsrv(tmp2, tmp2, rscratch2);
4251         __ uxtbw(tmp1, tmp1);
4252         __ uxtbw(tmp2, tmp2);
4253       } else {
4254         __ lsrv(tmp2, tmp2, rscratch2);
4255         __ uxthw(tmp1, tmp1);
4256         __ uxthw(tmp2, tmp2);
4257       }
4258       __ subw(result, tmp1, tmp2);
4259       __ b(LENGTH_DIFF);
4260     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4261       __ eor(rscratch2, tmp1, tmp2);
4262       __ cbnz(rscratch2, DIFF);
4263     __ bind(LENGTH_DIFF);
4264       __ ret(lr);
4265     return entry;
4266   }
4267 
4268   void generate_compare_long_strings() {
4269       StubRoutines::aarch64::_compare_long_string_LL
4270           = generate_compare_long_string_same_encoding(true);
4271       StubRoutines::aarch64::_compare_long_string_UU
4272           = generate_compare_long_string_same_encoding(false);
4273       StubRoutines::aarch64::_compare_long_string_LU
4274           = generate_compare_long_string_different_encoding(true);
4275       StubRoutines::aarch64::_compare_long_string_UL
4276           = generate_compare_long_string_different_encoding(false);
4277   }
4278 
4279   // R0 = result
4280   // R1 = str2
4281   // R2 = cnt1
4282   // R3 = str1
4283   // R4 = cnt2
4284   // This generic linear code use few additional ideas, which makes it faster:
4285   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4286   // in order to skip initial loading(help in systems with 1 ld pipeline)
4287   // 2) we can use "fast" algorithm of finding single character to search for
4288   // first symbol with less branches(1 branch per each loaded register instead
4289   // of branch for each symbol), so, this is where constants like
4290   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4291   // 3) after loading and analyzing 1st register of source string, it can be
4292   // used to search for every 1st character entry, saving few loads in
4293   // comparison with "simplier-but-slower" implementation
4294   // 4) in order to avoid lots of push/pop operations, code below is heavily
4295   // re-using/re-initializing/compressing register values, which makes code
4296   // larger and a bit less readable, however, most of extra operations are
4297   // issued during loads or branches, so, penalty is minimal
4298   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4299     const char* stubName = str1_isL
4300         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4301         : "indexof_linear_uu";
4302     __ align(CodeEntryAlignment);
4303     StubCodeMark mark(this, "StubRoutines", stubName);
4304     address entry = __ pc();
4305 
4306     int str1_chr_size = str1_isL ? 1 : 2;
4307     int str2_chr_size = str2_isL ? 1 : 2;
4308     int str1_chr_shift = str1_isL ? 0 : 1;
4309     int str2_chr_shift = str2_isL ? 0 : 1;
4310     bool isL = str1_isL && str2_isL;
4311    // parameters
4312     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4313     // temporary registers
4314     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4315     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4316     // redefinitions
4317     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4318 
4319     __ push(spilled_regs, sp);
4320     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4321         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4322         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4323         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4324         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4325         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4326     // Read whole register from str1. It is safe, because length >=8 here
4327     __ ldr(ch1, Address(str1));
4328     // Read whole register from str2. It is safe, because length >=8 here
4329     __ ldr(ch2, Address(str2));
4330     __ sub(cnt2, cnt2, cnt1);
4331     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4332     if (str1_isL != str2_isL) {
4333       __ eor(v0, __ T16B, v0, v0);
4334     }
4335     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4336     __ mul(first, first, tmp1);
4337     // check if we have less than 1 register to check
4338     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4339     if (str1_isL != str2_isL) {
4340       __ fmovd(v1, ch1);
4341     }
4342     __ br(__ LE, L_SMALL);
4343     __ eor(ch2, first, ch2);
4344     if (str1_isL != str2_isL) {
4345       __ zip1(v1, __ T16B, v1, v0);
4346     }
4347     __ sub(tmp2, ch2, tmp1);
4348     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4349     __ bics(tmp2, tmp2, ch2);
4350     if (str1_isL != str2_isL) {
4351       __ fmovd(ch1, v1);
4352     }
4353     __ br(__ NE, L_HAS_ZERO);
4354     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4355     __ add(result, result, wordSize/str2_chr_size);
4356     __ add(str2, str2, wordSize);
4357     __ br(__ LT, L_POST_LOOP);
4358     __ BIND(L_LOOP);
4359       __ ldr(ch2, Address(str2));
4360       __ eor(ch2, first, ch2);
4361       __ sub(tmp2, ch2, tmp1);
4362       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4363       __ bics(tmp2, tmp2, ch2);
4364       __ br(__ NE, L_HAS_ZERO);
4365     __ BIND(L_LOOP_PROCEED);
4366       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4367       __ add(str2, str2, wordSize);
4368       __ add(result, result, wordSize/str2_chr_size);
4369       __ br(__ GE, L_LOOP);
4370     __ BIND(L_POST_LOOP);
4371       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4372       __ br(__ LE, NOMATCH);
4373       __ ldr(ch2, Address(str2));
4374       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4375       __ eor(ch2, first, ch2);
4376       __ sub(tmp2, ch2, tmp1);
4377       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4378       __ mov(tmp4, -1); // all bits set
4379       __ b(L_SMALL_PROCEED);
4380     __ align(OptoLoopAlignment);
4381     __ BIND(L_SMALL);
4382       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4383       __ eor(ch2, first, ch2);
4384       if (str1_isL != str2_isL) {
4385         __ zip1(v1, __ T16B, v1, v0);
4386       }
4387       __ sub(tmp2, ch2, tmp1);
4388       __ mov(tmp4, -1); // all bits set
4389       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4390       if (str1_isL != str2_isL) {
4391         __ fmovd(ch1, v1); // move converted 4 symbols
4392       }
4393     __ BIND(L_SMALL_PROCEED);
4394       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4395       __ bic(tmp2, tmp2, ch2);
4396       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4397       __ rbit(tmp2, tmp2);
4398       __ br(__ EQ, NOMATCH);
4399     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4400       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4401       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4402       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4403       if (str2_isL) { // LL
4404         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4405         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4406         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4407         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4408         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4409       } else {
4410         __ mov(ch2, 0xE); // all bits in byte set except last one
4411         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4412         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4413         __ lslv(tmp2, tmp2, tmp4);
4414         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4415         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4416         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4417         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4418       }
4419       __ cmp(ch1, ch2);
4420       __ mov(tmp4, wordSize/str2_chr_size);
4421       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4422     __ BIND(L_SMALL_CMP_LOOP);
4423       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4424                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4425       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4426                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4427       __ add(tmp4, tmp4, 1);
4428       __ cmp(tmp4, cnt1);
4429       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4430       __ cmp(first, ch2);
4431       __ br(__ EQ, L_SMALL_CMP_LOOP);
4432     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4433       __ cbz(tmp2, NOMATCH); // no more matches. exit
4434       __ clz(tmp4, tmp2);
4435       __ add(result, result, 1); // advance index
4436       __ add(str2, str2, str2_chr_size); // advance pointer
4437       __ b(L_SMALL_HAS_ZERO_LOOP);
4438     __ align(OptoLoopAlignment);
4439     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4440       __ cmp(first, ch2);
4441       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4442       __ b(DONE);
4443     __ align(OptoLoopAlignment);
4444     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4445       if (str2_isL) { // LL
4446         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4447         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4448         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4449         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4450         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4451       } else {
4452         __ mov(ch2, 0xE); // all bits in byte set except last one
4453         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4454         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4455         __ lslv(tmp2, tmp2, tmp4);
4456         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4457         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4458         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4459         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4460       }
4461       __ cmp(ch1, ch2);
4462       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4463       __ b(DONE);
4464     __ align(OptoLoopAlignment);
4465     __ BIND(L_HAS_ZERO);
4466       __ rbit(tmp2, tmp2);
4467       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4468       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4469       // It's fine because both counters are 32bit and are not changed in this
4470       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4471       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4472       __ sub(result, result, 1);
4473     __ BIND(L_HAS_ZERO_LOOP);
4474       __ mov(cnt1, wordSize/str2_chr_size);
4475       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4476       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4477       if (str2_isL) {
4478         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4479         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4480         __ lslv(tmp2, tmp2, tmp4);
4481         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4482         __ add(tmp4, tmp4, 1);
4483         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4484         __ lsl(tmp2, tmp2, 1);
4485         __ mov(tmp4, wordSize/str2_chr_size);
4486       } else {
4487         __ mov(ch2, 0xE);
4488         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4489         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4490         __ lslv(tmp2, tmp2, tmp4);
4491         __ add(tmp4, tmp4, 1);
4492         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4493         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4494         __ lsl(tmp2, tmp2, 1);
4495         __ mov(tmp4, wordSize/str2_chr_size);
4496         __ sub(str2, str2, str2_chr_size);
4497       }
4498       __ cmp(ch1, ch2);
4499       __ mov(tmp4, wordSize/str2_chr_size);
4500       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4501     __ BIND(L_CMP_LOOP);
4502       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4503                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4504       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4505                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4506       __ add(tmp4, tmp4, 1);
4507       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4508       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4509       __ cmp(cnt1, ch2);
4510       __ br(__ EQ, L_CMP_LOOP);
4511     __ BIND(L_CMP_LOOP_NOMATCH);
4512       // here we're not matched
4513       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4514       __ clz(tmp4, tmp2);
4515       __ add(str2, str2, str2_chr_size); // advance pointer
4516       __ b(L_HAS_ZERO_LOOP);
4517     __ align(OptoLoopAlignment);
4518     __ BIND(L_CMP_LOOP_LAST_CMP);
4519       __ cmp(cnt1, ch2);
4520       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4521       __ b(DONE);
4522     __ align(OptoLoopAlignment);
4523     __ BIND(L_CMP_LOOP_LAST_CMP2);
4524       if (str2_isL) {
4525         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4526         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4527         __ lslv(tmp2, tmp2, tmp4);
4528         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4529         __ add(tmp4, tmp4, 1);
4530         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4531         __ lsl(tmp2, tmp2, 1);
4532       } else {
4533         __ mov(ch2, 0xE);
4534         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4535         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4536         __ lslv(tmp2, tmp2, tmp4);
4537         __ add(tmp4, tmp4, 1);
4538         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4539         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4540         __ lsl(tmp2, tmp2, 1);
4541         __ sub(str2, str2, str2_chr_size);
4542       }
4543       __ cmp(ch1, ch2);
4544       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4545       __ b(DONE);
4546     __ align(OptoLoopAlignment);
4547     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4548       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4549       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4550       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4551       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4552       // result by analyzed characters value, so, we can just reset lower bits
4553       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4554       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4555       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4556       // index of last analyzed substring inside current octet. So, str2 in at
4557       // respective start address. We need to advance it to next octet
4558       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4559       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4560       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4561       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4562       __ movw(cnt2, cnt2);
4563       __ b(L_LOOP_PROCEED);
4564     __ align(OptoLoopAlignment);
4565     __ BIND(NOMATCH);
4566       __ mov(result, -1);
4567     __ BIND(DONE);
4568       __ pop(spilled_regs, sp);
4569       __ ret(lr);
4570     return entry;
4571   }
4572 
4573   void generate_string_indexof_stubs() {
4574     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4575     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4576     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4577   }
4578 
4579   void inflate_and_store_2_fp_registers(bool generatePrfm,
4580       FloatRegister src1, FloatRegister src2) {
4581     Register dst = r1;
4582     __ zip1(v1, __ T16B, src1, v0);
4583     __ zip2(v2, __ T16B, src1, v0);
4584     if (generatePrfm) {
4585       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4586     }
4587     __ zip1(v3, __ T16B, src2, v0);
4588     __ zip2(v4, __ T16B, src2, v0);
4589     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4590   }
4591 
4592   // R0 = src
4593   // R1 = dst
4594   // R2 = len
4595   // R3 = len >> 3
4596   // V0 = 0
4597   // v1 = loaded 8 bytes
4598   address generate_large_byte_array_inflate() {
4599     __ align(CodeEntryAlignment);
4600     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4601     address entry = __ pc();
4602     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4603     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4604     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4605 
4606     // do one more 8-byte read to have address 16-byte aligned in most cases
4607     // also use single store instruction
4608     __ ldrd(v2, __ post(src, 8));
4609     __ sub(octetCounter, octetCounter, 2);
4610     __ zip1(v1, __ T16B, v1, v0);
4611     __ zip1(v2, __ T16B, v2, v0);
4612     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4613     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4614     __ subs(rscratch1, octetCounter, large_loop_threshold);
4615     __ br(__ LE, LOOP_START);
4616     __ b(LOOP_PRFM_START);
4617     __ bind(LOOP_PRFM);
4618       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4619     __ bind(LOOP_PRFM_START);
4620       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4621       __ sub(octetCounter, octetCounter, 8);
4622       __ subs(rscratch1, octetCounter, large_loop_threshold);
4623       inflate_and_store_2_fp_registers(true, v3, v4);
4624       inflate_and_store_2_fp_registers(true, v5, v6);
4625       __ br(__ GT, LOOP_PRFM);
4626       __ cmp(octetCounter, (u1)8);
4627       __ br(__ LT, DONE);
4628     __ bind(LOOP);
4629       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4630       __ bind(LOOP_START);
4631       __ sub(octetCounter, octetCounter, 8);
4632       __ cmp(octetCounter, (u1)8);
4633       inflate_and_store_2_fp_registers(false, v3, v4);
4634       inflate_and_store_2_fp_registers(false, v5, v6);
4635       __ br(__ GE, LOOP);
4636     __ bind(DONE);
4637       __ ret(lr);
4638     return entry;
4639   }
4640 
4641   /**
4642    *  Arguments:
4643    *
4644    *  Input:
4645    *  c_rarg0   - current state address
4646    *  c_rarg1   - H key address
4647    *  c_rarg2   - data address
4648    *  c_rarg3   - number of blocks
4649    *
4650    *  Output:
4651    *  Updated state at c_rarg0
4652    */
4653   address generate_ghash_processBlocks() {
4654     // Bafflingly, GCM uses little-endian for the byte order, but
4655     // big-endian for the bit order.  For example, the polynomial 1 is
4656     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4657     //
4658     // So, we must either reverse the bytes in each word and do
4659     // everything big-endian or reverse the bits in each byte and do
4660     // it little-endian.  On AArch64 it's more idiomatic to reverse
4661     // the bits in each byte (we have an instruction, RBIT, to do
4662     // that) and keep the data in little-endian bit order throught the
4663     // calculation, bit-reversing the inputs and outputs.
4664 
4665     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4666     __ align(wordSize * 2);
4667     address p = __ pc();
4668     __ emit_int64(0x87);  // The low-order bits of the field
4669                           // polynomial (i.e. p = z^7+z^2+z+1)
4670                           // repeated in the low and high parts of a
4671                           // 128-bit vector
4672     __ emit_int64(0x87);
4673 
4674     __ align(CodeEntryAlignment);
4675     address start = __ pc();
4676 
4677     Register state   = c_rarg0;
4678     Register subkeyH = c_rarg1;
4679     Register data    = c_rarg2;
4680     Register blocks  = c_rarg3;
4681 
4682     FloatRegister vzr = v30;
4683     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4684 
4685     __ ldrq(v0, Address(state));
4686     __ ldrq(v1, Address(subkeyH));
4687 
4688     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4689     __ rbit(v0, __ T16B, v0);
4690     __ rev64(v1, __ T16B, v1);
4691     __ rbit(v1, __ T16B, v1);
4692 
4693     __ ldrq(v26, p);
4694 
4695     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4696     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4697 
4698     {
4699       Label L_ghash_loop;
4700       __ bind(L_ghash_loop);
4701 
4702       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4703                                                  // reversing each byte
4704       __ rbit(v2, __ T16B, v2);
4705       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4706 
4707       // Multiply state in v2 by subkey in v1
4708       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4709                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4710                      /*temps*/v6, v20, v18, v21);
4711       // Reduce v7:v5 by the field polynomial
4712       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4713 
4714       __ sub(blocks, blocks, 1);
4715       __ cbnz(blocks, L_ghash_loop);
4716     }
4717 
4718     // The bit-reversed result is at this point in v0
4719     __ rev64(v1, __ T16B, v0);
4720     __ rbit(v1, __ T16B, v1);
4721 
4722     __ st1(v1, __ T16B, state);
4723     __ ret(lr);
4724 
4725     return start;
4726   }
4727 
4728   // Continuation point for throwing of implicit exceptions that are
4729   // not handled in the current activation. Fabricates an exception
4730   // oop and initiates normal exception dispatching in this
4731   // frame. Since we need to preserve callee-saved values (currently
4732   // only for C2, but done for C1 as well) we need a callee-saved oop
4733   // map and therefore have to make these stubs into RuntimeStubs
4734   // rather than BufferBlobs.  If the compiler needs all registers to
4735   // be preserved between the fault point and the exception handler
4736   // then it must assume responsibility for that in
4737   // AbstractCompiler::continuation_for_implicit_null_exception or
4738   // continuation_for_implicit_division_by_zero_exception. All other
4739   // implicit exceptions (e.g., NullPointerException or
4740   // AbstractMethodError on entry) are either at call sites or
4741   // otherwise assume that stack unwinding will be initiated, so
4742   // caller saved registers were assumed volatile in the compiler.
4743 
4744 #undef __
4745 #define __ masm->
4746 
4747   address generate_throw_exception(const char* name,
4748                                    address runtime_entry,
4749                                    Register arg1 = noreg,
4750                                    Register arg2 = noreg) {
4751     // Information about frame layout at time of blocking runtime call.
4752     // Note that we only have to preserve callee-saved registers since
4753     // the compilers are responsible for supplying a continuation point
4754     // if they expect all registers to be preserved.
4755     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4756     enum layout {
4757       rfp_off = 0,
4758       rfp_off2,
4759       return_off,
4760       return_off2,
4761       framesize // inclusive of return address
4762     };
4763 
4764     int insts_size = 512;
4765     int locs_size  = 64;
4766 
4767     CodeBuffer code(name, insts_size, locs_size);
4768     OopMapSet* oop_maps  = new OopMapSet();
4769     MacroAssembler* masm = new MacroAssembler(&code);
4770 
4771     address start = __ pc();
4772 
4773     // This is an inlined and slightly modified version of call_VM
4774     // which has the ability to fetch the return PC out of
4775     // thread-local storage and also sets up last_Java_sp slightly
4776     // differently than the real call_VM
4777 
4778     __ enter(); // Save FP and LR before call
4779 
4780     assert(is_even(framesize/2), "sp not 16-byte aligned");
4781 
4782     // lr and fp are already in place
4783     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4784 
4785     int frame_complete = __ pc() - start;
4786 
4787     // Set up last_Java_sp and last_Java_fp
4788     address the_pc = __ pc();
4789     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4790 
4791     // Call runtime
4792     if (arg1 != noreg) {
4793       assert(arg2 != c_rarg1, "clobbered");
4794       __ mov(c_rarg1, arg1);
4795     }
4796     if (arg2 != noreg) {
4797       __ mov(c_rarg2, arg2);
4798     }
4799     __ mov(c_rarg0, rthread);
4800     BLOCK_COMMENT("call runtime_entry");
4801     __ mov(rscratch1, runtime_entry);
4802     __ blr(rscratch1);
4803 
4804     // Generate oop map
4805     OopMap* map = new OopMap(framesize, 0);
4806 
4807     oop_maps->add_gc_map(the_pc - start, map);
4808 
4809     __ reset_last_Java_frame(true);
4810     __ maybe_isb();
4811 
4812     __ leave();
4813 
4814     // check for pending exceptions
4815 #ifdef ASSERT
4816     Label L;
4817     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4818     __ cbnz(rscratch1, L);
4819     __ should_not_reach_here();
4820     __ bind(L);
4821 #endif // ASSERT
4822     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4823 
4824 
4825     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4826     RuntimeStub* stub =
4827       RuntimeStub::new_runtime_stub(name,
4828                                     &code,
4829                                     frame_complete,
4830                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4831                                     oop_maps, false);
4832     return stub->entry_point();
4833   }
4834 
4835   class MontgomeryMultiplyGenerator : public MacroAssembler {
4836 
4837     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4838       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4839 
4840     RegSet _toSave;
4841     bool _squaring;
4842 
4843   public:
4844     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4845       : MacroAssembler(as->code()), _squaring(squaring) {
4846 
4847       // Register allocation
4848 
4849       Register reg = c_rarg0;
4850       Pa_base = reg;       // Argument registers
4851       if (squaring)
4852         Pb_base = Pa_base;
4853       else
4854         Pb_base = ++reg;
4855       Pn_base = ++reg;
4856       Rlen= ++reg;
4857       inv = ++reg;
4858       Pm_base = ++reg;
4859 
4860                           // Working registers:
4861       Ra =  ++reg;        // The current digit of a, b, n, and m.
4862       Rb =  ++reg;
4863       Rm =  ++reg;
4864       Rn =  ++reg;
4865 
4866       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4867       Pb =  ++reg;
4868       Pm =  ++reg;
4869       Pn =  ++reg;
4870 
4871       t0 =  ++reg;        // Three registers which form a
4872       t1 =  ++reg;        // triple-precision accumuator.
4873       t2 =  ++reg;
4874 
4875       Ri =  ++reg;        // Inner and outer loop indexes.
4876       Rj =  ++reg;
4877 
4878       Rhi_ab = ++reg;     // Product registers: low and high parts
4879       Rlo_ab = ++reg;     // of a*b and m*n.
4880       Rhi_mn = ++reg;
4881       Rlo_mn = ++reg;
4882 
4883       // r19 and up are callee-saved.
4884       _toSave = RegSet::range(r19, reg) + Pm_base;
4885     }
4886 
4887   private:
4888     void save_regs() {
4889       push(_toSave, sp);
4890     }
4891 
4892     void restore_regs() {
4893       pop(_toSave, sp);
4894     }
4895 
4896     template <typename T>
4897     void unroll_2(Register count, T block) {
4898       Label loop, end, odd;
4899       tbnz(count, 0, odd);
4900       cbz(count, end);
4901       align(16);
4902       bind(loop);
4903       (this->*block)();
4904       bind(odd);
4905       (this->*block)();
4906       subs(count, count, 2);
4907       br(Assembler::GT, loop);
4908       bind(end);
4909     }
4910 
4911     template <typename T>
4912     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4913       Label loop, end, odd;
4914       tbnz(count, 0, odd);
4915       cbz(count, end);
4916       align(16);
4917       bind(loop);
4918       (this->*block)(d, s, tmp);
4919       bind(odd);
4920       (this->*block)(d, s, tmp);
4921       subs(count, count, 2);
4922       br(Assembler::GT, loop);
4923       bind(end);
4924     }
4925 
4926     void pre1(RegisterOrConstant i) {
4927       block_comment("pre1");
4928       // Pa = Pa_base;
4929       // Pb = Pb_base + i;
4930       // Pm = Pm_base;
4931       // Pn = Pn_base + i;
4932       // Ra = *Pa;
4933       // Rb = *Pb;
4934       // Rm = *Pm;
4935       // Rn = *Pn;
4936       ldr(Ra, Address(Pa_base));
4937       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4938       ldr(Rm, Address(Pm_base));
4939       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4940       lea(Pa, Address(Pa_base));
4941       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4942       lea(Pm, Address(Pm_base));
4943       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4944 
4945       // Zero the m*n result.
4946       mov(Rhi_mn, zr);
4947       mov(Rlo_mn, zr);
4948     }
4949 
4950     // The core multiply-accumulate step of a Montgomery
4951     // multiplication.  The idea is to schedule operations as a
4952     // pipeline so that instructions with long latencies (loads and
4953     // multiplies) have time to complete before their results are
4954     // used.  This most benefits in-order implementations of the
4955     // architecture but out-of-order ones also benefit.
4956     void step() {
4957       block_comment("step");
4958       // MACC(Ra, Rb, t0, t1, t2);
4959       // Ra = *++Pa;
4960       // Rb = *--Pb;
4961       umulh(Rhi_ab, Ra, Rb);
4962       mul(Rlo_ab, Ra, Rb);
4963       ldr(Ra, pre(Pa, wordSize));
4964       ldr(Rb, pre(Pb, -wordSize));
4965       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4966                                        // previous iteration.
4967       // MACC(Rm, Rn, t0, t1, t2);
4968       // Rm = *++Pm;
4969       // Rn = *--Pn;
4970       umulh(Rhi_mn, Rm, Rn);
4971       mul(Rlo_mn, Rm, Rn);
4972       ldr(Rm, pre(Pm, wordSize));
4973       ldr(Rn, pre(Pn, -wordSize));
4974       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4975     }
4976 
4977     void post1() {
4978       block_comment("post1");
4979 
4980       // MACC(Ra, Rb, t0, t1, t2);
4981       // Ra = *++Pa;
4982       // Rb = *--Pb;
4983       umulh(Rhi_ab, Ra, Rb);
4984       mul(Rlo_ab, Ra, Rb);
4985       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4986       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4987 
4988       // *Pm = Rm = t0 * inv;
4989       mul(Rm, t0, inv);
4990       str(Rm, Address(Pm));
4991 
4992       // MACC(Rm, Rn, t0, t1, t2);
4993       // t0 = t1; t1 = t2; t2 = 0;
4994       umulh(Rhi_mn, Rm, Rn);
4995 
4996 #ifndef PRODUCT
4997       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4998       {
4999         mul(Rlo_mn, Rm, Rn);
5000         add(Rlo_mn, t0, Rlo_mn);
5001         Label ok;
5002         cbz(Rlo_mn, ok); {
5003           stop("broken Montgomery multiply");
5004         } bind(ok);
5005       }
5006 #endif
5007       // We have very carefully set things up so that
5008       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5009       // the lower half of Rm * Rn because we know the result already:
5010       // it must be -t0.  t0 + (-t0) must generate a carry iff
5011       // t0 != 0.  So, rather than do a mul and an adds we just set
5012       // the carry flag iff t0 is nonzero.
5013       //
5014       // mul(Rlo_mn, Rm, Rn);
5015       // adds(zr, t0, Rlo_mn);
5016       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5017       adcs(t0, t1, Rhi_mn);
5018       adc(t1, t2, zr);
5019       mov(t2, zr);
5020     }
5021 
5022     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5023       block_comment("pre2");
5024       // Pa = Pa_base + i-len;
5025       // Pb = Pb_base + len;
5026       // Pm = Pm_base + i-len;
5027       // Pn = Pn_base + len;
5028 
5029       if (i.is_register()) {
5030         sub(Rj, i.as_register(), len);
5031       } else {
5032         mov(Rj, i.as_constant());
5033         sub(Rj, Rj, len);
5034       }
5035       // Rj == i-len
5036 
5037       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5038       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5039       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5040       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5041 
5042       // Ra = *++Pa;
5043       // Rb = *--Pb;
5044       // Rm = *++Pm;
5045       // Rn = *--Pn;
5046       ldr(Ra, pre(Pa, wordSize));
5047       ldr(Rb, pre(Pb, -wordSize));
5048       ldr(Rm, pre(Pm, wordSize));
5049       ldr(Rn, pre(Pn, -wordSize));
5050 
5051       mov(Rhi_mn, zr);
5052       mov(Rlo_mn, zr);
5053     }
5054 
5055     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5056       block_comment("post2");
5057       if (i.is_constant()) {
5058         mov(Rj, i.as_constant()-len.as_constant());
5059       } else {
5060         sub(Rj, i.as_register(), len);
5061       }
5062 
5063       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5064 
5065       // As soon as we know the least significant digit of our result,
5066       // store it.
5067       // Pm_base[i-len] = t0;
5068       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5069 
5070       // t0 = t1; t1 = t2; t2 = 0;
5071       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5072       adc(t1, t2, zr);
5073       mov(t2, zr);
5074     }
5075 
5076     // A carry in t0 after Montgomery multiplication means that we
5077     // should subtract multiples of n from our result in m.  We'll
5078     // keep doing that until there is no carry.
5079     void normalize(RegisterOrConstant len) {
5080       block_comment("normalize");
5081       // while (t0)
5082       //   t0 = sub(Pm_base, Pn_base, t0, len);
5083       Label loop, post, again;
5084       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5085       cbz(t0, post); {
5086         bind(again); {
5087           mov(i, zr);
5088           mov(cnt, len);
5089           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5090           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5091           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5092           align(16);
5093           bind(loop); {
5094             sbcs(Rm, Rm, Rn);
5095             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5096             add(i, i, 1);
5097             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5098             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5099             sub(cnt, cnt, 1);
5100           } cbnz(cnt, loop);
5101           sbc(t0, t0, zr);
5102         } cbnz(t0, again);
5103       } bind(post);
5104     }
5105 
5106     // Move memory at s to d, reversing words.
5107     //    Increments d to end of copied memory
5108     //    Destroys tmp1, tmp2
5109     //    Preserves len
5110     //    Leaves s pointing to the address which was in d at start
5111     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5112       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5113 
5114       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5115       mov(tmp1, len);
5116       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5117       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5118     }
5119     // where
5120     void reverse1(Register d, Register s, Register tmp) {
5121       ldr(tmp, pre(s, -wordSize));
5122       ror(tmp, tmp, 32);
5123       str(tmp, post(d, wordSize));
5124     }
5125 
5126     void step_squaring() {
5127       // An extra ACC
5128       step();
5129       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5130     }
5131 
5132     void last_squaring(RegisterOrConstant i) {
5133       Label dont;
5134       // if ((i & 1) == 0) {
5135       tbnz(i.as_register(), 0, dont); {
5136         // MACC(Ra, Rb, t0, t1, t2);
5137         // Ra = *++Pa;
5138         // Rb = *--Pb;
5139         umulh(Rhi_ab, Ra, Rb);
5140         mul(Rlo_ab, Ra, Rb);
5141         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5142       } bind(dont);
5143     }
5144 
5145     void extra_step_squaring() {
5146       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5147 
5148       // MACC(Rm, Rn, t0, t1, t2);
5149       // Rm = *++Pm;
5150       // Rn = *--Pn;
5151       umulh(Rhi_mn, Rm, Rn);
5152       mul(Rlo_mn, Rm, Rn);
5153       ldr(Rm, pre(Pm, wordSize));
5154       ldr(Rn, pre(Pn, -wordSize));
5155     }
5156 
5157     void post1_squaring() {
5158       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5159 
5160       // *Pm = Rm = t0 * inv;
5161       mul(Rm, t0, inv);
5162       str(Rm, Address(Pm));
5163 
5164       // MACC(Rm, Rn, t0, t1, t2);
5165       // t0 = t1; t1 = t2; t2 = 0;
5166       umulh(Rhi_mn, Rm, Rn);
5167 
5168 #ifndef PRODUCT
5169       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5170       {
5171         mul(Rlo_mn, Rm, Rn);
5172         add(Rlo_mn, t0, Rlo_mn);
5173         Label ok;
5174         cbz(Rlo_mn, ok); {
5175           stop("broken Montgomery multiply");
5176         } bind(ok);
5177       }
5178 #endif
5179       // We have very carefully set things up so that
5180       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5181       // the lower half of Rm * Rn because we know the result already:
5182       // it must be -t0.  t0 + (-t0) must generate a carry iff
5183       // t0 != 0.  So, rather than do a mul and an adds we just set
5184       // the carry flag iff t0 is nonzero.
5185       //
5186       // mul(Rlo_mn, Rm, Rn);
5187       // adds(zr, t0, Rlo_mn);
5188       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5189       adcs(t0, t1, Rhi_mn);
5190       adc(t1, t2, zr);
5191       mov(t2, zr);
5192     }
5193 
5194     void acc(Register Rhi, Register Rlo,
5195              Register t0, Register t1, Register t2) {
5196       adds(t0, t0, Rlo);
5197       adcs(t1, t1, Rhi);
5198       adc(t2, t2, zr);
5199     }
5200 
5201   public:
5202     /**
5203      * Fast Montgomery multiplication.  The derivation of the
5204      * algorithm is in A Cryptographic Library for the Motorola
5205      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5206      *
5207      * Arguments:
5208      *
5209      * Inputs for multiplication:
5210      *   c_rarg0   - int array elements a
5211      *   c_rarg1   - int array elements b
5212      *   c_rarg2   - int array elements n (the modulus)
5213      *   c_rarg3   - int length
5214      *   c_rarg4   - int inv
5215      *   c_rarg5   - int array elements m (the result)
5216      *
5217      * Inputs for squaring:
5218      *   c_rarg0   - int array elements a
5219      *   c_rarg1   - int array elements n (the modulus)
5220      *   c_rarg2   - int length
5221      *   c_rarg3   - int inv
5222      *   c_rarg4   - int array elements m (the result)
5223      *
5224      */
5225     address generate_multiply() {
5226       Label argh, nothing;
5227       bind(argh);
5228       stop("MontgomeryMultiply total_allocation must be <= 8192");
5229 
5230       align(CodeEntryAlignment);
5231       address entry = pc();
5232 
5233       cbzw(Rlen, nothing);
5234 
5235       enter();
5236 
5237       // Make room.
5238       cmpw(Rlen, 512);
5239       br(Assembler::HI, argh);
5240       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5241       andr(sp, Ra, -2 * wordSize);
5242 
5243       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5244 
5245       {
5246         // Copy input args, reversing as we go.  We use Ra as a
5247         // temporary variable.
5248         reverse(Ra, Pa_base, Rlen, t0, t1);
5249         if (!_squaring)
5250           reverse(Ra, Pb_base, Rlen, t0, t1);
5251         reverse(Ra, Pn_base, Rlen, t0, t1);
5252       }
5253 
5254       // Push all call-saved registers and also Pm_base which we'll need
5255       // at the end.
5256       save_regs();
5257 
5258 #ifndef PRODUCT
5259       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5260       {
5261         ldr(Rn, Address(Pn_base, 0));
5262         mul(Rlo_mn, Rn, inv);
5263         subs(zr, Rlo_mn, -1);
5264         Label ok;
5265         br(EQ, ok); {
5266           stop("broken inverse in Montgomery multiply");
5267         } bind(ok);
5268       }
5269 #endif
5270 
5271       mov(Pm_base, Ra);
5272 
5273       mov(t0, zr);
5274       mov(t1, zr);
5275       mov(t2, zr);
5276 
5277       block_comment("for (int i = 0; i < len; i++) {");
5278       mov(Ri, zr); {
5279         Label loop, end;
5280         cmpw(Ri, Rlen);
5281         br(Assembler::GE, end);
5282 
5283         bind(loop);
5284         pre1(Ri);
5285 
5286         block_comment("  for (j = i; j; j--) {"); {
5287           movw(Rj, Ri);
5288           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5289         } block_comment("  } // j");
5290 
5291         post1();
5292         addw(Ri, Ri, 1);
5293         cmpw(Ri, Rlen);
5294         br(Assembler::LT, loop);
5295         bind(end);
5296         block_comment("} // i");
5297       }
5298 
5299       block_comment("for (int i = len; i < 2*len; i++) {");
5300       mov(Ri, Rlen); {
5301         Label loop, end;
5302         cmpw(Ri, Rlen, Assembler::LSL, 1);
5303         br(Assembler::GE, end);
5304 
5305         bind(loop);
5306         pre2(Ri, Rlen);
5307 
5308         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5309           lslw(Rj, Rlen, 1);
5310           subw(Rj, Rj, Ri);
5311           subw(Rj, Rj, 1);
5312           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5313         } block_comment("  } // j");
5314 
5315         post2(Ri, Rlen);
5316         addw(Ri, Ri, 1);
5317         cmpw(Ri, Rlen, Assembler::LSL, 1);
5318         br(Assembler::LT, loop);
5319         bind(end);
5320       }
5321       block_comment("} // i");
5322 
5323       normalize(Rlen);
5324 
5325       mov(Ra, Pm_base);  // Save Pm_base in Ra
5326       restore_regs();  // Restore caller's Pm_base
5327 
5328       // Copy our result into caller's Pm_base
5329       reverse(Pm_base, Ra, Rlen, t0, t1);
5330 
5331       leave();
5332       bind(nothing);
5333       ret(lr);
5334 
5335       return entry;
5336     }
5337     // In C, approximately:
5338 
5339     // void
5340     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5341     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5342     //                     unsigned long inv, int len) {
5343     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5344     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5345     //   unsigned long Ra, Rb, Rn, Rm;
5346 
5347     //   int i;
5348 
5349     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5350 
5351     //   for (i = 0; i < len; i++) {
5352     //     int j;
5353 
5354     //     Pa = Pa_base;
5355     //     Pb = Pb_base + i;
5356     //     Pm = Pm_base;
5357     //     Pn = Pn_base + i;
5358 
5359     //     Ra = *Pa;
5360     //     Rb = *Pb;
5361     //     Rm = *Pm;
5362     //     Rn = *Pn;
5363 
5364     //     int iters = i;
5365     //     for (j = 0; iters--; j++) {
5366     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5367     //       MACC(Ra, Rb, t0, t1, t2);
5368     //       Ra = *++Pa;
5369     //       Rb = *--Pb;
5370     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5371     //       MACC(Rm, Rn, t0, t1, t2);
5372     //       Rm = *++Pm;
5373     //       Rn = *--Pn;
5374     //     }
5375 
5376     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5377     //     MACC(Ra, Rb, t0, t1, t2);
5378     //     *Pm = Rm = t0 * inv;
5379     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5380     //     MACC(Rm, Rn, t0, t1, t2);
5381 
5382     //     assert(t0 == 0, "broken Montgomery multiply");
5383 
5384     //     t0 = t1; t1 = t2; t2 = 0;
5385     //   }
5386 
5387     //   for (i = len; i < 2*len; i++) {
5388     //     int j;
5389 
5390     //     Pa = Pa_base + i-len;
5391     //     Pb = Pb_base + len;
5392     //     Pm = Pm_base + i-len;
5393     //     Pn = Pn_base + len;
5394 
5395     //     Ra = *++Pa;
5396     //     Rb = *--Pb;
5397     //     Rm = *++Pm;
5398     //     Rn = *--Pn;
5399 
5400     //     int iters = len*2-i-1;
5401     //     for (j = i-len+1; iters--; j++) {
5402     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5403     //       MACC(Ra, Rb, t0, t1, t2);
5404     //       Ra = *++Pa;
5405     //       Rb = *--Pb;
5406     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5407     //       MACC(Rm, Rn, t0, t1, t2);
5408     //       Rm = *++Pm;
5409     //       Rn = *--Pn;
5410     //     }
5411 
5412     //     Pm_base[i-len] = t0;
5413     //     t0 = t1; t1 = t2; t2 = 0;
5414     //   }
5415 
5416     //   while (t0)
5417     //     t0 = sub(Pm_base, Pn_base, t0, len);
5418     // }
5419 
5420     /**
5421      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5422      * multiplies than Montgomery multiplication so it should be up to
5423      * 25% faster.  However, its loop control is more complex and it
5424      * may actually run slower on some machines.
5425      *
5426      * Arguments:
5427      *
5428      * Inputs:
5429      *   c_rarg0   - int array elements a
5430      *   c_rarg1   - int array elements n (the modulus)
5431      *   c_rarg2   - int length
5432      *   c_rarg3   - int inv
5433      *   c_rarg4   - int array elements m (the result)
5434      *
5435      */
5436     address generate_square() {
5437       Label argh;
5438       bind(argh);
5439       stop("MontgomeryMultiply total_allocation must be <= 8192");
5440 
5441       align(CodeEntryAlignment);
5442       address entry = pc();
5443 
5444       enter();
5445 
5446       // Make room.
5447       cmpw(Rlen, 512);
5448       br(Assembler::HI, argh);
5449       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5450       andr(sp, Ra, -2 * wordSize);
5451 
5452       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5453 
5454       {
5455         // Copy input args, reversing as we go.  We use Ra as a
5456         // temporary variable.
5457         reverse(Ra, Pa_base, Rlen, t0, t1);
5458         reverse(Ra, Pn_base, Rlen, t0, t1);
5459       }
5460 
5461       // Push all call-saved registers and also Pm_base which we'll need
5462       // at the end.
5463       save_regs();
5464 
5465       mov(Pm_base, Ra);
5466 
5467       mov(t0, zr);
5468       mov(t1, zr);
5469       mov(t2, zr);
5470 
5471       block_comment("for (int i = 0; i < len; i++) {");
5472       mov(Ri, zr); {
5473         Label loop, end;
5474         bind(loop);
5475         cmp(Ri, Rlen);
5476         br(Assembler::GE, end);
5477 
5478         pre1(Ri);
5479 
5480         block_comment("for (j = (i+1)/2; j; j--) {"); {
5481           add(Rj, Ri, 1);
5482           lsr(Rj, Rj, 1);
5483           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5484         } block_comment("  } // j");
5485 
5486         last_squaring(Ri);
5487 
5488         block_comment("  for (j = i/2; j; j--) {"); {
5489           lsr(Rj, Ri, 1);
5490           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5491         } block_comment("  } // j");
5492 
5493         post1_squaring();
5494         add(Ri, Ri, 1);
5495         cmp(Ri, Rlen);
5496         br(Assembler::LT, loop);
5497 
5498         bind(end);
5499         block_comment("} // i");
5500       }
5501 
5502       block_comment("for (int i = len; i < 2*len; i++) {");
5503       mov(Ri, Rlen); {
5504         Label loop, end;
5505         bind(loop);
5506         cmp(Ri, Rlen, Assembler::LSL, 1);
5507         br(Assembler::GE, end);
5508 
5509         pre2(Ri, Rlen);
5510 
5511         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5512           lsl(Rj, Rlen, 1);
5513           sub(Rj, Rj, Ri);
5514           sub(Rj, Rj, 1);
5515           lsr(Rj, Rj, 1);
5516           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5517         } block_comment("  } // j");
5518 
5519         last_squaring(Ri);
5520 
5521         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5522           lsl(Rj, Rlen, 1);
5523           sub(Rj, Rj, Ri);
5524           lsr(Rj, Rj, 1);
5525           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5526         } block_comment("  } // j");
5527 
5528         post2(Ri, Rlen);
5529         add(Ri, Ri, 1);
5530         cmp(Ri, Rlen, Assembler::LSL, 1);
5531 
5532         br(Assembler::LT, loop);
5533         bind(end);
5534         block_comment("} // i");
5535       }
5536 
5537       normalize(Rlen);
5538 
5539       mov(Ra, Pm_base);  // Save Pm_base in Ra
5540       restore_regs();  // Restore caller's Pm_base
5541 
5542       // Copy our result into caller's Pm_base
5543       reverse(Pm_base, Ra, Rlen, t0, t1);
5544 
5545       leave();
5546       ret(lr);
5547 
5548       return entry;
5549     }
5550     // In C, approximately:
5551 
5552     // void
5553     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5554     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5555     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5556     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5557     //   unsigned long Ra, Rb, Rn, Rm;
5558 
5559     //   int i;
5560 
5561     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5562 
5563     //   for (i = 0; i < len; i++) {
5564     //     int j;
5565 
5566     //     Pa = Pa_base;
5567     //     Pb = Pa_base + i;
5568     //     Pm = Pm_base;
5569     //     Pn = Pn_base + i;
5570 
5571     //     Ra = *Pa;
5572     //     Rb = *Pb;
5573     //     Rm = *Pm;
5574     //     Rn = *Pn;
5575 
5576     //     int iters = (i+1)/2;
5577     //     for (j = 0; iters--; j++) {
5578     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5579     //       MACC2(Ra, Rb, t0, t1, t2);
5580     //       Ra = *++Pa;
5581     //       Rb = *--Pb;
5582     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5583     //       MACC(Rm, Rn, t0, t1, t2);
5584     //       Rm = *++Pm;
5585     //       Rn = *--Pn;
5586     //     }
5587     //     if ((i & 1) == 0) {
5588     //       assert(Ra == Pa_base[j], "must be");
5589     //       MACC(Ra, Ra, t0, t1, t2);
5590     //     }
5591     //     iters = i/2;
5592     //     assert(iters == i-j, "must be");
5593     //     for (; iters--; j++) {
5594     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5595     //       MACC(Rm, Rn, t0, t1, t2);
5596     //       Rm = *++Pm;
5597     //       Rn = *--Pn;
5598     //     }
5599 
5600     //     *Pm = Rm = t0 * inv;
5601     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5602     //     MACC(Rm, Rn, t0, t1, t2);
5603 
5604     //     assert(t0 == 0, "broken Montgomery multiply");
5605 
5606     //     t0 = t1; t1 = t2; t2 = 0;
5607     //   }
5608 
5609     //   for (i = len; i < 2*len; i++) {
5610     //     int start = i-len+1;
5611     //     int end = start + (len - start)/2;
5612     //     int j;
5613 
5614     //     Pa = Pa_base + i-len;
5615     //     Pb = Pa_base + len;
5616     //     Pm = Pm_base + i-len;
5617     //     Pn = Pn_base + len;
5618 
5619     //     Ra = *++Pa;
5620     //     Rb = *--Pb;
5621     //     Rm = *++Pm;
5622     //     Rn = *--Pn;
5623 
5624     //     int iters = (2*len-i-1)/2;
5625     //     assert(iters == end-start, "must be");
5626     //     for (j = start; iters--; j++) {
5627     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5628     //       MACC2(Ra, Rb, t0, t1, t2);
5629     //       Ra = *++Pa;
5630     //       Rb = *--Pb;
5631     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5632     //       MACC(Rm, Rn, t0, t1, t2);
5633     //       Rm = *++Pm;
5634     //       Rn = *--Pn;
5635     //     }
5636     //     if ((i & 1) == 0) {
5637     //       assert(Ra == Pa_base[j], "must be");
5638     //       MACC(Ra, Ra, t0, t1, t2);
5639     //     }
5640     //     iters =  (2*len-i)/2;
5641     //     assert(iters == len-j, "must be");
5642     //     for (; iters--; j++) {
5643     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5644     //       MACC(Rm, Rn, t0, t1, t2);
5645     //       Rm = *++Pm;
5646     //       Rn = *--Pn;
5647     //     }
5648     //     Pm_base[i-len] = t0;
5649     //     t0 = t1; t1 = t2; t2 = 0;
5650     //   }
5651 
5652     //   while (t0)
5653     //     t0 = sub(Pm_base, Pn_base, t0, len);
5654     // }
5655   };
5656 
5657 
5658   // Initialization
5659   void generate_initial() {
5660     // Generate initial stubs and initializes the entry points
5661 
5662     // entry points that exist in all platforms Note: This is code
5663     // that could be shared among different platforms - however the
5664     // benefit seems to be smaller than the disadvantage of having a
5665     // much more complicated generator structure. See also comment in
5666     // stubRoutines.hpp.
5667 
5668     StubRoutines::_forward_exception_entry = generate_forward_exception();
5669 
5670     StubRoutines::_call_stub_entry =
5671       generate_call_stub(StubRoutines::_call_stub_return_address);
5672 
5673     // is referenced by megamorphic call
5674     StubRoutines::_catch_exception_entry = generate_catch_exception();
5675 
5676     // Build this early so it's available for the interpreter.
5677     StubRoutines::_throw_StackOverflowError_entry =
5678       generate_throw_exception("StackOverflowError throw_exception",
5679                                CAST_FROM_FN_PTR(address,
5680                                                 SharedRuntime::throw_StackOverflowError));
5681     StubRoutines::_throw_delayed_StackOverflowError_entry =
5682       generate_throw_exception("delayed StackOverflowError throw_exception",
5683                                CAST_FROM_FN_PTR(address,
5684                                                 SharedRuntime::throw_delayed_StackOverflowError));
5685     if (UseCRC32Intrinsics) {
5686       // set table address before stub generation which use it
5687       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5688       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5689     }
5690 
5691     if (UseCRC32CIntrinsics) {
5692       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5693     }
5694 
5695     // Disabled until JDK-8210858 is fixed
5696     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5697     //   StubRoutines::_dlog = generate_dlog();
5698     // }
5699 
5700     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5701       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5702     }
5703 
5704     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5705       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5706     }
5707   }
5708 
5709   void generate_all() {
5710     // support for verify_oop (must happen after universe_init)
5711     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5712     StubRoutines::_throw_AbstractMethodError_entry =
5713       generate_throw_exception("AbstractMethodError throw_exception",
5714                                CAST_FROM_FN_PTR(address,
5715                                                 SharedRuntime::
5716                                                 throw_AbstractMethodError));
5717 
5718     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5719       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5720                                CAST_FROM_FN_PTR(address,
5721                                                 SharedRuntime::
5722                                                 throw_IncompatibleClassChangeError));
5723 
5724     StubRoutines::_throw_NullPointerException_at_call_entry =
5725       generate_throw_exception("NullPointerException at call throw_exception",
5726                                CAST_FROM_FN_PTR(address,
5727                                                 SharedRuntime::
5728                                                 throw_NullPointerException_at_call));
5729 
5730     // arraycopy stubs used by compilers
5731     generate_arraycopy_stubs();
5732 
5733     // has negatives stub for large arrays.
5734     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5735 
5736     // array equals stub for large arrays.
5737     if (!UseSimpleArrayEquals) {
5738       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5739     }
5740 
5741     generate_compare_long_strings();
5742 
5743     generate_string_indexof_stubs();
5744 
5745     // byte_array_inflate stub for large arrays.
5746     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5747 
5748 #ifdef COMPILER2
5749     if (UseMultiplyToLenIntrinsic) {
5750       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5751     }
5752 
5753     if (UseSquareToLenIntrinsic) {
5754       StubRoutines::_squareToLen = generate_squareToLen();
5755     }
5756 
5757     if (UseMulAddIntrinsic) {
5758       StubRoutines::_mulAdd = generate_mulAdd();
5759     }
5760 
5761     if (UseMontgomeryMultiplyIntrinsic) {
5762       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5763       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5764       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5765     }
5766 
5767     if (UseMontgomerySquareIntrinsic) {
5768       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5769       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5770       // We use generate_multiply() rather than generate_square()
5771       // because it's faster for the sizes of modulus we care about.
5772       StubRoutines::_montgomerySquare = g.generate_multiply();
5773     }
5774 #endif // COMPILER2
5775 
5776     // generate GHASH intrinsics code
5777     if (UseGHASHIntrinsics) {
5778       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5779     }
5780 
5781     // data cache line writeback
5782     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5783     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5784 
5785     if (UseAESIntrinsics) {
5786       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5787       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5788       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5789       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5790     }
5791 
5792     if (UseSHA1Intrinsics) {
5793       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5794       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5795     }
5796     if (UseSHA256Intrinsics) {
5797       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5798       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5799     }
5800 
5801     // generate Adler32 intrinsics code
5802     if (UseAdler32Intrinsics) {
5803       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5804     }
5805 
5806     // Safefetch stubs.
5807     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5808                                                        &StubRoutines::_safefetch32_fault_pc,
5809                                                        &StubRoutines::_safefetch32_continuation_pc);
5810     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5811                                                        &StubRoutines::_safefetchN_fault_pc,
5812                                                        &StubRoutines::_safefetchN_continuation_pc);
5813     StubRoutines::aarch64::set_completed();
5814   }
5815 
5816  public:
5817   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5818     if (all) {
5819       generate_all();
5820     } else {
5821       generate_initial();
5822     }
5823   }
5824 }; // end class declaration
5825 
5826 #define UCM_TABLE_MAX_ENTRIES 8
5827 void StubGenerator_generate(CodeBuffer* code, bool all) {
5828   if (UnsafeCopyMemory::_table == NULL) {
5829     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
5830   }
5831   StubGenerator g(code, all);
5832 }