New src/hotspot/cpu/aarch64/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (u1)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_VALUETYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, (u1)T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, (u1)T_VALUETYPE);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, (u1)T_LONG);
 330     __ br(Assembler::EQ, is_long);
 331     __ cmp(j_rarg1, (u1)T_FLOAT);
 332     __ br(Assembler::EQ, is_float);
 333     __ cmp(j_rarg1, (u1)T_DOUBLE);
 334     __ br(Assembler::EQ, is_double);
 335 
 336     // handle T_INT case
 337     __ strw(r0, Address(j_rarg2));
 338 
 339     __ BIND(exit);
 340 
 341     // pop parameters
 342     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 343 
 344 #ifdef ASSERT
 345     // verify that threads correspond
 346     {
 347       Label L, S;
 348       __ ldr(rscratch1, thread);
 349       __ cmp(rthread, rscratch1);
 350       __ br(Assembler::NE, S);
 351       __ get_thread(rscratch1);
 352       __ cmp(rthread, rscratch1);
 353       __ br(Assembler::EQ, L);
 354       __ BIND(S);
 355       __ stop("StubRoutines::call_stub: threads must correspond");
 356       __ BIND(L);
 357     }
 358 #endif
 359 
 360     // restore callee-save registers
 361     __ ldpd(v15, v14,  d15_save);
 362     __ ldpd(v13, v12,  d13_save);
 363     __ ldpd(v11, v10,  d11_save);
 364     __ ldpd(v9,  v8,   d9_save);
 365 
 366     __ ldp(r28, r27,   r28_save);
 367     __ ldp(r26, r25,   r26_save);
 368     __ ldp(r24, r23,   r24_save);
 369     __ ldp(r22, r21,   r22_save);
 370     __ ldp(r20, r19,   r20_save);
 371 
 372     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 373     __ ldrw(c_rarg2, result_type);
 374     __ ldr(c_rarg3,  method);
 375     __ ldp(c_rarg4, c_rarg5,  entry_point);
 376     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 377 
 378 #ifndef PRODUCT
 379     // tell the simulator we are about to end Java execution
 380     if (NotifySimulator) {
 381       __ notify(Assembler::method_exit);
 382     }
 383 #endif
 384     // leave frame and return to caller
 385     __ leave();
 386     __ ret(lr);
 387 
 388     // handle return types different from T_INT
 389 
 390     __ BIND(is_long);
 391     __ str(r0, Address(j_rarg2, 0));
 392     __ br(Assembler::AL, exit);
 393 
 394     __ BIND(is_float);
 395     __ strs(j_farg0, Address(j_rarg2, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     __ BIND(is_double);
 399     __ strd(j_farg0, Address(j_rarg2, 0));
 400     __ br(Assembler::AL, exit);
 401 
 402     return start;
 403   }
 404 
 405   // Return point for a Java call if there's an exception thrown in
 406   // Java code.  The exception is caught and transformed into a
 407   // pending exception stored in JavaThread that can be tested from
 408   // within the VM.
 409   //
 410   // Note: Usually the parameters are removed by the callee. In case
 411   // of an exception crossing an activation frame boundary, that is
 412   // not the case if the callee is compiled code => need to setup the
 413   // rsp.
 414   //
 415   // r0: exception oop
 416 
 417   // NOTE: this is used as a target from the signal handler so it
 418   // needs an x86 prolog which returns into the current simulator
 419   // executing the generated catch_exception code. so the prolog
 420   // needs to install rax in a sim register and adjust the sim's
 421   // restart pc to enter the generated code at the start position
 422   // then return from native to simulated execution.
 423 
 424   address generate_catch_exception() {
 425     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 426     address start = __ pc();
 427 
 428     // same as in generate_call_stub():
 429     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 430     const Address thread        (rfp, thread_off         * wordSize);
 431 
 432 #ifdef ASSERT
 433     // verify that threads correspond
 434     {
 435       Label L, S;
 436       __ ldr(rscratch1, thread);
 437       __ cmp(rthread, rscratch1);
 438       __ br(Assembler::NE, S);
 439       __ get_thread(rscratch1);
 440       __ cmp(rthread, rscratch1);
 441       __ br(Assembler::EQ, L);
 442       __ bind(S);
 443       __ stop("StubRoutines::catch_exception: threads must correspond");
 444       __ bind(L);
 445     }
 446 #endif
 447 
 448     // set pending exception
 449     __ verify_oop(r0);
 450 
 451     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 452     __ mov(rscratch1, (address)__FILE__);
 453     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 454     __ movw(rscratch1, (int)__LINE__);
 455     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 456 
 457     // complete return to VM
 458     assert(StubRoutines::_call_stub_return_address != NULL,
 459            "_call_stub_return_address must have been generated before");
 460     __ b(StubRoutines::_call_stub_return_address);
 461 
 462     return start;
 463   }
 464 
 465   // Continuation point for runtime calls returning with a pending
 466   // exception.  The pending exception check happened in the runtime
 467   // or native call stub.  The pending exception in Thread is
 468   // converted into a Java-level exception.
 469   //
 470   // Contract with Java-level exception handlers:
 471   // r0: exception
 472   // r3: throwing pc
 473   //
 474   // NOTE: At entry of this stub, exception-pc must be in LR !!
 475 
 476   // NOTE: this is always used as a jump target within generated code
 477   // so it just needs to be generated code wiht no x86 prolog
 478 
 479   address generate_forward_exception() {
 480     StubCodeMark mark(this, "StubRoutines", "forward exception");
 481     address start = __ pc();
 482 
 483     // Upon entry, LR points to the return address returning into
 484     // Java (interpreted or compiled) code; i.e., the return address
 485     // becomes the throwing pc.
 486     //
 487     // Arguments pushed before the runtime call are still on the stack
 488     // but the exception handler will reset the stack pointer ->
 489     // ignore them.  A potential result in registers can be ignored as
 490     // well.
 491 
 492 #ifdef ASSERT
 493     // make sure this code is only executed if there is a pending exception
 494     {
 495       Label L;
 496       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 497       __ cbnz(rscratch1, L);
 498       __ stop("StubRoutines::forward exception: no pending exception (1)");
 499       __ bind(L);
 500     }
 501 #endif
 502 
 503     // compute exception handler into r19
 504 
 505     // call the VM to find the handler address associated with the
 506     // caller address. pass thread in r0 and caller pc (ret address)
 507     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 508     // the stack.
 509     __ mov(c_rarg1, lr);
 510     // lr will be trashed by the VM call so we move it to R19
 511     // (callee-saved) because we also need to pass it to the handler
 512     // returned by this call.
 513     __ mov(r19, lr);
 514     BLOCK_COMMENT("call exception_handler_for_return_address");
 515     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 516                          SharedRuntime::exception_handler_for_return_address),
 517                     rthread, c_rarg1);
 518     // we should not really care that lr is no longer the callee
 519     // address. we saved the value the handler needs in r19 so we can
 520     // just copy it to r3. however, the C2 handler will push its own
 521     // frame and then calls into the VM and the VM code asserts that
 522     // the PC for the frame above the handler belongs to a compiled
 523     // Java method. So, we restore lr here to satisfy that assert.
 524     __ mov(lr, r19);
 525     // setup r0 & r3 & clear pending exception
 526     __ mov(r3, r19);
 527     __ mov(r19, r0);
 528     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 529     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 530 
 531 #ifdef ASSERT
 532     // make sure exception is set
 533     {
 534       Label L;
 535       __ cbnz(r0, L);
 536       __ stop("StubRoutines::forward exception: no pending exception (2)");
 537       __ bind(L);
 538     }
 539 #endif
 540 
 541     // continue at exception handler
 542     // r0: exception
 543     // r3: throwing pc
 544     // r19: exception handler
 545     __ verify_oop(r0);
 546     __ br(r19);
 547 
 548     return start;
 549   }
 550 
 551   // Non-destructive plausibility checks for oops
 552   //
 553   // Arguments:
 554   //    r0: oop to verify
 555   //    rscratch1: error message
 556   //
 557   // Stack after saving c_rarg3:
 558   //    [tos + 0]: saved c_rarg3
 559   //    [tos + 1]: saved c_rarg2
 560   //    [tos + 2]: saved lr
 561   //    [tos + 3]: saved rscratch2
 562   //    [tos + 4]: saved r0
 563   //    [tos + 5]: saved rscratch1
 564   address generate_verify_oop() {
 565 
 566     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 567     address start = __ pc();
 568 
 569     Label exit, error;
 570 
 571     // save c_rarg2 and c_rarg3
 572     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 573 
 574     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 575     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 576     __ ldr(c_rarg3, Address(c_rarg2));
 577     __ add(c_rarg3, c_rarg3, 1);
 578     __ str(c_rarg3, Address(c_rarg2));
 579 
 580     // object is in r0
 581     // make sure object is 'reasonable'
 582     __ cbz(r0, exit); // if obj is NULL it is OK
 583 
 584     // Check if the oop is in the right area of memory
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 586     __ andr(c_rarg2, r0, c_rarg3);
 587     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 588 
 589     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 590     // instruction here because the flags register is live.
 591     __ eor(c_rarg2, c_rarg2, c_rarg3);
 592     __ cbnz(c_rarg2, error);
 593 
 594     // make sure klass is 'reasonable', which is not zero.
 595     __ load_klass(r0, r0);  // get klass
 596     __ cbz(r0, error);      // if klass is NULL it is broken
 597 
 598     // return if everything seems ok
 599     __ bind(exit);
 600 
 601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 602     __ ret(lr);
 603 
 604     // handle errors
 605     __ bind(error);
 606     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 607 
 608     __ push(RegSet::range(r0, r29), sp);
 609     // debug(char* msg, int64_t pc, int64_t regs[])
 610     __ mov(c_rarg0, rscratch1);      // pass address of error message
 611     __ mov(c_rarg1, lr);             // pass return address
 612     __ mov(c_rarg2, sp);             // pass address of regs on stack
 613 #ifndef PRODUCT
 614     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 615 #endif
 616     BLOCK_COMMENT("call MacroAssembler::debug");
 617     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 618     __ blrt(rscratch1, 3, 0, 1);
 619 
 620     return start;
 621   }
 622 
 623   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 624 
 625   // The inner part of zero_words().  This is the bulk operation,
 626   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 627   // caller is responsible for zeroing the last few words.
 628   //
 629   // Inputs:
 630   // r10: the HeapWord-aligned base address of an array to zero.
 631   // r11: the count in HeapWords, r11 > 0.
 632   //
 633   // Returns r10 and r11, adjusted for the caller to clear.
 634   // r10: the base address of the tail of words left to clear.
 635   // r11: the number of words in the tail.
 636   //      r11 < MacroAssembler::zero_words_block_size.
 637 
 638   address generate_zero_blocks() {
 639     Label done;
 640     Label base_aligned;
 641 
 642     Register base = r10, cnt = r11;
 643 
 644     __ align(CodeEntryAlignment);
 645     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 646     address start = __ pc();
 647 
 648     if (UseBlockZeroing) {
 649       int zva_length = VM_Version::zva_length();
 650 
 651       // Ensure ZVA length can be divided by 16. This is required by
 652       // the subsequent operations.
 653       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 654 
 655       __ tbz(base, 3, base_aligned);
 656       __ str(zr, Address(__ post(base, 8)));
 657       __ sub(cnt, cnt, 1);
 658       __ bind(base_aligned);
 659 
 660       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 661       // alignment.
 662       Label small;
 663       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 664       __ subs(rscratch1, cnt, low_limit >> 3);
 665       __ br(Assembler::LT, small);
 666       __ zero_dcache_blocks(base, cnt);
 667       __ bind(small);
 668     }
 669 
 670     {
 671       // Number of stp instructions we'll unroll
 672       const int unroll =
 673         MacroAssembler::zero_words_block_size / 2;
 674       // Clear the remaining blocks.
 675       Label loop;
 676       __ subs(cnt, cnt, unroll * 2);
 677       __ br(Assembler::LT, done);
 678       __ bind(loop);
 679       for (int i = 0; i < unroll; i++)
 680         __ stp(zr, zr, __ post(base, 16));
 681       __ subs(cnt, cnt, unroll * 2);
 682       __ br(Assembler::GE, loop);
 683       __ bind(done);
 684       __ add(cnt, cnt, unroll * 2);
 685     }
 686 
 687     __ ret(lr);
 688 
 689     return start;
 690   }
 691 
 692 
 693   typedef enum {
 694     copy_forwards = 1,
 695     copy_backwards = -1
 696   } copy_direction;
 697 
 698   // Bulk copy of blocks of 8 words.
 699   //
 700   // count is a count of words.
 701   //
 702   // Precondition: count >= 8
 703   //
 704   // Postconditions:
 705   //
 706   // The least significant bit of count contains the remaining count
 707   // of words to copy.  The rest of count is trash.
 708   //
 709   // s and d are adjusted to point to the remaining words to copy
 710   //
 711   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 712                            copy_direction direction) {
 713     int unit = wordSize * direction;
 714     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 715 
 716     int offset;
 717     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 718       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 719     const Register stride = r13;
 720 
 721     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 722     assert_different_registers(s, d, count, rscratch1);
 723 
 724     Label again, drain;
 725     const char *stub_name;
 726     if (direction == copy_forwards)
 727       stub_name = "forward_copy_longs";
 728     else
 729       stub_name = "backward_copy_longs";
 730 
 731     __ align(CodeEntryAlignment);
 732 
 733     StubCodeMark mark(this, "StubRoutines", stub_name);
 734 
 735     __ bind(start);
 736 
 737     Label unaligned_copy_long;
 738     if (AvoidUnalignedAccesses) {
 739       __ tbnz(d, 3, unaligned_copy_long);
 740     }
 741 
 742     if (direction == copy_forwards) {
 743       __ sub(s, s, bias);
 744       __ sub(d, d, bias);
 745     }
 746 
 747 #ifdef ASSERT
 748     // Make sure we are never given < 8 words
 749     {
 750       Label L;
 751       __ cmp(count, (u1)8);
 752       __ br(Assembler::GE, L);
 753       __ stop("genrate_copy_longs called with < 8 words");
 754       __ bind(L);
 755     }
 756 #endif
 757 
 758     // Fill 8 registers
 759     if (UseSIMDForMemoryOps) {
 760       __ ldpq(v0, v1, Address(s, 4 * unit));
 761       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 762     } else {
 763       __ ldp(t0, t1, Address(s, 2 * unit));
 764       __ ldp(t2, t3, Address(s, 4 * unit));
 765       __ ldp(t4, t5, Address(s, 6 * unit));
 766       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 767     }
 768 
 769     __ subs(count, count, 16);
 770     __ br(Assembler::LO, drain);
 771 
 772     int prefetch = PrefetchCopyIntervalInBytes;
 773     bool use_stride = false;
 774     if (direction == copy_backwards) {
 775        use_stride = prefetch > 256;
 776        prefetch = -prefetch;
 777        if (use_stride) __ mov(stride, prefetch);
 778     }
 779 
 780     __ bind(again);
 781 
 782     if (PrefetchCopyIntervalInBytes > 0)
 783       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 784 
 785     if (UseSIMDForMemoryOps) {
 786       __ stpq(v0, v1, Address(d, 4 * unit));
 787       __ ldpq(v0, v1, Address(s, 4 * unit));
 788       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 789       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 790     } else {
 791       __ stp(t0, t1, Address(d, 2 * unit));
 792       __ ldp(t0, t1, Address(s, 2 * unit));
 793       __ stp(t2, t3, Address(d, 4 * unit));
 794       __ ldp(t2, t3, Address(s, 4 * unit));
 795       __ stp(t4, t5, Address(d, 6 * unit));
 796       __ ldp(t4, t5, Address(s, 6 * unit));
 797       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 798       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 799     }
 800 
 801     __ subs(count, count, 8);
 802     __ br(Assembler::HS, again);
 803 
 804     // Drain
 805     __ bind(drain);
 806     if (UseSIMDForMemoryOps) {
 807       __ stpq(v0, v1, Address(d, 4 * unit));
 808       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 809     } else {
 810       __ stp(t0, t1, Address(d, 2 * unit));
 811       __ stp(t2, t3, Address(d, 4 * unit));
 812       __ stp(t4, t5, Address(d, 6 * unit));
 813       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 814     }
 815 
 816     {
 817       Label L1, L2;
 818       __ tbz(count, exact_log2(4), L1);
 819       if (UseSIMDForMemoryOps) {
 820         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 821         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 822       } else {
 823         __ ldp(t0, t1, Address(s, 2 * unit));
 824         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 825         __ stp(t0, t1, Address(d, 2 * unit));
 826         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 827       }
 828       __ bind(L1);
 829 
 830       if (direction == copy_forwards) {
 831         __ add(s, s, bias);
 832         __ add(d, d, bias);
 833       }
 834 
 835       __ tbz(count, 1, L2);
 836       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 837       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 838       __ bind(L2);
 839     }
 840 
 841     __ ret(lr);
 842 
 843     if (AvoidUnalignedAccesses) {
 844       Label drain, again;
 845       // Register order for storing. Order is different for backward copy.
 846 
 847       __ bind(unaligned_copy_long);
 848 
 849       // source address is even aligned, target odd aligned
 850       //
 851       // when forward copying word pairs we read long pairs at offsets
 852       // {0, 2, 4, 6} (in long words). when backwards copying we read
 853       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 854       // address by -2 in the forwards case so we can compute the
 855       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 856       // or -1.
 857       //
 858       // when forward copying we need to store 1 word, 3 pairs and
 859       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 860       // zero offset We adjust the destination by -1 which means we
 861       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 862       //
 863       // When backwards copyng we need to store 1 word, 3 pairs and
 864       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 865       // offsets {1, 3, 5, 7, 8} * unit.
 866 
 867       if (direction == copy_forwards) {
 868         __ sub(s, s, 16);
 869         __ sub(d, d, 8);
 870       }
 871 
 872       // Fill 8 registers
 873       //
 874       // for forwards copy s was offset by -16 from the original input
 875       // value of s so the register contents are at these offsets
 876       // relative to the 64 bit block addressed by that original input
 877       // and so on for each successive 64 byte block when s is updated
 878       //
 879       // t0 at offset 0,  t1 at offset 8
 880       // t2 at offset 16, t3 at offset 24
 881       // t4 at offset 32, t5 at offset 40
 882       // t6 at offset 48, t7 at offset 56
 883 
 884       // for backwards copy s was not offset so the register contents
 885       // are at these offsets into the preceding 64 byte block
 886       // relative to that original input and so on for each successive
 887       // preceding 64 byte block when s is updated. this explains the
 888       // slightly counter-intuitive looking pattern of register usage
 889       // in the stp instructions for backwards copy.
 890       //
 891       // t0 at offset -16, t1 at offset -8
 892       // t2 at offset -32, t3 at offset -24
 893       // t4 at offset -48, t5 at offset -40
 894       // t6 at offset -64, t7 at offset -56
 895 
 896       __ ldp(t0, t1, Address(s, 2 * unit));
 897       __ ldp(t2, t3, Address(s, 4 * unit));
 898       __ ldp(t4, t5, Address(s, 6 * unit));
 899       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 900 
 901       __ subs(count, count, 16);
 902       __ br(Assembler::LO, drain);
 903 
 904       int prefetch = PrefetchCopyIntervalInBytes;
 905       bool use_stride = false;
 906       if (direction == copy_backwards) {
 907          use_stride = prefetch > 256;
 908          prefetch = -prefetch;
 909          if (use_stride) __ mov(stride, prefetch);
 910       }
 911 
 912       __ bind(again);
 913 
 914       if (PrefetchCopyIntervalInBytes > 0)
 915         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 916 
 917       if (direction == copy_forwards) {
 918        // allowing for the offset of -8 the store instructions place
 919        // registers into the target 64 bit block at the following
 920        // offsets
 921        //
 922        // t0 at offset 0
 923        // t1 at offset 8,  t2 at offset 16
 924        // t3 at offset 24, t4 at offset 32
 925        // t5 at offset 40, t6 at offset 48
 926        // t7 at offset 56
 927 
 928         __ str(t0, Address(d, 1 * unit));
 929         __ stp(t1, t2, Address(d, 2 * unit));
 930         __ ldp(t0, t1, Address(s, 2 * unit));
 931         __ stp(t3, t4, Address(d, 4 * unit));
 932         __ ldp(t2, t3, Address(s, 4 * unit));
 933         __ stp(t5, t6, Address(d, 6 * unit));
 934         __ ldp(t4, t5, Address(s, 6 * unit));
 935         __ str(t7, Address(__ pre(d, 8 * unit)));
 936         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 937       } else {
 938        // d was not offset when we started so the registers are
 939        // written into the 64 bit block preceding d with the following
 940        // offsets
 941        //
 942        // t1 at offset -8
 943        // t3 at offset -24, t0 at offset -16
 944        // t5 at offset -48, t2 at offset -32
 945        // t7 at offset -56, t4 at offset -48
 946        //                   t6 at offset -64
 947        //
 948        // note that this matches the offsets previously noted for the
 949        // loads
 950 
 951         __ str(t1, Address(d, 1 * unit));
 952         __ stp(t3, t0, Address(d, 3 * unit));
 953         __ ldp(t0, t1, Address(s, 2 * unit));
 954         __ stp(t5, t2, Address(d, 5 * unit));
 955         __ ldp(t2, t3, Address(s, 4 * unit));
 956         __ stp(t7, t4, Address(d, 7 * unit));
 957         __ ldp(t4, t5, Address(s, 6 * unit));
 958         __ str(t6, Address(__ pre(d, 8 * unit)));
 959         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 960       }
 961 
 962       __ subs(count, count, 8);
 963       __ br(Assembler::HS, again);
 964 
 965       // Drain
 966       //
 967       // this uses the same pattern of offsets and register arguments
 968       // as above
 969       __ bind(drain);
 970       if (direction == copy_forwards) {
 971         __ str(t0, Address(d, 1 * unit));
 972         __ stp(t1, t2, Address(d, 2 * unit));
 973         __ stp(t3, t4, Address(d, 4 * unit));
 974         __ stp(t5, t6, Address(d, 6 * unit));
 975         __ str(t7, Address(__ pre(d, 8 * unit)));
 976       } else {
 977         __ str(t1, Address(d, 1 * unit));
 978         __ stp(t3, t0, Address(d, 3 * unit));
 979         __ stp(t5, t2, Address(d, 5 * unit));
 980         __ stp(t7, t4, Address(d, 7 * unit));
 981         __ str(t6, Address(__ pre(d, 8 * unit)));
 982       }
 983       // now we need to copy any remaining part block which may
 984       // include a 4 word block subblock and/or a 2 word subblock.
 985       // bits 2 and 1 in the count are the tell-tale for whetehr we
 986       // have each such subblock
 987       {
 988         Label L1, L2;
 989         __ tbz(count, exact_log2(4), L1);
 990        // this is the same as above but copying only 4 longs hence
 991        // with ony one intervening stp between the str instructions
 992        // but note that the offsets and registers still follow the
 993        // same pattern
 994         __ ldp(t0, t1, Address(s, 2 * unit));
 995         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 996         if (direction == copy_forwards) {
 997           __ str(t0, Address(d, 1 * unit));
 998           __ stp(t1, t2, Address(d, 2 * unit));
 999           __ str(t3, Address(__ pre(d, 4 * unit)));
1000         } else {
1001           __ str(t1, Address(d, 1 * unit));
1002           __ stp(t3, t0, Address(d, 3 * unit));
1003           __ str(t2, Address(__ pre(d, 4 * unit)));
1004         }
1005         __ bind(L1);
1006 
1007         __ tbz(count, 1, L2);
1008        // this is the same as above but copying only 2 longs hence
1009        // there is no intervening stp between the str instructions
1010        // but note that the offset and register patterns are still
1011        // the same
1012         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1013         if (direction == copy_forwards) {
1014           __ str(t0, Address(d, 1 * unit));
1015           __ str(t1, Address(__ pre(d, 2 * unit)));
1016         } else {
1017           __ str(t1, Address(d, 1 * unit));
1018           __ str(t0, Address(__ pre(d, 2 * unit)));
1019         }
1020         __ bind(L2);
1021 
1022        // for forwards copy we need to re-adjust the offsets we
1023        // applied so that s and d are follow the last words written
1024 
1025        if (direction == copy_forwards) {
1026          __ add(s, s, 16);
1027          __ add(d, d, 8);
1028        }
1029 
1030       }
1031 
1032       __ ret(lr);
1033       }
1034   }
1035 
1036   // Small copy: less than 16 bytes.
1037   //
1038   // NB: Ignores all of the bits of count which represent more than 15
1039   // bytes, so a caller doesn't have to mask them.
1040 
1041   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1042     bool is_backwards = step < 0;
1043     size_t granularity = uabs(step);
1044     int direction = is_backwards ? -1 : 1;
1045     int unit = wordSize * direction;
1046 
1047     Label Lword, Lint, Lshort, Lbyte;
1048 
1049     assert(granularity
1050            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1051 
1052     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1053 
1054     // ??? I don't know if this bit-test-and-branch is the right thing
1055     // to do.  It does a lot of jumping, resulting in several
1056     // mispredicted branches.  It might make more sense to do this
1057     // with something like Duff's device with a single computed branch.
1058 
1059     __ tbz(count, 3 - exact_log2(granularity), Lword);
1060     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1061     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1062     __ bind(Lword);
1063 
1064     if (granularity <= sizeof (jint)) {
1065       __ tbz(count, 2 - exact_log2(granularity), Lint);
1066       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1067       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1068       __ bind(Lint);
1069     }
1070 
1071     if (granularity <= sizeof (jshort)) {
1072       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1073       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1074       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1075       __ bind(Lshort);
1076     }
1077 
1078     if (granularity <= sizeof (jbyte)) {
1079       __ tbz(count, 0, Lbyte);
1080       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1081       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1082       __ bind(Lbyte);
1083     }
1084   }
1085 
1086   Label copy_f, copy_b;
1087 
1088   // All-singing all-dancing memory copy.
1089   //
1090   // Copy count units of memory from s to d.  The size of a unit is
1091   // step, which can be positive or negative depending on the direction
1092   // of copy.  If is_aligned is false, we align the source address.
1093   //
1094 
1095   void copy_memory(bool is_aligned, Register s, Register d,
1096                    Register count, Register tmp, int step) {
1097     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1098     bool is_backwards = step < 0;
1099     int granularity = uabs(step);
1100     const Register t0 = r3, t1 = r4;
1101 
1102     // <= 96 bytes do inline. Direction doesn't matter because we always
1103     // load all the data before writing anything
1104     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1105     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1106     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1107     const Register send = r17, dend = r18;
1108 
1109     if (PrefetchCopyIntervalInBytes > 0)
1110       __ prfm(Address(s, 0), PLDL1KEEP);
1111     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1112     __ br(Assembler::HI, copy_big);
1113 
1114     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1115     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1116 
1117     __ cmp(count, u1(16/granularity));
1118     __ br(Assembler::LS, copy16);
1119 
1120     __ cmp(count, u1(64/granularity));
1121     __ br(Assembler::HI, copy80);
1122 
1123     __ cmp(count, u1(32/granularity));
1124     __ br(Assembler::LS, copy32);
1125 
1126     // 33..64 bytes
1127     if (UseSIMDForMemoryOps) {
1128       __ ldpq(v0, v1, Address(s, 0));
1129       __ ldpq(v2, v3, Address(send, -32));
1130       __ stpq(v0, v1, Address(d, 0));
1131       __ stpq(v2, v3, Address(dend, -32));
1132     } else {
1133       __ ldp(t0, t1, Address(s, 0));
1134       __ ldp(t2, t3, Address(s, 16));
1135       __ ldp(t4, t5, Address(send, -32));
1136       __ ldp(t6, t7, Address(send, -16));
1137 
1138       __ stp(t0, t1, Address(d, 0));
1139       __ stp(t2, t3, Address(d, 16));
1140       __ stp(t4, t5, Address(dend, -32));
1141       __ stp(t6, t7, Address(dend, -16));
1142     }
1143     __ b(finish);
1144 
1145     // 17..32 bytes
1146     __ bind(copy32);
1147     __ ldp(t0, t1, Address(s, 0));
1148     __ ldp(t2, t3, Address(send, -16));
1149     __ stp(t0, t1, Address(d, 0));
1150     __ stp(t2, t3, Address(dend, -16));
1151     __ b(finish);
1152 
1153     // 65..80/96 bytes
1154     // (96 bytes if SIMD because we do 32 byes per instruction)
1155     __ bind(copy80);
1156     if (UseSIMDForMemoryOps) {
1157       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1158       __ ldpq(v4, v5, Address(send, -32));
1159       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1160       __ stpq(v4, v5, Address(dend, -32));
1161     } else {
1162       __ ldp(t0, t1, Address(s, 0));
1163       __ ldp(t2, t3, Address(s, 16));
1164       __ ldp(t4, t5, Address(s, 32));
1165       __ ldp(t6, t7, Address(s, 48));
1166       __ ldp(t8, t9, Address(send, -16));
1167 
1168       __ stp(t0, t1, Address(d, 0));
1169       __ stp(t2, t3, Address(d, 16));
1170       __ stp(t4, t5, Address(d, 32));
1171       __ stp(t6, t7, Address(d, 48));
1172       __ stp(t8, t9, Address(dend, -16));
1173     }
1174     __ b(finish);
1175 
1176     // 0..16 bytes
1177     __ bind(copy16);
1178     __ cmp(count, u1(8/granularity));
1179     __ br(Assembler::LO, copy8);
1180 
1181     // 8..16 bytes
1182     __ ldr(t0, Address(s, 0));
1183     __ ldr(t1, Address(send, -8));
1184     __ str(t0, Address(d, 0));
1185     __ str(t1, Address(dend, -8));
1186     __ b(finish);
1187 
1188     if (granularity < 8) {
1189       // 4..7 bytes
1190       __ bind(copy8);
1191       __ tbz(count, 2 - exact_log2(granularity), copy4);
1192       __ ldrw(t0, Address(s, 0));
1193       __ ldrw(t1, Address(send, -4));
1194       __ strw(t0, Address(d, 0));
1195       __ strw(t1, Address(dend, -4));
1196       __ b(finish);
1197       if (granularity < 4) {
1198         // 0..3 bytes
1199         __ bind(copy4);
1200         __ cbz(count, finish); // get rid of 0 case
1201         if (granularity == 2) {
1202           __ ldrh(t0, Address(s, 0));
1203           __ strh(t0, Address(d, 0));
1204         } else { // granularity == 1
1205           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1206           // the first and last byte.
1207           // Handle the 3 byte case by loading and storing base + count/2
1208           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1209           // This does means in the 1 byte case we load/store the same
1210           // byte 3 times.
1211           __ lsr(count, count, 1);
1212           __ ldrb(t0, Address(s, 0));
1213           __ ldrb(t1, Address(send, -1));
1214           __ ldrb(t2, Address(s, count));
1215           __ strb(t0, Address(d, 0));
1216           __ strb(t1, Address(dend, -1));
1217           __ strb(t2, Address(d, count));
1218         }
1219         __ b(finish);
1220       }
1221     }
1222 
1223     __ bind(copy_big);
1224     if (is_backwards) {
1225       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1226       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1227     }
1228 
1229     // Now we've got the small case out of the way we can align the
1230     // source address on a 2-word boundary.
1231 
1232     Label aligned;
1233 
1234     if (is_aligned) {
1235       // We may have to adjust by 1 word to get s 2-word-aligned.
1236       __ tbz(s, exact_log2(wordSize), aligned);
1237       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1238       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1239       __ sub(count, count, wordSize/granularity);
1240     } else {
1241       if (is_backwards) {
1242         __ andr(rscratch2, s, 2 * wordSize - 1);
1243       } else {
1244         __ neg(rscratch2, s);
1245         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1246       }
1247       // rscratch2 is the byte adjustment needed to align s.
1248       __ cbz(rscratch2, aligned);
1249       int shift = exact_log2(granularity);
1250       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1251       __ sub(count, count, rscratch2);
1252 
1253 #if 0
1254       // ?? This code is only correct for a disjoint copy.  It may or
1255       // may not make sense to use it in that case.
1256 
1257       // Copy the first pair; s and d may not be aligned.
1258       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1259       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1260 
1261       // Align s and d, adjust count
1262       if (is_backwards) {
1263         __ sub(s, s, rscratch2);
1264         __ sub(d, d, rscratch2);
1265       } else {
1266         __ add(s, s, rscratch2);
1267         __ add(d, d, rscratch2);
1268       }
1269 #else
1270       copy_memory_small(s, d, rscratch2, rscratch1, step);
1271 #endif
1272     }
1273 
1274     __ bind(aligned);
1275 
1276     // s is now 2-word-aligned.
1277 
1278     // We have a count of units and some trailing bytes.  Adjust the
1279     // count and do a bulk copy of words.
1280     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1281     if (direction == copy_forwards)
1282       __ bl(copy_f);
1283     else
1284       __ bl(copy_b);
1285 
1286     // And the tail.
1287     copy_memory_small(s, d, count, tmp, step);
1288 
1289     if (granularity >= 8) __ bind(copy8);
1290     if (granularity >= 4) __ bind(copy4);
1291     __ bind(finish);
1292   }
1293 
1294 
1295   void clobber_registers() {
1296 #ifdef ASSERT
1297     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1298     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1299     for (Register r = r3; r <= r18; r++)
1300       if (r != rscratch1) __ mov(r, rscratch1);
1301 #endif
1302   }
1303 
1304   // Scan over array at a for count oops, verifying each one.
1305   // Preserves a and count, clobbers rscratch1 and rscratch2.
1306   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1307     Label loop, end;
1308     __ mov(rscratch1, a);
1309     __ mov(rscratch2, zr);
1310     __ bind(loop);
1311     __ cmp(rscratch2, count);
1312     __ br(Assembler::HS, end);
1313     if (size == (size_t)wordSize) {
1314       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1315       __ verify_oop(temp);
1316     } else {
1317       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1318       __ decode_heap_oop(temp); // calls verify_oop
1319     }
1320     __ add(rscratch2, rscratch2, size);
1321     __ b(loop);
1322     __ bind(end);
1323   }
1324 
1325   // Arguments:
1326   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1327   //             ignored
1328   //   is_oop  - true => oop array, so generate store check code
1329   //   name    - stub name string
1330   //
1331   // Inputs:
1332   //   c_rarg0   - source array address
1333   //   c_rarg1   - destination array address
1334   //   c_rarg2   - element count, treated as ssize_t, can be zero
1335   //
1336   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1337   // the hardware handle it.  The two dwords within qwords that span
1338   // cache line boundaries will still be loaded and stored atomicly.
1339   //
1340   // Side Effects:
1341   //   disjoint_int_copy_entry is set to the no-overlap entry point
1342   //   used by generate_conjoint_int_oop_copy().
1343   //
1344   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1345                                   const char *name, bool dest_uninitialized = false) {
1346     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1347     RegSet saved_reg = RegSet::of(s, d, count);
1348     __ align(CodeEntryAlignment);
1349     StubCodeMark mark(this, "StubRoutines", name);
1350     address start = __ pc();
1351     __ enter();
1352 
1353     if (entry != NULL) {
1354       *entry = __ pc();
1355       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1356       BLOCK_COMMENT("Entry:");
1357     }
1358 
1359     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1360     if (dest_uninitialized) {
1361       decorators |= IS_DEST_UNINITIALIZED;
1362     }
1363     if (aligned) {
1364       decorators |= ARRAYCOPY_ALIGNED;
1365     }
1366 
1367     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1368     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1369 
1370     if (is_oop) {
1371       // save regs before copy_memory
1372       __ push(RegSet::of(d, count), sp);
1373     }
1374     copy_memory(aligned, s, d, count, rscratch1, size);
1375 
1376     if (is_oop) {
1377       __ pop(RegSet::of(d, count), sp);
1378       if (VerifyOops)
1379         verify_oop_array(size, d, count, r16);
1380       __ sub(count, count, 1); // make an inclusive end pointer
1381       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1382     }
1383 
1384     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1385 
1386     __ leave();
1387     __ mov(r0, zr); // return 0
1388     __ ret(lr);
1389 #ifdef BUILTIN_SIM
1390     {
1391       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1392       sim->notifyCompile(const_cast<char*>(name), start);
1393     }
1394 #endif
1395     return start;
1396   }
1397 
1398   // Arguments:
1399   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1400   //             ignored
1401   //   is_oop  - true => oop array, so generate store check code
1402   //   name    - stub name string
1403   //
1404   // Inputs:
1405   //   c_rarg0   - source array address
1406   //   c_rarg1   - destination array address
1407   //   c_rarg2   - element count, treated as ssize_t, can be zero
1408   //
1409   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1410   // the hardware handle it.  The two dwords within qwords that span
1411   // cache line boundaries will still be loaded and stored atomicly.
1412   //
1413   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1414                                  address *entry, const char *name,
1415                                  bool dest_uninitialized = false) {
1416     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1417     RegSet saved_regs = RegSet::of(s, d, count);
1418     StubCodeMark mark(this, "StubRoutines", name);
1419     address start = __ pc();
1420     __ enter();
1421 
1422     if (entry != NULL) {
1423       *entry = __ pc();
1424       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1425       BLOCK_COMMENT("Entry:");
1426     }
1427 
1428     // use fwd copy when (d-s) above_equal (count*size)
1429     __ sub(rscratch1, d, s);
1430     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1431     __ br(Assembler::HS, nooverlap_target);
1432 
1433     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1434     if (dest_uninitialized) {
1435       decorators |= IS_DEST_UNINITIALIZED;
1436     }
1437     if (aligned) {
1438       decorators |= ARRAYCOPY_ALIGNED;
1439     }
1440 
1441     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1442     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1443 
1444     if (is_oop) {
1445       // save regs before copy_memory
1446       __ push(RegSet::of(d, count), sp);
1447     }
1448     copy_memory(aligned, s, d, count, rscratch1, -size);
1449     if (is_oop) {
1450       __ pop(RegSet::of(d, count), sp);
1451       if (VerifyOops)
1452         verify_oop_array(size, d, count, r16);
1453       __ sub(count, count, 1); // make an inclusive end pointer
1454       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1455     }
1456     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1457     __ leave();
1458     __ mov(r0, zr); // return 0
1459     __ ret(lr);
1460 #ifdef BUILTIN_SIM
1461     {
1462       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1463       sim->notifyCompile(const_cast<char*>(name), start);
1464     }
1465 #endif
1466     return start;
1467 }
1468 
1469   // Arguments:
1470   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1471   //             ignored
1472   //   name    - stub name string
1473   //
1474   // Inputs:
1475   //   c_rarg0   - source array address
1476   //   c_rarg1   - destination array address
1477   //   c_rarg2   - element count, treated as ssize_t, can be zero
1478   //
1479   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1480   // we let the hardware handle it.  The one to eight bytes within words,
1481   // dwords or qwords that span cache line boundaries will still be loaded
1482   // and stored atomically.
1483   //
1484   // Side Effects:
1485   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1486   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1487   // we let the hardware handle it.  The one to eight bytes within words,
1488   // dwords or qwords that span cache line boundaries will still be loaded
1489   // and stored atomically.
1490   //
1491   // Side Effects:
1492   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1493   //   used by generate_conjoint_byte_copy().
1494   //
1495   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1496     const bool not_oop = false;
1497     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1498   }
1499 
1500   // Arguments:
1501   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1502   //             ignored
1503   //   name    - stub name string
1504   //
1505   // Inputs:
1506   //   c_rarg0   - source array address
1507   //   c_rarg1   - destination array address
1508   //   c_rarg2   - element count, treated as ssize_t, can be zero
1509   //
1510   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1511   // we let the hardware handle it.  The one to eight bytes within words,
1512   // dwords or qwords that span cache line boundaries will still be loaded
1513   // and stored atomically.
1514   //
1515   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1516                                       address* entry, const char *name) {
1517     const bool not_oop = false;
1518     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1519   }
1520 
1521   // Arguments:
1522   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1523   //             ignored
1524   //   name    - stub name string
1525   //
1526   // Inputs:
1527   //   c_rarg0   - source array address
1528   //   c_rarg1   - destination array address
1529   //   c_rarg2   - element count, treated as ssize_t, can be zero
1530   //
1531   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1532   // let the hardware handle it.  The two or four words within dwords
1533   // or qwords that span cache line boundaries will still be loaded
1534   // and stored atomically.
1535   //
1536   // Side Effects:
1537   //   disjoint_short_copy_entry is set to the no-overlap entry point
1538   //   used by generate_conjoint_short_copy().
1539   //
1540   address generate_disjoint_short_copy(bool aligned,
1541                                        address* entry, const char *name) {
1542     const bool not_oop = false;
1543     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1544   }
1545 
1546   // Arguments:
1547   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1548   //             ignored
1549   //   name    - stub name string
1550   //
1551   // Inputs:
1552   //   c_rarg0   - source array address
1553   //   c_rarg1   - destination array address
1554   //   c_rarg2   - element count, treated as ssize_t, can be zero
1555   //
1556   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1557   // let the hardware handle it.  The two or four words within dwords
1558   // or qwords that span cache line boundaries will still be loaded
1559   // and stored atomically.
1560   //
1561   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1562                                        address *entry, const char *name) {
1563     const bool not_oop = false;
1564     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1565 
1566   }
1567   // Arguments:
1568   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1569   //             ignored
1570   //   name    - stub name string
1571   //
1572   // Inputs:
1573   //   c_rarg0   - source array address
1574   //   c_rarg1   - destination array address
1575   //   c_rarg2   - element count, treated as ssize_t, can be zero
1576   //
1577   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1578   // the hardware handle it.  The two dwords within qwords that span
1579   // cache line boundaries will still be loaded and stored atomicly.
1580   //
1581   // Side Effects:
1582   //   disjoint_int_copy_entry is set to the no-overlap entry point
1583   //   used by generate_conjoint_int_oop_copy().
1584   //
1585   address generate_disjoint_int_copy(bool aligned, address *entry,
1586                                          const char *name, bool dest_uninitialized = false) {
1587     const bool not_oop = false;
1588     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1589   }
1590 
1591   // Arguments:
1592   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1593   //             ignored
1594   //   name    - stub name string
1595   //
1596   // Inputs:
1597   //   c_rarg0   - source array address
1598   //   c_rarg1   - destination array address
1599   //   c_rarg2   - element count, treated as ssize_t, can be zero
1600   //
1601   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1602   // the hardware handle it.  The two dwords within qwords that span
1603   // cache line boundaries will still be loaded and stored atomicly.
1604   //
1605   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1606                                      address *entry, const char *name,
1607                                      bool dest_uninitialized = false) {
1608     const bool not_oop = false;
1609     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1610   }
1611 
1612 
1613   // Arguments:
1614   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1615   //             ignored
1616   //   name    - stub name string
1617   //
1618   // Inputs:
1619   //   c_rarg0   - source array address
1620   //   c_rarg1   - destination array address
1621   //   c_rarg2   - element count, treated as size_t, can be zero
1622   //
1623   // Side Effects:
1624   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1625   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1626   //
1627   address generate_disjoint_long_copy(bool aligned, address *entry,
1628                                           const char *name, bool dest_uninitialized = false) {
1629     const bool not_oop = false;
1630     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1631   }
1632 
1633   // Arguments:
1634   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1635   //             ignored
1636   //   name    - stub name string
1637   //
1638   // Inputs:
1639   //   c_rarg0   - source array address
1640   //   c_rarg1   - destination array address
1641   //   c_rarg2   - element count, treated as size_t, can be zero
1642   //
1643   address generate_conjoint_long_copy(bool aligned,
1644                                       address nooverlap_target, address *entry,
1645                                       const char *name, bool dest_uninitialized = false) {
1646     const bool not_oop = false;
1647     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1648   }
1649 
1650   // Arguments:
1651   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1652   //             ignored
1653   //   name    - stub name string
1654   //
1655   // Inputs:
1656   //   c_rarg0   - source array address
1657   //   c_rarg1   - destination array address
1658   //   c_rarg2   - element count, treated as size_t, can be zero
1659   //
1660   // Side Effects:
1661   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1662   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1663   //
1664   address generate_disjoint_oop_copy(bool aligned, address *entry,
1665                                      const char *name, bool dest_uninitialized) {
1666     const bool is_oop = true;
1667     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1668     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1669   }
1670 
1671   // Arguments:
1672   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1673   //             ignored
1674   //   name    - stub name string
1675   //
1676   // Inputs:
1677   //   c_rarg0   - source array address
1678   //   c_rarg1   - destination array address
1679   //   c_rarg2   - element count, treated as size_t, can be zero
1680   //
1681   address generate_conjoint_oop_copy(bool aligned,
1682                                      address nooverlap_target, address *entry,
1683                                      const char *name, bool dest_uninitialized) {
1684     const bool is_oop = true;
1685     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1686     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1687                                   name, dest_uninitialized);
1688   }
1689 
1690 
1691   // Helper for generating a dynamic type check.
1692   // Smashes rscratch1.
1693   void generate_type_check(Register sub_klass,
1694                            Register super_check_offset,
1695                            Register super_klass,
1696                            Label& L_success) {
1697     assert_different_registers(sub_klass, super_check_offset, super_klass);
1698 
1699     BLOCK_COMMENT("type_check:");
1700 
1701     Label L_miss;
1702 
1703     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1704                                      super_check_offset);
1705     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1706 
1707     // Fall through on failure!
1708     __ BIND(L_miss);
1709   }
1710 
1711   //
1712   //  Generate checkcasting array copy stub
1713   //
1714   //  Input:
1715   //    c_rarg0   - source array address
1716   //    c_rarg1   - destination array address
1717   //    c_rarg2   - element count, treated as ssize_t, can be zero
1718   //    c_rarg3   - size_t ckoff (super_check_offset)
1719   //    c_rarg4   - oop ckval (super_klass)
1720   //
1721   //  Output:
1722   //    r0 ==  0  -  success
1723   //    r0 == -1^K - failure, where K is partial transfer count
1724   //
1725   address generate_checkcast_copy(const char *name, address *entry,
1726                                   bool dest_uninitialized = false) {
1727 
1728     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1729 
1730     // Input registers (after setup_arg_regs)
1731     const Register from        = c_rarg0;   // source array address
1732     const Register to          = c_rarg1;   // destination array address
1733     const Register count       = c_rarg2;   // elementscount
1734     const Register ckoff       = c_rarg3;   // super_check_offset
1735     const Register ckval       = c_rarg4;   // super_klass
1736 
1737     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1738     RegSet wb_post_saved_regs = RegSet::of(count);
1739 
1740     // Registers used as temps (r18, r19, r20 are save-on-entry)
1741     const Register count_save  = r21;       // orig elementscount
1742     const Register start_to    = r20;       // destination array start address
1743     const Register copied_oop  = r18;       // actual oop copied
1744     const Register r19_klass   = r19;       // oop._klass
1745 
1746     //---------------------------------------------------------------
1747     // Assembler stub will be used for this call to arraycopy
1748     // if the two arrays are subtypes of Object[] but the
1749     // destination array type is not equal to or a supertype
1750     // of the source type.  Each element must be separately
1751     // checked.
1752 
1753     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1754                                copied_oop, r19_klass, count_save);
1755 
1756     __ align(CodeEntryAlignment);
1757     StubCodeMark mark(this, "StubRoutines", name);
1758     address start = __ pc();
1759 
1760     __ enter(); // required for proper stackwalking of RuntimeStub frame
1761 
1762 #ifdef ASSERT
1763     // caller guarantees that the arrays really are different
1764     // otherwise, we would have to make conjoint checks
1765     { Label L;
1766       array_overlap_test(L, TIMES_OOP);
1767       __ stop("checkcast_copy within a single array");
1768       __ bind(L);
1769     }
1770 #endif //ASSERT
1771 
1772     // Caller of this entry point must set up the argument registers.
1773     if (entry != NULL) {
1774       *entry = __ pc();
1775       BLOCK_COMMENT("Entry:");
1776     }
1777 
1778      // Empty array:  Nothing to do.
1779     __ cbz(count, L_done);
1780 
1781     __ push(RegSet::of(r18, r19, r20, r21), sp);
1782 
1783 #ifdef ASSERT
1784     BLOCK_COMMENT("assert consistent ckoff/ckval");
1785     // The ckoff and ckval must be mutually consistent,
1786     // even though caller generates both.
1787     { Label L;
1788       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1789       __ ldrw(start_to, Address(ckval, sco_offset));
1790       __ cmpw(ckoff, start_to);
1791       __ br(Assembler::EQ, L);
1792       __ stop("super_check_offset inconsistent");
1793       __ bind(L);
1794     }
1795 #endif //ASSERT
1796 
1797     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1798     bool is_oop = true;
1799     if (dest_uninitialized) {
1800       decorators |= IS_DEST_UNINITIALIZED;
1801     }
1802 
1803     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1804     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1805 
1806     // save the original count
1807     __ mov(count_save, count);
1808 
1809     // Copy from low to high addresses
1810     __ mov(start_to, to);              // Save destination array start address
1811     __ b(L_load_element);
1812 
1813     // ======== begin loop ========
1814     // (Loop is rotated; its entry is L_load_element.)
1815     // Loop control:
1816     //   for (; count != 0; count--) {
1817     //     copied_oop = load_heap_oop(from++);
1818     //     ... generate_type_check ...;
1819     //     store_heap_oop(to++, copied_oop);
1820     //   }
1821     __ align(OptoLoopAlignment);
1822 
1823     __ BIND(L_store_element);
1824     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1825     __ sub(count, count, 1);
1826     __ cbz(count, L_do_card_marks);
1827 
1828     // ======== loop entry is here ========
1829     __ BIND(L_load_element);
1830     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1831     __ cbz(copied_oop, L_store_element);
1832 
1833     __ load_klass(r19_klass, copied_oop);// query the object klass
1834     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1835     // ======== end loop ========
1836 
1837     // It was a real error; we must depend on the caller to finish the job.
1838     // Register count = remaining oops, count_orig = total oops.
1839     // Emit GC store barriers for the oops we have copied and report
1840     // their number to the caller.
1841 
1842     __ subs(count, count_save, count);     // K = partially copied oop count
1843     __ eon(count, count, zr);                   // report (-1^K) to caller
1844     __ br(Assembler::EQ, L_done_pop);
1845 
1846     __ BIND(L_do_card_marks);
1847     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1848     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1849 
1850     __ bind(L_done_pop);
1851     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1852     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1853 
1854     __ bind(L_done);
1855     __ mov(r0, count);
1856     __ leave();
1857     __ ret(lr);
1858 
1859     return start;
1860   }
1861 
1862   // Perform range checks on the proposed arraycopy.
1863   // Kills temp, but nothing else.
1864   // Also, clean the sign bits of src_pos and dst_pos.
1865   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1866                               Register src_pos, // source position (c_rarg1)
1867                               Register dst,     // destination array oo (c_rarg2)
1868                               Register dst_pos, // destination position (c_rarg3)
1869                               Register length,
1870                               Register temp,
1871                               Label& L_failed) {
1872     BLOCK_COMMENT("arraycopy_range_checks:");
1873 
1874     assert_different_registers(rscratch1, temp);
1875 
1876     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1877     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1878     __ addw(temp, length, src_pos);
1879     __ cmpw(temp, rscratch1);
1880     __ br(Assembler::HI, L_failed);
1881 
1882     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1883     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1884     __ addw(temp, length, dst_pos);
1885     __ cmpw(temp, rscratch1);
1886     __ br(Assembler::HI, L_failed);
1887 
1888     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1889     __ movw(src_pos, src_pos);
1890     __ movw(dst_pos, dst_pos);
1891 
1892     BLOCK_COMMENT("arraycopy_range_checks done");
1893   }
1894 
1895   // These stubs get called from some dumb test routine.
1896   // I'll write them properly when they're called from
1897   // something that's actually doing something.
1898   static void fake_arraycopy_stub(address src, address dst, int count) {
1899     assert(count == 0, "huh?");
1900   }
1901 
1902 
1903   //
1904   //  Generate 'unsafe' array copy stub
1905   //  Though just as safe as the other stubs, it takes an unscaled
1906   //  size_t argument instead of an element count.
1907   //
1908   //  Input:
1909   //    c_rarg0   - source array address
1910   //    c_rarg1   - destination array address
1911   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1912   //
1913   // Examines the alignment of the operands and dispatches
1914   // to a long, int, short, or byte copy loop.
1915   //
1916   address generate_unsafe_copy(const char *name,
1917                                address byte_copy_entry,
1918                                address short_copy_entry,
1919                                address int_copy_entry,
1920                                address long_copy_entry) {
1921     Label L_long_aligned, L_int_aligned, L_short_aligned;
1922     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1923 
1924     __ align(CodeEntryAlignment);
1925     StubCodeMark mark(this, "StubRoutines", name);
1926     address start = __ pc();
1927     __ enter(); // required for proper stackwalking of RuntimeStub frame
1928 
1929     // bump this on entry, not on exit:
1930     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1931 
1932     __ orr(rscratch1, s, d);
1933     __ orr(rscratch1, rscratch1, count);
1934 
1935     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1936     __ cbz(rscratch1, L_long_aligned);
1937     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1938     __ cbz(rscratch1, L_int_aligned);
1939     __ tbz(rscratch1, 0, L_short_aligned);
1940     __ b(RuntimeAddress(byte_copy_entry));
1941 
1942     __ BIND(L_short_aligned);
1943     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1944     __ b(RuntimeAddress(short_copy_entry));
1945     __ BIND(L_int_aligned);
1946     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1947     __ b(RuntimeAddress(int_copy_entry));
1948     __ BIND(L_long_aligned);
1949     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1950     __ b(RuntimeAddress(long_copy_entry));
1951 
1952     return start;
1953   }
1954 
1955   //
1956   //  Generate generic array copy stubs
1957   //
1958   //  Input:
1959   //    c_rarg0    -  src oop
1960   //    c_rarg1    -  src_pos (32-bits)
1961   //    c_rarg2    -  dst oop
1962   //    c_rarg3    -  dst_pos (32-bits)
1963   //    c_rarg4    -  element count (32-bits)
1964   //
1965   //  Output:
1966   //    r0 ==  0  -  success
1967   //    r0 == -1^K - failure, where K is partial transfer count
1968   //
1969   address generate_generic_copy(const char *name,
1970                                 address byte_copy_entry, address short_copy_entry,
1971                                 address int_copy_entry, address oop_copy_entry,
1972                                 address long_copy_entry, address checkcast_copy_entry) {
1973 
1974     Label L_failed, L_objArray;
1975     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1976 
1977     // Input registers
1978     const Register src        = c_rarg0;  // source array oop
1979     const Register src_pos    = c_rarg1;  // source position
1980     const Register dst        = c_rarg2;  // destination array oop
1981     const Register dst_pos    = c_rarg3;  // destination position
1982     const Register length     = c_rarg4;
1983 
1984     __ align(CodeEntryAlignment);
1985 
1986     StubCodeMark mark(this, "StubRoutines", name);
1987 
1988     address start = __ pc();
1989 
1990     __ enter(); // required for proper stackwalking of RuntimeStub frame
1991 
1992     // bump this on entry, not on exit:
1993     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1994 
1995     //-----------------------------------------------------------------------
1996     // Assembler stub will be used for this call to arraycopy
1997     // if the following conditions are met:
1998     //
1999     // (1) src and dst must not be null.
2000     // (2) src_pos must not be negative.
2001     // (3) dst_pos must not be negative.
2002     // (4) length  must not be negative.
2003     // (5) src klass and dst klass should be the same and not NULL.
2004     // (6) src and dst should be arrays.
2005     // (7) src_pos + length must not exceed length of src.
2006     // (8) dst_pos + length must not exceed length of dst.
2007     //
2008 
2009     //  if (src == NULL) return -1;
2010     __ cbz(src, L_failed);
2011 
2012     //  if (src_pos < 0) return -1;
2013     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2014 
2015     //  if (dst == NULL) return -1;
2016     __ cbz(dst, L_failed);
2017 
2018     //  if (dst_pos < 0) return -1;
2019     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2020 
2021     // registers used as temp
2022     const Register scratch_length    = r16; // elements count to copy
2023     const Register scratch_src_klass = r17; // array klass
2024     const Register lh                = r18; // layout helper
2025 
2026     //  if (length < 0) return -1;
2027     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2028     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2029 
2030     __ load_klass(scratch_src_klass, src);
2031 #ifdef ASSERT
2032     //  assert(src->klass() != NULL);
2033     {
2034       BLOCK_COMMENT("assert klasses not null {");
2035       Label L1, L2;
2036       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2037       __ bind(L1);
2038       __ stop("broken null klass");
2039       __ bind(L2);
2040       __ load_klass(rscratch1, dst);
2041       __ cbz(rscratch1, L1);     // this would be broken also
2042       BLOCK_COMMENT("} assert klasses not null done");
2043     }
2044 #endif
2045 
2046     // Load layout helper (32-bits)
2047     //
2048     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2049     // 32        30    24            16              8     2                 0
2050     //
2051     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2052     //
2053 
2054     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2055 
2056     // Handle objArrays completely differently...
2057     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2058     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2059     __ movw(rscratch1, objArray_lh);
2060     __ eorw(rscratch2, lh, rscratch1);
2061     __ cbzw(rscratch2, L_objArray);
2062 
2063     //  if (src->klass() != dst->klass()) return -1;
2064     __ load_klass(rscratch2, dst);
2065     __ eor(rscratch2, rscratch2, scratch_src_klass);
2066     __ cbnz(rscratch2, L_failed);
2067 
2068     //  if (!src->is_Array()) return -1;
2069     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2070 
2071     // At this point, it is known to be a typeArray (array_tag 0x3).
2072 #ifdef ASSERT
2073     {
2074       BLOCK_COMMENT("assert primitive array {");
2075       Label L;
2076       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2077       __ cmpw(lh, rscratch2);
2078       __ br(Assembler::GE, L);
2079       __ stop("must be a primitive array");
2080       __ bind(L);
2081       BLOCK_COMMENT("} assert primitive array done");
2082     }
2083 #endif
2084 
2085     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2086                            rscratch2, L_failed);
2087 
2088     // TypeArrayKlass
2089     //
2090     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2091     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2092     //
2093 
2094     const Register rscratch1_offset = rscratch1;    // array offset
2095     const Register r18_elsize = lh; // element size
2096 
2097     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2098            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2099     __ add(src, src, rscratch1_offset);           // src array offset
2100     __ add(dst, dst, rscratch1_offset);           // dst array offset
2101     BLOCK_COMMENT("choose copy loop based on element size");
2102 
2103     // next registers should be set before the jump to corresponding stub
2104     const Register from     = c_rarg0;  // source array address
2105     const Register to       = c_rarg1;  // destination array address
2106     const Register count    = c_rarg2;  // elements count
2107 
2108     // 'from', 'to', 'count' registers should be set in such order
2109     // since they are the same as 'src', 'src_pos', 'dst'.
2110 
2111     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2112 
2113     // The possible values of elsize are 0-3, i.e. exact_log2(element
2114     // size in bytes).  We do a simple bitwise binary search.
2115   __ BIND(L_copy_bytes);
2116     __ tbnz(r18_elsize, 1, L_copy_ints);
2117     __ tbnz(r18_elsize, 0, L_copy_shorts);
2118     __ lea(from, Address(src, src_pos));// src_addr
2119     __ lea(to,   Address(dst, dst_pos));// dst_addr
2120     __ movw(count, scratch_length); // length
2121     __ b(RuntimeAddress(byte_copy_entry));
2122 
2123   __ BIND(L_copy_shorts);
2124     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2125     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2126     __ movw(count, scratch_length); // length
2127     __ b(RuntimeAddress(short_copy_entry));
2128 
2129   __ BIND(L_copy_ints);
2130     __ tbnz(r18_elsize, 0, L_copy_longs);
2131     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2132     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2133     __ movw(count, scratch_length); // length
2134     __ b(RuntimeAddress(int_copy_entry));
2135 
2136   __ BIND(L_copy_longs);
2137 #ifdef ASSERT
2138     {
2139       BLOCK_COMMENT("assert long copy {");
2140       Label L;
2141       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2142       __ cmpw(r18_elsize, LogBytesPerLong);
2143       __ br(Assembler::EQ, L);
2144       __ stop("must be long copy, but elsize is wrong");
2145       __ bind(L);
2146       BLOCK_COMMENT("} assert long copy done");
2147     }
2148 #endif
2149     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2150     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2151     __ movw(count, scratch_length); // length
2152     __ b(RuntimeAddress(long_copy_entry));
2153 
2154     // ObjArrayKlass
2155   __ BIND(L_objArray);
2156     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2157 
2158     Label L_plain_copy, L_checkcast_copy;
2159     //  test array classes for subtyping
2160     __ load_klass(r18, dst);
2161     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2162     __ br(Assembler::NE, L_checkcast_copy);
2163 
2164     // Identically typed arrays can be copied without element-wise checks.
2165     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2166                            rscratch2, L_failed);
2167 
2168     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2169     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2170     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2171     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2172     __ movw(count, scratch_length); // length
2173   __ BIND(L_plain_copy);
2174     __ b(RuntimeAddress(oop_copy_entry));
2175 
2176   __ BIND(L_checkcast_copy);
2177     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2178     {
2179       // Before looking at dst.length, make sure dst is also an objArray.
2180       __ ldrw(rscratch1, Address(r18, lh_offset));
2181       __ movw(rscratch2, objArray_lh);
2182       __ eorw(rscratch1, rscratch1, rscratch2);
2183       __ cbnzw(rscratch1, L_failed);
2184 
2185       // It is safe to examine both src.length and dst.length.
2186       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2187                              r18, L_failed);
2188 
2189       const Register rscratch2_dst_klass = rscratch2;
2190       __ load_klass(rscratch2_dst_klass, dst); // reload
2191 
2192       // Marshal the base address arguments now, freeing registers.
2193       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2194       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2195       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2196       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2197       __ movw(count, length);           // length (reloaded)
2198       Register sco_temp = c_rarg3;      // this register is free now
2199       assert_different_registers(from, to, count, sco_temp,
2200                                  rscratch2_dst_klass, scratch_src_klass);
2201       // assert_clean_int(count, sco_temp);
2202 
2203       // Generate the type check.
2204       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2205       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2206       // assert_clean_int(sco_temp, r18);
2207       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2208 
2209       // Fetch destination element klass from the ObjArrayKlass header.
2210       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2211       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2212       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2213 
2214       // the checkcast_copy loop needs two extra arguments:
2215       assert(c_rarg3 == sco_temp, "#3 already in place");
2216       // Set up arguments for checkcast_copy_entry.
2217       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2218       __ b(RuntimeAddress(checkcast_copy_entry));
2219     }
2220 
2221   __ BIND(L_failed);
2222     __ mov(r0, -1);
2223     __ leave();   // required for proper stackwalking of RuntimeStub frame
2224     __ ret(lr);
2225 
2226     return start;
2227   }
2228 
2229   //
2230   // Generate stub for array fill. If "aligned" is true, the
2231   // "to" address is assumed to be heapword aligned.
2232   //
2233   // Arguments for generated stub:
2234   //   to:    c_rarg0
2235   //   value: c_rarg1
2236   //   count: c_rarg2 treated as signed
2237   //
2238   address generate_fill(BasicType t, bool aligned, const char *name) {
2239     __ align(CodeEntryAlignment);
2240     StubCodeMark mark(this, "StubRoutines", name);
2241     address start = __ pc();
2242 
2243     BLOCK_COMMENT("Entry:");
2244 
2245     const Register to        = c_rarg0;  // source array address
2246     const Register value     = c_rarg1;  // value
2247     const Register count     = c_rarg2;  // elements count
2248 
2249     const Register bz_base = r10;        // base for block_zero routine
2250     const Register cnt_words = r11;      // temp register
2251 
2252     __ enter();
2253 
2254     Label L_fill_elements, L_exit1;
2255 
2256     int shift = -1;
2257     switch (t) {
2258       case T_BYTE:
2259         shift = 0;
2260         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2261         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2262         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2263         __ br(Assembler::LO, L_fill_elements);
2264         break;
2265       case T_SHORT:
2266         shift = 1;
2267         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2268         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2269         __ br(Assembler::LO, L_fill_elements);
2270         break;
2271       case T_INT:
2272         shift = 2;
2273         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2274         __ br(Assembler::LO, L_fill_elements);
2275         break;
2276       default: ShouldNotReachHere();
2277     }
2278 
2279     // Align source address at 8 bytes address boundary.
2280     Label L_skip_align1, L_skip_align2, L_skip_align4;
2281     if (!aligned) {
2282       switch (t) {
2283         case T_BYTE:
2284           // One byte misalignment happens only for byte arrays.
2285           __ tbz(to, 0, L_skip_align1);
2286           __ strb(value, Address(__ post(to, 1)));
2287           __ subw(count, count, 1);
2288           __ bind(L_skip_align1);
2289           // Fallthrough
2290         case T_SHORT:
2291           // Two bytes misalignment happens only for byte and short (char) arrays.
2292           __ tbz(to, 1, L_skip_align2);
2293           __ strh(value, Address(__ post(to, 2)));
2294           __ subw(count, count, 2 >> shift);
2295           __ bind(L_skip_align2);
2296           // Fallthrough
2297         case T_INT:
2298           // Align to 8 bytes, we know we are 4 byte aligned to start.
2299           __ tbz(to, 2, L_skip_align4);
2300           __ strw(value, Address(__ post(to, 4)));
2301           __ subw(count, count, 4 >> shift);
2302           __ bind(L_skip_align4);
2303           break;
2304         default: ShouldNotReachHere();
2305       }
2306     }
2307 
2308     //
2309     //  Fill large chunks
2310     //
2311     __ lsrw(cnt_words, count, 3 - shift); // number of words
2312     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2313     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2314     if (UseBlockZeroing) {
2315       Label non_block_zeroing, rest;
2316       // If the fill value is zero we can use the fast zero_words().
2317       __ cbnz(value, non_block_zeroing);
2318       __ mov(bz_base, to);
2319       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2320       __ zero_words(bz_base, cnt_words);
2321       __ b(rest);
2322       __ bind(non_block_zeroing);
2323       __ fill_words(to, cnt_words, value);
2324       __ bind(rest);
2325     } else {
2326       __ fill_words(to, cnt_words, value);
2327     }
2328 
2329     // Remaining count is less than 8 bytes. Fill it by a single store.
2330     // Note that the total length is no less than 8 bytes.
2331     if (t == T_BYTE || t == T_SHORT) {
2332       Label L_exit1;
2333       __ cbzw(count, L_exit1);
2334       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2335       __ str(value, Address(to, -8));    // overwrite some elements
2336       __ bind(L_exit1);
2337       __ leave();
2338       __ ret(lr);
2339     }
2340 
2341     // Handle copies less than 8 bytes.
2342     Label L_fill_2, L_fill_4, L_exit2;
2343     __ bind(L_fill_elements);
2344     switch (t) {
2345       case T_BYTE:
2346         __ tbz(count, 0, L_fill_2);
2347         __ strb(value, Address(__ post(to, 1)));
2348         __ bind(L_fill_2);
2349         __ tbz(count, 1, L_fill_4);
2350         __ strh(value, Address(__ post(to, 2)));
2351         __ bind(L_fill_4);
2352         __ tbz(count, 2, L_exit2);
2353         __ strw(value, Address(to));
2354         break;
2355       case T_SHORT:
2356         __ tbz(count, 0, L_fill_4);
2357         __ strh(value, Address(__ post(to, 2)));
2358         __ bind(L_fill_4);
2359         __ tbz(count, 1, L_exit2);
2360         __ strw(value, Address(to));
2361         break;
2362       case T_INT:
2363         __ cbzw(count, L_exit2);
2364         __ strw(value, Address(to));
2365         break;
2366       default: ShouldNotReachHere();
2367     }
2368     __ bind(L_exit2);
2369     __ leave();
2370     __ ret(lr);
2371     return start;
2372   }
2373 
2374   void generate_arraycopy_stubs() {
2375     address entry;
2376     address entry_jbyte_arraycopy;
2377     address entry_jshort_arraycopy;
2378     address entry_jint_arraycopy;
2379     address entry_oop_arraycopy;
2380     address entry_jlong_arraycopy;
2381     address entry_checkcast_arraycopy;
2382 
2383     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2384     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2385 
2386     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2387 
2388     //*** jbyte
2389     // Always need aligned and unaligned versions
2390     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2391                                                                                   "jbyte_disjoint_arraycopy");
2392     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2393                                                                                   &entry_jbyte_arraycopy,
2394                                                                                   "jbyte_arraycopy");
2395     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2396                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2397     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2398                                                                                   "arrayof_jbyte_arraycopy");
2399 
2400     //*** jshort
2401     // Always need aligned and unaligned versions
2402     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2403                                                                                     "jshort_disjoint_arraycopy");
2404     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2405                                                                                     &entry_jshort_arraycopy,
2406                                                                                     "jshort_arraycopy");
2407     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2408                                                                                     "arrayof_jshort_disjoint_arraycopy");
2409     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2410                                                                                     "arrayof_jshort_arraycopy");
2411 
2412     //*** jint
2413     // Aligned versions
2414     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2415                                                                                 "arrayof_jint_disjoint_arraycopy");
2416     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2417                                                                                 "arrayof_jint_arraycopy");
2418     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2419     // entry_jint_arraycopy always points to the unaligned version
2420     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2421                                                                                 "jint_disjoint_arraycopy");
2422     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2423                                                                                 &entry_jint_arraycopy,
2424                                                                                 "jint_arraycopy");
2425 
2426     //*** jlong
2427     // It is always aligned
2428     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2429                                                                                   "arrayof_jlong_disjoint_arraycopy");
2430     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2431                                                                                   "arrayof_jlong_arraycopy");
2432     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2433     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2434 
2435     //*** oops
2436     {
2437       // With compressed oops we need unaligned versions; notice that
2438       // we overwrite entry_oop_arraycopy.
2439       bool aligned = !UseCompressedOops;
2440 
2441       StubRoutines::_arrayof_oop_disjoint_arraycopy
2442         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2443                                      /*dest_uninitialized*/false);
2444       StubRoutines::_arrayof_oop_arraycopy
2445         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2446                                      /*dest_uninitialized*/false);
2447       // Aligned versions without pre-barriers
2448       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2449         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2450                                      /*dest_uninitialized*/true);
2451       StubRoutines::_arrayof_oop_arraycopy_uninit
2452         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2453                                      /*dest_uninitialized*/true);
2454     }
2455 
2456     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2457     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2458     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2459     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2460 
2461     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2462     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2463                                                                         /*dest_uninitialized*/true);
2464 
2465     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2466                                                               entry_jbyte_arraycopy,
2467                                                               entry_jshort_arraycopy,
2468                                                               entry_jint_arraycopy,
2469                                                               entry_jlong_arraycopy);
2470 
2471     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2472                                                                entry_jbyte_arraycopy,
2473                                                                entry_jshort_arraycopy,
2474                                                                entry_jint_arraycopy,
2475                                                                entry_oop_arraycopy,
2476                                                                entry_jlong_arraycopy,
2477                                                                entry_checkcast_arraycopy);
2478 
2479     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2480     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2481     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2482     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2483     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2484     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2485   }
2486 
2487   void generate_math_stubs() { Unimplemented(); }
2488 
2489   // Arguments:
2490   //
2491   // Inputs:
2492   //   c_rarg0   - source byte array address
2493   //   c_rarg1   - destination byte array address
2494   //   c_rarg2   - K (key) in little endian int array
2495   //
2496   address generate_aescrypt_encryptBlock() {
2497     __ align(CodeEntryAlignment);
2498     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2499 
2500     Label L_doLast;
2501 
2502     const Register from        = c_rarg0;  // source array address
2503     const Register to          = c_rarg1;  // destination array address
2504     const Register key         = c_rarg2;  // key array address
2505     const Register keylen      = rscratch1;
2506 
2507     address start = __ pc();
2508     __ enter();
2509 
2510     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2511 
2512     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2513 
2514     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2515     __ rev32(v1, __ T16B, v1);
2516     __ rev32(v2, __ T16B, v2);
2517     __ rev32(v3, __ T16B, v3);
2518     __ rev32(v4, __ T16B, v4);
2519     __ aese(v0, v1);
2520     __ aesmc(v0, v0);
2521     __ aese(v0, v2);
2522     __ aesmc(v0, v0);
2523     __ aese(v0, v3);
2524     __ aesmc(v0, v0);
2525     __ aese(v0, v4);
2526     __ aesmc(v0, v0);
2527 
2528     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2529     __ rev32(v1, __ T16B, v1);
2530     __ rev32(v2, __ T16B, v2);
2531     __ rev32(v3, __ T16B, v3);
2532     __ rev32(v4, __ T16B, v4);
2533     __ aese(v0, v1);
2534     __ aesmc(v0, v0);
2535     __ aese(v0, v2);
2536     __ aesmc(v0, v0);
2537     __ aese(v0, v3);
2538     __ aesmc(v0, v0);
2539     __ aese(v0, v4);
2540     __ aesmc(v0, v0);
2541 
2542     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2543     __ rev32(v1, __ T16B, v1);
2544     __ rev32(v2, __ T16B, v2);
2545 
2546     __ cmpw(keylen, 44);
2547     __ br(Assembler::EQ, L_doLast);
2548 
2549     __ aese(v0, v1);
2550     __ aesmc(v0, v0);
2551     __ aese(v0, v2);
2552     __ aesmc(v0, v0);
2553 
2554     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2555     __ rev32(v1, __ T16B, v1);
2556     __ rev32(v2, __ T16B, v2);
2557 
2558     __ cmpw(keylen, 52);
2559     __ br(Assembler::EQ, L_doLast);
2560 
2561     __ aese(v0, v1);
2562     __ aesmc(v0, v0);
2563     __ aese(v0, v2);
2564     __ aesmc(v0, v0);
2565 
2566     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2567     __ rev32(v1, __ T16B, v1);
2568     __ rev32(v2, __ T16B, v2);
2569 
2570     __ BIND(L_doLast);
2571 
2572     __ aese(v0, v1);
2573     __ aesmc(v0, v0);
2574     __ aese(v0, v2);
2575 
2576     __ ld1(v1, __ T16B, key);
2577     __ rev32(v1, __ T16B, v1);
2578     __ eor(v0, __ T16B, v0, v1);
2579 
2580     __ st1(v0, __ T16B, to);
2581 
2582     __ mov(r0, 0);
2583 
2584     __ leave();
2585     __ ret(lr);
2586 
2587     return start;
2588   }
2589 
2590   // Arguments:
2591   //
2592   // Inputs:
2593   //   c_rarg0   - source byte array address
2594   //   c_rarg1   - destination byte array address
2595   //   c_rarg2   - K (key) in little endian int array
2596   //
2597   address generate_aescrypt_decryptBlock() {
2598     assert(UseAES, "need AES instructions and misaligned SSE support");
2599     __ align(CodeEntryAlignment);
2600     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2601     Label L_doLast;
2602 
2603     const Register from        = c_rarg0;  // source array address
2604     const Register to          = c_rarg1;  // destination array address
2605     const Register key         = c_rarg2;  // key array address
2606     const Register keylen      = rscratch1;
2607 
2608     address start = __ pc();
2609     __ enter(); // required for proper stackwalking of RuntimeStub frame
2610 
2611     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2612 
2613     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2614 
2615     __ ld1(v5, __ T16B, __ post(key, 16));
2616     __ rev32(v5, __ T16B, v5);
2617 
2618     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2619     __ rev32(v1, __ T16B, v1);
2620     __ rev32(v2, __ T16B, v2);
2621     __ rev32(v3, __ T16B, v3);
2622     __ rev32(v4, __ T16B, v4);
2623     __ aesd(v0, v1);
2624     __ aesimc(v0, v0);
2625     __ aesd(v0, v2);
2626     __ aesimc(v0, v0);
2627     __ aesd(v0, v3);
2628     __ aesimc(v0, v0);
2629     __ aesd(v0, v4);
2630     __ aesimc(v0, v0);
2631 
2632     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2633     __ rev32(v1, __ T16B, v1);
2634     __ rev32(v2, __ T16B, v2);
2635     __ rev32(v3, __ T16B, v3);
2636     __ rev32(v4, __ T16B, v4);
2637     __ aesd(v0, v1);
2638     __ aesimc(v0, v0);
2639     __ aesd(v0, v2);
2640     __ aesimc(v0, v0);
2641     __ aesd(v0, v3);
2642     __ aesimc(v0, v0);
2643     __ aesd(v0, v4);
2644     __ aesimc(v0, v0);
2645 
2646     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2647     __ rev32(v1, __ T16B, v1);
2648     __ rev32(v2, __ T16B, v2);
2649 
2650     __ cmpw(keylen, 44);
2651     __ br(Assembler::EQ, L_doLast);
2652 
2653     __ aesd(v0, v1);
2654     __ aesimc(v0, v0);
2655     __ aesd(v0, v2);
2656     __ aesimc(v0, v0);
2657 
2658     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2659     __ rev32(v1, __ T16B, v1);
2660     __ rev32(v2, __ T16B, v2);
2661 
2662     __ cmpw(keylen, 52);
2663     __ br(Assembler::EQ, L_doLast);
2664 
2665     __ aesd(v0, v1);
2666     __ aesimc(v0, v0);
2667     __ aesd(v0, v2);
2668     __ aesimc(v0, v0);
2669 
2670     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2671     __ rev32(v1, __ T16B, v1);
2672     __ rev32(v2, __ T16B, v2);
2673 
2674     __ BIND(L_doLast);
2675 
2676     __ aesd(v0, v1);
2677     __ aesimc(v0, v0);
2678     __ aesd(v0, v2);
2679 
2680     __ eor(v0, __ T16B, v0, v5);
2681 
2682     __ st1(v0, __ T16B, to);
2683 
2684     __ mov(r0, 0);
2685 
2686     __ leave();
2687     __ ret(lr);
2688 
2689     return start;
2690   }
2691 
2692   // Arguments:
2693   //
2694   // Inputs:
2695   //   c_rarg0   - source byte array address
2696   //   c_rarg1   - destination byte array address
2697   //   c_rarg2   - K (key) in little endian int array
2698   //   c_rarg3   - r vector byte array address
2699   //   c_rarg4   - input length
2700   //
2701   // Output:
2702   //   x0        - input length
2703   //
2704   address generate_cipherBlockChaining_encryptAESCrypt() {
2705     assert(UseAES, "need AES instructions and misaligned SSE support");
2706     __ align(CodeEntryAlignment);
2707     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2708 
2709     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2710 
2711     const Register from        = c_rarg0;  // source array address
2712     const Register to          = c_rarg1;  // destination array address
2713     const Register key         = c_rarg2;  // key array address
2714     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2715                                            // and left with the results of the last encryption block
2716     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2717     const Register keylen      = rscratch1;
2718 
2719     address start = __ pc();
2720 
2721       __ enter();
2722 
2723       __ movw(rscratch2, len_reg);
2724 
2725       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2726 
2727       __ ld1(v0, __ T16B, rvec);
2728 
2729       __ cmpw(keylen, 52);
2730       __ br(Assembler::CC, L_loadkeys_44);
2731       __ br(Assembler::EQ, L_loadkeys_52);
2732 
2733       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2734       __ rev32(v17, __ T16B, v17);
2735       __ rev32(v18, __ T16B, v18);
2736     __ BIND(L_loadkeys_52);
2737       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2738       __ rev32(v19, __ T16B, v19);
2739       __ rev32(v20, __ T16B, v20);
2740     __ BIND(L_loadkeys_44);
2741       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2742       __ rev32(v21, __ T16B, v21);
2743       __ rev32(v22, __ T16B, v22);
2744       __ rev32(v23, __ T16B, v23);
2745       __ rev32(v24, __ T16B, v24);
2746       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2747       __ rev32(v25, __ T16B, v25);
2748       __ rev32(v26, __ T16B, v26);
2749       __ rev32(v27, __ T16B, v27);
2750       __ rev32(v28, __ T16B, v28);
2751       __ ld1(v29, v30, v31, __ T16B, key);
2752       __ rev32(v29, __ T16B, v29);
2753       __ rev32(v30, __ T16B, v30);
2754       __ rev32(v31, __ T16B, v31);
2755 
2756     __ BIND(L_aes_loop);
2757       __ ld1(v1, __ T16B, __ post(from, 16));
2758       __ eor(v0, __ T16B, v0, v1);
2759 
2760       __ br(Assembler::CC, L_rounds_44);
2761       __ br(Assembler::EQ, L_rounds_52);
2762 
2763       __ aese(v0, v17); __ aesmc(v0, v0);
2764       __ aese(v0, v18); __ aesmc(v0, v0);
2765     __ BIND(L_rounds_52);
2766       __ aese(v0, v19); __ aesmc(v0, v0);
2767       __ aese(v0, v20); __ aesmc(v0, v0);
2768     __ BIND(L_rounds_44);
2769       __ aese(v0, v21); __ aesmc(v0, v0);
2770       __ aese(v0, v22); __ aesmc(v0, v0);
2771       __ aese(v0, v23); __ aesmc(v0, v0);
2772       __ aese(v0, v24); __ aesmc(v0, v0);
2773       __ aese(v0, v25); __ aesmc(v0, v0);
2774       __ aese(v0, v26); __ aesmc(v0, v0);
2775       __ aese(v0, v27); __ aesmc(v0, v0);
2776       __ aese(v0, v28); __ aesmc(v0, v0);
2777       __ aese(v0, v29); __ aesmc(v0, v0);
2778       __ aese(v0, v30);
2779       __ eor(v0, __ T16B, v0, v31);
2780 
2781       __ st1(v0, __ T16B, __ post(to, 16));
2782 
2783       __ subw(len_reg, len_reg, 16);
2784       __ cbnzw(len_reg, L_aes_loop);
2785 
2786       __ st1(v0, __ T16B, rvec);
2787 
2788       __ mov(r0, rscratch2);
2789 
2790       __ leave();
2791       __ ret(lr);
2792 
2793       return start;
2794   }
2795 
2796   // Arguments:
2797   //
2798   // Inputs:
2799   //   c_rarg0   - source byte array address
2800   //   c_rarg1   - destination byte array address
2801   //   c_rarg2   - K (key) in little endian int array
2802   //   c_rarg3   - r vector byte array address
2803   //   c_rarg4   - input length
2804   //
2805   // Output:
2806   //   r0        - input length
2807   //
2808   address generate_cipherBlockChaining_decryptAESCrypt() {
2809     assert(UseAES, "need AES instructions and misaligned SSE support");
2810     __ align(CodeEntryAlignment);
2811     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2812 
2813     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2814 
2815     const Register from        = c_rarg0;  // source array address
2816     const Register to          = c_rarg1;  // destination array address
2817     const Register key         = c_rarg2;  // key array address
2818     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2819                                            // and left with the results of the last encryption block
2820     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2821     const Register keylen      = rscratch1;
2822 
2823     address start = __ pc();
2824 
2825       __ enter();
2826 
2827       __ movw(rscratch2, len_reg);
2828 
2829       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2830 
2831       __ ld1(v2, __ T16B, rvec);
2832 
2833       __ ld1(v31, __ T16B, __ post(key, 16));
2834       __ rev32(v31, __ T16B, v31);
2835 
2836       __ cmpw(keylen, 52);
2837       __ br(Assembler::CC, L_loadkeys_44);
2838       __ br(Assembler::EQ, L_loadkeys_52);
2839 
2840       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2841       __ rev32(v17, __ T16B, v17);
2842       __ rev32(v18, __ T16B, v18);
2843     __ BIND(L_loadkeys_52);
2844       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2845       __ rev32(v19, __ T16B, v19);
2846       __ rev32(v20, __ T16B, v20);
2847     __ BIND(L_loadkeys_44);
2848       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2849       __ rev32(v21, __ T16B, v21);
2850       __ rev32(v22, __ T16B, v22);
2851       __ rev32(v23, __ T16B, v23);
2852       __ rev32(v24, __ T16B, v24);
2853       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2854       __ rev32(v25, __ T16B, v25);
2855       __ rev32(v26, __ T16B, v26);
2856       __ rev32(v27, __ T16B, v27);
2857       __ rev32(v28, __ T16B, v28);
2858       __ ld1(v29, v30, __ T16B, key);
2859       __ rev32(v29, __ T16B, v29);
2860       __ rev32(v30, __ T16B, v30);
2861 
2862     __ BIND(L_aes_loop);
2863       __ ld1(v0, __ T16B, __ post(from, 16));
2864       __ orr(v1, __ T16B, v0, v0);
2865 
2866       __ br(Assembler::CC, L_rounds_44);
2867       __ br(Assembler::EQ, L_rounds_52);
2868 
2869       __ aesd(v0, v17); __ aesimc(v0, v0);
2870       __ aesd(v0, v18); __ aesimc(v0, v0);
2871     __ BIND(L_rounds_52);
2872       __ aesd(v0, v19); __ aesimc(v0, v0);
2873       __ aesd(v0, v20); __ aesimc(v0, v0);
2874     __ BIND(L_rounds_44);
2875       __ aesd(v0, v21); __ aesimc(v0, v0);
2876       __ aesd(v0, v22); __ aesimc(v0, v0);
2877       __ aesd(v0, v23); __ aesimc(v0, v0);
2878       __ aesd(v0, v24); __ aesimc(v0, v0);
2879       __ aesd(v0, v25); __ aesimc(v0, v0);
2880       __ aesd(v0, v26); __ aesimc(v0, v0);
2881       __ aesd(v0, v27); __ aesimc(v0, v0);
2882       __ aesd(v0, v28); __ aesimc(v0, v0);
2883       __ aesd(v0, v29); __ aesimc(v0, v0);
2884       __ aesd(v0, v30);
2885       __ eor(v0, __ T16B, v0, v31);
2886       __ eor(v0, __ T16B, v0, v2);
2887 
2888       __ st1(v0, __ T16B, __ post(to, 16));
2889       __ orr(v2, __ T16B, v1, v1);
2890 
2891       __ subw(len_reg, len_reg, 16);
2892       __ cbnzw(len_reg, L_aes_loop);
2893 
2894       __ st1(v2, __ T16B, rvec);
2895 
2896       __ mov(r0, rscratch2);
2897 
2898       __ leave();
2899       __ ret(lr);
2900 
2901     return start;
2902   }
2903 
2904   // Arguments:
2905   //
2906   // Inputs:
2907   //   c_rarg0   - byte[]  source+offset
2908   //   c_rarg1   - int[]   SHA.state
2909   //   c_rarg2   - int     offset
2910   //   c_rarg3   - int     limit
2911   //
2912   address generate_sha1_implCompress(bool multi_block, const char *name) {
2913     __ align(CodeEntryAlignment);
2914     StubCodeMark mark(this, "StubRoutines", name);
2915     address start = __ pc();
2916 
2917     Register buf   = c_rarg0;
2918     Register state = c_rarg1;
2919     Register ofs   = c_rarg2;
2920     Register limit = c_rarg3;
2921 
2922     Label keys;
2923     Label sha1_loop;
2924 
2925     // load the keys into v0..v3
2926     __ adr(rscratch1, keys);
2927     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2928     // load 5 words state into v6, v7
2929     __ ldrq(v6, Address(state, 0));
2930     __ ldrs(v7, Address(state, 16));
2931 
2932 
2933     __ BIND(sha1_loop);
2934     // load 64 bytes of data into v16..v19
2935     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2936     __ rev32(v16, __ T16B, v16);
2937     __ rev32(v17, __ T16B, v17);
2938     __ rev32(v18, __ T16B, v18);
2939     __ rev32(v19, __ T16B, v19);
2940 
2941     // do the sha1
2942     __ addv(v4, __ T4S, v16, v0);
2943     __ orr(v20, __ T16B, v6, v6);
2944 
2945     FloatRegister d0 = v16;
2946     FloatRegister d1 = v17;
2947     FloatRegister d2 = v18;
2948     FloatRegister d3 = v19;
2949 
2950     for (int round = 0; round < 20; round++) {
2951       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2952       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2953       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2954       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2955       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2956 
2957       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2958       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2959       __ sha1h(tmp2, __ T4S, v20);
2960       if (round < 5)
2961         __ sha1c(v20, __ T4S, tmp3, tmp4);
2962       else if (round < 10 || round >= 15)
2963         __ sha1p(v20, __ T4S, tmp3, tmp4);
2964       else
2965         __ sha1m(v20, __ T4S, tmp3, tmp4);
2966       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2967 
2968       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2969     }
2970 
2971     __ addv(v7, __ T2S, v7, v21);
2972     __ addv(v6, __ T4S, v6, v20);
2973 
2974     if (multi_block) {
2975       __ add(ofs, ofs, 64);
2976       __ cmp(ofs, limit);
2977       __ br(Assembler::LE, sha1_loop);
2978       __ mov(c_rarg0, ofs); // return ofs
2979     }
2980 
2981     __ strq(v6, Address(state, 0));
2982     __ strs(v7, Address(state, 16));
2983 
2984     __ ret(lr);
2985 
2986     __ bind(keys);
2987     __ emit_int32(0x5a827999);
2988     __ emit_int32(0x6ed9eba1);
2989     __ emit_int32(0x8f1bbcdc);
2990     __ emit_int32(0xca62c1d6);
2991 
2992     return start;
2993   }
2994 
2995 
2996   // Arguments:
2997   //
2998   // Inputs:
2999   //   c_rarg0   - byte[]  source+offset
3000   //   c_rarg1   - int[]   SHA.state
3001   //   c_rarg2   - int     offset
3002   //   c_rarg3   - int     limit
3003   //
3004   address generate_sha256_implCompress(bool multi_block, const char *name) {
3005     static const uint32_t round_consts[64] = {
3006       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3007       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3008       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3009       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3010       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3011       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3012       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3013       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3014       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3015       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3016       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3017       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3018       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3019       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3020       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3021       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3022     };
3023     __ align(CodeEntryAlignment);
3024     StubCodeMark mark(this, "StubRoutines", name);
3025     address start = __ pc();
3026 
3027     Register buf   = c_rarg0;
3028     Register state = c_rarg1;
3029     Register ofs   = c_rarg2;
3030     Register limit = c_rarg3;
3031 
3032     Label sha1_loop;
3033 
3034     __ stpd(v8, v9, __ pre(sp, -32));
3035     __ stpd(v10, v11, Address(sp, 16));
3036 
3037 // dga == v0
3038 // dgb == v1
3039 // dg0 == v2
3040 // dg1 == v3
3041 // dg2 == v4
3042 // t0 == v6
3043 // t1 == v7
3044 
3045     // load 16 keys to v16..v31
3046     __ lea(rscratch1, ExternalAddress((address)round_consts));
3047     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3048     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3049     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3050     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3051 
3052     // load 8 words (256 bits) state
3053     __ ldpq(v0, v1, state);
3054 
3055     __ BIND(sha1_loop);
3056     // load 64 bytes of data into v8..v11
3057     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3058     __ rev32(v8, __ T16B, v8);
3059     __ rev32(v9, __ T16B, v9);
3060     __ rev32(v10, __ T16B, v10);
3061     __ rev32(v11, __ T16B, v11);
3062 
3063     __ addv(v6, __ T4S, v8, v16);
3064     __ orr(v2, __ T16B, v0, v0);
3065     __ orr(v3, __ T16B, v1, v1);
3066 
3067     FloatRegister d0 = v8;
3068     FloatRegister d1 = v9;
3069     FloatRegister d2 = v10;
3070     FloatRegister d3 = v11;
3071 
3072 
3073     for (int round = 0; round < 16; round++) {
3074       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3075       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3076       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3077       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3078 
3079       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3080        __ orr(v4, __ T16B, v2, v2);
3081       if (round < 15)
3082         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3083       __ sha256h(v2, __ T4S, v3, tmp2);
3084       __ sha256h2(v3, __ T4S, v4, tmp2);
3085       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3086 
3087       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3088     }
3089 
3090     __ addv(v0, __ T4S, v0, v2);
3091     __ addv(v1, __ T4S, v1, v3);
3092 
3093     if (multi_block) {
3094       __ add(ofs, ofs, 64);
3095       __ cmp(ofs, limit);
3096       __ br(Assembler::LE, sha1_loop);
3097       __ mov(c_rarg0, ofs); // return ofs
3098     }
3099 
3100     __ ldpd(v10, v11, Address(sp, 16));
3101     __ ldpd(v8, v9, __ post(sp, 32));
3102 
3103     __ stpq(v0, v1, state);
3104 
3105     __ ret(lr);
3106 
3107     return start;
3108   }
3109 
3110 #ifndef BUILTIN_SIM
3111   // Safefetch stubs.
3112   void generate_safefetch(const char* name, int size, address* entry,
3113                           address* fault_pc, address* continuation_pc) {
3114     // safefetch signatures:
3115     //   int      SafeFetch32(int*      adr, int      errValue);
3116     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3117     //
3118     // arguments:
3119     //   c_rarg0 = adr
3120     //   c_rarg1 = errValue
3121     //
3122     // result:
3123     //   PPC_RET  = *adr or errValue
3124 
3125     StubCodeMark mark(this, "StubRoutines", name);
3126 
3127     // Entry point, pc or function descriptor.
3128     *entry = __ pc();
3129 
3130     // Load *adr into c_rarg1, may fault.
3131     *fault_pc = __ pc();
3132     switch (size) {
3133       case 4:
3134         // int32_t
3135         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3136         break;
3137       case 8:
3138         // int64_t
3139         __ ldr(c_rarg1, Address(c_rarg0, 0));
3140         break;
3141       default:
3142         ShouldNotReachHere();
3143     }
3144 
3145     // return errValue or *adr
3146     *continuation_pc = __ pc();
3147     __ mov(r0, c_rarg1);
3148     __ ret(lr);
3149   }
3150 #endif
3151 
3152   /**
3153    *  Arguments:
3154    *
3155    * Inputs:
3156    *   c_rarg0   - int crc
3157    *   c_rarg1   - byte* buf
3158    *   c_rarg2   - int length
3159    *
3160    * Ouput:
3161    *       rax   - int crc result
3162    */
3163   address generate_updateBytesCRC32() {
3164     assert(UseCRC32Intrinsics, "what are we doing here?");
3165 
3166     __ align(CodeEntryAlignment);
3167     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3168 
3169     address start = __ pc();
3170 
3171     const Register crc   = c_rarg0;  // crc
3172     const Register buf   = c_rarg1;  // source java byte array address
3173     const Register len   = c_rarg2;  // length
3174     const Register table0 = c_rarg3; // crc_table address
3175     const Register table1 = c_rarg4;
3176     const Register table2 = c_rarg5;
3177     const Register table3 = c_rarg6;
3178     const Register tmp3 = c_rarg7;
3179 
3180     BLOCK_COMMENT("Entry:");
3181     __ enter(); // required for proper stackwalking of RuntimeStub frame
3182 
3183     __ kernel_crc32(crc, buf, len,
3184               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3185 
3186     __ leave(); // required for proper stackwalking of RuntimeStub frame
3187     __ ret(lr);
3188 
3189     return start;
3190   }
3191 
3192   /**
3193    *  Arguments:
3194    *
3195    * Inputs:
3196    *   c_rarg0   - int crc
3197    *   c_rarg1   - byte* buf
3198    *   c_rarg2   - int length
3199    *   c_rarg3   - int* table
3200    *
3201    * Ouput:
3202    *       r0   - int crc result
3203    */
3204   address generate_updateBytesCRC32C() {
3205     assert(UseCRC32CIntrinsics, "what are we doing here?");
3206 
3207     __ align(CodeEntryAlignment);
3208     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3209 
3210     address start = __ pc();
3211 
3212     const Register crc   = c_rarg0;  // crc
3213     const Register buf   = c_rarg1;  // source java byte array address
3214     const Register len   = c_rarg2;  // length
3215     const Register table0 = c_rarg3; // crc_table address
3216     const Register table1 = c_rarg4;
3217     const Register table2 = c_rarg5;
3218     const Register table3 = c_rarg6;
3219     const Register tmp3 = c_rarg7;
3220 
3221     BLOCK_COMMENT("Entry:");
3222     __ enter(); // required for proper stackwalking of RuntimeStub frame
3223 
3224     __ kernel_crc32c(crc, buf, len,
3225               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3226 
3227     __ leave(); // required for proper stackwalking of RuntimeStub frame
3228     __ ret(lr);
3229 
3230     return start;
3231   }
3232 
3233   /***
3234    *  Arguments:
3235    *
3236    *  Inputs:
3237    *   c_rarg0   - int   adler
3238    *   c_rarg1   - byte* buff
3239    *   c_rarg2   - int   len
3240    *
3241    * Output:
3242    *   c_rarg0   - int adler result
3243    */
3244   address generate_updateBytesAdler32() {
3245     __ align(CodeEntryAlignment);
3246     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3247     address start = __ pc();
3248 
3249     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3250 
3251     // Aliases
3252     Register adler  = c_rarg0;
3253     Register s1     = c_rarg0;
3254     Register s2     = c_rarg3;
3255     Register buff   = c_rarg1;
3256     Register len    = c_rarg2;
3257     Register nmax  = r4;
3258     Register base = r5;
3259     Register count = r6;
3260     Register temp0 = rscratch1;
3261     Register temp1 = rscratch2;
3262     Register temp2 = r7;
3263 
3264     // Max number of bytes we can process before having to take the mod
3265     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3266     unsigned long BASE = 0xfff1;
3267     unsigned long NMAX = 0x15B0;
3268 
3269     __ mov(base, BASE);
3270     __ mov(nmax, NMAX);
3271 
3272     // s1 is initialized to the lower 16 bits of adler
3273     // s2 is initialized to the upper 16 bits of adler
3274     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3275     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3276 
3277     // The pipelined loop needs at least 16 elements for 1 iteration
3278     // It does check this, but it is more effective to skip to the cleanup loop
3279     __ cmp(len, (u1)16);
3280     __ br(Assembler::HS, L_nmax);
3281     __ cbz(len, L_combine);
3282 
3283     __ bind(L_simple_by1_loop);
3284     __ ldrb(temp0, Address(__ post(buff, 1)));
3285     __ add(s1, s1, temp0);
3286     __ add(s2, s2, s1);
3287     __ subs(len, len, 1);
3288     __ br(Assembler::HI, L_simple_by1_loop);
3289 
3290     // s1 = s1 % BASE
3291     __ subs(temp0, s1, base);
3292     __ csel(s1, temp0, s1, Assembler::HS);
3293 
3294     // s2 = s2 % BASE
3295     __ lsr(temp0, s2, 16);
3296     __ lsl(temp1, temp0, 4);
3297     __ sub(temp1, temp1, temp0);
3298     __ add(s2, temp1, s2, ext::uxth);
3299 
3300     __ subs(temp0, s2, base);
3301     __ csel(s2, temp0, s2, Assembler::HS);
3302 
3303     __ b(L_combine);
3304 
3305     __ bind(L_nmax);
3306     __ subs(len, len, nmax);
3307     __ sub(count, nmax, 16);
3308     __ br(Assembler::LO, L_by16);
3309 
3310     __ bind(L_nmax_loop);
3311 
3312     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3313 
3314     __ add(s1, s1, temp0, ext::uxtb);
3315     __ ubfx(temp2, temp0, 8, 8);
3316     __ add(s2, s2, s1);
3317     __ add(s1, s1, temp2);
3318     __ ubfx(temp2, temp0, 16, 8);
3319     __ add(s2, s2, s1);
3320     __ add(s1, s1, temp2);
3321     __ ubfx(temp2, temp0, 24, 8);
3322     __ add(s2, s2, s1);
3323     __ add(s1, s1, temp2);
3324     __ ubfx(temp2, temp0, 32, 8);
3325     __ add(s2, s2, s1);
3326     __ add(s1, s1, temp2);
3327     __ ubfx(temp2, temp0, 40, 8);
3328     __ add(s2, s2, s1);
3329     __ add(s1, s1, temp2);
3330     __ ubfx(temp2, temp0, 48, 8);
3331     __ add(s2, s2, s1);
3332     __ add(s1, s1, temp2);
3333     __ add(s2, s2, s1);
3334     __ add(s1, s1, temp0, Assembler::LSR, 56);
3335     __ add(s2, s2, s1);
3336 
3337     __ add(s1, s1, temp1, ext::uxtb);
3338     __ ubfx(temp2, temp1, 8, 8);
3339     __ add(s2, s2, s1);
3340     __ add(s1, s1, temp2);
3341     __ ubfx(temp2, temp1, 16, 8);
3342     __ add(s2, s2, s1);
3343     __ add(s1, s1, temp2);
3344     __ ubfx(temp2, temp1, 24, 8);
3345     __ add(s2, s2, s1);
3346     __ add(s1, s1, temp2);
3347     __ ubfx(temp2, temp1, 32, 8);
3348     __ add(s2, s2, s1);
3349     __ add(s1, s1, temp2);
3350     __ ubfx(temp2, temp1, 40, 8);
3351     __ add(s2, s2, s1);
3352     __ add(s1, s1, temp2);
3353     __ ubfx(temp2, temp1, 48, 8);
3354     __ add(s2, s2, s1);
3355     __ add(s1, s1, temp2);
3356     __ add(s2, s2, s1);
3357     __ add(s1, s1, temp1, Assembler::LSR, 56);
3358     __ add(s2, s2, s1);
3359 
3360     __ subs(count, count, 16);
3361     __ br(Assembler::HS, L_nmax_loop);
3362 
3363     // s1 = s1 % BASE
3364     __ lsr(temp0, s1, 16);
3365     __ lsl(temp1, temp0, 4);
3366     __ sub(temp1, temp1, temp0);
3367     __ add(temp1, temp1, s1, ext::uxth);
3368 
3369     __ lsr(temp0, temp1, 16);
3370     __ lsl(s1, temp0, 4);
3371     __ sub(s1, s1, temp0);
3372     __ add(s1, s1, temp1, ext:: uxth);
3373 
3374     __ subs(temp0, s1, base);
3375     __ csel(s1, temp0, s1, Assembler::HS);
3376 
3377     // s2 = s2 % BASE
3378     __ lsr(temp0, s2, 16);
3379     __ lsl(temp1, temp0, 4);
3380     __ sub(temp1, temp1, temp0);
3381     __ add(temp1, temp1, s2, ext::uxth);
3382 
3383     __ lsr(temp0, temp1, 16);
3384     __ lsl(s2, temp0, 4);
3385     __ sub(s2, s2, temp0);
3386     __ add(s2, s2, temp1, ext:: uxth);
3387 
3388     __ subs(temp0, s2, base);
3389     __ csel(s2, temp0, s2, Assembler::HS);
3390 
3391     __ subs(len, len, nmax);
3392     __ sub(count, nmax, 16);
3393     __ br(Assembler::HS, L_nmax_loop);
3394 
3395     __ bind(L_by16);
3396     __ adds(len, len, count);
3397     __ br(Assembler::LO, L_by1);
3398 
3399     __ bind(L_by16_loop);
3400 
3401     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3402 
3403     __ add(s1, s1, temp0, ext::uxtb);
3404     __ ubfx(temp2, temp0, 8, 8);
3405     __ add(s2, s2, s1);
3406     __ add(s1, s1, temp2);
3407     __ ubfx(temp2, temp0, 16, 8);
3408     __ add(s2, s2, s1);
3409     __ add(s1, s1, temp2);
3410     __ ubfx(temp2, temp0, 24, 8);
3411     __ add(s2, s2, s1);
3412     __ add(s1, s1, temp2);
3413     __ ubfx(temp2, temp0, 32, 8);
3414     __ add(s2, s2, s1);
3415     __ add(s1, s1, temp2);
3416     __ ubfx(temp2, temp0, 40, 8);
3417     __ add(s2, s2, s1);
3418     __ add(s1, s1, temp2);
3419     __ ubfx(temp2, temp0, 48, 8);
3420     __ add(s2, s2, s1);
3421     __ add(s1, s1, temp2);
3422     __ add(s2, s2, s1);
3423     __ add(s1, s1, temp0, Assembler::LSR, 56);
3424     __ add(s2, s2, s1);
3425 
3426     __ add(s1, s1, temp1, ext::uxtb);
3427     __ ubfx(temp2, temp1, 8, 8);
3428     __ add(s2, s2, s1);
3429     __ add(s1, s1, temp2);
3430     __ ubfx(temp2, temp1, 16, 8);
3431     __ add(s2, s2, s1);
3432     __ add(s1, s1, temp2);
3433     __ ubfx(temp2, temp1, 24, 8);
3434     __ add(s2, s2, s1);
3435     __ add(s1, s1, temp2);
3436     __ ubfx(temp2, temp1, 32, 8);
3437     __ add(s2, s2, s1);
3438     __ add(s1, s1, temp2);
3439     __ ubfx(temp2, temp1, 40, 8);
3440     __ add(s2, s2, s1);
3441     __ add(s1, s1, temp2);
3442     __ ubfx(temp2, temp1, 48, 8);
3443     __ add(s2, s2, s1);
3444     __ add(s1, s1, temp2);
3445     __ add(s2, s2, s1);
3446     __ add(s1, s1, temp1, Assembler::LSR, 56);
3447     __ add(s2, s2, s1);
3448 
3449     __ subs(len, len, 16);
3450     __ br(Assembler::HS, L_by16_loop);
3451 
3452     __ bind(L_by1);
3453     __ adds(len, len, 15);
3454     __ br(Assembler::LO, L_do_mod);
3455 
3456     __ bind(L_by1_loop);
3457     __ ldrb(temp0, Address(__ post(buff, 1)));
3458     __ add(s1, temp0, s1);
3459     __ add(s2, s2, s1);
3460     __ subs(len, len, 1);
3461     __ br(Assembler::HS, L_by1_loop);
3462 
3463     __ bind(L_do_mod);
3464     // s1 = s1 % BASE
3465     __ lsr(temp0, s1, 16);
3466     __ lsl(temp1, temp0, 4);
3467     __ sub(temp1, temp1, temp0);
3468     __ add(temp1, temp1, s1, ext::uxth);
3469 
3470     __ lsr(temp0, temp1, 16);
3471     __ lsl(s1, temp0, 4);
3472     __ sub(s1, s1, temp0);
3473     __ add(s1, s1, temp1, ext:: uxth);
3474 
3475     __ subs(temp0, s1, base);
3476     __ csel(s1, temp0, s1, Assembler::HS);
3477 
3478     // s2 = s2 % BASE
3479     __ lsr(temp0, s2, 16);
3480     __ lsl(temp1, temp0, 4);
3481     __ sub(temp1, temp1, temp0);
3482     __ add(temp1, temp1, s2, ext::uxth);
3483 
3484     __ lsr(temp0, temp1, 16);
3485     __ lsl(s2, temp0, 4);
3486     __ sub(s2, s2, temp0);
3487     __ add(s2, s2, temp1, ext:: uxth);
3488 
3489     __ subs(temp0, s2, base);
3490     __ csel(s2, temp0, s2, Assembler::HS);
3491 
3492     // Combine lower bits and higher bits
3493     __ bind(L_combine);
3494     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3495 
3496     __ ret(lr);
3497 
3498     return start;
3499   }
3500 
3501   /**
3502    *  Arguments:
3503    *
3504    *  Input:
3505    *    c_rarg0   - x address
3506    *    c_rarg1   - x length
3507    *    c_rarg2   - y address
3508    *    c_rarg3   - y lenth
3509    *    c_rarg4   - z address
3510    *    c_rarg5   - z length
3511    */
3512   address generate_multiplyToLen() {
3513     __ align(CodeEntryAlignment);
3514     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3515 
3516     address start = __ pc();
3517     const Register x     = r0;
3518     const Register xlen  = r1;
3519     const Register y     = r2;
3520     const Register ylen  = r3;
3521     const Register z     = r4;
3522     const Register zlen  = r5;
3523 
3524     const Register tmp1  = r10;
3525     const Register tmp2  = r11;
3526     const Register tmp3  = r12;
3527     const Register tmp4  = r13;
3528     const Register tmp5  = r14;
3529     const Register tmp6  = r15;
3530     const Register tmp7  = r16;
3531 
3532     BLOCK_COMMENT("Entry:");
3533     __ enter(); // required for proper stackwalking of RuntimeStub frame
3534     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3535     __ leave(); // required for proper stackwalking of RuntimeStub frame
3536     __ ret(lr);
3537 
3538     return start;
3539   }
3540 
3541   address generate_squareToLen() {
3542     // squareToLen algorithm for sizes 1..127 described in java code works
3543     // faster than multiply_to_len on some CPUs and slower on others, but
3544     // multiply_to_len shows a bit better overall results
3545     __ align(CodeEntryAlignment);
3546     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3547     address start = __ pc();
3548 
3549     const Register x     = r0;
3550     const Register xlen  = r1;
3551     const Register z     = r2;
3552     const Register zlen  = r3;
3553     const Register y     = r4; // == x
3554     const Register ylen  = r5; // == xlen
3555 
3556     const Register tmp1  = r10;
3557     const Register tmp2  = r11;
3558     const Register tmp3  = r12;
3559     const Register tmp4  = r13;
3560     const Register tmp5  = r14;
3561     const Register tmp6  = r15;
3562     const Register tmp7  = r16;
3563 
3564     RegSet spilled_regs = RegSet::of(y, ylen);
3565     BLOCK_COMMENT("Entry:");
3566     __ enter();
3567     __ push(spilled_regs, sp);
3568     __ mov(y, x);
3569     __ mov(ylen, xlen);
3570     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3571     __ pop(spilled_regs, sp);
3572     __ leave();
3573     __ ret(lr);
3574     return start;
3575   }
3576 
3577   address generate_mulAdd() {
3578     __ align(CodeEntryAlignment);
3579     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3580 
3581     address start = __ pc();
3582 
3583     const Register out     = r0;
3584     const Register in      = r1;
3585     const Register offset  = r2;
3586     const Register len     = r3;
3587     const Register k       = r4;
3588 
3589     BLOCK_COMMENT("Entry:");
3590     __ enter();
3591     __ mul_add(out, in, offset, len, k);
3592     __ leave();
3593     __ ret(lr);
3594 
3595     return start;
3596   }
3597 
3598   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3599                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3600                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3601     // Karatsuba multiplication performs a 128*128 -> 256-bit
3602     // multiplication in three 128-bit multiplications and a few
3603     // additions.
3604     //
3605     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3606     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3607     //
3608     // Inputs:
3609     //
3610     // A0 in a.d[0]     (subkey)
3611     // A1 in a.d[1]
3612     // (A1+A0) in a1_xor_a0.d[0]
3613     //
3614     // B0 in b.d[0]     (state)
3615     // B1 in b.d[1]
3616 
3617     __ ext(tmp1, __ T16B, b, b, 0x08);
3618     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3619     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3620     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3621     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3622 
3623     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3624     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3625     __ eor(tmp2, __ T16B, tmp2, tmp4);
3626     __ eor(tmp2, __ T16B, tmp2, tmp3);
3627 
3628     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3629     __ ins(result_hi, __ D, tmp2, 0, 1);
3630     __ ins(result_lo, __ D, tmp2, 1, 0);
3631   }
3632 
3633   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3634                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3635     const FloatRegister t0 = result;
3636 
3637     // The GCM field polynomial f is z^128 + p(z), where p =
3638     // z^7+z^2+z+1.
3639     //
3640     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3641     //
3642     // so, given that the product we're reducing is
3643     //    a == lo + hi * z^128
3644     // substituting,
3645     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3646     //
3647     // we reduce by multiplying hi by p(z) and subtracting the result
3648     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3649     // bits we can do this with two 64-bit multiplications, lo*p and
3650     // hi*p.
3651 
3652     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3653     __ ext(t1, __ T16B, t0, z, 8);
3654     __ eor(hi, __ T16B, hi, t1);
3655     __ ext(t1, __ T16B, z, t0, 8);
3656     __ eor(lo, __ T16B, lo, t1);
3657     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3658     __ eor(result, __ T16B, lo, t0);
3659   }
3660 
3661   address generate_has_negatives(address &has_negatives_long) {
3662     const u1 large_loop_size = 64;
3663     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3664     int dcache_line = VM_Version::dcache_line_size();
3665 
3666     Register ary1 = r1, len = r2, result = r0;
3667 
3668     __ align(CodeEntryAlignment);
3669 
3670     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3671 
3672     address entry = __ pc();
3673 
3674     __ enter();
3675 
3676   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3677         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3678 
3679   __ cmp(len, (u1)15);
3680   __ br(Assembler::GT, LEN_OVER_15);
3681   // The only case when execution falls into this code is when pointer is near
3682   // the end of memory page and we have to avoid reading next page
3683   __ add(ary1, ary1, len);
3684   __ subs(len, len, 8);
3685   __ br(Assembler::GT, LEN_OVER_8);
3686   __ ldr(rscratch2, Address(ary1, -8));
3687   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3688   __ lsrv(rscratch2, rscratch2, rscratch1);
3689   __ tst(rscratch2, UPPER_BIT_MASK);
3690   __ cset(result, Assembler::NE);
3691   __ leave();
3692   __ ret(lr);
3693   __ bind(LEN_OVER_8);
3694   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3695   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3696   __ tst(rscratch2, UPPER_BIT_MASK);
3697   __ br(Assembler::NE, RET_TRUE_NO_POP);
3698   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3699   __ lsrv(rscratch1, rscratch1, rscratch2);
3700   __ tst(rscratch1, UPPER_BIT_MASK);
3701   __ cset(result, Assembler::NE);
3702   __ leave();
3703   __ ret(lr);
3704 
3705   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3706   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3707 
3708   has_negatives_long = __ pc(); // 2nd entry point
3709 
3710   __ enter();
3711 
3712   __ bind(LEN_OVER_15);
3713     __ push(spilled_regs, sp);
3714     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3715     __ cbz(rscratch2, ALIGNED);
3716     __ ldp(tmp6, tmp1, Address(ary1));
3717     __ mov(tmp5, 16);
3718     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3719     __ add(ary1, ary1, rscratch1);
3720     __ sub(len, len, rscratch1);
3721     __ orr(tmp6, tmp6, tmp1);
3722     __ tst(tmp6, UPPER_BIT_MASK);
3723     __ br(Assembler::NE, RET_TRUE);
3724 
3725   __ bind(ALIGNED);
3726     __ cmp(len, large_loop_size);
3727     __ br(Assembler::LT, CHECK_16);
3728     // Perform 16-byte load as early return in pre-loop to handle situation
3729     // when initially aligned large array has negative values at starting bytes,
3730     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3731     // slower. Cases with negative bytes further ahead won't be affected that
3732     // much. In fact, it'll be faster due to early loads, less instructions and
3733     // less branches in LARGE_LOOP.
3734     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3735     __ sub(len, len, 16);
3736     __ orr(tmp6, tmp6, tmp1);
3737     __ tst(tmp6, UPPER_BIT_MASK);
3738     __ br(Assembler::NE, RET_TRUE);
3739     __ cmp(len, large_loop_size);
3740     __ br(Assembler::LT, CHECK_16);
3741 
3742     if (SoftwarePrefetchHintDistance >= 0
3743         && SoftwarePrefetchHintDistance >= dcache_line) {
3744       // initial prefetch
3745       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3746     }
3747   __ bind(LARGE_LOOP);
3748     if (SoftwarePrefetchHintDistance >= 0) {
3749       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3750     }
3751     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3752     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3753     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3754     // instructions per cycle and have less branches, but this approach disables
3755     // early return, thus, all 64 bytes are loaded and checked every time.
3756     __ ldp(tmp2, tmp3, Address(ary1));
3757     __ ldp(tmp4, tmp5, Address(ary1, 16));
3758     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3759     __ ldp(tmp6, tmp1, Address(ary1, 48));
3760     __ add(ary1, ary1, large_loop_size);
3761     __ sub(len, len, large_loop_size);
3762     __ orr(tmp2, tmp2, tmp3);
3763     __ orr(tmp4, tmp4, tmp5);
3764     __ orr(rscratch1, rscratch1, rscratch2);
3765     __ orr(tmp6, tmp6, tmp1);
3766     __ orr(tmp2, tmp2, tmp4);
3767     __ orr(rscratch1, rscratch1, tmp6);
3768     __ orr(tmp2, tmp2, rscratch1);
3769     __ tst(tmp2, UPPER_BIT_MASK);
3770     __ br(Assembler::NE, RET_TRUE);
3771     __ cmp(len, large_loop_size);
3772     __ br(Assembler::GE, LARGE_LOOP);
3773 
3774   __ bind(CHECK_16); // small 16-byte load pre-loop
3775     __ cmp(len, (u1)16);
3776     __ br(Assembler::LT, POST_LOOP16);
3777 
3778   __ bind(LOOP16); // small 16-byte load loop
3779     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3780     __ sub(len, len, 16);
3781     __ orr(tmp2, tmp2, tmp3);
3782     __ tst(tmp2, UPPER_BIT_MASK);
3783     __ br(Assembler::NE, RET_TRUE);
3784     __ cmp(len, (u1)16);
3785     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3786 
3787   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3788     __ cmp(len, (u1)8);
3789     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3790     __ ldr(tmp3, Address(__ post(ary1, 8)));
3791     __ sub(len, len, 8);
3792     __ tst(tmp3, UPPER_BIT_MASK);
3793     __ br(Assembler::NE, RET_TRUE);
3794 
3795   __ bind(POST_LOOP16_LOAD_TAIL);
3796     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3797     __ ldr(tmp1, Address(ary1));
3798     __ mov(tmp2, 64);
3799     __ sub(tmp4, tmp2, len, __ LSL, 3);
3800     __ lslv(tmp1, tmp1, tmp4);
3801     __ tst(tmp1, UPPER_BIT_MASK);
3802     __ br(Assembler::NE, RET_TRUE);
3803     // Fallthrough
3804 
3805   __ bind(RET_FALSE);
3806     __ pop(spilled_regs, sp);
3807     __ leave();
3808     __ mov(result, zr);
3809     __ ret(lr);
3810 
3811   __ bind(RET_TRUE);
3812     __ pop(spilled_regs, sp);
3813   __ bind(RET_TRUE_NO_POP);
3814     __ leave();
3815     __ mov(result, 1);
3816     __ ret(lr);
3817 
3818   __ bind(DONE);
3819     __ pop(spilled_regs, sp);
3820     __ leave();
3821     __ ret(lr);
3822     return entry;
3823   }
3824 
3825   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3826         bool usePrefetch, Label &NOT_EQUAL) {
3827     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3828         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3829         tmp7 = r12, tmp8 = r13;
3830     Label LOOP;
3831 
3832     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3833     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3834     __ bind(LOOP);
3835     if (usePrefetch) {
3836       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3837       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3838     }
3839     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3840     __ eor(tmp1, tmp1, tmp2);
3841     __ eor(tmp3, tmp3, tmp4);
3842     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3843     __ orr(tmp1, tmp1, tmp3);
3844     __ cbnz(tmp1, NOT_EQUAL);
3845     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3846     __ eor(tmp5, tmp5, tmp6);
3847     __ eor(tmp7, tmp7, tmp8);
3848     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3849     __ orr(tmp5, tmp5, tmp7);
3850     __ cbnz(tmp5, NOT_EQUAL);
3851     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3852     __ eor(tmp1, tmp1, tmp2);
3853     __ eor(tmp3, tmp3, tmp4);
3854     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3855     __ orr(tmp1, tmp1, tmp3);
3856     __ cbnz(tmp1, NOT_EQUAL);
3857     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3858     __ eor(tmp5, tmp5, tmp6);
3859     __ sub(cnt1, cnt1, 8 * wordSize);
3860     __ eor(tmp7, tmp7, tmp8);
3861     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3862     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3863     // cmp) because subs allows an unlimited range of immediate operand.
3864     __ subs(tmp6, cnt1, loopThreshold);
3865     __ orr(tmp5, tmp5, tmp7);
3866     __ cbnz(tmp5, NOT_EQUAL);
3867     __ br(__ GE, LOOP);
3868     // post-loop
3869     __ eor(tmp1, tmp1, tmp2);
3870     __ eor(tmp3, tmp3, tmp4);
3871     __ orr(tmp1, tmp1, tmp3);
3872     __ sub(cnt1, cnt1, 2 * wordSize);
3873     __ cbnz(tmp1, NOT_EQUAL);
3874   }
3875 
3876   void generate_large_array_equals_loop_simd(int loopThreshold,
3877         bool usePrefetch, Label &NOT_EQUAL) {
3878     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3879         tmp2 = rscratch2;
3880     Label LOOP;
3881 
3882     __ bind(LOOP);
3883     if (usePrefetch) {
3884       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3885       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3886     }
3887     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3888     __ sub(cnt1, cnt1, 8 * wordSize);
3889     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3890     __ subs(tmp1, cnt1, loopThreshold);
3891     __ eor(v0, __ T16B, v0, v4);
3892     __ eor(v1, __ T16B, v1, v5);
3893     __ eor(v2, __ T16B, v2, v6);
3894     __ eor(v3, __ T16B, v3, v7);
3895     __ orr(v0, __ T16B, v0, v1);
3896     __ orr(v1, __ T16B, v2, v3);
3897     __ orr(v0, __ T16B, v0, v1);
3898     __ umov(tmp1, v0, __ D, 0);
3899     __ umov(tmp2, v0, __ D, 1);
3900     __ orr(tmp1, tmp1, tmp2);
3901     __ cbnz(tmp1, NOT_EQUAL);
3902     __ br(__ GE, LOOP);
3903   }
3904 
3905   // a1 = r1 - array1 address
3906   // a2 = r2 - array2 address
3907   // result = r0 - return value. Already contains "false"
3908   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3909   // r3-r5 are reserved temporary registers
3910   address generate_large_array_equals() {
3911     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3912         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3913         tmp7 = r12, tmp8 = r13;
3914     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3915         SMALL_LOOP, POST_LOOP;
3916     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3917     // calculate if at least 32 prefetched bytes are used
3918     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3919     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3920     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3921     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3922         tmp5, tmp6, tmp7, tmp8);
3923 
3924     __ align(CodeEntryAlignment);
3925 
3926     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3927 
3928     address entry = __ pc();
3929     __ enter();
3930     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3931     // also advance pointers to use post-increment instead of pre-increment
3932     __ add(a1, a1, wordSize);
3933     __ add(a2, a2, wordSize);
3934     if (AvoidUnalignedAccesses) {
3935       // both implementations (SIMD/nonSIMD) are using relatively large load
3936       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3937       // on some CPUs in case of address is not at least 16-byte aligned.
3938       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3939       // load if needed at least for 1st address and make if 16-byte aligned.
3940       Label ALIGNED16;
3941       __ tbz(a1, 3, ALIGNED16);
3942       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3943       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3944       __ sub(cnt1, cnt1, wordSize);
3945       __ eor(tmp1, tmp1, tmp2);
3946       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3947       __ bind(ALIGNED16);
3948     }
3949     if (UseSIMDForArrayEquals) {
3950       if (SoftwarePrefetchHintDistance >= 0) {
3951         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3952         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3953         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3954             /* prfm = */ true, NOT_EQUAL);
3955         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3956         __ br(__ LT, TAIL);
3957       }
3958       __ bind(NO_PREFETCH_LARGE_LOOP);
3959       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3960           /* prfm = */ false, NOT_EQUAL);
3961     } else {
3962       __ push(spilled_regs, sp);
3963       if (SoftwarePrefetchHintDistance >= 0) {
3964         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3965         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3966         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3967             /* prfm = */ true, NOT_EQUAL);
3968         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3969         __ br(__ LT, TAIL);
3970       }
3971       __ bind(NO_PREFETCH_LARGE_LOOP);
3972       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3973           /* prfm = */ false, NOT_EQUAL);
3974     }
3975     __ bind(TAIL);
3976       __ cbz(cnt1, EQUAL);
3977       __ subs(cnt1, cnt1, wordSize);
3978       __ br(__ LE, POST_LOOP);
3979     __ bind(SMALL_LOOP);
3980       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3981       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3982       __ subs(cnt1, cnt1, wordSize);
3983       __ eor(tmp1, tmp1, tmp2);
3984       __ cbnz(tmp1, NOT_EQUAL);
3985       __ br(__ GT, SMALL_LOOP);
3986     __ bind(POST_LOOP);
3987       __ ldr(tmp1, Address(a1, cnt1));
3988       __ ldr(tmp2, Address(a2, cnt1));
3989       __ eor(tmp1, tmp1, tmp2);
3990       __ cbnz(tmp1, NOT_EQUAL);
3991     __ bind(EQUAL);
3992       __ mov(result, true);
3993     __ bind(NOT_EQUAL);
3994       if (!UseSIMDForArrayEquals) {
3995         __ pop(spilled_regs, sp);
3996       }
3997     __ bind(NOT_EQUAL_NO_POP);
3998     __ leave();
3999     __ ret(lr);
4000     return entry;
4001   }
4002 
4003   address generate_dsin_dcos(bool isCos) {
4004     __ align(CodeEntryAlignment);
4005     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4006     address start = __ pc();
4007     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4008         (address)StubRoutines::aarch64::_two_over_pi,
4009         (address)StubRoutines::aarch64::_pio2,
4010         (address)StubRoutines::aarch64::_dsin_coef,
4011         (address)StubRoutines::aarch64::_dcos_coef);
4012     return start;
4013   }
4014 
4015   address generate_dlog() {
4016     __ align(CodeEntryAlignment);
4017     StubCodeMark mark(this, "StubRoutines", "dlog");
4018     address entry = __ pc();
4019     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4020         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4021     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4022     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4023         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4024     return entry;
4025   }
4026 
4027   // code for comparing 16 bytes of strings with same encoding
4028   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4029     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4030     __ ldr(rscratch1, Address(__ post(str1, 8)));
4031     __ eor(rscratch2, tmp1, tmp2);
4032     __ ldr(cnt1, Address(__ post(str2, 8)));
4033     __ cbnz(rscratch2, DIFF1);
4034     __ ldr(tmp1, Address(__ post(str1, 8)));
4035     __ eor(rscratch2, rscratch1, cnt1);
4036     __ ldr(tmp2, Address(__ post(str2, 8)));
4037     __ cbnz(rscratch2, DIFF2);
4038   }
4039 
4040   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4041   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4042       Label &DIFF2) {
4043     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4044     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4045 
4046     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4047     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4048     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4049     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4050 
4051     __ fmovd(tmpL, vtmp3);
4052     __ eor(rscratch2, tmp3, tmpL);
4053     __ cbnz(rscratch2, DIFF2);
4054 
4055     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4056     __ umov(tmpL, vtmp3, __ D, 1);
4057     __ eor(rscratch2, tmpU, tmpL);
4058     __ cbnz(rscratch2, DIFF1);
4059 
4060     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4061     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4062     __ fmovd(tmpL, vtmp);
4063     __ eor(rscratch2, tmp3, tmpL);
4064     __ cbnz(rscratch2, DIFF2);
4065 
4066     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4067     __ umov(tmpL, vtmp, __ D, 1);
4068     __ eor(rscratch2, tmpU, tmpL);
4069     __ cbnz(rscratch2, DIFF1);
4070   }
4071 
4072   // r0  = result
4073   // r1  = str1
4074   // r2  = cnt1
4075   // r3  = str2
4076   // r4  = cnt2
4077   // r10 = tmp1
4078   // r11 = tmp2
4079   address generate_compare_long_string_different_encoding(bool isLU) {
4080     __ align(CodeEntryAlignment);
4081     StubCodeMark mark(this, "StubRoutines", isLU
4082         ? "compare_long_string_different_encoding LU"
4083         : "compare_long_string_different_encoding UL");
4084     address entry = __ pc();
4085     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4086         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
4087         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4088     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4089         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4090     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4091     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4092 
4093     int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
4094 
4095     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4096     // cnt2 == amount of characters left to compare
4097     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4098     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4099     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4100     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4101     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4102     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4103     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4104     __ eor(rscratch2, tmp1, tmp2);
4105     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4106     __ mov(rscratch1, tmp2);
4107     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4108     Register strU = isLU ? str2 : str1,
4109              strL = isLU ? str1 : str2,
4110              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4111              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4112     __ push(spilled_regs, sp);
4113     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4114     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4115 
4116     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4117 
4118     if (SoftwarePrefetchHintDistance >= 0) {
4119       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4120       __ br(__ LT, SMALL_LOOP);
4121       __ bind(LARGE_LOOP_PREFETCH);
4122         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4123         __ mov(tmp4, 2);
4124         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4125         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4126           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4127           __ subs(tmp4, tmp4, 1);
4128           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4129           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4130           __ mov(tmp4, 2);
4131         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4132           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4133           __ subs(tmp4, tmp4, 1);
4134           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4135           __ sub(cnt2, cnt2, 64);
4136           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4137           __ br(__ GE, LARGE_LOOP_PREFETCH);
4138     }
4139     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4140     __ subs(cnt2, cnt2, 16);
4141     __ br(__ LT, TAIL);
4142     __ b(SMALL_LOOP_ENTER);
4143     __ bind(SMALL_LOOP); // smaller loop
4144       __ subs(cnt2, cnt2, 16);
4145     __ bind(SMALL_LOOP_ENTER);
4146       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4147       __ br(__ GE, SMALL_LOOP);
4148       __ cbz(cnt2, LOAD_LAST);
4149     __ bind(TAIL); // 1..15 characters left
4150       __ subs(zr, cnt2, -8);
4151       __ br(__ GT, TAIL_LOAD_16);
4152       __ ldrd(vtmp, Address(tmp2));
4153       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4154 
4155       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4156       __ fmovd(tmpL, vtmp3);
4157       __ eor(rscratch2, tmp3, tmpL);
4158       __ cbnz(rscratch2, DIFF2);
4159       __ umov(tmpL, vtmp3, __ D, 1);
4160       __ eor(rscratch2, tmpU, tmpL);
4161       __ cbnz(rscratch2, DIFF1);
4162       __ b(LOAD_LAST);
4163     __ bind(TAIL_LOAD_16);
4164       __ ldrq(vtmp, Address(tmp2));
4165       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4166       __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4167       __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4168       __ fmovd(tmpL, vtmp3);
4169       __ eor(rscratch2, tmp3, tmpL);
4170       __ cbnz(rscratch2, DIFF2);
4171 
4172       __ ldr(tmp3, Address(__ post(cnt1, 8)));
4173       __ umov(tmpL, vtmp3, __ D, 1);
4174       __ eor(rscratch2, tmpU, tmpL);
4175       __ cbnz(rscratch2, DIFF1);
4176 
4177       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4178       __ fmovd(tmpL, vtmp);
4179       __ eor(rscratch2, tmp3, tmpL);
4180       __ cbnz(rscratch2, DIFF2);
4181 
4182       __ umov(tmpL, vtmp, __ D, 1);
4183       __ eor(rscratch2, tmpU, tmpL);
4184       __ cbnz(rscratch2, DIFF1);
4185       __ b(LOAD_LAST);
4186     __ bind(DIFF2);
4187       __ mov(tmpU, tmp3);
4188     __ bind(DIFF1);
4189       __ pop(spilled_regs, sp);
4190       __ b(CALCULATE_DIFFERENCE);
4191     __ bind(LOAD_LAST);
4192       __ pop(spilled_regs, sp);
4193 
4194       __ ldrs(vtmp, Address(strL));
4195       __ ldr(tmpU, Address(strU));
4196       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4197       __ fmovd(tmpL, vtmp);
4198 
4199       __ eor(rscratch2, tmpU, tmpL);
4200       __ cbz(rscratch2, DONE);
4201 
4202     // Find the first different characters in the longwords and
4203     // compute their difference.
4204     __ bind(CALCULATE_DIFFERENCE);
4205       __ rev(rscratch2, rscratch2);
4206       __ clz(rscratch2, rscratch2);
4207       __ andr(rscratch2, rscratch2, -16);
4208       __ lsrv(tmp1, tmp1, rscratch2);
4209       __ uxthw(tmp1, tmp1);
4210       __ lsrv(rscratch1, rscratch1, rscratch2);
4211       __ uxthw(rscratch1, rscratch1);
4212       __ subw(result, tmp1, rscratch1);
4213     __ bind(DONE);
4214       __ ret(lr);
4215     return entry;
4216   }
4217 
4218   // r0  = result
4219   // r1  = str1
4220   // r2  = cnt1
4221   // r3  = str2
4222   // r4  = cnt2
4223   // r10 = tmp1
4224   // r11 = tmp2
4225   address generate_compare_long_string_same_encoding(bool isLL) {
4226     __ align(CodeEntryAlignment);
4227     StubCodeMark mark(this, "StubRoutines", isLL
4228         ? "compare_long_string_same_encoding LL"
4229         : "compare_long_string_same_encoding UU");
4230     address entry = __ pc();
4231     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4232         tmp1 = r10, tmp2 = r11;
4233     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4234         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4235         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4236     // exit from large loop when less than 64 bytes left to read or we're about
4237     // to prefetch memory behind array border
4238     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4239     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4240     // update cnt2 counter with already loaded 8 bytes
4241     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4242     // update pointers, because of previous read
4243     __ add(str1, str1, wordSize);
4244     __ add(str2, str2, wordSize);
4245     if (SoftwarePrefetchHintDistance >= 0) {
4246       __ bind(LARGE_LOOP_PREFETCH);
4247         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4248         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4249         compare_string_16_bytes_same(DIFF, DIFF2);
4250         compare_string_16_bytes_same(DIFF, DIFF2);
4251         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4252         compare_string_16_bytes_same(DIFF, DIFF2);
4253         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4254         compare_string_16_bytes_same(DIFF, DIFF2);
4255         __ br(__ GT, LARGE_LOOP_PREFETCH);
4256         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4257         // less than 16 bytes left?
4258         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4259         __ br(__ LT, TAIL);
4260     }
4261     __ bind(SMALL_LOOP);
4262       compare_string_16_bytes_same(DIFF, DIFF2);
4263       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4264       __ br(__ GE, SMALL_LOOP);
4265     __ bind(TAIL);
4266       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4267       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4268       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4269       __ br(__ LE, CHECK_LAST);
4270       __ eor(rscratch2, tmp1, tmp2);
4271       __ cbnz(rscratch2, DIFF);
4272       __ ldr(tmp1, Address(__ post(str1, 8)));
4273       __ ldr(tmp2, Address(__ post(str2, 8)));
4274       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4275     __ bind(CHECK_LAST);
4276       if (!isLL) {
4277         __ add(cnt2, cnt2, cnt2); // now in bytes
4278       }
4279       __ eor(rscratch2, tmp1, tmp2);
4280       __ cbnz(rscratch2, DIFF);
4281       __ ldr(rscratch1, Address(str1, cnt2));
4282       __ ldr(cnt1, Address(str2, cnt2));
4283       __ eor(rscratch2, rscratch1, cnt1);
4284       __ cbz(rscratch2, LENGTH_DIFF);
4285       // Find the first different characters in the longwords and
4286       // compute their difference.
4287     __ bind(DIFF2);
4288       __ rev(rscratch2, rscratch2);
4289       __ clz(rscratch2, rscratch2);
4290       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4291       __ lsrv(rscratch1, rscratch1, rscratch2);
4292       if (isLL) {
4293         __ lsrv(cnt1, cnt1, rscratch2);
4294         __ uxtbw(rscratch1, rscratch1);
4295         __ uxtbw(cnt1, cnt1);
4296       } else {
4297         __ lsrv(cnt1, cnt1, rscratch2);
4298         __ uxthw(rscratch1, rscratch1);
4299         __ uxthw(cnt1, cnt1);
4300       }
4301       __ subw(result, rscratch1, cnt1);
4302       __ b(LENGTH_DIFF);
4303     __ bind(DIFF);
4304       __ rev(rscratch2, rscratch2);
4305       __ clz(rscratch2, rscratch2);
4306       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4307       __ lsrv(tmp1, tmp1, rscratch2);
4308       if (isLL) {
4309         __ lsrv(tmp2, tmp2, rscratch2);
4310         __ uxtbw(tmp1, tmp1);
4311         __ uxtbw(tmp2, tmp2);
4312       } else {
4313         __ lsrv(tmp2, tmp2, rscratch2);
4314         __ uxthw(tmp1, tmp1);
4315         __ uxthw(tmp2, tmp2);
4316       }
4317       __ subw(result, tmp1, tmp2);
4318       __ b(LENGTH_DIFF);
4319     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4320       __ eor(rscratch2, tmp1, tmp2);
4321       __ cbnz(rscratch2, DIFF);
4322     __ bind(LENGTH_DIFF);
4323       __ ret(lr);
4324     return entry;
4325   }
4326 
4327   void generate_compare_long_strings() {
4328       StubRoutines::aarch64::_compare_long_string_LL
4329           = generate_compare_long_string_same_encoding(true);
4330       StubRoutines::aarch64::_compare_long_string_UU
4331           = generate_compare_long_string_same_encoding(false);
4332       StubRoutines::aarch64::_compare_long_string_LU
4333           = generate_compare_long_string_different_encoding(true);
4334       StubRoutines::aarch64::_compare_long_string_UL
4335           = generate_compare_long_string_different_encoding(false);
4336   }
4337 
4338   // R0 = result
4339   // R1 = str2
4340   // R2 = cnt1
4341   // R3 = str1
4342   // R4 = cnt2
4343   // This generic linear code use few additional ideas, which makes it faster:
4344   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4345   // in order to skip initial loading(help in systems with 1 ld pipeline)
4346   // 2) we can use "fast" algorithm of finding single character to search for
4347   // first symbol with less branches(1 branch per each loaded register instead
4348   // of branch for each symbol), so, this is where constants like
4349   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4350   // 3) after loading and analyzing 1st register of source string, it can be
4351   // used to search for every 1st character entry, saving few loads in
4352   // comparison with "simplier-but-slower" implementation
4353   // 4) in order to avoid lots of push/pop operations, code below is heavily
4354   // re-using/re-initializing/compressing register values, which makes code
4355   // larger and a bit less readable, however, most of extra operations are
4356   // issued during loads or branches, so, penalty is minimal
4357   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4358     const char* stubName = str1_isL
4359         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4360         : "indexof_linear_uu";
4361     __ align(CodeEntryAlignment);
4362     StubCodeMark mark(this, "StubRoutines", stubName);
4363     address entry = __ pc();
4364 
4365     int str1_chr_size = str1_isL ? 1 : 2;
4366     int str2_chr_size = str2_isL ? 1 : 2;
4367     int str1_chr_shift = str1_isL ? 0 : 1;
4368     int str2_chr_shift = str2_isL ? 0 : 1;
4369     bool isL = str1_isL && str2_isL;
4370    // parameters
4371     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4372     // temporary registers
4373     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4374     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4375     // redefinitions
4376     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4377 
4378     __ push(spilled_regs, sp);
4379     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4380         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4381         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4382         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4383         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4384         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4385     // Read whole register from str1. It is safe, because length >=8 here
4386     __ ldr(ch1, Address(str1));
4387     // Read whole register from str2. It is safe, because length >=8 here
4388     __ ldr(ch2, Address(str2));
4389     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4390     if (str1_isL != str2_isL) {
4391       __ eor(v0, __ T16B, v0, v0);
4392     }
4393     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4394     __ mul(first, first, tmp1);
4395     // check if we have less than 1 register to check
4396     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4397     if (str1_isL != str2_isL) {
4398       __ fmovd(v1, ch1);
4399     }
4400     __ br(__ LE, L_SMALL);
4401     __ eor(ch2, first, ch2);
4402     if (str1_isL != str2_isL) {
4403       __ zip1(v1, __ T16B, v1, v0);
4404     }
4405     __ sub(tmp2, ch2, tmp1);
4406     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4407     __ bics(tmp2, tmp2, ch2);
4408     if (str1_isL != str2_isL) {
4409       __ fmovd(ch1, v1);
4410     }
4411     __ br(__ NE, L_HAS_ZERO);
4412     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4413     __ add(result, result, wordSize/str2_chr_size);
4414     __ add(str2, str2, wordSize);
4415     __ br(__ LT, L_POST_LOOP);
4416     __ BIND(L_LOOP);
4417       __ ldr(ch2, Address(str2));
4418       __ eor(ch2, first, ch2);
4419       __ sub(tmp2, ch2, tmp1);
4420       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4421       __ bics(tmp2, tmp2, ch2);
4422       __ br(__ NE, L_HAS_ZERO);
4423     __ BIND(L_LOOP_PROCEED);
4424       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4425       __ add(str2, str2, wordSize);
4426       __ add(result, result, wordSize/str2_chr_size);
4427       __ br(__ GE, L_LOOP);
4428     __ BIND(L_POST_LOOP);
4429       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4430       __ br(__ LE, NOMATCH);
4431       __ ldr(ch2, Address(str2));
4432       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4433       __ eor(ch2, first, ch2);
4434       __ sub(tmp2, ch2, tmp1);
4435       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4436       __ mov(tmp4, -1); // all bits set
4437       __ b(L_SMALL_PROCEED);
4438     __ align(OptoLoopAlignment);
4439     __ BIND(L_SMALL);
4440       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4441       __ eor(ch2, first, ch2);
4442       if (str1_isL != str2_isL) {
4443         __ zip1(v1, __ T16B, v1, v0);
4444       }
4445       __ sub(tmp2, ch2, tmp1);
4446       __ mov(tmp4, -1); // all bits set
4447       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4448       if (str1_isL != str2_isL) {
4449         __ fmovd(ch1, v1); // move converted 4 symbols
4450       }
4451     __ BIND(L_SMALL_PROCEED);
4452       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4453       __ bic(tmp2, tmp2, ch2);
4454       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4455       __ rbit(tmp2, tmp2);
4456       __ br(__ EQ, NOMATCH);
4457     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4458       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4459       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4460       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4461       if (str2_isL) { // LL
4462         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4463         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4464         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4465         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4466         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4467       } else {
4468         __ mov(ch2, 0xE); // all bits in byte set except last one
4469         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4470         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4471         __ lslv(tmp2, tmp2, tmp4);
4472         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4473         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4474         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4475         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4476       }
4477       __ cmp(ch1, ch2);
4478       __ mov(tmp4, wordSize/str2_chr_size);
4479       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4480     __ BIND(L_SMALL_CMP_LOOP);
4481       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4482                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4483       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4484                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4485       __ add(tmp4, tmp4, 1);
4486       __ cmp(tmp4, cnt1);
4487       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4488       __ cmp(first, ch2);
4489       __ br(__ EQ, L_SMALL_CMP_LOOP);
4490     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4491       __ cbz(tmp2, NOMATCH); // no more matches. exit
4492       __ clz(tmp4, tmp2);
4493       __ add(result, result, 1); // advance index
4494       __ add(str2, str2, str2_chr_size); // advance pointer
4495       __ b(L_SMALL_HAS_ZERO_LOOP);
4496     __ align(OptoLoopAlignment);
4497     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4498       __ cmp(first, ch2);
4499       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4500       __ b(DONE);
4501     __ align(OptoLoopAlignment);
4502     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4503       if (str2_isL) { // LL
4504         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4505         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4506         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4507         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4508         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4509       } else {
4510         __ mov(ch2, 0xE); // all bits in byte set except last one
4511         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4512         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4513         __ lslv(tmp2, tmp2, tmp4);
4514         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4515         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4516         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4517         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4518       }
4519       __ cmp(ch1, ch2);
4520       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4521       __ b(DONE);
4522     __ align(OptoLoopAlignment);
4523     __ BIND(L_HAS_ZERO);
4524       __ rbit(tmp2, tmp2);
4525       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4526       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4527       // It's fine because both counters are 32bit and are not changed in this
4528       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4529       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4530       __ sub(result, result, 1);
4531     __ BIND(L_HAS_ZERO_LOOP);
4532       __ mov(cnt1, wordSize/str2_chr_size);
4533       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4534       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4535       if (str2_isL) {
4536         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4537         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4538         __ lslv(tmp2, tmp2, tmp4);
4539         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4540         __ add(tmp4, tmp4, 1);
4541         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4542         __ lsl(tmp2, tmp2, 1);
4543         __ mov(tmp4, wordSize/str2_chr_size);
4544       } else {
4545         __ mov(ch2, 0xE);
4546         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4547         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4548         __ lslv(tmp2, tmp2, tmp4);
4549         __ add(tmp4, tmp4, 1);
4550         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4551         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4552         __ lsl(tmp2, tmp2, 1);
4553         __ mov(tmp4, wordSize/str2_chr_size);
4554         __ sub(str2, str2, str2_chr_size);
4555       }
4556       __ cmp(ch1, ch2);
4557       __ mov(tmp4, wordSize/str2_chr_size);
4558       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4559     __ BIND(L_CMP_LOOP);
4560       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4561                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4562       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4563                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4564       __ add(tmp4, tmp4, 1);
4565       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4566       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4567       __ cmp(cnt1, ch2);
4568       __ br(__ EQ, L_CMP_LOOP);
4569     __ BIND(L_CMP_LOOP_NOMATCH);
4570       // here we're not matched
4571       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4572       __ clz(tmp4, tmp2);
4573       __ add(str2, str2, str2_chr_size); // advance pointer
4574       __ b(L_HAS_ZERO_LOOP);
4575     __ align(OptoLoopAlignment);
4576     __ BIND(L_CMP_LOOP_LAST_CMP);
4577       __ cmp(cnt1, ch2);
4578       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4579       __ b(DONE);
4580     __ align(OptoLoopAlignment);
4581     __ BIND(L_CMP_LOOP_LAST_CMP2);
4582       if (str2_isL) {
4583         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4584         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4585         __ lslv(tmp2, tmp2, tmp4);
4586         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4587         __ add(tmp4, tmp4, 1);
4588         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4589         __ lsl(tmp2, tmp2, 1);
4590       } else {
4591         __ mov(ch2, 0xE);
4592         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4593         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4594         __ lslv(tmp2, tmp2, tmp4);
4595         __ add(tmp4, tmp4, 1);
4596         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4597         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4598         __ lsl(tmp2, tmp2, 1);
4599         __ sub(str2, str2, str2_chr_size);
4600       }
4601       __ cmp(ch1, ch2);
4602       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4603       __ b(DONE);
4604     __ align(OptoLoopAlignment);
4605     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4606       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4607       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4608       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4609       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4610       // result by analyzed characters value, so, we can just reset lower bits
4611       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4612       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4613       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4614       // index of last analyzed substring inside current octet. So, str2 in at
4615       // respective start address. We need to advance it to next octet
4616       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4617       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4618       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4619       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4620       __ movw(cnt2, cnt2);
4621       __ b(L_LOOP_PROCEED);
4622     __ align(OptoLoopAlignment);
4623     __ BIND(NOMATCH);
4624       __ mov(result, -1);
4625     __ BIND(DONE);
4626       __ pop(spilled_regs, sp);
4627       __ ret(lr);
4628     return entry;
4629   }
4630 
4631   void generate_string_indexof_stubs() {
4632     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4633     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4634     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4635   }
4636 
4637   void inflate_and_store_2_fp_registers(bool generatePrfm,
4638       FloatRegister src1, FloatRegister src2) {
4639     Register dst = r1;
4640     __ zip1(v1, __ T16B, src1, v0);
4641     __ zip2(v2, __ T16B, src1, v0);
4642     if (generatePrfm) {
4643       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4644     }
4645     __ zip1(v3, __ T16B, src2, v0);
4646     __ zip2(v4, __ T16B, src2, v0);
4647     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4648   }
4649 
4650   // R0 = src
4651   // R1 = dst
4652   // R2 = len
4653   // R3 = len >> 3
4654   // V0 = 0
4655   // v1 = loaded 8 bytes
4656   address generate_large_byte_array_inflate() {
4657     __ align(CodeEntryAlignment);
4658     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4659     address entry = __ pc();
4660     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4661     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4662     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4663 
4664     // do one more 8-byte read to have address 16-byte aligned in most cases
4665     // also use single store instruction
4666     __ ldrd(v2, __ post(src, 8));
4667     __ sub(octetCounter, octetCounter, 2);
4668     __ zip1(v1, __ T16B, v1, v0);
4669     __ zip1(v2, __ T16B, v2, v0);
4670     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4671     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4672     __ subs(rscratch1, octetCounter, large_loop_threshold);
4673     __ br(__ LE, LOOP_START);
4674     __ b(LOOP_PRFM_START);
4675     __ bind(LOOP_PRFM);
4676       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4677     __ bind(LOOP_PRFM_START);
4678       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4679       __ sub(octetCounter, octetCounter, 8);
4680       __ subs(rscratch1, octetCounter, large_loop_threshold);
4681       inflate_and_store_2_fp_registers(true, v3, v4);
4682       inflate_and_store_2_fp_registers(true, v5, v6);
4683       __ br(__ GT, LOOP_PRFM);
4684       __ cmp(octetCounter, (u1)8);
4685       __ br(__ LT, DONE);
4686     __ bind(LOOP);
4687       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4688       __ bind(LOOP_START);
4689       __ sub(octetCounter, octetCounter, 8);
4690       __ cmp(octetCounter, (u1)8);
4691       inflate_and_store_2_fp_registers(false, v3, v4);
4692       inflate_and_store_2_fp_registers(false, v5, v6);
4693       __ br(__ GE, LOOP);
4694     __ bind(DONE);
4695       __ ret(lr);
4696     return entry;
4697   }
4698 
4699   /**
4700    *  Arguments:
4701    *
4702    *  Input:
4703    *  c_rarg0   - current state address
4704    *  c_rarg1   - H key address
4705    *  c_rarg2   - data address
4706    *  c_rarg3   - number of blocks
4707    *
4708    *  Output:
4709    *  Updated state at c_rarg0
4710    */
4711   address generate_ghash_processBlocks() {
4712     // Bafflingly, GCM uses little-endian for the byte order, but
4713     // big-endian for the bit order.  For example, the polynomial 1 is
4714     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4715     //
4716     // So, we must either reverse the bytes in each word and do
4717     // everything big-endian or reverse the bits in each byte and do
4718     // it little-endian.  On AArch64 it's more idiomatic to reverse
4719     // the bits in each byte (we have an instruction, RBIT, to do
4720     // that) and keep the data in little-endian bit order throught the
4721     // calculation, bit-reversing the inputs and outputs.
4722 
4723     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4724     __ align(wordSize * 2);
4725     address p = __ pc();
4726     __ emit_int64(0x87);  // The low-order bits of the field
4727                           // polynomial (i.e. p = z^7+z^2+z+1)
4728                           // repeated in the low and high parts of a
4729                           // 128-bit vector
4730     __ emit_int64(0x87);
4731 
4732     __ align(CodeEntryAlignment);
4733     address start = __ pc();
4734 
4735     Register state   = c_rarg0;
4736     Register subkeyH = c_rarg1;
4737     Register data    = c_rarg2;
4738     Register blocks  = c_rarg3;
4739 
4740     FloatRegister vzr = v30;
4741     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4742 
4743     __ ldrq(v0, Address(state));
4744     __ ldrq(v1, Address(subkeyH));
4745 
4746     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4747     __ rbit(v0, __ T16B, v0);
4748     __ rev64(v1, __ T16B, v1);
4749     __ rbit(v1, __ T16B, v1);
4750 
4751     __ ldrq(v26, p);
4752 
4753     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4754     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4755 
4756     {
4757       Label L_ghash_loop;
4758       __ bind(L_ghash_loop);
4759 
4760       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4761                                                  // reversing each byte
4762       __ rbit(v2, __ T16B, v2);
4763       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4764 
4765       // Multiply state in v2 by subkey in v1
4766       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4767                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4768                      /*temps*/v6, v20, v18, v21);
4769       // Reduce v7:v5 by the field polynomial
4770       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4771 
4772       __ sub(blocks, blocks, 1);
4773       __ cbnz(blocks, L_ghash_loop);
4774     }
4775 
4776     // The bit-reversed result is at this point in v0
4777     __ rev64(v1, __ T16B, v0);
4778     __ rbit(v1, __ T16B, v1);
4779 
4780     __ st1(v1, __ T16B, state);
4781     __ ret(lr);
4782 
4783     return start;
4784   }
4785 
4786   // Continuation point for throwing of implicit exceptions that are
4787   // not handled in the current activation. Fabricates an exception
4788   // oop and initiates normal exception dispatching in this
4789   // frame. Since we need to preserve callee-saved values (currently
4790   // only for C2, but done for C1 as well) we need a callee-saved oop
4791   // map and therefore have to make these stubs into RuntimeStubs
4792   // rather than BufferBlobs.  If the compiler needs all registers to
4793   // be preserved between the fault point and the exception handler
4794   // then it must assume responsibility for that in
4795   // AbstractCompiler::continuation_for_implicit_null_exception or
4796   // continuation_for_implicit_division_by_zero_exception. All other
4797   // implicit exceptions (e.g., NullPointerException or
4798   // AbstractMethodError on entry) are either at call sites or
4799   // otherwise assume that stack unwinding will be initiated, so
4800   // caller saved registers were assumed volatile in the compiler.
4801 
4802 #undef __
4803 #define __ masm->
4804 
4805   address generate_throw_exception(const char* name,
4806                                    address runtime_entry,
4807                                    Register arg1 = noreg,
4808                                    Register arg2 = noreg) {
4809     // Information about frame layout at time of blocking runtime call.
4810     // Note that we only have to preserve callee-saved registers since
4811     // the compilers are responsible for supplying a continuation point
4812     // if they expect all registers to be preserved.
4813     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4814     enum layout {
4815       rfp_off = 0,
4816       rfp_off2,
4817       return_off,
4818       return_off2,
4819       framesize // inclusive of return address
4820     };
4821 
4822     int insts_size = 512;
4823     int locs_size  = 64;
4824 
4825     CodeBuffer code(name, insts_size, locs_size);
4826     OopMapSet* oop_maps  = new OopMapSet();
4827     MacroAssembler* masm = new MacroAssembler(&code);
4828 
4829     address start = __ pc();
4830 
4831     // This is an inlined and slightly modified version of call_VM
4832     // which has the ability to fetch the return PC out of
4833     // thread-local storage and also sets up last_Java_sp slightly
4834     // differently than the real call_VM
4835 
4836     __ enter(); // Save FP and LR before call
4837 
4838     assert(is_even(framesize/2), "sp not 16-byte aligned");
4839 
4840     // lr and fp are already in place
4841     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4842 
4843     int frame_complete = __ pc() - start;
4844 
4845     // Set up last_Java_sp and last_Java_fp
4846     address the_pc = __ pc();
4847     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4848 
4849     // Call runtime
4850     if (arg1 != noreg) {
4851       assert(arg2 != c_rarg1, "clobbered");
4852       __ mov(c_rarg1, arg1);
4853     }
4854     if (arg2 != noreg) {
4855       __ mov(c_rarg2, arg2);
4856     }
4857     __ mov(c_rarg0, rthread);
4858     BLOCK_COMMENT("call runtime_entry");
4859     __ mov(rscratch1, runtime_entry);
4860     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4861 
4862     // Generate oop map
4863     OopMap* map = new OopMap(framesize, 0);
4864 
4865     oop_maps->add_gc_map(the_pc - start, map);
4866 
4867     __ reset_last_Java_frame(true);
4868     __ maybe_isb();
4869 
4870     __ leave();
4871 
4872     // check for pending exceptions
4873 #ifdef ASSERT
4874     Label L;
4875     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4876     __ cbnz(rscratch1, L);
4877     __ should_not_reach_here();
4878     __ bind(L);
4879 #endif // ASSERT
4880     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4881 
4882 
4883     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4884     RuntimeStub* stub =
4885       RuntimeStub::new_runtime_stub(name,
4886                                     &code,
4887                                     frame_complete,
4888                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4889                                     oop_maps, false);
4890     return stub->entry_point();
4891   }
4892 
4893   class MontgomeryMultiplyGenerator : public MacroAssembler {
4894 
4895     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4896       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4897 
4898     RegSet _toSave;
4899     bool _squaring;
4900 
4901   public:
4902     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4903       : MacroAssembler(as->code()), _squaring(squaring) {
4904 
4905       // Register allocation
4906 
4907       Register reg = c_rarg0;
4908       Pa_base = reg;       // Argument registers
4909       if (squaring)
4910         Pb_base = Pa_base;
4911       else
4912         Pb_base = ++reg;
4913       Pn_base = ++reg;
4914       Rlen= ++reg;
4915       inv = ++reg;
4916       Pm_base = ++reg;
4917 
4918                           // Working registers:
4919       Ra =  ++reg;        // The current digit of a, b, n, and m.
4920       Rb =  ++reg;
4921       Rm =  ++reg;
4922       Rn =  ++reg;
4923 
4924       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4925       Pb =  ++reg;
4926       Pm =  ++reg;
4927       Pn =  ++reg;
4928 
4929       t0 =  ++reg;        // Three registers which form a
4930       t1 =  ++reg;        // triple-precision accumuator.
4931       t2 =  ++reg;
4932 
4933       Ri =  ++reg;        // Inner and outer loop indexes.
4934       Rj =  ++reg;
4935 
4936       Rhi_ab = ++reg;     // Product registers: low and high parts
4937       Rlo_ab = ++reg;     // of a*b and m*n.
4938       Rhi_mn = ++reg;
4939       Rlo_mn = ++reg;
4940 
4941       // r19 and up are callee-saved.
4942       _toSave = RegSet::range(r19, reg) + Pm_base;
4943     }
4944 
4945   private:
4946     void save_regs() {
4947       push(_toSave, sp);
4948     }
4949 
4950     void restore_regs() {
4951       pop(_toSave, sp);
4952     }
4953 
4954     template <typename T>
4955     void unroll_2(Register count, T block) {
4956       Label loop, end, odd;
4957       tbnz(count, 0, odd);
4958       cbz(count, end);
4959       align(16);
4960       bind(loop);
4961       (this->*block)();
4962       bind(odd);
4963       (this->*block)();
4964       subs(count, count, 2);
4965       br(Assembler::GT, loop);
4966       bind(end);
4967     }
4968 
4969     template <typename T>
4970     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4971       Label loop, end, odd;
4972       tbnz(count, 0, odd);
4973       cbz(count, end);
4974       align(16);
4975       bind(loop);
4976       (this->*block)(d, s, tmp);
4977       bind(odd);
4978       (this->*block)(d, s, tmp);
4979       subs(count, count, 2);
4980       br(Assembler::GT, loop);
4981       bind(end);
4982     }
4983 
4984     void pre1(RegisterOrConstant i) {
4985       block_comment("pre1");
4986       // Pa = Pa_base;
4987       // Pb = Pb_base + i;
4988       // Pm = Pm_base;
4989       // Pn = Pn_base + i;
4990       // Ra = *Pa;
4991       // Rb = *Pb;
4992       // Rm = *Pm;
4993       // Rn = *Pn;
4994       ldr(Ra, Address(Pa_base));
4995       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4996       ldr(Rm, Address(Pm_base));
4997       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4998       lea(Pa, Address(Pa_base));
4999       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5000       lea(Pm, Address(Pm_base));
5001       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5002 
5003       // Zero the m*n result.
5004       mov(Rhi_mn, zr);
5005       mov(Rlo_mn, zr);
5006     }
5007 
5008     // The core multiply-accumulate step of a Montgomery
5009     // multiplication.  The idea is to schedule operations as a
5010     // pipeline so that instructions with long latencies (loads and
5011     // multiplies) have time to complete before their results are
5012     // used.  This most benefits in-order implementations of the
5013     // architecture but out-of-order ones also benefit.
5014     void step() {
5015       block_comment("step");
5016       // MACC(Ra, Rb, t0, t1, t2);
5017       // Ra = *++Pa;
5018       // Rb = *--Pb;
5019       umulh(Rhi_ab, Ra, Rb);
5020       mul(Rlo_ab, Ra, Rb);
5021       ldr(Ra, pre(Pa, wordSize));
5022       ldr(Rb, pre(Pb, -wordSize));
5023       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5024                                        // previous iteration.
5025       // MACC(Rm, Rn, t0, t1, t2);
5026       // Rm = *++Pm;
5027       // Rn = *--Pn;
5028       umulh(Rhi_mn, Rm, Rn);
5029       mul(Rlo_mn, Rm, Rn);
5030       ldr(Rm, pre(Pm, wordSize));
5031       ldr(Rn, pre(Pn, -wordSize));
5032       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5033     }
5034 
5035     void post1() {
5036       block_comment("post1");
5037 
5038       // MACC(Ra, Rb, t0, t1, t2);
5039       // Ra = *++Pa;
5040       // Rb = *--Pb;
5041       umulh(Rhi_ab, Ra, Rb);
5042       mul(Rlo_ab, Ra, Rb);
5043       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5044       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5045 
5046       // *Pm = Rm = t0 * inv;
5047       mul(Rm, t0, inv);
5048       str(Rm, Address(Pm));
5049 
5050       // MACC(Rm, Rn, t0, t1, t2);
5051       // t0 = t1; t1 = t2; t2 = 0;
5052       umulh(Rhi_mn, Rm, Rn);
5053 
5054 #ifndef PRODUCT
5055       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5056       {
5057         mul(Rlo_mn, Rm, Rn);
5058         add(Rlo_mn, t0, Rlo_mn);
5059         Label ok;
5060         cbz(Rlo_mn, ok); {
5061           stop("broken Montgomery multiply");
5062         } bind(ok);
5063       }
5064 #endif
5065       // We have very carefully set things up so that
5066       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5067       // the lower half of Rm * Rn because we know the result already:
5068       // it must be -t0.  t0 + (-t0) must generate a carry iff
5069       // t0 != 0.  So, rather than do a mul and an adds we just set
5070       // the carry flag iff t0 is nonzero.
5071       //
5072       // mul(Rlo_mn, Rm, Rn);
5073       // adds(zr, t0, Rlo_mn);
5074       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5075       adcs(t0, t1, Rhi_mn);
5076       adc(t1, t2, zr);
5077       mov(t2, zr);
5078     }
5079 
5080     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5081       block_comment("pre2");
5082       // Pa = Pa_base + i-len;
5083       // Pb = Pb_base + len;
5084       // Pm = Pm_base + i-len;
5085       // Pn = Pn_base + len;
5086 
5087       if (i.is_register()) {
5088         sub(Rj, i.as_register(), len);
5089       } else {
5090         mov(Rj, i.as_constant());
5091         sub(Rj, Rj, len);
5092       }
5093       // Rj == i-len
5094 
5095       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5096       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5097       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5098       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5099 
5100       // Ra = *++Pa;
5101       // Rb = *--Pb;
5102       // Rm = *++Pm;
5103       // Rn = *--Pn;
5104       ldr(Ra, pre(Pa, wordSize));
5105       ldr(Rb, pre(Pb, -wordSize));
5106       ldr(Rm, pre(Pm, wordSize));
5107       ldr(Rn, pre(Pn, -wordSize));
5108 
5109       mov(Rhi_mn, zr);
5110       mov(Rlo_mn, zr);
5111     }
5112 
5113     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5114       block_comment("post2");
5115       if (i.is_constant()) {
5116         mov(Rj, i.as_constant()-len.as_constant());
5117       } else {
5118         sub(Rj, i.as_register(), len);
5119       }
5120 
5121       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5122 
5123       // As soon as we know the least significant digit of our result,
5124       // store it.
5125       // Pm_base[i-len] = t0;
5126       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5127 
5128       // t0 = t1; t1 = t2; t2 = 0;
5129       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5130       adc(t1, t2, zr);
5131       mov(t2, zr);
5132     }
5133 
5134     // A carry in t0 after Montgomery multiplication means that we
5135     // should subtract multiples of n from our result in m.  We'll
5136     // keep doing that until there is no carry.
5137     void normalize(RegisterOrConstant len) {
5138       block_comment("normalize");
5139       // while (t0)
5140       //   t0 = sub(Pm_base, Pn_base, t0, len);
5141       Label loop, post, again;
5142       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5143       cbz(t0, post); {
5144         bind(again); {
5145           mov(i, zr);
5146           mov(cnt, len);
5147           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5148           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5149           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5150           align(16);
5151           bind(loop); {
5152             sbcs(Rm, Rm, Rn);
5153             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5154             add(i, i, 1);
5155             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5156             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5157             sub(cnt, cnt, 1);
5158           } cbnz(cnt, loop);
5159           sbc(t0, t0, zr);
5160         } cbnz(t0, again);
5161       } bind(post);
5162     }
5163 
5164     // Move memory at s to d, reversing words.
5165     //    Increments d to end of copied memory
5166     //    Destroys tmp1, tmp2
5167     //    Preserves len
5168     //    Leaves s pointing to the address which was in d at start
5169     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5170       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5171 
5172       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5173       mov(tmp1, len);
5174       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5175       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5176     }
5177     // where
5178     void reverse1(Register d, Register s, Register tmp) {
5179       ldr(tmp, pre(s, -wordSize));
5180       ror(tmp, tmp, 32);
5181       str(tmp, post(d, wordSize));
5182     }
5183 
5184     void step_squaring() {
5185       // An extra ACC
5186       step();
5187       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5188     }
5189 
5190     void last_squaring(RegisterOrConstant i) {
5191       Label dont;
5192       // if ((i & 1) == 0) {
5193       tbnz(i.as_register(), 0, dont); {
5194         // MACC(Ra, Rb, t0, t1, t2);
5195         // Ra = *++Pa;
5196         // Rb = *--Pb;
5197         umulh(Rhi_ab, Ra, Rb);
5198         mul(Rlo_ab, Ra, Rb);
5199         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5200       } bind(dont);
5201     }
5202 
5203     void extra_step_squaring() {
5204       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5205 
5206       // MACC(Rm, Rn, t0, t1, t2);
5207       // Rm = *++Pm;
5208       // Rn = *--Pn;
5209       umulh(Rhi_mn, Rm, Rn);
5210       mul(Rlo_mn, Rm, Rn);
5211       ldr(Rm, pre(Pm, wordSize));
5212       ldr(Rn, pre(Pn, -wordSize));
5213     }
5214 
5215     void post1_squaring() {
5216       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5217 
5218       // *Pm = Rm = t0 * inv;
5219       mul(Rm, t0, inv);
5220       str(Rm, Address(Pm));
5221 
5222       // MACC(Rm, Rn, t0, t1, t2);
5223       // t0 = t1; t1 = t2; t2 = 0;
5224       umulh(Rhi_mn, Rm, Rn);
5225 
5226 #ifndef PRODUCT
5227       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5228       {
5229         mul(Rlo_mn, Rm, Rn);
5230         add(Rlo_mn, t0, Rlo_mn);
5231         Label ok;
5232         cbz(Rlo_mn, ok); {
5233           stop("broken Montgomery multiply");
5234         } bind(ok);
5235       }
5236 #endif
5237       // We have very carefully set things up so that
5238       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5239       // the lower half of Rm * Rn because we know the result already:
5240       // it must be -t0.  t0 + (-t0) must generate a carry iff
5241       // t0 != 0.  So, rather than do a mul and an adds we just set
5242       // the carry flag iff t0 is nonzero.
5243       //
5244       // mul(Rlo_mn, Rm, Rn);
5245       // adds(zr, t0, Rlo_mn);
5246       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5247       adcs(t0, t1, Rhi_mn);
5248       adc(t1, t2, zr);
5249       mov(t2, zr);
5250     }
5251 
5252     void acc(Register Rhi, Register Rlo,
5253              Register t0, Register t1, Register t2) {
5254       adds(t0, t0, Rlo);
5255       adcs(t1, t1, Rhi);
5256       adc(t2, t2, zr);
5257     }
5258 
5259   public:
5260     /**
5261      * Fast Montgomery multiplication.  The derivation of the
5262      * algorithm is in A Cryptographic Library for the Motorola
5263      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5264      *
5265      * Arguments:
5266      *
5267      * Inputs for multiplication:
5268      *   c_rarg0   - int array elements a
5269      *   c_rarg1   - int array elements b
5270      *   c_rarg2   - int array elements n (the modulus)
5271      *   c_rarg3   - int length
5272      *   c_rarg4   - int inv
5273      *   c_rarg5   - int array elements m (the result)
5274      *
5275      * Inputs for squaring:
5276      *   c_rarg0   - int array elements a
5277      *   c_rarg1   - int array elements n (the modulus)
5278      *   c_rarg2   - int length
5279      *   c_rarg3   - int inv
5280      *   c_rarg4   - int array elements m (the result)
5281      *
5282      */
5283     address generate_multiply() {
5284       Label argh, nothing;
5285       bind(argh);
5286       stop("MontgomeryMultiply total_allocation must be <= 8192");
5287 
5288       align(CodeEntryAlignment);
5289       address entry = pc();
5290 
5291       cbzw(Rlen, nothing);
5292 
5293       enter();
5294 
5295       // Make room.
5296       cmpw(Rlen, 512);
5297       br(Assembler::HI, argh);
5298       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5299       andr(sp, Ra, -2 * wordSize);
5300 
5301       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5302 
5303       {
5304         // Copy input args, reversing as we go.  We use Ra as a
5305         // temporary variable.
5306         reverse(Ra, Pa_base, Rlen, t0, t1);
5307         if (!_squaring)
5308           reverse(Ra, Pb_base, Rlen, t0, t1);
5309         reverse(Ra, Pn_base, Rlen, t0, t1);
5310       }
5311 
5312       // Push all call-saved registers and also Pm_base which we'll need
5313       // at the end.
5314       save_regs();
5315 
5316 #ifndef PRODUCT
5317       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5318       {
5319         ldr(Rn, Address(Pn_base, 0));
5320         mul(Rlo_mn, Rn, inv);
5321         subs(zr, Rlo_mn, -1);
5322         Label ok;
5323         br(EQ, ok); {
5324           stop("broken inverse in Montgomery multiply");
5325         } bind(ok);
5326       }
5327 #endif
5328 
5329       mov(Pm_base, Ra);
5330 
5331       mov(t0, zr);
5332       mov(t1, zr);
5333       mov(t2, zr);
5334 
5335       block_comment("for (int i = 0; i < len; i++) {");
5336       mov(Ri, zr); {
5337         Label loop, end;
5338         cmpw(Ri, Rlen);
5339         br(Assembler::GE, end);
5340 
5341         bind(loop);
5342         pre1(Ri);
5343 
5344         block_comment("  for (j = i; j; j--) {"); {
5345           movw(Rj, Ri);
5346           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5347         } block_comment("  } // j");
5348 
5349         post1();
5350         addw(Ri, Ri, 1);
5351         cmpw(Ri, Rlen);
5352         br(Assembler::LT, loop);
5353         bind(end);
5354         block_comment("} // i");
5355       }
5356 
5357       block_comment("for (int i = len; i < 2*len; i++) {");
5358       mov(Ri, Rlen); {
5359         Label loop, end;
5360         cmpw(Ri, Rlen, Assembler::LSL, 1);
5361         br(Assembler::GE, end);
5362 
5363         bind(loop);
5364         pre2(Ri, Rlen);
5365 
5366         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5367           lslw(Rj, Rlen, 1);
5368           subw(Rj, Rj, Ri);
5369           subw(Rj, Rj, 1);
5370           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5371         } block_comment("  } // j");
5372 
5373         post2(Ri, Rlen);
5374         addw(Ri, Ri, 1);
5375         cmpw(Ri, Rlen, Assembler::LSL, 1);
5376         br(Assembler::LT, loop);
5377         bind(end);
5378       }
5379       block_comment("} // i");
5380 
5381       normalize(Rlen);
5382 
5383       mov(Ra, Pm_base);  // Save Pm_base in Ra
5384       restore_regs();  // Restore caller's Pm_base
5385 
5386       // Copy our result into caller's Pm_base
5387       reverse(Pm_base, Ra, Rlen, t0, t1);
5388 
5389       leave();
5390       bind(nothing);
5391       ret(lr);
5392 
5393       return entry;
5394     }
5395     // In C, approximately:
5396 
5397     // void
5398     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5399     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5400     //                     unsigned long inv, int len) {
5401     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5402     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5403     //   unsigned long Ra, Rb, Rn, Rm;
5404 
5405     //   int i;
5406 
5407     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5408 
5409     //   for (i = 0; i < len; i++) {
5410     //     int j;
5411 
5412     //     Pa = Pa_base;
5413     //     Pb = Pb_base + i;
5414     //     Pm = Pm_base;
5415     //     Pn = Pn_base + i;
5416 
5417     //     Ra = *Pa;
5418     //     Rb = *Pb;
5419     //     Rm = *Pm;
5420     //     Rn = *Pn;
5421 
5422     //     int iters = i;
5423     //     for (j = 0; iters--; j++) {
5424     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5425     //       MACC(Ra, Rb, t0, t1, t2);
5426     //       Ra = *++Pa;
5427     //       Rb = *--Pb;
5428     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5429     //       MACC(Rm, Rn, t0, t1, t2);
5430     //       Rm = *++Pm;
5431     //       Rn = *--Pn;
5432     //     }
5433 
5434     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5435     //     MACC(Ra, Rb, t0, t1, t2);
5436     //     *Pm = Rm = t0 * inv;
5437     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5438     //     MACC(Rm, Rn, t0, t1, t2);
5439 
5440     //     assert(t0 == 0, "broken Montgomery multiply");
5441 
5442     //     t0 = t1; t1 = t2; t2 = 0;
5443     //   }
5444 
5445     //   for (i = len; i < 2*len; i++) {
5446     //     int j;
5447 
5448     //     Pa = Pa_base + i-len;
5449     //     Pb = Pb_base + len;
5450     //     Pm = Pm_base + i-len;
5451     //     Pn = Pn_base + len;
5452 
5453     //     Ra = *++Pa;
5454     //     Rb = *--Pb;
5455     //     Rm = *++Pm;
5456     //     Rn = *--Pn;
5457 
5458     //     int iters = len*2-i-1;
5459     //     for (j = i-len+1; iters--; j++) {
5460     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5461     //       MACC(Ra, Rb, t0, t1, t2);
5462     //       Ra = *++Pa;
5463     //       Rb = *--Pb;
5464     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5465     //       MACC(Rm, Rn, t0, t1, t2);
5466     //       Rm = *++Pm;
5467     //       Rn = *--Pn;
5468     //     }
5469 
5470     //     Pm_base[i-len] = t0;
5471     //     t0 = t1; t1 = t2; t2 = 0;
5472     //   }
5473 
5474     //   while (t0)
5475     //     t0 = sub(Pm_base, Pn_base, t0, len);
5476     // }
5477 
5478     /**
5479      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5480      * multiplies than Montgomery multiplication so it should be up to
5481      * 25% faster.  However, its loop control is more complex and it
5482      * may actually run slower on some machines.
5483      *
5484      * Arguments:
5485      *
5486      * Inputs:
5487      *   c_rarg0   - int array elements a
5488      *   c_rarg1   - int array elements n (the modulus)
5489      *   c_rarg2   - int length
5490      *   c_rarg3   - int inv
5491      *   c_rarg4   - int array elements m (the result)
5492      *
5493      */
5494     address generate_square() {
5495       Label argh;
5496       bind(argh);
5497       stop("MontgomeryMultiply total_allocation must be <= 8192");
5498 
5499       align(CodeEntryAlignment);
5500       address entry = pc();
5501 
5502       enter();
5503 
5504       // Make room.
5505       cmpw(Rlen, 512);
5506       br(Assembler::HI, argh);
5507       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5508       andr(sp, Ra, -2 * wordSize);
5509 
5510       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5511 
5512       {
5513         // Copy input args, reversing as we go.  We use Ra as a
5514         // temporary variable.
5515         reverse(Ra, Pa_base, Rlen, t0, t1);
5516         reverse(Ra, Pn_base, Rlen, t0, t1);
5517       }
5518 
5519       // Push all call-saved registers and also Pm_base which we'll need
5520       // at the end.
5521       save_regs();
5522 
5523       mov(Pm_base, Ra);
5524 
5525       mov(t0, zr);
5526       mov(t1, zr);
5527       mov(t2, zr);
5528 
5529       block_comment("for (int i = 0; i < len; i++) {");
5530       mov(Ri, zr); {
5531         Label loop, end;
5532         bind(loop);
5533         cmp(Ri, Rlen);
5534         br(Assembler::GE, end);
5535 
5536         pre1(Ri);
5537 
5538         block_comment("for (j = (i+1)/2; j; j--) {"); {
5539           add(Rj, Ri, 1);
5540           lsr(Rj, Rj, 1);
5541           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5542         } block_comment("  } // j");
5543 
5544         last_squaring(Ri);
5545 
5546         block_comment("  for (j = i/2; j; j--) {"); {
5547           lsr(Rj, Ri, 1);
5548           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5549         } block_comment("  } // j");
5550 
5551         post1_squaring();
5552         add(Ri, Ri, 1);
5553         cmp(Ri, Rlen);
5554         br(Assembler::LT, loop);
5555 
5556         bind(end);
5557         block_comment("} // i");
5558       }
5559 
5560       block_comment("for (int i = len; i < 2*len; i++) {");
5561       mov(Ri, Rlen); {
5562         Label loop, end;
5563         bind(loop);
5564         cmp(Ri, Rlen, Assembler::LSL, 1);
5565         br(Assembler::GE, end);
5566 
5567         pre2(Ri, Rlen);
5568 
5569         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5570           lsl(Rj, Rlen, 1);
5571           sub(Rj, Rj, Ri);
5572           sub(Rj, Rj, 1);
5573           lsr(Rj, Rj, 1);
5574           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5575         } block_comment("  } // j");
5576 
5577         last_squaring(Ri);
5578 
5579         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5580           lsl(Rj, Rlen, 1);
5581           sub(Rj, Rj, Ri);
5582           lsr(Rj, Rj, 1);
5583           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5584         } block_comment("  } // j");
5585 
5586         post2(Ri, Rlen);
5587         add(Ri, Ri, 1);
5588         cmp(Ri, Rlen, Assembler::LSL, 1);
5589 
5590         br(Assembler::LT, loop);
5591         bind(end);
5592         block_comment("} // i");
5593       }
5594 
5595       normalize(Rlen);
5596 
5597       mov(Ra, Pm_base);  // Save Pm_base in Ra
5598       restore_regs();  // Restore caller's Pm_base
5599 
5600       // Copy our result into caller's Pm_base
5601       reverse(Pm_base, Ra, Rlen, t0, t1);
5602 
5603       leave();
5604       ret(lr);
5605 
5606       return entry;
5607     }
5608     // In C, approximately:
5609 
5610     // void
5611     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5612     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5613     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5614     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5615     //   unsigned long Ra, Rb, Rn, Rm;
5616 
5617     //   int i;
5618 
5619     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5620 
5621     //   for (i = 0; i < len; i++) {
5622     //     int j;
5623 
5624     //     Pa = Pa_base;
5625     //     Pb = Pa_base + i;
5626     //     Pm = Pm_base;
5627     //     Pn = Pn_base + i;
5628 
5629     //     Ra = *Pa;
5630     //     Rb = *Pb;
5631     //     Rm = *Pm;
5632     //     Rn = *Pn;
5633 
5634     //     int iters = (i+1)/2;
5635     //     for (j = 0; iters--; j++) {
5636     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5637     //       MACC2(Ra, Rb, t0, t1, t2);
5638     //       Ra = *++Pa;
5639     //       Rb = *--Pb;
5640     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5641     //       MACC(Rm, Rn, t0, t1, t2);
5642     //       Rm = *++Pm;
5643     //       Rn = *--Pn;
5644     //     }
5645     //     if ((i & 1) == 0) {
5646     //       assert(Ra == Pa_base[j], "must be");
5647     //       MACC(Ra, Ra, t0, t1, t2);
5648     //     }
5649     //     iters = i/2;
5650     //     assert(iters == i-j, "must be");
5651     //     for (; iters--; j++) {
5652     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5653     //       MACC(Rm, Rn, t0, t1, t2);
5654     //       Rm = *++Pm;
5655     //       Rn = *--Pn;
5656     //     }
5657 
5658     //     *Pm = Rm = t0 * inv;
5659     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5660     //     MACC(Rm, Rn, t0, t1, t2);
5661 
5662     //     assert(t0 == 0, "broken Montgomery multiply");
5663 
5664     //     t0 = t1; t1 = t2; t2 = 0;
5665     //   }
5666 
5667     //   for (i = len; i < 2*len; i++) {
5668     //     int start = i-len+1;
5669     //     int end = start + (len - start)/2;
5670     //     int j;
5671 
5672     //     Pa = Pa_base + i-len;
5673     //     Pb = Pa_base + len;
5674     //     Pm = Pm_base + i-len;
5675     //     Pn = Pn_base + len;
5676 
5677     //     Ra = *++Pa;
5678     //     Rb = *--Pb;
5679     //     Rm = *++Pm;
5680     //     Rn = *--Pn;
5681 
5682     //     int iters = (2*len-i-1)/2;
5683     //     assert(iters == end-start, "must be");
5684     //     for (j = start; iters--; j++) {
5685     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5686     //       MACC2(Ra, Rb, t0, t1, t2);
5687     //       Ra = *++Pa;
5688     //       Rb = *--Pb;
5689     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5690     //       MACC(Rm, Rn, t0, t1, t2);
5691     //       Rm = *++Pm;
5692     //       Rn = *--Pn;
5693     //     }
5694     //     if ((i & 1) == 0) {
5695     //       assert(Ra == Pa_base[j], "must be");
5696     //       MACC(Ra, Ra, t0, t1, t2);
5697     //     }
5698     //     iters =  (2*len-i)/2;
5699     //     assert(iters == len-j, "must be");
5700     //     for (; iters--; j++) {
5701     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5702     //       MACC(Rm, Rn, t0, t1, t2);
5703     //       Rm = *++Pm;
5704     //       Rn = *--Pn;
5705     //     }
5706     //     Pm_base[i-len] = t0;
5707     //     t0 = t1; t1 = t2; t2 = 0;
5708     //   }
5709 
5710     //   while (t0)
5711     //     t0 = sub(Pm_base, Pn_base, t0, len);
5712     // }
5713   };
5714 
5715 
5716   // Initialization
5717   void generate_initial() {
5718     // Generate initial stubs and initializes the entry points
5719 
5720     // entry points that exist in all platforms Note: This is code
5721     // that could be shared among different platforms - however the
5722     // benefit seems to be smaller than the disadvantage of having a
5723     // much more complicated generator structure. See also comment in
5724     // stubRoutines.hpp.
5725 
5726     StubRoutines::_forward_exception_entry = generate_forward_exception();
5727 
5728     StubRoutines::_call_stub_entry =
5729       generate_call_stub(StubRoutines::_call_stub_return_address);
5730 
5731     // is referenced by megamorphic call
5732     StubRoutines::_catch_exception_entry = generate_catch_exception();
5733 
5734     // Build this early so it's available for the interpreter.
5735     StubRoutines::_throw_StackOverflowError_entry =
5736       generate_throw_exception("StackOverflowError throw_exception",
5737                                CAST_FROM_FN_PTR(address,
5738                                                 SharedRuntime::throw_StackOverflowError));
5739     StubRoutines::_throw_delayed_StackOverflowError_entry =
5740       generate_throw_exception("delayed StackOverflowError throw_exception",
5741                                CAST_FROM_FN_PTR(address,
5742                                                 SharedRuntime::throw_delayed_StackOverflowError));
5743     if (UseCRC32Intrinsics) {
5744       // set table address before stub generation which use it
5745       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5746       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5747     }
5748 
5749     if (UseCRC32CIntrinsics) {
5750       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5751     }
5752 
5753     // Disabled until JDK-8210858 is fixed
5754     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5755     //   StubRoutines::_dlog = generate_dlog();
5756     // }
5757 
5758     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5759       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5760     }
5761 
5762     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5763       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5764     }
5765   }
5766 
5767   void generate_all() {
5768     // support for verify_oop (must happen after universe_init)
5769     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5770     StubRoutines::_throw_AbstractMethodError_entry =
5771       generate_throw_exception("AbstractMethodError throw_exception",
5772                                CAST_FROM_FN_PTR(address,
5773                                                 SharedRuntime::
5774                                                 throw_AbstractMethodError));
5775 
5776     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5777       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5778                                CAST_FROM_FN_PTR(address,
5779                                                 SharedRuntime::
5780                                                 throw_IncompatibleClassChangeError));
5781 
5782     StubRoutines::_throw_NullPointerException_at_call_entry =
5783       generate_throw_exception("NullPointerException at call throw_exception",
5784                                CAST_FROM_FN_PTR(address,
5785                                                 SharedRuntime::
5786                                                 throw_NullPointerException_at_call));
5787 
5788     // arraycopy stubs used by compilers
5789     generate_arraycopy_stubs();
5790 
5791     // has negatives stub for large arrays.
5792     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5793 
5794     // array equals stub for large arrays.
5795     if (!UseSimpleArrayEquals) {
5796       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5797     }
5798 
5799     generate_compare_long_strings();
5800 
5801     generate_string_indexof_stubs();
5802 
5803     // byte_array_inflate stub for large arrays.
5804     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5805 
5806 #ifdef COMPILER2
5807     if (UseMultiplyToLenIntrinsic) {
5808       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5809     }
5810 
5811     if (UseSquareToLenIntrinsic) {
5812       StubRoutines::_squareToLen = generate_squareToLen();
5813     }
5814 
5815     if (UseMulAddIntrinsic) {
5816       StubRoutines::_mulAdd = generate_mulAdd();
5817     }
5818 
5819     if (UseMontgomeryMultiplyIntrinsic) {
5820       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5821       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5822       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5823     }
5824 
5825     if (UseMontgomerySquareIntrinsic) {
5826       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5827       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5828       // We use generate_multiply() rather than generate_square()
5829       // because it's faster for the sizes of modulus we care about.
5830       StubRoutines::_montgomerySquare = g.generate_multiply();
5831     }
5832 #endif // COMPILER2
5833 
5834 #ifndef BUILTIN_SIM
5835     // generate GHASH intrinsics code
5836     if (UseGHASHIntrinsics) {
5837       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5838     }
5839 
5840     if (UseAESIntrinsics) {
5841       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5842       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5843       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5844       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5845     }
5846 
5847     if (UseSHA1Intrinsics) {
5848       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5849       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5850     }
5851     if (UseSHA256Intrinsics) {
5852       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5853       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5854     }
5855 
5856     // generate Adler32 intrinsics code
5857     if (UseAdler32Intrinsics) {
5858       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5859     }
5860 
5861     // Safefetch stubs.
5862     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5863                                                        &StubRoutines::_safefetch32_fault_pc,
5864                                                        &StubRoutines::_safefetch32_continuation_pc);
5865     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5866                                                        &StubRoutines::_safefetchN_fault_pc,
5867                                                        &StubRoutines::_safefetchN_continuation_pc);
5868 #endif
5869     StubRoutines::aarch64::set_completed();
5870   }
5871 
5872  public:
5873   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5874     if (all) {
5875       generate_all();
5876     } else {
5877       generate_initial();
5878     }
5879   }
5880 }; // end class declaration
5881 
5882 void StubGenerator_generate(CodeBuffer* code, bool all) {
5883   StubGenerator g(code, all);
5884 }